From 453e0e3f0ef3202386b553719f628cef93ff95a7 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Wed, 17 Jun 2020 11:05:42 -0400
Subject: [PATCH 001/334] Minor code cleanup for perform_base_backup().

Merge two calls to sendDir() that are exactly the same except for
the fifth argument. Adjust comments to match.

Also, don't bother checking whether tblspc_map_file is NULL. We
initialize it in all cases, so it can't be.

Patch by me, reviewed by Amit Kapila and Kyotaro Horiguchi.

Discussion: http://postgr.es/m/CA+TgmoYq+59SJ2zBbP891ngWPA9fymOqntqYcweSDYXS2a620A@mail.gmail.com
---
 src/backend/replication/basebackup.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 62633e7ddcd5..efcf1e6eb56a 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -269,7 +269,7 @@ perform_base_backup(basebackup_options *opt)
 	XLogRecPtr	endptr;
 	TimeLineID	endtli;
 	StringInfo	labelfile;
-	StringInfo	tblspc_map_file = NULL;
+	StringInfo	tblspc_map_file;
 	backup_manifest_info manifest;
 	int			datadirpathlen;
 	List	   *tablespaces = NIL;
@@ -424,25 +424,23 @@ perform_base_backup(basebackup_options *opt)
 			if (ti->path == NULL)
 			{
 				struct stat statbuf;
+				bool	sendtblspclinks = true;
 
 				/* In the main tar, include the backup_label first... */
 				sendFileWithContent(BACKUP_LABEL_FILE, labelfile->data,
 									&manifest);
 
-				/*
-				 * Send tablespace_map file if required and then the bulk of
-				 * the files.
-				 */
-				if (tblspc_map_file && opt->sendtblspcmapfile)
+				/* Then the tablespace_map file, if required... */
+				if (opt->sendtblspcmapfile)
 				{
 					sendFileWithContent(TABLESPACE_MAP, tblspc_map_file->data,
 										&manifest);
-					sendDir(".", 1, false, tablespaces, false,
-							&manifest, NULL);
+					sendtblspclinks = false;
 				}
-				else
-					sendDir(".", 1, false, tablespaces, true,
-							&manifest, NULL);
+
+				/* Then the bulk of the files... */
+				sendDir(".", 1, false, tablespaces, sendtblspclinks,
+						&manifest, NULL);
 
 				/* ... and pg_control after everything else. */
 				if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)

From 2fd2effc50824a8775a088435a13f47b7a6f3b94 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Wed, 17 Jun 2020 11:39:17 -0400
Subject: [PATCH 002/334] Improve server code to read files as part of a base
 backup.

Don't use fread(), since that doesn't necessarily set errno. We could
use read() instead, but it's even better to use pg_pread(), which
allows us to avoid some extra calls to seek to the desired location in
the file.

Also, advertise a wait event while reading from a file, as we do for
most other places where we're reading data from files.

Patch by me, reviewed by Hamid Akhtar.

Discussion: http://postgr.es/m/CA+TgmobBw-3573vMosGj06r72ajHsYeKtksT_oTxH8XvTL7DxA@mail.gmail.com
---
 doc/src/sgml/monitoring.sgml         |   4 +
 src/backend/postmaster/pgstat.c      |   3 +
 src/backend/replication/basebackup.c | 143 ++++++++++++++-------------
 src/include/pgstat.h                 |   3 +-
 4 files changed, 86 insertions(+), 67 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 89662cc0a367..dfa9d0d6410c 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -1193,6 +1193,10 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
     </thead>
 
     <tbody>
+     <row>
+      <entry><literal>BaseBackupRead</literal></entry>
+      <entry>Waiting for base backup to read from a file.</entry>
+     </row>
      <row>
       <entry><literal>BufFileRead</literal></entry>
       <entry>Waiting for a read from a buffered file.</entry>
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index e96134dac8aa..c022597bc09a 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3931,6 +3931,9 @@ pgstat_get_wait_io(WaitEventIO w)
 
 	switch (w)
 	{
+		case WAIT_EVENT_BASEBACKUP_READ:
+			event_name = "BaseBackupRead";
+			break;
 		case WAIT_EVENT_BUFFILE_READ:
 			event_name = "BufFileRead";
 			break;
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index efcf1e6eb56a..096b0fcef0d1 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -81,6 +81,8 @@ static int	compareWalFileNames(const ListCell *a, const ListCell *b);
 static void throttle(size_t increment);
 static void update_basebackup_progress(int64 delta);
 static bool is_checksummed_file(const char *fullpath, const char *filename);
+static int	basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
+								 const char *filename, bool partial_read_ok);
 
 /* Was the backup currently in-progress initiated in recovery mode? */
 static bool backup_started_in_recovery = false;
@@ -98,18 +100,6 @@ static char *statrelpath = NULL;
  */
 #define THROTTLING_FREQUENCY	8
 
-/*
- * Checks whether we encountered any error in fread().  fread() doesn't give
- * any clue what has happened, so we check with ferror().  Also, neither
- * fread() nor ferror() set errno, so we just throw a generic error.
- */
-#define CHECK_FREAD_ERROR(fp, filename) \
-do { \
-	if (ferror(fp)) \
-		ereport(ERROR, \
-				(errmsg("could not read from file \"%s\"", filename))); \
-} while (0)
-
 /* The actual number of bytes, transfer of which may cause sleep. */
 static uint64 throttling_sample;
 
@@ -600,7 +590,7 @@ perform_base_backup(basebackup_options *opt)
 		foreach(lc, walFileList)
 		{
 			char	   *walFileName = (char *) lfirst(lc);
-			FILE	   *fp;
+			int			fd;
 			char		buf[TAR_SEND_SIZE];
 			size_t		cnt;
 			pgoff_t		len = 0;
@@ -608,8 +598,8 @@ perform_base_backup(basebackup_options *opt)
 			snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFileName);
 			XLogFromFileName(walFileName, &tli, &segno, wal_segment_size);
 
-			fp = AllocateFile(pathbuf, "rb");
-			if (fp == NULL)
+			fd = OpenTransientFile(pathbuf, O_RDONLY | PG_BINARY);
+			if (fd < 0)
 			{
 				int			save_errno = errno;
 
@@ -626,7 +616,7 @@ perform_base_backup(basebackup_options *opt)
 						 errmsg("could not open file \"%s\": %m", pathbuf)));
 			}
 
-			if (fstat(fileno(fp), &statbuf) != 0)
+			if (fstat(fd, &statbuf) != 0)
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not stat file \"%s\": %m",
@@ -642,9 +632,10 @@ perform_base_backup(basebackup_options *opt)
 			/* send the WAL file itself */
 			_tarWriteHeader(pathbuf, NULL, &statbuf, false);
 
-			while ((cnt = fread(buf, 1,
-								Min(sizeof(buf), wal_segment_size - len),
-								fp)) > 0)
+			while ((cnt = basebackup_read_file(fd, buf,
+											   Min(sizeof(buf),
+												   wal_segment_size - len),
+											   len, pathbuf, true)) > 0)
 			{
 				CheckXLogRemoved(segno, tli);
 				/* Send the chunk as a CopyData message */
@@ -660,8 +651,6 @@ perform_base_backup(basebackup_options *opt)
 					break;
 			}
 
-			CHECK_FREAD_ERROR(fp, pathbuf);
-
 			if (len != wal_segment_size)
 			{
 				CheckXLogRemoved(segno, tli);
@@ -676,7 +665,7 @@ perform_base_backup(basebackup_options *opt)
 			 */
 			Assert(wal_segment_size % TAR_BLOCK_SIZE == 0);
 
-			FreeFile(fp);
+			CloseTransientFile(fd);
 
 			/*
 			 * Mark file as archived, otherwise files can get archived again
@@ -1575,7 +1564,7 @@ sendFile(const char *readfilename, const char *tarfilename,
 		 struct stat *statbuf, bool missing_ok, Oid dboid,
 		 backup_manifest_info *manifest, const char *spcoid)
 {
-	FILE	   *fp;
+	int			fd;
 	BlockNumber blkno = 0;
 	bool		block_retry = false;
 	char		buf[TAR_SEND_SIZE];
@@ -1594,8 +1583,8 @@ sendFile(const char *readfilename, const char *tarfilename,
 
 	pg_checksum_init(&checksum_ctx, manifest->checksum_type);
 
-	fp = AllocateFile(readfilename, "rb");
-	if (fp == NULL)
+	fd = OpenTransientFile(readfilename, O_RDONLY | PG_BINARY);
+	if (fd < 0)
 	{
 		if (errno == ENOENT && missing_ok)
 			return false;
@@ -1637,8 +1626,27 @@ sendFile(const char *readfilename, const char *tarfilename,
 		}
 	}
 
-	while ((cnt = fread(buf, 1, Min(sizeof(buf), statbuf->st_size - len), fp)) > 0)
+	/*
+	 * Loop until we read the amount of data the caller told us to expect. The
+	 * file could be longer, if it was extended while we were sending it, but
+	 * for a base backup we can ignore such extended data. It will be restored
+	 * from WAL.
+	 */
+	while (len < statbuf->st_size)
 	{
+		/* Try to read some more data. */
+		cnt = basebackup_read_file(fd, buf,
+								   Min(sizeof(buf), statbuf->st_size - len),
+								   len, readfilename, true);
+
+		/*
+		 * If we hit end-of-file, a concurrent truncation must have occurred.
+		 * That's not an error condition, because WAL replay will fix things
+		 * up.
+		 */
+		if (cnt == 0)
+			break;
+
 		/*
 		 * The checksums are verified at block level, so we iterate over the
 		 * buffer in chunks of BLCKSZ, after making sure that
@@ -1689,16 +1697,15 @@ sendFile(const char *readfilename, const char *tarfilename,
 						 */
 						if (block_retry == false)
 						{
-							/* Reread the failed block */
-							if (fseek(fp, -(cnt - BLCKSZ * i), SEEK_CUR) == -1)
-							{
-								ereport(ERROR,
-										(errcode_for_file_access(),
-										 errmsg("could not fseek in file \"%s\": %m",
-												readfilename)));
-							}
+							int			reread_cnt;
 
-							if (fread(buf + BLCKSZ * i, 1, BLCKSZ, fp) != BLCKSZ)
+							/* Reread the failed block */
+							reread_cnt =
+								basebackup_read_file(fd, buf + BLCKSZ * i,
+													 BLCKSZ, len + BLCKSZ * i,
+													 readfilename,
+													 false);
+							if (reread_cnt == 0)
 							{
 								/*
 								 * If we hit end-of-file, a concurrent
@@ -1708,24 +1715,8 @@ sendFile(const char *readfilename, const char *tarfilename,
 								 * code that handles that case. (We must fix
 								 * up cnt first, though.)
 								 */
-								if (feof(fp))
-								{
-									cnt = BLCKSZ * i;
-									break;
-								}
-
-								ereport(ERROR,
-										(errcode_for_file_access(),
-										 errmsg("could not reread block %d of file \"%s\": %m",
-												blkno, readfilename)));
-							}
-
-							if (fseek(fp, cnt - BLCKSZ * i - BLCKSZ, SEEK_CUR) == -1)
-							{
-								ereport(ERROR,
-										(errcode_for_file_access(),
-										 errmsg("could not fseek in file \"%s\": %m",
-												readfilename)));
+								cnt = BLCKSZ * i;
+								break;
 							}
 
 							/* Set flag so we know a retry was attempted */
@@ -1768,20 +1759,8 @@ sendFile(const char *readfilename, const char *tarfilename,
 
 		len += cnt;
 		throttle(cnt);
-
-		if (feof(fp) || len >= statbuf->st_size)
-		{
-			/*
-			 * Reached end of file. The file could be longer, if it was
-			 * extended while we were sending it, but for a base backup we can
-			 * ignore such extended data. It will be restored from WAL.
-			 */
-			break;
-		}
 	}
 
-	CHECK_FREAD_ERROR(fp, readfilename);
-
 	/* If the file was truncated while we were sending it, pad it with zeros */
 	if (len < statbuf->st_size)
 	{
@@ -1810,7 +1789,7 @@ sendFile(const char *readfilename, const char *tarfilename,
 		update_basebackup_progress(pad);
 	}
 
-	FreeFile(fp);
+	CloseTransientFile(fd);
 
 	if (checksum_failures > 1)
 	{
@@ -1996,3 +1975,35 @@ update_basebackup_progress(int64 delta)
 
 	pgstat_progress_update_multi_param(nparam, index, val);
 }
+
+/*
+ * Read some data from a file, setting a wait event and reporting any error
+ * encountered.
+ *
+ * If partial_read_ok is false, also report an error if the number of bytes
+ * read is not equal to the number of bytes requested.
+ *
+ * Returns the number of bytes read.
+ */
+static int
+basebackup_read_file(int fd, char *buf, size_t nbytes, off_t offset,
+					 const char *filename, bool partial_read_ok)
+{
+	int			rc;
+
+	pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ);
+	rc = pg_pread(fd, buf, nbytes, offset);
+	pgstat_report_wait_end();
+
+	if (rc < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\": %m", filename)));
+	if (!partial_read_ok && rc > 0 && rc != nbytes)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read file \"%s\": read %d of %zu",
+						filename, rc, nbytes)));
+
+	return rc;
+}
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index c55dc1481ca5..13872013823e 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -913,7 +913,8 @@ typedef enum
  */
 typedef enum
 {
-	WAIT_EVENT_BUFFILE_READ = PG_WAIT_IO,
+	WAIT_EVENT_BASEBACKUP_READ = PG_WAIT_IO,
+	WAIT_EVENT_BUFFILE_READ,
 	WAIT_EVENT_BUFFILE_WRITE,
 	WAIT_EVENT_CONTROL_FILE_READ,
 	WAIT_EVENT_CONTROL_FILE_SYNC,

From fd49d53807575e009f7b66771d48c9356344d7d1 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 15 Jun 2020 18:23:10 -0700
Subject: [PATCH 003/334] Avoid potential spinlock in a signal handler as part
 of global barriers.

On platforms without support for 64bit atomic operations where we also
cannot rely on 64bit reads to have single copy atomicity, such atomics
are implemented using a spinlock based fallback. That means it's not
safe to even read such atomics from within a signal handler (since the
signal handler might run when the spinlock already is held).

To avoid this issue defer global barrier processing out of the signal
handler. Instead of checking local / shared barrier generation to
determine whether to set ProcSignalBarrierPending, introduce
PROCSIGNAL_BARRIER and always set ProcSignalBarrierPending when
receiving such a signal. Additionally avoid redundant work in
ProcessProcSignalBarrier if ProcSignalBarrierPending is unnecessarily.

Also do a small amount of other polishing.

Author: Andres Freund
Reviewed-By: Robert Haas
Discussion: https://postgr.es/m/20200609193723.eu5ilsjxwdpyxhgz@alap3.anarazel.de
Backpatch: 13-, where the code was introduced.
---
 src/backend/storage/ipc/procsignal.c | 87 ++++++++++++++++------------
 src/include/storage/procsignal.h     |  1 +
 2 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index c809196d06a4..4fa385b0ece4 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -320,7 +320,7 @@ SendProcSignal(pid_t pid, ProcSignalReason reason, BackendId backendId)
 uint64
 EmitProcSignalBarrier(ProcSignalBarrierType type)
 {
-	uint64		flagbit = UINT64CONST(1) << (uint64) type;
+	uint32		flagbit = 1 << (uint32) type;
 	uint64		generation;
 
 	/*
@@ -363,7 +363,11 @@ EmitProcSignalBarrier(ProcSignalBarrierType type)
 		pid_t		pid = slot->pss_pid;
 
 		if (pid != 0)
+		{
+			/* see SendProcSignal for details */
+			slot->pss_signalFlags[PROCSIG_BARRIER] = true;
 			kill(pid, SIGUSR1);
+		}
 	}
 
 	return generation;
@@ -383,6 +387,8 @@ WaitForProcSignalBarrier(uint64 generation)
 {
 	long		timeout = 125L;
 
+	Assert(generation <= pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration));
+
 	for (int i = NumProcSignalSlots - 1; i >= 0; i--)
 	{
 		volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
@@ -417,6 +423,23 @@ WaitForProcSignalBarrier(uint64 generation)
 	pg_memory_barrier();
 }
 
+/*
+ * Handle receipt of an interrupt indicating a global barrier event.
+ *
+ * All the actual work is deferred to ProcessProcSignalBarrier(), because we
+ * cannot safely access the barrier generation inside the signal handler as
+ * 64bit atomics might use spinlock based emulation, even for reads. As this
+ * routine only gets called when PROCSIG_BARRIER is sent that won't cause a
+ * lot fo unnecessary work.
+ */
+static void
+HandleProcSignalBarrierInterrupt(void)
+{
+	InterruptPending = true;
+	ProcSignalBarrierPending = true;
+	/* latch will be set by procsignal_sigusr1_handler */
+}
+
 /*
  * Perform global barrier related interrupt checking.
  *
@@ -428,22 +451,38 @@ WaitForProcSignalBarrier(uint64 generation)
 void
 ProcessProcSignalBarrier(void)
 {
-	uint64		generation;
+	uint64		local_gen;
+	uint64		shared_gen;
 	uint32		flags;
 
+	Assert(MyProcSignalSlot);
+
 	/* Exit quickly if there's no work to do. */
 	if (!ProcSignalBarrierPending)
 		return;
 	ProcSignalBarrierPending = false;
 
 	/*
-	 * Read the current barrier generation, and then get the flags that are
-	 * set for this backend. Note that pg_atomic_exchange_u32 is a full
-	 * barrier, so we're guaranteed that the read of the barrier generation
-	 * happens before we atomically extract the flags, and that any subsequent
-	 * state changes happen afterward.
+	 * It's not unlikely to process multiple barriers at once, before the
+	 * signals for all the barriers have arrived. To avoid unnecessary work in
+	 * response to subsequent signals, exit early if we already have processed
+	 * all of them.
+	 */
+	local_gen = pg_atomic_read_u64(&MyProcSignalSlot->pss_barrierGeneration);
+	shared_gen = pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration);
+
+	Assert(local_gen <= shared_gen);
+
+	if (local_gen == shared_gen)
+		return;
+
+	/*
+	 * Get and clear the flags that are set for this backend. Note that
+	 * pg_atomic_exchange_u32 is a full barrier, so we're guaranteed that the
+	 * read of the barrier generation above happens before we atomically
+	 * extract the flags, and that any subsequent state changes happen
+	 * afterward.
 	 */
-	generation = pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration);
 	flags = pg_atomic_exchange_u32(&MyProcSignalSlot->pss_barrierCheckMask, 0);
 
 	/*
@@ -466,7 +505,7 @@ ProcessProcSignalBarrier(void)
 	 * things have changed further, it'll get fixed up when this function is
 	 * next called.
 	 */
-	pg_atomic_write_u64(&MyProcSignalSlot->pss_barrierGeneration, generation);
+	pg_atomic_write_u64(&MyProcSignalSlot->pss_barrierGeneration, shared_gen);
 }
 
 static void
@@ -505,27 +544,6 @@ CheckProcSignal(ProcSignalReason reason)
 	return false;
 }
 
-/*
- * CheckProcSignalBarrier - check for new barriers we need to absorb
- */
-static bool
-CheckProcSignalBarrier(void)
-{
-	volatile ProcSignalSlot *slot = MyProcSignalSlot;
-
-	if (slot != NULL)
-	{
-		uint64		mygen;
-		uint64		curgen;
-
-		mygen = pg_atomic_read_u64(&slot->pss_barrierGeneration);
-		curgen = pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration);
-		return (mygen != curgen);
-	}
-
-	return false;
-}
-
 /*
  * procsignal_sigusr1_handler - handle SIGUSR1 signal.
  */
@@ -546,6 +564,9 @@ procsignal_sigusr1_handler(SIGNAL_ARGS)
 	if (CheckProcSignal(PROCSIG_WALSND_INIT_STOPPING))
 		HandleWalSndInitStopping();
 
+	if (CheckProcSignal(PROCSIG_BARRIER))
+		HandleProcSignalBarrierInterrupt();
+
 	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE))
 		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE);
 
@@ -564,12 +585,6 @@ procsignal_sigusr1_handler(SIGNAL_ARGS)
 	if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN))
 		RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
 
-	if (CheckProcSignalBarrier())
-	{
-		InterruptPending = true;
-		ProcSignalBarrierPending = true;
-	}
-
 	SetLatch(MyLatch);
 
 	latch_sigusr1_handler();
diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h
index a0c0bc3ce553..5cb39697f38f 100644
--- a/src/include/storage/procsignal.h
+++ b/src/include/storage/procsignal.h
@@ -33,6 +33,7 @@ typedef enum
 	PROCSIG_NOTIFY_INTERRUPT,	/* listen/notify interrupt */
 	PROCSIG_PARALLEL_MESSAGE,	/* message from cooperating parallel backend */
 	PROCSIG_WALSND_INIT_STOPPING,	/* ask walsenders to prepare for shutdown  */
+	PROCSIG_BARRIER,			/* global barrier interrupt  */
 
 	/* Recovery conflict reasons */
 	PROCSIG_RECOVERY_CONFLICT_DATABASE,

From 4d4ca24efe8ebda9547337f47dcb61d3163be765 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 8 Jun 2020 15:25:49 -0700
Subject: [PATCH 004/334] spinlock emulation: Fix bug when more than INT_MAX
 spinlocks are initialized.

Once the counter goes negative we ended up with spinlocks that errored
out on first use (due to check in tas_sema).

Author: Andres Freund
Reviewed-By: Robert Haas
Discussion: https://postgr.es/m/20200606023103.avzrctgv7476xj7i@alap3.anarazel.de
Backpatch: 9.5-
---
 src/backend/storage/lmgr/spin.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/storage/lmgr/spin.c b/src/backend/storage/lmgr/spin.c
index 4d2a4c6641aa..753943e46d62 100644
--- a/src/backend/storage/lmgr/spin.c
+++ b/src/backend/storage/lmgr/spin.c
@@ -106,7 +106,7 @@ SpinlockSemaInit(void)
 void
 s_init_lock_sema(volatile slock_t *lock, bool nested)
 {
-	static int	counter = 0;
+	static uint32 counter = 0;
 
 	*lock = ((++counter) % NUM_SPINLOCK_SEMAPHORES) + 1;
 }

From 6924c37f772cd7701d3e1267a1fb3221ca159ba4 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 17 Jun 2020 15:23:55 -0700
Subject: [PATCH 005/334] Fix nbtree.h dedup state comment.

Oversight in commit 0d861bbb.
---
 src/include/access/nbtree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 4e2b056b5456..3b2bcb22a70e 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -758,7 +758,7 @@ typedef struct BTDedupStateData
 	 * will not become posting list tuples do not appear in the array (they
 	 * are implicitly unchanged by deduplication pass).
 	 */
-	int			nintervals;		/* current size of intervals array */
+	int			nintervals;		/* current number of intervals in array */
 	BTDedupInterval intervals[MaxIndexTuplesPerPage];
 } BTDedupStateData;
 

From d8b15eeb8a1acbe01b502ddd3390d7f1824c7a25 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 17 Jun 2020 18:29:29 -0400
Subject: [PATCH 006/334] Sync our copy of the timezone library with IANA
 release tzcode2020a.

This absorbs a leap-second-related bug fix in localtime.c, and
teaches zic to handle an expiration marker in the leapseconds file.
Neither are of any interest to us (for the foreseeable future
anyway), but we need to stay more or less in sync with upstream.

Also adjust some over-eager changes in the README from commit 957338418.
I have no intention of making changes that require C99 in this code,
until such time as all the live back branches require C99.  Otherwise
back-patching will get too exciting.

For the same reason, absorb assorted whitespace and other cosmetic
changes from HEAD into the back branches; mostly this reflects use of
improved versions of pgindent.

All in all then, quite a boring update.  But I figured I'd get it
done while I was looking at this code.
---
 src/timezone/README      |  18 ++++-
 src/timezone/localtime.c |  23 +++++-
 src/timezone/zic.c       | 169 ++++++++++++++++++++++++---------------
 3 files changed, 142 insertions(+), 68 deletions(-)

diff --git a/src/timezone/README b/src/timezone/README
index 3c593933c1c8..9939aa6dd7ea 100644
--- a/src/timezone/README
+++ b/src/timezone/README
@@ -55,7 +55,7 @@ match properly on the old version.
 Time Zone code
 ==============
 
-The code in this directory is currently synced with tzcode release 2019b.
+The code in this directory is currently synced with tzcode release 2020a.
 There are many cosmetic (and not so cosmetic) differences from the
 original tzcode library, but diffs in the upstream version should usually
 be propagated to our version.  Here are some notes about that.
@@ -71,7 +71,14 @@ fixed that.)
 
 * We need the code to follow Postgres' portability conventions; this
 includes relying on configure's results rather than hand-hacked
-#defines (see private.h).
+#defines (see private.h in particular).
+
+* Similarly, avoid relying on <stdint.h> features that may not exist on old
+systems.  In particular this means using Postgres' definitions of the int32
+and int64 typedefs, not int_fast32_t/int_fast64_t.  Likewise we use
+PG_INT32_MIN/MAX not INT32_MIN/MAX.  (Once we desupport all PG versions
+that don't require C99, it'd be practical to rely on <stdint.h> and remove
+this set of diffs; but that day is not yet.)
 
 * Since Postgres is typically built on a system that has its own copy
 of the <time.h> functions, we must avoid conflicting with those.  This
@@ -109,6 +116,13 @@ to first run the tzcode source files through a sed filter like this:
         -e 's|^\*/| */|' \
         -e 's/\bregister[ \t]//g' \
         -e 's/\bATTRIBUTE_PURE[ \t]//g' \
+        -e 's/int_fast32_t/int32/g' \
+        -e 's/int_fast64_t/int64/g' \
+        -e 's/intmax_t/int64/g' \
+        -e 's/INT32_MIN/PG_INT32_MIN/g' \
+        -e 's/INT32_MAX/PG_INT32_MAX/g' \
+        -e 's/INTMAX_MIN/PG_INT64_MIN/g' \
+        -e 's/INTMAX_MAX/PG_INT64_MAX/g' \
         -e 's/struct[ \t]+tm\b/struct pg_tm/g' \
         -e 's/\btime_t\b/pg_time_t/g' \
         -e 's/lineno/lineno_t/g' \
diff --git a/src/timezone/localtime.c b/src/timezone/localtime.c
index 787f0b69d630..0f65f3c648e5 100644
--- a/src/timezone/localtime.c
+++ b/src/timezone/localtime.c
@@ -92,6 +92,7 @@ struct rule
 static struct pg_tm *gmtsub(pg_time_t const *, int32, struct pg_tm *);
 static bool increment_overflow(int *, int);
 static bool increment_overflow_time(pg_time_t *, int32);
+static int64 leapcorr(struct state const *, pg_time_t);
 static struct pg_tm *timesub(pg_time_t const *, int32, struct state const *,
 							 struct pg_tm *);
 static bool typesequiv(struct state const *, int, int);
@@ -477,12 +478,14 @@ tzloadbody(char const *name, char *canonname, struct state *sp, bool doextend,
 
 				for (i = 0; i < ts->timecnt; i++)
 					if (sp->timecnt == 0
-						|| sp->ats[sp->timecnt - 1] < ts->ats[i])
+						|| (sp->ats[sp->timecnt - 1]
+							< ts->ats[i] + leapcorr(sp, ts->ats[i])))
 						break;
 				while (i < ts->timecnt
 					   && sp->timecnt < TZ_MAX_TIMES)
 				{
-					sp->ats[sp->timecnt] = ts->ats[i];
+					sp->ats[sp->timecnt]
+						= ts->ats[i] + leapcorr(sp, ts->ats[i]);
 					sp->types[sp->timecnt] = (sp->typecnt
 											  + ts->types[i]);
 					sp->timecnt++;
@@ -1601,6 +1604,22 @@ increment_overflow_time(pg_time_t *tp, int32 j)
 	return false;
 }
 
+static int64
+leapcorr(struct state const *sp, pg_time_t t)
+{
+	struct lsinfo const *lp;
+	int			i;
+
+	i = sp->leapcnt;
+	while (--i >= 0)
+	{
+		lp = &sp->lsis[i];
+		if (t >= lp->ls_trans)
+			return lp->ls_corr;
+	}
+	return 0;
+}
+
 /*
  * Find the next DST transition time in the given zone after the given time
  *
diff --git a/src/timezone/zic.c b/src/timezone/zic.c
index 9df81824a0f0..e5a3ca26f42e 100644
--- a/src/timezone/zic.c
+++ b/src/timezone/zic.c
@@ -125,13 +125,14 @@ static void warning(const char *string,...) pg_attribute_printf(1, 2);
 static void usage(FILE *stream, int status) pg_attribute_noreturn();
 static void addtt(zic_t starttime, int type);
 static int	addtype(zic_t, char const *, bool, bool, bool);
-static void leapadd(zic_t, bool, int, int);
+static void leapadd(zic_t, int, int);
 static void adjleap(void);
 static void associate(void);
 static void dolink(const char *, const char *, bool);
 static char **getfields(char *buf);
 static zic_t gethms(const char *string, const char *errstring);
 static zic_t getsave(char *, bool *);
+static void inexpires(char **, int);
 static void infile(const char *filename);
 static void inleap(char **fields, int nfields);
 static void inlink(char **fields, int nfields);
@@ -202,6 +203,7 @@ static int	typecnt;
 #define LC_ZONE		1
 #define LC_LINK		2
 #define LC_LEAP		3
+#define LC_EXPIRES	4
 
 /*
  * Which fields are which on a Zone line.
@@ -267,6 +269,9 @@ static int	typecnt;
 #define LP_ROLL		6
 #define LEAP_FIELDS	7
 
+/* Expires lines are like Leap lines, except without CORR and ROLL fields.  */
+#define EXPIRES_FIELDS	5
+
 /*
  * Year synonyms.
  */
@@ -312,6 +317,7 @@ static struct lookup const zi_line_codes[] = {
 };
 static struct lookup const leap_line_codes[] = {
 	{"Leap", LC_LEAP},
+	{"Expires", LC_EXPIRES},
 	{NULL, 0}
 };
 
@@ -584,6 +590,12 @@ static zic_t const max_time = MAXVAL(zic_t, TIME_T_BITS_IN_FILE);
 static zic_t lo_time = MINVAL(zic_t, TIME_T_BITS_IN_FILE);
 static zic_t hi_time = MAXVAL(zic_t, TIME_T_BITS_IN_FILE);
 
+/* The time specified by an Expires line, or negative if no such line.  */
+static zic_t leapexpires = -1;
+
+/* The time specified by an #expires comment, or negative if no such line.  */
+static zic_t comment_leapexpires = -1;
+
 /* Set the time range of the output to TIMERANGE.
    Return true if successful.  */
 static bool
@@ -1279,7 +1291,8 @@ infile(const char *name)
 		}
 		if (nfields == 0)
 		{
-			/* nothing to do */
+			if (name == leapsec && *buf == '#')
+				sscanf(buf, "#expires " INT64_FORMAT, &comment_leapexpires);
 		}
 		else if (wantcont)
 		{
@@ -1311,6 +1324,10 @@ infile(const char *name)
 						inleap(fields, nfields);
 						wantcont = false;
 						break;
+					case LC_EXPIRES:
+						inexpires(fields, nfields);
+						wantcont = false;
+						break;
 					default:	/* "cannot happen" */
 						fprintf(stderr,
 								_("%s: panic: Invalid l_value %d\n"),
@@ -1634,8 +1651,8 @@ inzsub(char **fields, int nfields, bool iscont)
 	return hasuntil;
 }
 
-static void
-inleap(char **fields, int nfields)
+static zic_t
+getleapdatetime(char **fields, int nfields, bool expire_line)
 {
 	const char *cp;
 	const struct lookup *lp;
@@ -1651,11 +1668,6 @@ inleap(char **fields, int nfields)
 	zic_t		t;
 	char		xs;
 
-	if (nfields != LEAP_FIELDS)
-	{
-		error(_("wrong number of fields on Leap line"));
-		return;
-	}
 	dayoff = 0;
 	cp = fields[LP_YEAR];
 	if (sscanf(cp, "%d%c", &year, &xs) != 1)
@@ -1664,13 +1676,16 @@ inleap(char **fields, int nfields)
 		 * Leapin' Lizards!
 		 */
 		error(_("invalid leaping year"));
-		return;
+		return -1;
+	}
+	if (!expire_line)
+	{
+		if (!leapseen || leapmaxyear < year)
+			leapmaxyear = year;
+		if (!leapseen || leapminyear > year)
+			leapminyear = year;
+		leapseen = true;
 	}
-	if (!leapseen || leapmaxyear < year)
-		leapmaxyear = year;
-	if (!leapseen || leapminyear > year)
-		leapminyear = year;
-	leapseen = true;
 	j = EPOCH_YEAR;
 	while (j != year)
 	{
@@ -1689,7 +1704,7 @@ inleap(char **fields, int nfields)
 	if ((lp = byword(fields[LP_MONTH], mon_names)) == NULL)
 	{
 		error(_("invalid month name"));
-		return;
+		return -1;
 	}
 	month = lp->l_value;
 	j = TM_JANUARY;
@@ -1704,56 +1719,70 @@ inleap(char **fields, int nfields)
 		day <= 0 || day > len_months[isleap(year)][month])
 	{
 		error(_("invalid day of month"));
-		return;
+		return -1;
 	}
 	dayoff = oadd(dayoff, day - 1);
 	if (dayoff < min_time / SECSPERDAY)
 	{
 		error(_("time too small"));
-		return;
+		return -1;
 	}
 	if (dayoff > max_time / SECSPERDAY)
 	{
 		error(_("time too large"));
-		return;
+		return -1;
 	}
 	t = dayoff * SECSPERDAY;
 	tod = gethms(fields[LP_TIME], _("invalid time of day"));
-	cp = fields[LP_CORR];
+	t = tadd(t, tod);
+	if (t < 0)
+		error(_("leap second precedes Epoch"));
+	return t;
+}
+
+static void
+inleap(char **fields, int nfields)
+{
+	if (nfields != LEAP_FIELDS)
+		error(_("wrong number of fields on Leap line"));
+	else
 	{
-		bool		positive;
-		int			count;
+		zic_t		t = getleapdatetime(fields, nfields, false);
 
-		if (strcmp(cp, "") == 0)
-		{						/* infile() turns "-" into "" */
-			positive = false;
-			count = 1;
-		}
-		else if (strcmp(cp, "+") == 0)
+		if (0 <= t)
 		{
-			positive = true;
-			count = 1;
-		}
-		else
-		{
-			error(_("illegal CORRECTION field on Leap line"));
-			return;
-		}
-		if ((lp = byword(fields[LP_ROLL], leap_types)) == NULL)
-		{
-			error(_("illegal Rolling/Stationary field on Leap line"));
-			return;
-		}
-		t = tadd(t, tod);
-		if (t < 0)
-		{
-			error(_("leap second precedes Epoch"));
-			return;
+			struct lookup const *lp = byword(fields[LP_ROLL], leap_types);
+
+			if (!lp)
+				error(_("invalid Rolling/Stationary field on Leap line"));
+			else
+			{
+				int			correction = 0;
+
+				if (!fields[LP_CORR][0])	/* infile() turns "-" into "".  */
+					correction = -1;
+				else if (strcmp(fields[LP_CORR], "+") == 0)
+					correction = 1;
+				else
+					error(_("invalid CORRECTION field on Leap line"));
+				if (correction)
+					leapadd(t, correction, lp->l_value);
+			}
 		}
-		leapadd(t, positive, lp->l_value, count);
 	}
 }
 
+static void
+inexpires(char **fields, int nfields)
+{
+	if (nfields != EXPIRES_FIELDS)
+		error(_("wrong number of fields on Expires line"));
+	else if (0 <= leapexpires)
+		error(_("multiple Expires lines"));
+	else
+		leapexpires = getleapdatetime(fields, nfields, true);
+}
+
 static void
 inlink(char **fields, int nfields)
 {
@@ -3369,12 +3398,11 @@ addtype(zic_t utoff, char const *abbr, bool isdst, bool ttisstd, bool ttisut)
 }
 
 static void
-leapadd(zic_t t, bool positive, int rolling, int count)
+leapadd(zic_t t, int correction, int rolling)
 {
-	int			i,
-				j;
+	int			i;
 
-	if (leapcnt + (positive ? count : 1) > TZ_MAX_LEAPS)
+	if (TZ_MAX_LEAPS <= leapcnt)
 	{
 		error(_("too many leap seconds"));
 		exit(EXIT_FAILURE);
@@ -3382,19 +3410,13 @@ leapadd(zic_t t, bool positive, int rolling, int count)
 	for (i = 0; i < leapcnt; ++i)
 		if (t <= trans[i])
 			break;
-	do
-	{
-		for (j = leapcnt; j > i; --j)
-		{
-			trans[j] = trans[j - 1];
-			corr[j] = corr[j - 1];
-			roll[j] = roll[j - 1];
-		}
-		trans[i] = t;
-		corr[i] = positive ? 1 : -count;
-		roll[i] = rolling;
-		++leapcnt;
-	} while (positive && --count != 0);
+	memmove(&trans[i + 1], &trans[i], (leapcnt - i) * sizeof *trans);
+	memmove(&corr[i + 1], &corr[i], (leapcnt - i) * sizeof *corr);
+	memmove(&roll[i + 1], &roll[i], (leapcnt - i) * sizeof *roll);
+	trans[i] = t;
+	corr[i] = correction;
+	roll[i] = rolling;
+	++leapcnt;
 }
 
 static void
@@ -3418,6 +3440,25 @@ adjleap(void)
 		trans[i] = tadd(trans[i], last);
 		last = corr[i] += last;
 	}
+
+	if (leapexpires < 0)
+	{
+		leapexpires = comment_leapexpires;
+		if (0 <= leapexpires)
+			warning(_("\"#expires\" is obsolescent; use \"Expires\""));
+	}
+
+	if (0 <= leapexpires)
+	{
+		leapexpires = oadd(leapexpires, last);
+		if (!(leapcnt == 0 || (trans[leapcnt - 1] < leapexpires)))
+		{
+			error(_("last Leap time does not precede Expires time"));
+			exit(EXIT_FAILURE);
+		}
+		if (leapexpires <= hi_time)
+			hi_time = leapexpires - 1;
+	}
 }
 
 static char *

From 2b2a070d98b2f2c7ecc031e582cfefa400316ce3 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Thu, 18 Jun 2020 10:40:10 +0900
Subject: [PATCH 007/334] Remove reset of testtablespace from pg_regress on
 Windows

testtablespace is an extra path used as tablespace location in the main
regression test suite, computed from --outputdir as defined by the
caller of pg_regress (current directory if undefined).

This special handling was introduced as of f10589e to be specific to
MSVC, as we let pg_regress' Makefile handle this cleanup in other
environments.  This moves the cleanup to the MSVC script running
regression tests instead where needed: check, installcheck and
upgradecheck.  I have also checked this patch on MSVC with repeated runs
of each target.

Author: Kyotaro Horiguchi, Michael Paquier
Discussion: https://postgr.es/m/20200219.142519.437573253063431435.horikyota.ntt@gmail.com
---
 src/test/regress/pg_regress.c | 22 ----------------------
 src/tools/msvc/vcregress.pl   | 17 +++++++++++++++--
 2 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c
index f11a3b9e26e6..c8d190d2489f 100644
--- a/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@ -494,28 +494,6 @@ convert_sourcefiles_in(const char *source_subdir, const char *dest_dir, const ch
 
 	snprintf(testtablespace, MAXPGPATH, "%s/testtablespace", outputdir);
 
-#ifdef WIN32
-
-	/*
-	 * On Windows only, clean out the test tablespace dir, or create it if it
-	 * doesn't exist.  On other platforms we expect the Makefile to take care
-	 * of that.  (We don't migrate that functionality in here because it'd be
-	 * harder to cope with platform-specific issues such as SELinux.)
-	 *
-	 * XXX it would be better if pg_regress.c had nothing at all to do with
-	 * testtablespace, and this were handled by a .BAT file or similar on
-	 * Windows.  See pgsql-hackers discussion of 2008-01-18.
-	 */
-	if (directory_exists(testtablespace))
-		if (!rmtree(testtablespace, true))
-		{
-			fprintf(stderr, _("\n%s: could not remove test tablespace \"%s\"\n"),
-					progname, testtablespace);
-			exit(2);
-		}
-	make_directory(testtablespace);
-#endif
-
 	/* finally loop on each file and do the replacement */
 	for (name = names; *name; name++)
 	{
diff --git a/src/tools/msvc/vcregress.pl b/src/tools/msvc/vcregress.pl
index 3365ee578c3d..d6763ad4ac57 100644
--- a/src/tools/msvc/vcregress.pl
+++ b/src/tools/msvc/vcregress.pl
@@ -123,6 +123,8 @@ sub installcheck_internal
 sub installcheck
 {
 	my $schedule = shift || 'serial';
+
+	CleanupTablespaceDirectory();
 	installcheck_internal($schedule);
 	return;
 }
@@ -143,6 +145,7 @@ sub check
 		"--temp-instance=./tmp_check");
 	push(@args, $maxconn)     if $maxconn;
 	push(@args, $temp_config) if $temp_config;
+	CleanupTablespaceDirectory();
 	system(@args);
 	my $status = $? >> 8;
 	exit $status if $status;
@@ -570,8 +573,8 @@ sub upgradecheck
 	$ENV{PGDATA} = "$data.old";
 	my $outputdir          = "$tmp_root/regress";
 	my @EXTRA_REGRESS_OPTS = ("--outputdir=$outputdir");
-	mkdir "$outputdir"                || die $!;
-	mkdir "$outputdir/testtablespace" || die $!;
+	mkdir "$outputdir" || die $!;
+	CleanupTablespaceDirectory($outputdir);
 
 	my $logdir = "$topdir/src/bin/pg_upgrade/log";
 	rmtree($logdir);
@@ -737,6 +740,16 @@ sub InstallTemp
 	return;
 }
 
+sub CleanupTablespaceDirectory
+{
+	my $testdir = shift || getcwd();
+
+	my $testtablespace = "$testdir/testtablespace";
+
+	rmtree($testtablespace) if (-d $testtablespace);
+	mkdir($testtablespace);
+}
+
 sub usage
 {
 	print STDERR

From 9d402c73ade412bdeb9064c81fc4ed071c4e93f8 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Thu, 18 Jun 2020 08:41:31 +0200
Subject: [PATCH 008/334] Expand tests for factorial

Move from int4 to numeric test.  (They were originally int4 functions,
but were reimplemented for numeric in
04a4821adef38155b7920ba9eb83c4c3c29156f8.)  Add some tests for edge
cases.

Discussion: https://www.postgresql.org/message-id/flat/6ce1df0e-86a3-e544-743a-f357ff663f68%402ndquadrant.com
---
 src/test/regress/expected/int4.out    | 12 --------
 src/test/regress/expected/numeric.out | 41 +++++++++++++++++++++++++++
 src/test/regress/sql/int4.sql         |  4 ---
 src/test/regress/sql/numeric.sql      | 11 +++++++
 4 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/src/test/regress/expected/int4.out b/src/test/regress/expected/int4.out
index c384af18ee89..77f43739a7c1 100644
--- a/src/test/regress/expected/int4.out
+++ b/src/test/regress/expected/int4.out
@@ -299,18 +299,6 @@ SELECT int4 '1000' < int4 '999' AS false;
  f
 (1 row)
 
-SELECT 4! AS twenty_four;
- twenty_four 
--------------
-          24
-(1 row)
-
-SELECT !!3 AS six;
- six 
------
-   6
-(1 row)
-
 SELECT 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 AS ten;
  ten 
 -----
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index c7fe63d03744..b255be7c8520 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -2315,3 +2315,44 @@ FROM (VALUES (0::numeric, 0::numeric),
 
 SELECT lcm(9999 * (10::numeric)^131068 + (10::numeric^131068 - 1), 2); -- overflow
 ERROR:  value overflows numeric format
+--
+-- Tests for factorial
+--
+SELECT 4!;
+ ?column? 
+----------
+       24
+(1 row)
+
+SELECT !!3;
+ ?column? 
+----------
+        6
+(1 row)
+
+SELECT factorial(15);
+   factorial   
+---------------
+ 1307674368000
+(1 row)
+
+SELECT 100000!;
+ERROR:  value overflows numeric format
+SELECT 0!;
+ ?column? 
+----------
+        1
+(1 row)
+
+SELECT -4!;
+ ?column? 
+----------
+        1
+(1 row)
+
+SELECT factorial(-4);
+ factorial 
+-----------
+         1
+(1 row)
+
diff --git a/src/test/regress/sql/int4.sql b/src/test/regress/sql/int4.sql
index a9e90a96c4c0..b00c9dea2a6d 100644
--- a/src/test/regress/sql/int4.sql
+++ b/src/test/regress/sql/int4.sql
@@ -114,10 +114,6 @@ SELECT int2 '2' * int4 '2' = int4 '16' / int2 '4' AS true;
 
 SELECT int4 '1000' < int4 '999' AS false;
 
-SELECT 4! AS twenty_four;
-
-SELECT !!3 AS six;
-
 SELECT 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 AS ten;
 
 SELECT 2 + 2 / 2 AS three;
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index 41475a9a245f..1332a9cf07a6 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -1111,3 +1111,14 @@ FROM (VALUES (0::numeric, 0::numeric),
              (4232.820::numeric, 132.72000::numeric)) AS v(a, b);
 
 SELECT lcm(9999 * (10::numeric)^131068 + (10::numeric^131068 - 1), 2); -- overflow
+
+--
+-- Tests for factorial
+--
+SELECT 4!;
+SELECT !!3;
+SELECT factorial(15);
+SELECT 100000!;
+SELECT 0!;
+SELECT -4!;
+SELECT factorial(-4);

From 0a40563eadc67472d6fd50dabf7002afa25c3330 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Thu, 18 Jun 2020 08:41:31 +0200
Subject: [PATCH 009/334] Disallow factorial of negative numbers

The previous implementation returned 1 for all negative numbers, which
is not sensible under any definition.

Discussion: https://www.postgresql.org/message-id/flat/6ce1df0e-86a3-e544-743a-f357ff663f68%402ndquadrant.com
---
 src/backend/utils/adt/numeric.c       |  4 ++++
 src/test/regress/expected/numeric.out | 12 ++----------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index eea42398541b..5f23f2afac86 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -2946,6 +2946,10 @@ numeric_fac(PG_FUNCTION_ARGS)
 	NumericVar	fact;
 	NumericVar	result;
 
+	if (num < 0)
+		ereport(ERROR,
+				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+				 errmsg("factorial of a negative number is undefined")));
 	if (num <= 1)
 	{
 		res = make_result(&const_one);
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index b255be7c8520..2f3ecb50a733 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -2345,14 +2345,6 @@ SELECT 0!;
 (1 row)
 
 SELECT -4!;
- ?column? 
-----------
-        1
-(1 row)
-
+ERROR:  factorial of a negative number is undefined
 SELECT factorial(-4);
- factorial 
------------
-         1
-(1 row)
-
+ERROR:  factorial of a negative number is undefined

From b48df818dcbd1a5e34ab7a2d9f98828b7b62140c Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Thu, 18 Jun 2020 16:34:59 +0900
Subject: [PATCH 010/334] Fix oldest xmin and LSN computation across repslots
 after advancing

Advancing a replication slot did not recompute the oldest xmin and LSN
values across replication slots, preventing resource removal like
segments not recycled at checkpoint time.  The original commit that
introduced the slot advancing in 9c7d06d never did the update of those
oldest values, and b0afdca removed this code.

This commit adds a TAP test to check segment recycling with advancing
for physical slots, enforcing an extra segment switch before advancing
to check if the segment gets correctly recycled after a checkpoint.

Reported-by: Andres Freund
Reviewed-by: Alexey Kondratov, Kyptaro Horiguchi
Discussion: https://postgr.es/m/20200609171904.kpltxxvjzislidks@alap3.anarazel.de
Backpatch-through: 11
---
 src/backend/replication/slotfuncs.c   |  7 +++++++
 src/test/recovery/t/001_stream_rep.pl | 21 +++++++++++++++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 1b929a603e51..06e4955de73b 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -621,6 +621,13 @@ pg_replication_slot_advance(PG_FUNCTION_ARGS)
 	values[0] = NameGetDatum(&MyReplicationSlot->data.name);
 	nulls[0] = false;
 
+	/*
+	 * Recompute the minimum LSN and xmin across all slots to adjust with the
+	 * advancing potentially done.
+	 */
+	ReplicationSlotsComputeRequiredXmin(false);
+	ReplicationSlotsComputeRequiredLSN();
+
 	ReplicationSlotRelease();
 
 	/* Return the reached position. */
diff --git a/src/test/recovery/t/001_stream_rep.pl b/src/test/recovery/t/001_stream_rep.pl
index 0c316c18082e..778f11b28b43 100644
--- a/src/test/recovery/t/001_stream_rep.pl
+++ b/src/test/recovery/t/001_stream_rep.pl
@@ -3,7 +3,7 @@
 use warnings;
 use PostgresNode;
 use TestLib;
-use Test::More tests => 35;
+use Test::More tests => 36;
 
 # Initialize master node
 my $node_master = get_new_node('master');
@@ -364,15 +364,26 @@ sub replay_check
 	qq[SELECT 1 FROM replayed WHERE val = $newval]);
 is($is_replayed, qq(1), "standby_2 didn't replay master value $newval");
 
+# Drop any existing slots on the primary, for the follow-up tests.
+$node_master->safe_psql('postgres',
+	"SELECT pg_drop_replication_slot(slot_name) FROM pg_replication_slots;");
+
 # Test physical slot advancing and its durability.  Create a new slot on
 # the primary, not used by any of the standbys. This reserves WAL at creation.
 my $phys_slot = 'phys_slot';
 $node_master->safe_psql('postgres',
 	"SELECT pg_create_physical_replication_slot('$phys_slot', true);");
+# Generate some WAL, and switch to a new segment, used to check that
+# the previous segment is correctly getting recycled as the slot advancing
+# would recompute the minimum LSN calculated across all slots.
+my $segment_removed = $node_master->safe_psql('postgres',
+	'SELECT pg_walfile_name(pg_current_wal_lsn())');
+chomp($segment_removed);
 $node_master->psql(
 	'postgres', "
 	CREATE TABLE tab_phys_slot (a int);
-	INSERT INTO tab_phys_slot VALUES (generate_series(1,10));");
+	INSERT INTO tab_phys_slot VALUES (generate_series(1,10));
+	SELECT pg_switch_wal();");
 my $current_lsn =
   $node_master->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
 chomp($current_lsn);
@@ -392,3 +403,9 @@ sub replay_check
 chomp($phys_restart_lsn_post);
 ok( ($phys_restart_lsn_pre cmp $phys_restart_lsn_post) == 0,
 	"physical slot advance persists across restarts");
+
+# Check if the previous segment gets correctly recycled after the
+# server stopped cleanly, causing a shutdown checkpoint to be generated.
+my $master_data = $node_master->data_dir;
+ok(!-f "$master_data/pg_wal/$segment_removed",
+	"WAL segment $segment_removed recycled after physical slot advancing");

From a3235a53ae9f6f21f823081c610b0901db6aa665 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 18 Jun 2020 16:27:18 -0400
Subject: [PATCH 011/334] Doc: document POSIX-style time zone specifications in
 full.

We'd glossed over most of this complexity for years, but it's hard
to avoid writing it all down now, so that we can explain what happens
when there's no "posixrules" file in the IANA time zone database.
That was at best a tiny minority situation till now, but it's likely
to become quite common in the future, so we'd better explain it.

Nonetheless, we don't really encourage people to use POSIX zone specs;
picking a named zone is almost always what you really want, unless
perhaps you're stuck with an out-of-date zone database.  Therefore,
let's shove all this detail into an appendix.

Patch by me; thanks to Robert Haas for help with some awkward wording.

Discussion: https://postgr.es/m/1390.1562258309@sss.pgh.pa.us
---
 doc/src/sgml/datatype.sgml |  38 +------
 doc/src/sgml/datetime.sgml | 212 +++++++++++++++++++++++++++++++++++++
 2 files changed, 217 insertions(+), 33 deletions(-)

diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index 3df189ad853c..49fb19ff9194 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -2478,7 +2478,7 @@ TIMESTAMP WITH TIME ZONE '2004-10-19 10:23:54+02'
         A time zone abbreviation, for example <literal>PST</literal>.  Such a
         specification merely defines a particular offset from UTC, in
         contrast to full time zone names which can imply a set of daylight
-        savings transition-date rules as well.  The recognized abbreviations
+        savings transition rules as well.  The recognized abbreviations
         are listed in the <literal>pg_timezone_abbrevs</literal> view (see <xref
         linkend="view-pg-timezone-abbrevs"/>).  You cannot set the
         configuration parameters <xref linkend="guc-timezone"/> or
@@ -2492,25 +2492,10 @@ TIMESTAMP WITH TIME ZONE '2004-10-19 10:23:54+02'
        <para>
         In addition to the timezone names and abbreviations,
         <productname>PostgreSQL</productname> will accept POSIX-style time zone
-        specifications of the form <replaceable>STD</replaceable><replaceable>offset</replaceable> or
-        <replaceable>STD</replaceable><replaceable>offset</replaceable><replaceable>DST</replaceable>, where
-        <replaceable>STD</replaceable> is a zone abbreviation, <replaceable>offset</replaceable> is a
-        numeric offset in hours west from UTC, and <replaceable>DST</replaceable> is an
-        optional daylight-savings zone abbreviation, assumed to stand for one
-        hour ahead of the given offset. For example, if <literal>EST5EDT</literal>
-        were not already a recognized zone name, it would be accepted and would
-        be functionally equivalent to United States East Coast time.  In this
-        syntax, a zone abbreviation can be a string of letters, or an
-        arbitrary string surrounded by angle brackets (<literal>&lt;&gt;</literal>).
-        When a daylight-savings zone abbreviation is present,
-        it is assumed to be used
-        according to the same daylight-savings transition rules used in the
-        IANA time zone database's <filename>posixrules</filename> entry.
-        In a standard <productname>PostgreSQL</productname> installation,
-        <filename>posixrules</filename> is the same as <literal>US/Eastern</literal>, so
-        that POSIX-style time zone specifications follow USA daylight-savings
-        rules.  If needed, you can adjust this behavior by replacing the
-        <filename>posixrules</filename> file.
+        specifications, as described in
+        <xref linkend="datetime-posix-timezone-specs"/>.  This option is not
+        normally preferable to using a named time zone, but it may be
+        necessary if no suitable IANA time zone entry is available.
        </para>
       </listitem>
      </itemizedlist>
@@ -2537,19 +2522,6 @@ TIMESTAMP WITH TIME ZONE '2004-10-19 10:23:54+02'
      above, this is not necessarily the same as local civil time on that date.
     </para>
 
-    <para>
-     One should be wary that the POSIX-style time zone feature can
-     lead to silently accepting bogus input, since there is no check on the
-     reasonableness of the zone abbreviations.  For example, <literal>SET
-     TIMEZONE TO FOOBAR0</literal> will work, leaving the system effectively using
-     a rather peculiar abbreviation for UTC.
-     Another issue to keep in mind is that in POSIX time zone names,
-     positive offsets are used for locations <emphasis>west</emphasis> of Greenwich.
-     Everywhere else, <productname>PostgreSQL</productname> follows the
-     ISO-8601 convention that positive timezone offsets are <emphasis>east</emphasis>
-     of Greenwich.
-    </para>
-
     <para>
      In all cases, timezone names and abbreviations are recognized
      case-insensitively.  (This is a change from <productname>PostgreSQL</productname>
diff --git a/doc/src/sgml/datetime.sgml b/doc/src/sgml/datetime.sgml
index 7cce826e2d00..7da4d0b7789d 100644
--- a/doc/src/sgml/datetime.sgml
+++ b/doc/src/sgml/datetime.sgml
@@ -555,6 +555,218 @@
 
   </sect1>
 
+  <sect1 id="datetime-posix-timezone-specs">
+  <title><acronym>POSIX</acronym> Time Zone Specifications</title>
+
+  <indexterm zone="datetime-posix-timezone-specs">
+   <primary>time zone</primary>
+   <secondary><acronym>POSIX</acronym>-style specification</secondary>
+  </indexterm>
+
+  <para>
+   <acronym>PostgreSQL</acronym> can accept time zone specifications that
+   are written according to the <acronym>POSIX</acronym> standard's rules
+   for the <varname>TZ</varname> environment
+   variable.  <acronym>POSIX</acronym> time zone specifications are
+   inadequate to deal with the complexity of real-world time zone history,
+   but there are sometimes reasons to use them.
+  </para>
+
+  <para>
+   A POSIX time zone specification has the form
+<synopsis>
+<replaceable>STD</replaceable> <replaceable>offset</replaceable> <optional> <replaceable>DST</replaceable> <optional> <replaceable>dstoffset</replaceable> </optional> <optional> , <replaceable>rule</replaceable> </optional> </optional>
+</synopsis>
+   (For readability, we show spaces between the fields, but spaces should
+   not be used in practice.)  The fields are:
+   <itemizedlist>
+    <listitem>
+     <para>
+      <replaceable>STD</replaceable> is the zone abbreviation to be used
+      for standard time.
+     </para>
+    </listitem>
+    <listitem>
+     <para>
+      <replaceable>offset</replaceable> is the zone's standard-time offset
+      from UTC.
+     </para>
+    </listitem>
+    <listitem>
+     <para>
+      <replaceable>DST</replaceable> is the zone abbreviation to be used
+      for daylight-savings time.  If this field and the following ones are
+      omitted, the zone uses a fixed UTC offset with no daylight-savings
+      rule.
+     </para>
+    </listitem>
+    <listitem>
+     <para>
+      <replaceable>dstoffset</replaceable> is the daylight-savings offset
+      from UTC.  This field is typically omitted, since it defaults to one
+      hour less than the standard-time <replaceable>offset</replaceable>,
+      which is usually the right thing.
+     </para>
+    </listitem>
+    <listitem>
+     <para>
+      <replaceable>rule</replaceable> defines the rule for when daylight
+      savings is in effect, as described below.
+     </para>
+    </listitem>
+   </itemizedlist>
+  </para>
+
+  <para>
+   In this syntax, a zone abbreviation can be a string of letters, such
+   as <literal>EST</literal>, or an arbitrary string surrounded by angle
+   brackets, such as <literal>&lt;UTC-05&gt;</literal>.
+   Note that the zone abbreviations given here are only used for output,
+   and even then only in some timestamp output formats.  The zone
+   abbreviations recognized in timestamp input are determined as explained
+   in <xref linkend="datetime-config-files"/>.
+  </para>
+
+  <para>
+   The offset fields specify the hours, and optionally minutes and seconds,
+   difference from UTC.  They have the format
+   <replaceable>hh</replaceable><optional><literal>:</literal><replaceable>mm</replaceable><optional><literal>:</literal><replaceable>ss</replaceable></optional></optional>
+   optionally with a leading sign (<literal>+</literal>
+   or <literal>-</literal>).  The positive sign is used for
+   zones <emphasis>west</emphasis> of Greenwich.  (Note that this is the
+   opposite of the ISO-8601 sign convention used elsewhere in
+   <acronym>PostgreSQL</acronym>.)  <replaceable>hh</replaceable> can have
+   one or two digits; <replaceable>mm</replaceable>
+   and <replaceable>ss</replaceable> (if used) must have two.
+  </para>
+
+  <para>
+   The daylight-savings transition <replaceable>rule</replaceable> has the
+   format
+<synopsis>
+<replaceable>dstdate</replaceable> <optional> <literal>/</literal> <replaceable>dsttime</replaceable> </optional> <literal>,</literal> <replaceable>stddate</replaceable> <optional> <literal>/</literal> <replaceable>stdtime</replaceable> </optional>
+</synopsis>
+   (As before, spaces should not be included in practice.)
+   The <replaceable>dstdate</replaceable>
+   and <replaceable>dsttime</replaceable> fields define when daylight-savings
+   time starts, while <replaceable>stddate</replaceable>
+   and <replaceable>stdtime</replaceable> define when standard time
+   starts.  (In some cases, notably in zones south of the equator, the
+   former might be later in the year than the latter.)  The date fields
+   have one of these formats:
+   <variablelist>
+    <varlistentry>
+     <term><replaceable>n</replaceable></term>
+     <listitem>
+      <para>
+       A plain integer denotes a day of the year, counting from zero to
+       364, or to 365 in leap years.
+      </para>
+     </listitem>
+    </varlistentry>
+    <varlistentry>
+     <term><literal>J</literal><replaceable>n</replaceable></term>
+     <listitem>
+      <para>
+       In this form, <replaceable>n</replaceable> counts from 1 to 365,
+       and February 29 is not counted even if it is present.  (Thus, a
+       transition occurring on February 29 could not be specified this
+       way.  However, days after February have the same numbers whether
+       it's a leap year or not, so that this form is usually more useful
+       than the plain-integer form for transitions on fixed dates.)
+      </para>
+     </listitem>
+    </varlistentry>
+    <varlistentry>
+     <term><literal>M</literal><replaceable>m</replaceable><literal>.</literal><replaceable>n</replaceable><literal>.</literal><replaceable>d</replaceable></term>
+     <listitem>
+      <para>
+       This form specifies a transition that always happens during the same
+       month and on the same day of the week.  <replaceable>m</replaceable>
+       identifies the month, from 1 to 12.  <replaceable>n</replaceable>
+       specifies the <replaceable>n</replaceable>'th occurrence of the
+       weekday identified by <replaceable>d</replaceable>.
+       <replaceable>n</replaceable> is a number between 1 and 4, or 5
+       meaning the last occurrence of that weekday in the month (which
+       could be the fourth or the fifth).  <replaceable>d</replaceable> is
+       a number between 0 and 6, with 0 indicating Sunday.
+       For example, <literal>M3.2.0</literal> means <quote>the second
+       Sunday in March</quote>.
+      </para>
+     </listitem>
+    </varlistentry>
+   </variablelist>
+  </para>
+
+  <note>
+   <para>
+    The <literal>M</literal> format is sufficient to describe many common
+    daylight-savings transition laws.  But note that none of these variants
+    can deal with daylight-savings law changes, so in practice the
+    historical data stored for named time zones (in the IANA time zone
+    database) is necessary to interpret past time stamps correctly.
+   </para>
+  </note>
+
+  <para>
+   The time fields in a transition rule have the same format as the offset
+   fields described previously, except that they cannot contain signs.
+   They define the current local time at which the change to the other
+   time occurs.  If omitted, they default to <literal>02:00:00</literal>.
+  </para>
+
+  <para>
+   If a daylight-savings abbreviation is given but the
+   transition <replaceable>rule</replaceable> field is omitted,
+   <productname>PostgreSQL</productname> attempts to determine the
+   transition times by consulting the <filename>posixrules</filename> file
+   in the IANA time zone database.  This file has the same format as a
+   full time zone entry, but only its transition timing rules are used,
+   not its UTC offsets.  Typically, this file has the same contents as the
+   <literal>US/Eastern</literal> file, so that POSIX-style time zone
+   specifications follow USA daylight-savings rules.  If needed, you can
+   adjust this behavior by replacing the <filename>posixrules</filename>
+   file.
+  </para>
+
+  <note>
+   <para>
+    The facility to consult a <filename>posixrules</filename> file has
+    been deprecated by IANA, and it is likely to go away in the future.
+    One bug in this feature, which is unlikely to be fixed before it
+    disappears, is that it fails to apply DST rules to dates after 2038.
+   </para>
+  </note>
+
+  <para>
+   If the <filename>posixrules</filename> file is not present,
+   the fallback behavior is to use the
+   rule <literal>M3.2.0,M11.1.0</literal>, which corresponds to USA
+   practice as of 2020 (that is, spring forward on the second Sunday of
+   March, fall back on the first Sunday of November, both transitions
+   occurring at 2AM prevailing time).
+  </para>
+
+  <para>
+   As an example, <literal>CET-1CEST,M3.5.0,M10.5.0/3</literal> describes
+   current (as of 2020) timekeeping practice in Paris.  This specification
+   says that standard time has the abbreviation <literal>CET</literal> and
+   is one hour ahead (east) of UTC; daylight savings time has the
+   abbreviation <literal>CEST</literal> and is implicitly two hours ahead
+   of UTC; daylight savings time begins on the last Sunday in March at 2AM
+   CET and ends on the last Sunday in October at 3AM CEST.
+  </para>
+
+  <para>
+   One should be wary that it is easy to misspell a POSIX-style time zone
+   specification, since there is no check on the reasonableness of the
+   zone abbreviation(s).  For example, <literal>SET TIMEZONE TO
+   FOOBAR0</literal> will work, leaving the system effectively using a
+   rather peculiar abbreviation for UTC.
+  </para>
+
+  </sect1>
+
   <sect1 id="datetime-units-history">
   <title>History of Units</title>
 

From 3b37a6de027c27f1e4ac865aaa34dd92cf5dc7a1 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 8 Jun 2020 16:36:51 -0700
Subject: [PATCH 012/334] Add basic spinlock tests to regression tests.

As s_lock_test, the already existing test for spinlocks, isn't run in
an automated fashion (and doesn't test a normal backend environment),
adding tests that are run as part of a normal regression run is a good
idea. Particularly in light of several recent and upcoming spinlock
related fixes.

Currently the new tests are run as part of the pre-existing
test_atomic_ops() test. That perhaps can be quibbled about, but for
now seems ok.

The only operations that s_lock_test tests but the new tests don't are
the detection of a stuck spinlock and S_LOCK_FREE (which is otherwise
unused, not implemented on all platforms, and will be removed).

This currently contains a test for more than INT_MAX spinlocks (only
run with --disable-spinlocks), to ensure the recent commit fixing a
bug with more than INT_MAX spinlock initializations is correct. That
test is somewhat slow, so we might want to disable it after a few
days.

It might be worth retiring s_lock_test after this. The added coverage
of a stuck spinlock probably isn't worth the added complexity?

Author: Andres Freund
Discussion: https://postgr.es/m/20200606023103.avzrctgv7476xj7i@alap3.anarazel.de
---
 src/test/regress/regress.c | 109 +++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 960c155e5f23..9bea2ada24aa 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -34,6 +34,7 @@
 #include "optimizer/optimizer.h"
 #include "optimizer/plancat.h"
 #include "port/atomics.h"
+#include "storage/spin.h"
 #include "utils/builtins.h"
 #include "utils/geo_decls.h"
 #include "utils/memutils.h"
@@ -794,6 +795,108 @@ test_atomic_uint64(void)
 	EXPECT_EQ_U64(pg_atomic_fetch_and_u64(&var, ~0), 0);
 }
 
+/*
+ * Perform, fairly minimal, testing of the spinlock implementation.
+ *
+ * It's likely worth expanding these to actually test concurrency etc, but
+ * having some regularly run tests is better than none.
+ */
+static void
+test_spinlock(void)
+{
+	/*
+	 * Basic tests for spinlocks, as well as the underlying operations.
+	 *
+	 * We embed the spinlock in a struct with other members to test that the
+	 * spinlock operations don't perform too wide writes.
+	 */
+	{
+		struct test_lock_struct
+		{
+			char		data_before[4];
+			slock_t		lock;
+			char		data_after[4];
+		} struct_w_lock;
+
+		memcpy(struct_w_lock.data_before, "abcd", 4);
+		memcpy(struct_w_lock.data_after, "ef12", 4);
+
+		/* test basic operations via the SpinLock* API */
+		SpinLockInit(&struct_w_lock.lock);
+		SpinLockAcquire(&struct_w_lock.lock);
+		SpinLockRelease(&struct_w_lock.lock);
+
+		/* test basic operations via underlying S_* API */
+		S_INIT_LOCK(&struct_w_lock.lock);
+		S_LOCK(&struct_w_lock.lock);
+		S_UNLOCK(&struct_w_lock.lock);
+
+		/* and that "contended" acquisition works */
+		s_lock(&struct_w_lock.lock, "testfile", 17, "testfunc");
+		S_UNLOCK(&struct_w_lock.lock);
+
+		/*
+		 * Check, using TAS directly, that a single spin cycle doesn't block
+		 * when acquiring an already acquired lock.
+		 */
+#ifdef TAS
+		S_LOCK(&struct_w_lock.lock);
+
+		if (!TAS(&struct_w_lock.lock))
+			elog(ERROR, "acquired already held spinlock");
+
+#ifdef TAS_SPIN
+		if (!TAS_SPIN(&struct_w_lock.lock))
+			elog(ERROR, "acquired already held spinlock");
+#endif							/* defined(TAS_SPIN) */
+
+		S_UNLOCK(&struct_w_lock.lock);
+#endif							/* defined(TAS) */
+
+		/*
+		 * Verify that after all of this the non-lock contents are still
+		 * correct.
+		 */
+		if (memcmp(struct_w_lock.data_before, "abcd", 4) != 0)
+			elog(ERROR, "padding before spinlock modified");
+		if (memcmp(struct_w_lock.data_after, "ef12", 4) != 0)
+			elog(ERROR, "padding after spinlock modified");
+	}
+
+	/*
+	 * Ensure that allocating more than INT32_MAX emulated spinlocks
+	 * works. That's interesting because the spinlock emulation uses a 32bit
+	 * integer to map spinlocks onto semaphores. There've been bugs...
+	 */
+#ifndef HAVE_SPINLOCKS
+	{
+		/*
+		 * Initialize enough spinlocks to advance counter close to
+		 * wraparound. It's too expensive to perform acquire/release for each,
+		 * as those may be syscalls when the spinlock emulation is used (and
+		 * even just atomic TAS would be expensive).
+		 */
+		for (uint32 i = 0; i < INT32_MAX - 100000; i++)
+		{
+			slock_t lock;
+
+			SpinLockInit(&lock);
+		}
+
+		for (uint32 i = 0; i < 200000; i++)
+		{
+			slock_t lock;
+
+			SpinLockInit(&lock);
+
+			SpinLockAcquire(&lock);
+			SpinLockRelease(&lock);
+			SpinLockAcquire(&lock);
+			SpinLockRelease(&lock);
+		}
+	}
+#endif
+}
 
 PG_FUNCTION_INFO_V1(test_atomic_ops);
 Datum
@@ -805,6 +908,12 @@ test_atomic_ops(PG_FUNCTION_ARGS)
 
 	test_atomic_uint64();
 
+	/*
+	 * Arguably this shouldn't be tested as part of this function, but it's
+	 * closely enough related that that seems ok for now.
+	 */
+	test_spinlock();
+
 	PG_RETURN_BOOL(true);
 }
 

From cf1234a10e50ff9be0dc85184689ee4ebc57cd77 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 8 Jun 2020 16:50:37 -0700
Subject: [PATCH 013/334] Fix deadlock danger when atomic ops are done under
 spinlock.

This was a danger only for --disable-spinlocks in combination with
atomic operations unsupported by the current platform.

While atomics.c was careful to signal that a separate semaphore ought
to be used when spinlock emulation is active, spin.c didn't actually
implement that mechanism. That's my (Andres') fault, it seems to have
gotten lost during the development of the atomic operations support.

Fix that issue and add test for nesting atomic operations inside a
spinlock.

Author: Andres Freund
Discussion: https://postgr.es/m/20200605023302.g6v3ydozy5txifji@alap3.anarazel.de
Backpatch: 9.5-
---
 src/backend/storage/lmgr/spin.c | 97 +++++++++++++++++++++++----------
 src/test/regress/regress.c      | 52 ++++++++++++++++++
 2 files changed, 119 insertions(+), 30 deletions(-)

diff --git a/src/backend/storage/lmgr/spin.c b/src/backend/storage/lmgr/spin.c
index 753943e46d62..9f7eae933922 100644
--- a/src/backend/storage/lmgr/spin.c
+++ b/src/backend/storage/lmgr/spin.c
@@ -28,8 +28,24 @@
 
 
 #ifndef HAVE_SPINLOCKS
+
+/*
+ * No TAS, so spinlocks are implemented as PGSemaphores.
+ */
+
+#ifndef HAVE_ATOMICS
+#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES + NUM_ATOMICS_SEMAPHORES)
+#else
+#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES)
+#endif /* DISABLE_ATOMICS */
+
 PGSemaphore *SpinlockSemaArray;
-#endif
+
+#else							/* !HAVE_SPINLOCKS */
+
+#define NUM_EMULATION_SEMAPHORES 0
+
+#endif							/* HAVE_SPINLOCKS */
 
 /*
  * Report the amount of shared memory needed to store semaphores for spinlock
@@ -38,34 +54,19 @@ PGSemaphore *SpinlockSemaArray;
 Size
 SpinlockSemaSize(void)
 {
-	return SpinlockSemas() * sizeof(PGSemaphore);
+	return NUM_EMULATION_SEMAPHORES * sizeof(PGSemaphore);
 }
 
-#ifdef HAVE_SPINLOCKS
-
 /*
  * Report number of semaphores needed to support spinlocks.
  */
 int
 SpinlockSemas(void)
 {
-	return 0;
+	return NUM_EMULATION_SEMAPHORES;
 }
-#else							/* !HAVE_SPINLOCKS */
 
-/*
- * No TAS, so spinlocks are implemented as PGSemaphores.
- */
-
-
-/*
- * Report number of semaphores needed to support spinlocks.
- */
-int
-SpinlockSemas(void)
-{
-	return NUM_SPINLOCK_SEMAPHORES + NUM_ATOMICS_SEMAPHORES;
-}
+#ifndef HAVE_SPINLOCKS
 
 /*
  * Initialize spinlock emulation.
@@ -92,23 +93,59 @@ SpinlockSemaInit(void)
 /*
  * s_lock.h hardware-spinlock emulation using semaphores
  *
- * We map all spinlocks onto a set of NUM_SPINLOCK_SEMAPHORES semaphores.
- * It's okay to map multiple spinlocks onto one semaphore because no process
- * should ever hold more than one at a time.  We just need enough semaphores
- * so that we aren't adding too much extra contention from that.
+ * We map all spinlocks onto NUM_EMULATION_SEMAPHORES semaphores.  It's okay to
+ * map multiple spinlocks onto one semaphore because no process should ever
+ * hold more than one at a time.  We just need enough semaphores so that we
+ * aren't adding too much extra contention from that.
+ *
+ * There is one exception to the restriction of only holding one spinlock at a
+ * time, which is that it's ok if emulated atomic operations are nested inside
+ * spinlocks. To avoid the danger of spinlocks and atomic using the same sema,
+ * we make sure "normal" spinlocks and atomics backed by spinlocks use
+ * distinct semaphores (see the nested argument to s_init_lock_sema).
  *
  * slock_t is just an int for this implementation; it holds the spinlock
- * number from 1..NUM_SPINLOCK_SEMAPHORES.  We intentionally ensure that 0
+ * number from 1..NUM_EMULATION_SEMAPHORES.  We intentionally ensure that 0
  * is not a valid value, so that testing with this code can help find
  * failures to initialize spinlocks.
  */
 
+static inline void
+s_check_valid(int lockndx)
+{
+	if (unlikely(lockndx <= 0 || lockndx > NUM_EMULATION_SEMAPHORES))
+		elog(ERROR, "invalid spinlock number: %d", lockndx);
+}
+
 void
 s_init_lock_sema(volatile slock_t *lock, bool nested)
 {
 	static uint32 counter = 0;
-
-	*lock = ((++counter) % NUM_SPINLOCK_SEMAPHORES) + 1;
+	uint32		offset;
+	uint32		sema_total;
+	uint32		idx;
+
+	if (nested)
+	{
+		/*
+		 * To allow nesting atomics inside spinlocked sections, use a
+		 * different spinlock. See comment above.
+		 */
+		offset = 1 + NUM_SPINLOCK_SEMAPHORES;
+		sema_total = NUM_ATOMICS_SEMAPHORES;
+	}
+	else
+	{
+		offset = 1;
+		sema_total = NUM_SPINLOCK_SEMAPHORES;
+	}
+
+	idx = (counter++ % sema_total) + offset;
+
+	/* double check we did things correctly */
+	s_check_valid(idx);
+
+	*lock = idx;
 }
 
 void
@@ -116,8 +153,8 @@ s_unlock_sema(volatile slock_t *lock)
 {
 	int			lockndx = *lock;
 
-	if (lockndx <= 0 || lockndx > NUM_SPINLOCK_SEMAPHORES)
-		elog(ERROR, "invalid spinlock number: %d", lockndx);
+	s_check_valid(lockndx);
+
 	PGSemaphoreUnlock(SpinlockSemaArray[lockndx - 1]);
 }
 
@@ -134,8 +171,8 @@ tas_sema(volatile slock_t *lock)
 {
 	int			lockndx = *lock;
 
-	if (lockndx <= 0 || lockndx > NUM_SPINLOCK_SEMAPHORES)
-		elog(ERROR, "invalid spinlock number: %d", lockndx);
+	s_check_valid(lockndx);
+
 	/* Note that TAS macros return 0 if *success* */
 	return !PGSemaphoreTryLock(SpinlockSemaArray[lockndx - 1]);
 }
diff --git a/src/test/regress/regress.c b/src/test/regress/regress.c
index 9bea2ada24aa..02397f2eb104 100644
--- a/src/test/regress/regress.c
+++ b/src/test/regress/regress.c
@@ -898,6 +898,56 @@ test_spinlock(void)
 #endif
 }
 
+/*
+ * Verify that performing atomic ops inside a spinlock isn't a
+ * problem. Realistically that's only going to be a problem when both
+ * --disable-spinlocks and --disable-atomics are used, but it's cheap enough
+ * to just always test.
+ *
+ * The test works by initializing enough atomics that we'd conflict if there
+ * were an overlap between a spinlock and an atomic by holding a spinlock
+ * while manipulating more than NUM_SPINLOCK_SEMAPHORES atomics.
+ *
+ * NUM_TEST_ATOMICS doesn't really need to be more than
+ * NUM_SPINLOCK_SEMAPHORES, but it seems better to test a bit more
+ * extensively.
+ */
+static void
+test_atomic_spin_nest(void)
+{
+	slock_t lock;
+#define NUM_TEST_ATOMICS (NUM_SPINLOCK_SEMAPHORES + NUM_ATOMICS_SEMAPHORES + 27)
+	pg_atomic_uint32 atomics32[NUM_TEST_ATOMICS];
+	pg_atomic_uint64 atomics64[NUM_TEST_ATOMICS];
+
+	SpinLockInit(&lock);
+
+	for (int i = 0; i < NUM_TEST_ATOMICS; i++)
+	{
+		pg_atomic_init_u32(&atomics32[i], 0);
+		pg_atomic_init_u64(&atomics64[i], 0);
+	}
+
+	/* just so it's not all zeroes */
+	for (int i = 0; i < NUM_TEST_ATOMICS; i++)
+	{
+		EXPECT_EQ_U32(pg_atomic_fetch_add_u32(&atomics32[i], i), 0);
+		EXPECT_EQ_U64(pg_atomic_fetch_add_u64(&atomics64[i], i), 0);
+	}
+
+	/* test whether we can do atomic op with lock held */
+	SpinLockAcquire(&lock);
+	for (int i = 0; i < NUM_TEST_ATOMICS; i++)
+	{
+		EXPECT_EQ_U32(pg_atomic_fetch_sub_u32(&atomics32[i], i), i);
+		EXPECT_EQ_U32(pg_atomic_read_u32(&atomics32[i]), 0);
+		EXPECT_EQ_U64(pg_atomic_fetch_sub_u64(&atomics64[i], i), i);
+		EXPECT_EQ_U64(pg_atomic_read_u64(&atomics64[i]), 0);
+	}
+	SpinLockRelease(&lock);
+}
+#undef NUM_TEST_ATOMICS
+
 PG_FUNCTION_INFO_V1(test_atomic_ops);
 Datum
 test_atomic_ops(PG_FUNCTION_ARGS)
@@ -914,6 +964,8 @@ test_atomic_ops(PG_FUNCTION_ARGS)
 	 */
 	test_spinlock();
 
+	test_atomic_spin_nest();
+
 	PG_RETURN_BOOL(true);
 }
 

From f219167910ad33dfd8f1b0bba15323d71a91c4e9 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 18 Jun 2020 19:40:09 -0700
Subject: [PATCH 014/334] Clean up includes of s_lock.h.

Users of spinlocks should use spin.h, not s_lock.h. And lwlock.h
hasn't utilized spinlocks for quite a while.

Discussion: https://postgr.es/m/20200618183041.upyrd25eosecyf3x@alap3.anarazel.de
---
 src/backend/main/main.c                  | 1 -
 src/include/storage/condition_variable.h | 2 +-
 src/include/storage/lwlock.h             | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/backend/main/main.c b/src/backend/main/main.c
index da3dae9e250f..a4dd233c7f92 100644
--- a/src/backend/main/main.c
+++ b/src/backend/main/main.c
@@ -35,7 +35,6 @@
 #include "common/username.h"
 #include "port/atomics.h"
 #include "postmaster/postmaster.h"
-#include "storage/s_lock.h"
 #include "storage/spin.h"
 #include "tcop/tcopprot.h"
 #include "utils/help_config.h"
diff --git a/src/include/storage/condition_variable.h b/src/include/storage/condition_variable.h
index c2be198f28e6..ad209acfac06 100644
--- a/src/include/storage/condition_variable.h
+++ b/src/include/storage/condition_variable.h
@@ -23,7 +23,7 @@
 #define CONDITION_VARIABLE_H
 
 #include "storage/proclist_types.h"
-#include "storage/s_lock.h"
+#include "storage/spin.h"
 
 typedef struct
 {
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index c04ae971485e..af9b41795d26 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -20,7 +20,6 @@
 
 #include "port/atomics.h"
 #include "storage/proclist_types.h"
-#include "storage/s_lock.h"
 
 struct PGPROC;
 

From 9bdb300dedf086cc54edf740088208e6b24307ef Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Fri, 19 Jun 2020 17:24:27 +1200
Subject: [PATCH 015/334] Fix EXPLAIN ANALYZE for parallel HashAgg plans

Since 1f39bce02, HashAgg nodes have had the ability to spill to disk when
memory consumption exceeds work_mem. That commit added new properties to
EXPLAIN ANALYZE to show the maximum memory usage and disk usage, however,
it didn't quite go as far as showing that information for parallel
workers.  Since workers may have experienced something very different from
the main process, we should show this information per worker, as is done
in Sort.

Reviewed-by: Justin Pryzby
Reviewed-by: Jeff Davis
Discussion: https://postgr.es/m/CAApHDvpEKbfZa18mM1TD7qV6PG+w97pwCWq5tVD0dX7e11gRJw@mail.gmail.com
Backpatch-through: 13, where the hashagg spilling code was added.
---
 src/backend/commands/explain.c      | 110 ++++++++++++++++++++++++----
 src/backend/executor/execParallel.c |  19 ++++-
 src/backend/executor/nodeAgg.c      | 103 ++++++++++++++++++++++++++
 src/include/executor/nodeAgg.h      |   7 ++
 src/include/nodes/execnodes.h       |  22 ++++++
 5 files changed, 244 insertions(+), 17 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 9092b4b30944..67bdcb2b2785 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3051,29 +3051,111 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 	Agg		   *agg = (Agg *) aggstate->ss.ps.plan;
 	int64		memPeakKb = (aggstate->hash_mem_peak + 1023) / 1024;
 
-	Assert(IsA(aggstate, AggState));
-
 	if (agg->aggstrategy != AGG_HASHED &&
 		agg->aggstrategy != AGG_MIXED)
 		return;
 
-	if (es->costs && aggstate->hash_planned_partitions > 0)
+	if (es->format != EXPLAIN_FORMAT_TEXT)
 	{
-		ExplainPropertyInteger("Planned Partitions", NULL,
-							   aggstate->hash_planned_partitions, es);
+
+		if (es->costs && aggstate->hash_planned_partitions > 0)
+		{
+			ExplainPropertyInteger("Planned Partitions", NULL,
+								   aggstate->hash_planned_partitions, es);
+		}
+
+		if (!es->analyze)
+			return;
+
+		/* EXPLAIN ANALYZE */
+		ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es);
+		if (aggstate->hash_batches_used > 0)
+		{
+			ExplainPropertyInteger("Disk Usage", "kB",
+								   aggstate->hash_disk_used, es);
+			ExplainPropertyInteger("HashAgg Batches", NULL,
+								   aggstate->hash_batches_used, es);
+		}
 	}
+	else
+	{
+		bool		gotone = false;
 
-	if (!es->analyze)
-		return;
+		if (es->costs && aggstate->hash_planned_partitions > 0)
+		{
+			ExplainIndentText(es);
+			appendStringInfo(es->str, "Planned Partitions: %d",
+							 aggstate->hash_planned_partitions);
+			gotone = true;
+		}
+
+		if (!es->analyze)
+		{
+			if (gotone)
+				appendStringInfoChar(es->str, '\n');
+			return;
+		}
+
+		if (!gotone)
+			ExplainIndentText(es);
+		else
+			appendStringInfoString(es->str, "  ");
+
+		appendStringInfo(es->str, "Peak Memory Usage: " INT64_FORMAT " kB",
+						 memPeakKb);
 
-	/* EXPLAIN ANALYZE */
-	ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es);
-	if (aggstate->hash_batches_used > 0)
+		if (aggstate->hash_batches_used > 0)
+			appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT " kB  HashAgg Batches: %d",
+							 aggstate->hash_disk_used,
+							 aggstate->hash_batches_used);
+		appendStringInfoChar(es->str, '\n');
+	}
+
+	/* Display stats for each parallel worker */
+	if (es->analyze && aggstate->shared_info != NULL)
 	{
-		ExplainPropertyInteger("Disk Usage", "kB",
-							   aggstate->hash_disk_used, es);
-		ExplainPropertyInteger("HashAgg Batches", NULL,
-							   aggstate->hash_batches_used, es);
+		for (int n = 0; n < aggstate->shared_info->num_workers; n++)
+		{
+			AggregateInstrumentation *sinstrument;
+			uint64		hash_disk_used;
+			int			hash_batches_used;
+
+			sinstrument = &aggstate->shared_info->sinstrument[n];
+			hash_disk_used = sinstrument->hash_disk_used;
+			hash_batches_used = sinstrument->hash_batches_used;
+			memPeakKb = (sinstrument->hash_mem_peak + 1023) / 1024;
+
+			if (es->workers_state)
+				ExplainOpenWorker(n, es);
+
+			if (es->format == EXPLAIN_FORMAT_TEXT)
+			{
+				ExplainIndentText(es);
+
+				appendStringInfo(es->str, "Peak Memory Usage: " INT64_FORMAT " kB",
+								 memPeakKb);
+
+				if (hash_batches_used > 0)
+					appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT " kB  HashAgg Batches: %d",
+									 hash_disk_used, hash_batches_used);
+				appendStringInfoChar(es->str, '\n');
+			}
+			else
+			{
+				ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb,
+									   es);
+				if (hash_batches_used > 0)
+				{
+					ExplainPropertyInteger("Disk Usage", "kB", hash_disk_used,
+										   es);
+					ExplainPropertyInteger("HashAgg Batches", NULL,
+										   hash_batches_used, es);
+				}
+			}
+
+			if (es->workers_state)
+				ExplainCloseWorker(n, es);
+		}
 	}
 }
 
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 41cb41481df6..382e78fb7fed 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -25,6 +25,7 @@
 
 #include "executor/execParallel.h"
 #include "executor/executor.h"
+#include "executor/nodeAgg.h"
 #include "executor/nodeAppend.h"
 #include "executor/nodeBitmapHeapscan.h"
 #include "executor/nodeCustom.h"
@@ -288,7 +289,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecIncrementalSortEstimate((IncrementalSortState *) planstate, e->pcxt);
 			break;
-
+		case T_AggState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecAggEstimate((AggState *) planstate, e->pcxt);
+			break;
 		default:
 			break;
 	}
@@ -505,7 +509,10 @@ ExecParallelInitializeDSM(PlanState *planstate,
 			/* even when not parallel-aware, for EXPLAIN ANALYZE */
 			ExecIncrementalSortInitializeDSM((IncrementalSortState *) planstate, d->pcxt);
 			break;
-
+		case T_AggState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecAggInitializeDSM((AggState *) planstate, d->pcxt);
+			break;
 		default:
 			break;
 	}
@@ -1048,6 +1055,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 		case T_HashState:
 			ExecHashRetrieveInstrumentation((HashState *) planstate);
 			break;
+		case T_AggState:
+			ExecAggRetrieveInstrumentation((AggState *) planstate);
+			break;
 		default:
 			break;
 	}
@@ -1336,7 +1346,10 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 			ExecIncrementalSortInitializeWorker((IncrementalSortState *) planstate,
 												pwcxt);
 			break;
-
+		case T_AggState:
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecAggInitializeWorker((AggState *) planstate, pwcxt);
+			break;
 		default:
 			break;
 	}
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 331acee28141..a20554ae65a6 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -240,6 +240,7 @@
 #include "postgres.h"
 
 #include "access/htup_details.h"
+#include "access/parallel.h"
 #include "catalog/objectaccess.h"
 #include "catalog/pg_aggregate.h"
 #include "catalog/pg_proc.h"
@@ -4483,6 +4484,22 @@ ExecEndAgg(AggState *node)
 	int			numGroupingSets = Max(node->maxsets, 1);
 	int			setno;
 
+	/*
+	 * When ending a parallel worker, copy the statistics gathered by the
+	 * worker back into shared memory so that it can be picked up by the main
+	 * process to report in EXPLAIN ANALYZE.
+	 */
+	if (node->shared_info && IsParallelWorker())
+	{
+		AggregateInstrumentation *si;
+
+		Assert(ParallelWorkerNumber <= node->shared_info->num_workers);
+		si = &node->shared_info->sinstrument[ParallelWorkerNumber];
+		si->hash_batches_used = node->hash_batches_used;
+		si->hash_disk_used = node->hash_disk_used;
+		si->hash_mem_peak = node->hash_mem_peak;
+	}
+
 	/* Make sure we have closed any open tuplesorts */
 
 	if (node->sort_in)
@@ -4854,3 +4871,89 @@ aggregate_dummy(PG_FUNCTION_ARGS)
 		 fcinfo->flinfo->fn_oid);
 	return (Datum) 0;			/* keep compiler quiet */
 }
+
+/* ----------------------------------------------------------------
+ *						Parallel Query Support
+ * ----------------------------------------------------------------
+ */
+
+ /* ----------------------------------------------------------------
+  *		ExecAggEstimate
+  *
+  *		Estimate space required to propagate aggregate statistics.
+  * ----------------------------------------------------------------
+  */
+void
+ExecAggEstimate(AggState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = mul_size(pcxt->nworkers, sizeof(AggregateInstrumentation));
+	size = add_size(size, offsetof(SharedAggInfo, sinstrument));
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAggInitializeDSM
+ *
+ *		Initialize DSM space for aggregate statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt)
+{
+	Size		size;
+
+	/* don't need this if not instrumenting or no workers */
+	if (!node->ss.ps.instrument || pcxt->nworkers == 0)
+		return;
+
+	size = offsetof(SharedAggInfo, sinstrument)
+		+ pcxt->nworkers * sizeof(AggregateInstrumentation);
+	node->shared_info = shm_toc_allocate(pcxt->toc, size);
+	/* ensure any unfilled slots will contain zeroes */
+	memset(node->shared_info, 0, size);
+	node->shared_info->num_workers = pcxt->nworkers;
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id,
+				   node->shared_info);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAggInitializeWorker
+ *
+ *		Attach worker to DSM space for aggregate statistics.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt)
+{
+	node->shared_info =
+		shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, true);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecAggRetrieveInstrumentation
+ *
+ *		Transfer aggregate statistics from DSM to private memory.
+ * ----------------------------------------------------------------
+ */
+void
+ExecAggRetrieveInstrumentation(AggState *node)
+{
+	Size		size;
+	SharedAggInfo *si;
+
+	if (node->shared_info == NULL)
+		return;
+
+	size = offsetof(SharedAggInfo, sinstrument)
+		+ node->shared_info->num_workers * sizeof(AggregateInstrumentation);
+	si = palloc(size);
+	memcpy(si, node->shared_info, size);
+	node->shared_info = si;
+}
diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h
index 92c2337fd3ac..bb0805abe091 100644
--- a/src/include/executor/nodeAgg.h
+++ b/src/include/executor/nodeAgg.h
@@ -14,6 +14,7 @@
 #ifndef NODEAGG_H
 #define NODEAGG_H
 
+#include "access/parallel.h"
 #include "nodes/execnodes.h"
 
 
@@ -323,4 +324,10 @@ extern void hash_agg_set_limits(double hashentrysize, uint64 input_groups,
 								int used_bits, Size *mem_limit,
 								uint64 *ngroups_limit, int *num_partitions);
 
+/* parallel instrumentation support */
+extern void ExecAggEstimate(AggState *node, ParallelContext *pcxt);
+extern void ExecAggInitializeDSM(AggState *node, ParallelContext *pcxt);
+extern void ExecAggInitializeWorker(AggState *node, ParallelWorkerContext *pwcxt);
+extern void ExecAggRetrieveInstrumentation(AggState *node);
+
 #endif							/* NODEAGG_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 98e0072b8ad2..f5dfa32d55c4 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2101,6 +2101,27 @@ typedef struct GroupState
 	bool		grp_done;		/* indicates completion of Group scan */
 } GroupState;
 
+/* ---------------------
+ *	per-worker aggregate information
+ * ---------------------
+ */
+typedef struct AggregateInstrumentation
+{
+	Size		hash_mem_peak;	/* peak hash table memory usage */
+	uint64		hash_disk_used; /* kB of disk space used */
+	int			hash_batches_used;	/* batches used during entire execution */
+} AggregateInstrumentation;
+
+/* ----------------
+ *	 Shared memory container for per-worker aggregate information
+ * ----------------
+ */
+typedef struct SharedAggInfo
+{
+	int			num_workers;
+	AggregateInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
+} SharedAggInfo;
+
 /* ---------------------
  *	AggState information
  *
@@ -2190,6 +2211,7 @@ typedef struct AggState
 	AggStatePerGroup *all_pergroups;	/* array of first ->pergroups, than
 										 * ->hash_pergroup */
 	ProjectionInfo *combinedproj;	/* projection machinery */
+	SharedAggInfo *shared_info; /* one entry per worker */
 } AggState;
 
 /* ----------------

From f9e9704f09daf882f5a1cf1fbe3f5a3150ae2bb9 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Fri, 19 Jun 2020 17:15:52 +0900
Subject: [PATCH 016/334] Fix issues in invalidation of obsolete replication
 slots.

This commit fixes the following issues.

1. There is the case where the slot is dropped while trying to invalidate it.
    InvalidateObsoleteReplicationSlots() did not handle this case, and
    which could cause checkpoint to fail.

2. InvalidateObsoleteReplicationSlots() could emit the same log message
    multiple times unnecessary. It should be logged only once.

3. When marking the slot as used, we always searched the target slot from
    all the replication slots even if we already found it. This could cause
    useless waste of cycles.

Back-patch to v13 where these issues were added as a part of
max_slot_wal_keep_size code.

Author: Fujii Masao
Reviewed-by: Kyotaro Horiguchi, Alvaro Herrera
Discussion: https://postgr.es/m/66c05b67-3396-042c-1b41-bfa6c3ddcf82@oss.nttdata.com
---
 src/backend/replication/slot.c | 226 ++++++++++++++++++++++-----------
 1 file changed, 154 insertions(+), 72 deletions(-)

diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 505445f2dc84..a7bbcf34991a 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -99,6 +99,9 @@ ReplicationSlot *MyReplicationSlot = NULL;
 int			max_replication_slots = 0;	/* the maximum number of replication
 										 * slots */
 
+static ReplicationSlot *SearchNamedReplicationSlot(const char *name);
+static int ReplicationSlotAcquireInternal(ReplicationSlot *slot,
+										  const char *name, SlotAcquireBehavior behavior);
 static void ReplicationSlotDropAcquired(void);
 static void ReplicationSlotDropPtr(ReplicationSlot *slot);
 
@@ -322,77 +325,117 @@ ReplicationSlotCreate(const char *name, bool db_specific,
 }
 
 /*
- * Find a previously created slot and mark it as used by this backend.
+ * Search for the named replication slot.
  *
- * The return value is only useful if behavior is SAB_Inquire, in which
- * it's zero if we successfully acquired the slot, or the PID of the
- * owning process otherwise.  If behavior is SAB_Error, then trying to
- * acquire an owned slot is an error.  If SAB_Block, we sleep until the
- * slot is released by the owning process.
+ * Return the replication slot if found, otherwise NULL.
+ *
+ * The caller must hold ReplicationSlotControlLock in shared mode.
  */
-int
-ReplicationSlotAcquire(const char *name, SlotAcquireBehavior behavior)
+static ReplicationSlot *
+SearchNamedReplicationSlot(const char *name)
 {
-	ReplicationSlot *slot;
-	int			active_pid;
 	int			i;
+	ReplicationSlot	*slot = NULL;
 
-retry:
-	Assert(MyReplicationSlot == NULL);
+	Assert(LWLockHeldByMeInMode(ReplicationSlotControlLock,
+								LW_SHARED));
 
-	/*
-	 * Search for the named slot and mark it active if we find it.  If the
-	 * slot is already active, we exit the loop with active_pid set to the PID
-	 * of the backend that owns it.
-	 */
-	active_pid = 0;
-	slot = NULL;
-	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
 	for (i = 0; i < max_replication_slots; i++)
 	{
 		ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
 
 		if (s->in_use && strcmp(name, NameStr(s->data.name)) == 0)
 		{
-			/*
-			 * This is the slot we want; check if it's active under some other
-			 * process.  In single user mode, we don't need this check.
-			 */
-			if (IsUnderPostmaster)
-			{
-				/*
-				 * Get ready to sleep on it in case it is active.  (We may end
-				 * up not sleeping, but we don't want to do this while holding
-				 * the spinlock.)
-				 */
-				ConditionVariablePrepareToSleep(&s->active_cv);
-
-				SpinLockAcquire(&s->mutex);
-
-				active_pid = s->active_pid;
-				if (active_pid == 0)
-					active_pid = s->active_pid = MyProcPid;
-
-				SpinLockRelease(&s->mutex);
-			}
-			else
-				active_pid = MyProcPid;
 			slot = s;
-
 			break;
 		}
 	}
-	LWLockRelease(ReplicationSlotControlLock);
 
-	/* If we did not find the slot, error out. */
-	if (slot == NULL)
+	return slot;
+}
+
+/*
+ * Find a previously created slot and mark it as used by this process.
+ *
+ * The return value is only useful if behavior is SAB_Inquire, in which
+ * it's zero if we successfully acquired the slot, -1 if the slot no longer
+ * exists, or the PID of the owning process otherwise.  If behavior is
+ * SAB_Error, then trying to acquire an owned slot is an error.
+ * If SAB_Block, we sleep until the slot is released by the owning process.
+ */
+int
+ReplicationSlotAcquire(const char *name, SlotAcquireBehavior behavior)
+{
+	return ReplicationSlotAcquireInternal(NULL, name, behavior);
+}
+
+/*
+ * Mark the specified slot as used by this process.
+ *
+ * Only one of slot and name can be specified.
+ * If slot == NULL, search for the slot with the given name.
+ *
+ * See comments about the return value in ReplicationSlotAcquire().
+ */
+static int
+ReplicationSlotAcquireInternal(ReplicationSlot *slot, const char *name,
+							   SlotAcquireBehavior behavior)
+{
+	ReplicationSlot *s;
+	int			active_pid;
+
+	AssertArg((slot == NULL) ^ (name == NULL));
+
+retry:
+	Assert(MyReplicationSlot == NULL);
+
+	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+
+	/*
+	 * Search for the slot with the specified name if the slot to acquire is
+	 * not given. If the slot is not found, we either return -1 or error out.
+	 */
+	s = slot ? slot : SearchNamedReplicationSlot(name);
+	if (s == NULL || !s->in_use)
+	{
+		LWLockRelease(ReplicationSlotControlLock);
+
+		if (behavior == SAB_Inquire)
+			return -1;
 		ereport(ERROR,
 				(errcode(ERRCODE_UNDEFINED_OBJECT),
-				 errmsg("replication slot \"%s\" does not exist", name)));
+				 errmsg("replication slot \"%s\" does not exist",
+						name ? name : NameStr(slot->data.name))));
+	}
 
 	/*
-	 * If we found the slot but it's already active in another backend, we
-	 * either error out or retry after a short wait, as caller specified.
+	 * This is the slot we want; check if it's active under some other
+	 * process.  In single user mode, we don't need this check.
+	 */
+	if (IsUnderPostmaster)
+	{
+		/*
+		 * Get ready to sleep on the slot in case it is active if SAB_Block.
+		 * (We may end up not sleeping, but we don't want to do this while
+		 * holding the spinlock.)
+		 */
+		if (behavior == SAB_Block)
+			ConditionVariablePrepareToSleep(&s->active_cv);
+
+		SpinLockAcquire(&s->mutex);
+		if (s->active_pid == 0)
+			s->active_pid = MyProcPid;
+		active_pid = s->active_pid;
+		SpinLockRelease(&s->mutex);
+	}
+	else
+		active_pid = MyProcPid;
+	LWLockRelease(ReplicationSlotControlLock);
+
+	/*
+	 * If we found the slot but it's already active in another process, we
+	 * either error out, return the PID of the owning process, or retry
+	 * after a short wait, as caller specified.
 	 */
 	if (active_pid != MyProcPid)
 	{
@@ -400,24 +443,24 @@ ReplicationSlotAcquire(const char *name, SlotAcquireBehavior behavior)
 			ereport(ERROR,
 					(errcode(ERRCODE_OBJECT_IN_USE),
 					 errmsg("replication slot \"%s\" is active for PID %d",
-							name, active_pid)));
+							NameStr(s->data.name), active_pid)));
 		else if (behavior == SAB_Inquire)
 			return active_pid;
 
 		/* Wait here until we get signaled, and then restart */
-		ConditionVariableSleep(&slot->active_cv,
+		ConditionVariableSleep(&s->active_cv,
 							   WAIT_EVENT_REPLICATION_SLOT_DROP);
 		ConditionVariableCancelSleep();
 		goto retry;
 	}
-	else
-		ConditionVariableCancelSleep(); /* no sleep needed after all */
+	else if (behavior == SAB_Block)
+		ConditionVariableCancelSleep();	/* no sleep needed after all */
 
 	/* Let everybody know we've modified this slot */
-	ConditionVariableBroadcast(&slot->active_cv);
+	ConditionVariableBroadcast(&s->active_cv);
 
 	/* We made this slot active, so it's ours now. */
-	MyReplicationSlot = slot;
+	MyReplicationSlot = s;
 
 	/* success */
 	return 0;
@@ -1100,43 +1143,82 @@ InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
 		ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
 		XLogRecPtr	restart_lsn = InvalidXLogRecPtr;
 		NameData	slotname;
+		int		wspid;
+		int		last_signaled_pid = 0;
 
 		if (!s->in_use)
 			continue;
 
 		SpinLockAcquire(&s->mutex);
-		if (s->data.restart_lsn == InvalidXLogRecPtr ||
-			s->data.restart_lsn >= oldestLSN)
-		{
-			SpinLockRelease(&s->mutex);
-			continue;
-		}
-
 		slotname = s->data.name;
 		restart_lsn = s->data.restart_lsn;
-
 		SpinLockRelease(&s->mutex);
+
+		if (XLogRecPtrIsInvalid(restart_lsn) || restart_lsn >= oldestLSN)
+			continue;
 		LWLockRelease(ReplicationSlotControlLock);
 
+		/* Get ready to sleep on the slot in case it is active */
+		ConditionVariablePrepareToSleep(&s->active_cv);
+
 		for (;;)
 		{
-			int			wspid = ReplicationSlotAcquire(NameStr(slotname),
-													   SAB_Inquire);
+			/*
+			 * Try to mark this slot as used by this process.
+			 *
+			 * Note that ReplicationSlotAcquireInternal(SAB_Inquire)
+			 * should not cancel the prepared condition variable
+			 * if this slot is active in other process. Because in this case
+			 * we have to wait on that CV for the process owning
+			 * the slot to be terminated, later.
+			 */
+			wspid = ReplicationSlotAcquireInternal(s, NULL, SAB_Inquire);
 
-			/* no walsender? success! */
-			if (wspid == 0)
+			/*
+			 * Exit the loop if we successfully acquired the slot or
+			 * the slot was dropped during waiting for the owning process
+			 * to be terminated. For example, the latter case is likely to
+			 * happen when the slot is temporary because it's automatically
+			 * dropped by the termination of the owning process.
+			 */
+			if (wspid <= 0)
 				break;
 
-			ereport(LOG,
-					(errmsg("terminating walsender %d because replication slot \"%s\" is too far behind",
-							wspid, NameStr(slotname))));
-			(void) kill(wspid, SIGTERM);
+			/*
+			 * Signal to terminate the process that owns the slot.
+			 *
+			 * There is the race condition where other process may own
+			 * the slot after the process using it was terminated and before
+			 * this process owns it. To handle this case, we signal again
+			 * if the PID of the owning process is changed than the last.
+			 *
+			 * XXX This logic assumes that the same PID is not reused
+			 * very quickly.
+			 */
+			if (last_signaled_pid != wspid)
+			{
+				ereport(LOG,
+						(errmsg("terminating process %d because replication slot \"%s\" is too far behind",
+								wspid, NameStr(slotname))));
+				(void) kill(wspid, SIGTERM);
+				last_signaled_pid = wspid;
+			}
 
 			ConditionVariableTimedSleep(&s->active_cv, 10,
 										WAIT_EVENT_REPLICATION_SLOT_DROP);
 		}
 		ConditionVariableCancelSleep();
 
+		/*
+		 * Do nothing here and start from scratch if the slot has
+		 * already been dropped.
+		 */
+		if (wspid == -1)
+		{
+			CHECK_FOR_INTERRUPTS();
+			goto restart;
+		}
+
 		ereport(LOG,
 				(errmsg("invalidating slot \"%s\" because its restart_lsn %X/%X exceeds max_slot_wal_keep_size",
 						NameStr(slotname),

From be14f884d57bc9c8ec8415edafea35ba5d31af59 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 19 Jun 2020 08:57:24 -0700
Subject: [PATCH 017/334] Fix deduplication "single value" strategy bug.

It was possible for deduplication's single value strategy to mistakenly
believe that a very small duplicate tuple counts as one of the six large
tuples that it aims to leave behind after the page finally splits.  This
could cause slightly suboptimal space utilization with very low
cardinality indexes, though only under fairly narrow conditions.

To fix, be particular about what kind of tuple counts as a
maxpostingsize-capped tuple.  This avoids confusion in the event of a
small tuple that gets "wedged" between two large tuples, where all
tuples on the page are duplicates of the same value.

Discussion: https://postgr.es/m/CAH2-Wz=Y+sgSFc-O3LpiZX-POx2bC+okec2KafERHuzdVa7-rQ@mail.gmail.com
Backpatch: 13-, where deduplication was introduced (by commit 0d861bbb)
---
 src/backend/access/nbtree/nbtdedup.c | 41 +++++++++++++++++++---------
 src/backend/access/nbtree/nbtsort.c  |  2 ++
 src/backend/access/nbtree/nbtxlog.c  |  1 +
 src/include/access/nbtree.h          |  1 +
 4 files changed, 32 insertions(+), 13 deletions(-)

diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c
index b20faf693daa..f6be865b17e3 100644
--- a/src/backend/access/nbtree/nbtdedup.c
+++ b/src/backend/access/nbtree/nbtdedup.c
@@ -62,7 +62,6 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
 	Page		page = BufferGetPage(buf);
 	BTPageOpaque opaque;
 	Page		newpage;
-	int			newpagendataitems = 0;
 	OffsetNumber deletable[MaxIndexTuplesPerPage];
 	BTDedupState state;
 	int			ndeletable = 0;
@@ -124,6 +123,7 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
 	 */
 	state = (BTDedupState) palloc(sizeof(BTDedupStateData));
 	state->deduplicate = true;
+	state->nmaxitems = 0;
 	state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK);
 	/* Metadata about base tuple of current pending posting list */
 	state->base = NULL;
@@ -204,26 +204,25 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
 			 * reset the state and move on without modifying the page.
 			 */
 			pagesaving += _bt_dedup_finish_pending(newpage, state);
-			newpagendataitems++;
 
 			if (singlevalstrat)
 			{
 				/*
 				 * Single value strategy's extra steps.
 				 *
-				 * Lower maxpostingsize for sixth and final item that might be
-				 * deduplicated by current deduplication pass.  When sixth
-				 * item formed/observed, stop deduplicating items.
+				 * Lower maxpostingsize for sixth and final large posting list
+				 * tuple at the point where 5 maxpostingsize-capped tuples
+				 * have either been formed or observed.
 				 *
-				 * Note: It's possible that this will be reached even when
-				 * current deduplication pass has yet to merge together some
-				 * existing items.  It doesn't matter whether or not the
-				 * current call generated the maxpostingsize-capped duplicate
-				 * tuples at the start of the page.
+				 * When a sixth maxpostingsize-capped item is formed/observed,
+				 * stop merging together tuples altogether.  The few tuples
+				 * that remain at the end of the page won't be merged together
+				 * at all (at least not until after a future page split takes
+				 * place).
 				 */
-				if (newpagendataitems == 5)
+				if (state->nmaxitems == 5)
 					_bt_singleval_fillfactor(page, state, newitemsz);
-				else if (newpagendataitems == 6)
+				else if (state->nmaxitems == 6)
 				{
 					state->deduplicate = false;
 					singlevalstrat = false; /* won't be back here */
@@ -237,7 +236,6 @@ _bt_dedup_one_page(Relation rel, Buffer buf, Relation heapRel,
 
 	/* Handle the last item */
 	pagesaving += _bt_dedup_finish_pending(newpage, state);
-	newpagendataitems++;
 
 	/*
 	 * If no items suitable for deduplication were found, newpage must be
@@ -404,7 +402,24 @@ _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
 						   (state->nhtids + nhtids) * sizeof(ItemPointerData));
 
 	if (mergedtupsz > state->maxpostingsize)
+	{
+		/*
+		 * Count this as an oversized item for single value strategy, though
+		 * only when there are 50 TIDs in the final posting list tuple.  This
+		 * limit (which is fairly arbitrary) avoids confusion about how many
+		 * 1/6 of a page tuples have been encountered/created by the current
+		 * deduplication pass.
+		 *
+		 * Note: We deliberately don't consider which deduplication pass
+		 * merged together tuples to create this item (could be a previous
+		 * deduplication pass, or current pass).  See _bt_do_singleval()
+		 * comments.
+		 */
+		if (state->nhtids > 50)
+			state->nmaxitems++;
+
 		return false;
+	}
 
 	/*
 	 * Save heap TIDs to pending posting list tuple -- itup can be merged into
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index 15f10a29d3da..c03998834d4a 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1095,6 +1095,7 @@ _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,
 		pfree(postingtuple);
 	}
 
+	dstate->nmaxitems = 0;
 	dstate->nhtids = 0;
 	dstate->nitems = 0;
 	dstate->phystupsize = 0;
@@ -1310,6 +1311,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 
 		dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));
 		dstate->deduplicate = true; /* unused */
+		dstate->nmaxitems = 0;	/* unused */
 		dstate->maxpostingsize = 0; /* set later */
 		/* Metadata about base tuple of current pending posting list */
 		dstate->base = NULL;
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 87a8612c28c4..5bec59d448dd 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -483,6 +483,7 @@ btree_xlog_dedup(XLogReaderState *record)
 
 		state = (BTDedupState) palloc(sizeof(BTDedupStateData));
 		state->deduplicate = true;	/* unused */
+		state->nmaxitems = 0;	/* unused */
 		/* Conservatively use larger maxpostingsize than primary */
 		state->maxpostingsize = BTMaxItemSize(page);
 		state->base = NULL;
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 3b2bcb22a70e..79506c748b2e 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -739,6 +739,7 @@ typedef struct BTDedupStateData
 {
 	/* Deduplication status info for entire pass over page */
 	bool		deduplicate;	/* Still deduplicating page? */
+	int			nmaxitems;		/* Number of max-sized tuples so far */
 	Size		maxpostingsize; /* Limit on size of final tuple */
 
 	/* Metadata about base tuple of current pending posting list */

From 816cbb59e3008112c5b217af7b9213b7a09881bf Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Fri, 19 Jun 2020 12:55:43 -0400
Subject: [PATCH 018/334] Adjust some glossary terms
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mostly in response to Jürgen Purtz critique of previous definitions,
though I added many other changes.

Author: Álvaro Herrera <alvherre@alvh.no-ip.org>
Reviewed-by: Jürgen Purtz <juergen@purtz.de>
Reviewed-by: Justin Pryzby <pryzby@telsasoft.com>
Reviewed-by: Erik Rijkers <er@xs4all.nl>
Discussion: https://postgr.es/m/c1e06008-2132-30f4-9b38-877e8683d418@purtz.de
---
 doc/src/sgml/glossary.sgml | 399 ++++++++++++++++++++++---------------
 1 file changed, 240 insertions(+), 159 deletions(-)

diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml
index 25b03f3b370f..c7c931c17e1b 100644
--- a/doc/src/sgml/glossary.sgml
+++ b/doc/src/sgml/glossary.sgml
@@ -23,7 +23,7 @@
   </glossentry>
 
   <glossentry id="glossary-aggregate">
-   <glossterm>Aggregate function</glossterm>
+   <glossterm>Aggregate function (routine)</glossterm>
    <glossdef>
     <para>
      A <glossterm linkend="glossary-function">function</glossterm> that
@@ -39,6 +39,11 @@
    </glossdef>
   </glossentry>
 
+  <glossentry>
+   <glossterm>Analytic function</glossterm>
+   <glosssee otherterm="glossary-window-function" />
+  </glossentry>
+
   <glossentry id="glossary-analyze">
    <glossterm>Analyze (operation)</glossterm>
    <glossdef>
@@ -54,14 +59,13 @@
      (Don't confuse this term with the <literal>ANALYZE</literal> option
      to the <xref linkend="sql-explain"/> command.)
     </para>
+    <para>
+     For more information, see
+     <xref linkend="sql-analyze"/>.
+    </para>
    </glossdef>
   </glossentry>
 
-  <glossentry>
-   <glossterm>Analytic function</glossterm>
-   <glosssee otherterm="glossary-window-function" />
-  </glossentry>
-
   <glossentry id="glossary-atomic">
    <glossterm>Atomic</glossterm>
    <glossdef>
@@ -98,8 +102,7 @@
    <glossdef>
     <para>
      An element with a certain name and data type found within a
-     <glossterm linkend="glossary-tuple">tuple</glossterm> or
-     <glossterm linkend="glossary-table">table</glossterm>.
+     <glossterm linkend="glossary-tuple">tuple</glossterm>.
     </para>
    </glossdef>
   </glossentry>
@@ -389,40 +392,33 @@
    <glosssee otherterm="glossary-data-directory" />
   </glossentry>
 
-  <glossentry id="glossary-data-directory">
-   <glossterm>Data directory</glossterm>
+  <glossentry id="glossary-database">
+   <glossterm>Database</glossterm>
    <glossdef>
     <para>
-     The base directory on the filesystem of a
-     <glossterm linkend="glossary-server">server</glossterm> that contains all
-     data files and subdirectories associated with an
-     <glossterm linkend="glossary-instance">instance</glossterm> (with the
-     exception of <glossterm linkend="glossary-tablespace">tablespaces</glossterm>).
-     The environment variable <literal>PGDATA</literal> is commonly used to
-     refer to the
-     <glossterm linkend="glossary-data-directory">data directory</glossterm>.
-    </para>
-    <para>
-     An <glossterm linkend="glossary-instance">instance</glossterm>'s storage
-     space comprises the data directory plus any additional tablespaces.
+     A named collection of
+     <glossterm linkend="glossary-sql-object">local SQL objects</glossterm>.
     </para>
     <para>
      For more information, see
-     <xref linkend="storage-file-layout"/>.
+     <xref linkend="manage-ag-overview"/>.
     </para>
    </glossdef>
   </glossentry>
 
-  <glossentry id="glossary-database">
-   <glossterm>Database</glossterm>
+  <glossentry id="glossary-db-cluster">
+   <glossterm>Database cluster</glossterm>
    <glossdef>
     <para>
-     A named collection of
-     <glossterm linkend="glossary-sql-object">SQL objects</glossterm>.
+     A collection of databases and global SQL objects,
+     and their common static and dynamic metadata.
+     Sometimes referred to as a
+     <firstterm>cluster</firstterm>.
     </para>
     <para>
-     For more information, see
-     <xref linkend="manage-ag-overview"/>.
+     In <productname>PostgreSQL</productname>, the term
+     <firstterm>cluster</firstterm> is also sometimes used to refer to an instance.
+     (Don't confuse this term with the SQL command <command>CLUSTER</command>.)
     </para>
    </glossdef>
   </glossentry>
@@ -432,6 +428,31 @@
    <glosssee otherterm="glossary-instance" />
   </glossentry>
 
+  <glossentry id="glossary-data-directory">
+   <glossterm>Data directory</glossterm>
+   <glossdef>
+    <para>
+     The base directory on the filesystem of a
+     <glossterm linkend="glossary-server">server</glossterm> that contains all
+     data files and subdirectories associated with a
+     <glossterm linkend="glossary-db-cluster">database cluster</glossterm>
+     (with the exception of
+     <glossterm linkend="glossary-tablespace">tablespaces</glossterm>,
+     and optionally <glossterm linkend="glossary-wal">WAL</glossterm>).
+     The environment variable <literal>PGDATA</literal> is commonly used to
+     refer to the data directory.
+    </para>
+    <para>
+     A <glossterm linkend="glossary-db-cluster">cluster</glossterm>'s storage
+     space comprises the data directory plus any additional tablespaces.
+    </para>
+    <para>
+     For more information, see
+     <xref linkend="storage-file-layout"/>.
+    </para>
+   </glossdef>
+  </glossentry>
+
   <glossentry id="glossary-data-page">
    <glossterm>Data page</glossterm>
    <glossdef>
@@ -578,7 +599,7 @@
   </glossentry>
 
   <glossentry id="glossary-foreign-table">
-   <glossterm>Foreign table</glossterm>
+   <glossterm>Foreign table (relation)</glossterm>
    <glossdef>
     <para>
      A <glossterm linkend="glossary-relation">relation</glossterm> which appears to have
@@ -631,12 +652,20 @@
   </glossentry>
 
   <glossentry id="glossary-function">
-   <glossterm>Function</glossterm>
+   <glossterm>Function (routine)</glossterm>
    <glossdef>
     <para>
-     Any defined transformation of data. Many functions are already defined
-     within <productname>PostgreSQL</productname> itself, but user-defined
-     ones can also be added.
+     A type of routine that receives zero or more arguments, returns zero or more
+     output values, and is constrained to run within one transaction.
+     Functions are invoked as part of a query, for example via
+     <command>SELECT</command>.
+     Certain functions can return
+     <glossterm linkend="glossary-result-set">sets</glossterm>; those are
+     called <firstterm>set-returning functions</firstterm>.
+    </para>
+    <para>
+     Functions can also be used for
+     <glossterm linkend="glossary-trigger">triggers</glossterm> to invoke.
     </para>
     <para>
      For more information, see
@@ -689,13 +718,12 @@
   </glossentry>
 
   <glossentry id="glossary-index">
-   <glossterm>Index</glossterm>
+   <glossterm>Index (relation)</glossterm>
    <glossdef>
     <para>
      A <glossterm linkend="glossary-relation">relation</glossterm> that contains
      data derived from a <glossterm linkend="glossary-table">table</glossterm>
-     (or <glossterm linkend="glossary-relation">relation</glossterm> types
-     such as a <glossterm linkend="glossary-materialized-view">materialized view</glossterm>).
+     or <glossterm linkend="glossary-materialized-view">materialized view</glossterm>.
      Its internal structure supports fast retrieval of and access to the original
      data.
     </para>
@@ -724,14 +752,12 @@
    <glossterm>Instance</glossterm>
    <glossdef>
     <para>
-     A set of databases and accompanying global SQL objects that are stored in
-     the same <glossterm linkend="glossary-data-directory">data directory</glossterm>
-     in a single <glossterm linkend="glossary-server">server</glossterm>.
-     If running, one
+     A group of backend and auxiliary processes that communicate using
+     a common shared memory area.  One
      <glossterm linkend="glossary-postmaster">postmaster process</glossterm>
-     manages a group of backend and auxiliary processes that communicate
-     using a common <glossterm linkend="glossary-shared-memory">shared memory</glossterm>
-     area.  Many instances can run on the same
+     manages the instance; one instance manages exactly one
+     <glossterm linkend="glossary-db-cluster">database cluster</glossterm>
+     with all its databases.  Many instances can run on the same
      <glossterm linkend="glossary-server">server</glossterm>
      as long as their <acronym>TCP</acronym> ports do not conflict.
     </para>
@@ -739,14 +765,10 @@
      The instance handles all key features of a <acronym>DBMS</acronym>:
      read and write access to files and shared memory,
      assurance of the <acronym>ACID</acronym> properties,
-     <glossterm linkend="glossary-connection">connections</glossterm> to client processes,
+     <glossterm linkend="glossary-connection">connections</glossterm> to
+     <glossterm linkend="glossary-client">client processes</glossterm>,
      privilege verification, crash recovery, replication, etc.
     </para>
-    <para>
-     In <productname>PostgreSQL</productname>, the term
-     <firstterm>cluster</firstterm> is also sometimes used to refer to an instance.
-     (Don't confuse this term with the SQL command <command>CLUSTER</command>.)
-    </para>
    </glossdef>
   </glossentry>
 
@@ -769,8 +791,10 @@
    <glossterm>Join</glossterm>
    <glossdef>
     <para>
-     An <acronym>SQL</acronym> keyword used in <command>SELECT</command> statements for
-     combining data from multiple <glossterm linkend="glossary-relation">relations</glossterm>.
+     An operation and <acronym>SQL</acronym> keyword used in
+     <glossterm linkend="glossary-query">queries</glossterm>
+     for combining data from multiple
+     <glossterm linkend="glossary-relation">relations</glossterm>.
     </para>
    </glossdef>
   </glossentry>
@@ -781,10 +805,10 @@
     <para>
      A means of identifying a <glossterm linkend="glossary-tuple">row</glossterm> within a
      <glossterm linkend="glossary-table">table</glossterm> or
-     <glossterm linkend="glossary-relation">relation</glossterm> by
+     other <glossterm linkend="glossary-relation">relation</glossterm> by
      values contained within one or more
      <glossterm linkend="glossary-attribute">attributes</glossterm>
-     in that table.
+     in that relation.
     </para>
    </glossdef>
   </glossentry>
@@ -813,15 +837,6 @@
    </glossdef>
   </glossentry>
 
-  <glossentry id="glossary-log-record">
-   <glossterm>Log record</glossterm>
-    <glossdef>
-     <para>
-      Archaic term for a <glossterm linkend="glossary-wal-record">WAL record</glossterm>.
-     </para>
-    </glossdef>
-  </glossentry>
-
   <glossentry id="glossary-logged">
    <glossterm>Logged</glossterm>
    <glossdef>
@@ -855,6 +870,15 @@
    </glossdef>
   </glossentry>
 
+  <glossentry id="glossary-log-record">
+   <glossterm>Log record</glossterm>
+    <glossdef>
+     <para>
+      Archaic term for a <glossterm linkend="glossary-wal-record">WAL record</glossterm>.
+     </para>
+    </glossdef>
+  </glossentry>
+
   <glossentry>
    <glossterm>Master (server)</glossterm>
    <glosssee otherterm="glossary-primary-server" />
@@ -883,12 +907,13 @@
   </glossentry>
 
   <glossentry id="glossary-materialized-view">
-   <glossterm>Materialized view</glossterm>
+   <glossterm>Materialized view (relation)</glossterm>
    <glossdef>
     <para>
      A <glossterm linkend="glossary-relation">relation</glossterm> that is
-     defined in the same way that a <glossterm linkend="glossary-view">view</glossterm>
-     is, but stores data in the same way that a
+     defined by a <command>SELECT</command> statement
+     (just like a <glossterm linkend="glossary-view">view</glossterm>),
+     but stores data in the same way that a
      <glossterm linkend="glossary-table">table</glossterm> does. It cannot be
      modified via <command>INSERT</command>, <command>UPDATE</command>, or
      <command>DELETE</command> operations.
@@ -949,6 +974,8 @@
     <para>
      One of several disjoint (not overlapping) subsets of a larger set.
     </para>
+   </glossdef>
+   <glossdef>
     <para>
      In reference to a
      <glossterm linkend="glossary-partitioned-table">partitioned table</glossterm>:
@@ -961,16 +988,18 @@
    </glossdef>
    <glossdef>
     <para>
-     In reference to a <glossterm linkend="glossary-window-function">window function</glossterm>:
+     In reference to a <glossterm linkend="glossary-window-function">window function</glossterm>
+     in a <glossterm linkend="glossary-query">query</glossterm>,
      a partition is a user-defined criterion that identifies which neighboring
-     <glossterm linkend="glossary-tuple">rows</glossterm> can be considered by the
-     function.
+     <glossterm linkend="glossary-tuple">rows</glossterm>
+     of the <glossterm linkend="glossary-result-set">query's result set</glossterm>
+     can be considered by the function.
     </para>
    </glossdef>
   </glossentry>
 
   <glossentry id="glossary-partitioned-table">
-   <glossterm>Partitioned table</glossterm>
+   <glossterm>Partitioned table (relation)</glossterm>
    <glossdef>
     <para>
      A <glossterm linkend="glossary-relation">relation</glossterm> that is
@@ -997,20 +1026,6 @@
    </glossdef>
   </glossentry>
 
-  <glossentry id="glossary-primary-server">
-   <glossterm>Primary (server)</glossterm>
-   <glossdef>
-    <para>
-     When two or more <glossterm linkend="glossary-database">databases</glossterm>
-     are linked via <glossterm linkend="glossary-replication">replication</glossterm>,
-     the <glossterm linkend="glossary-server">server</glossterm>
-     that is considered the authoritative source of information is called
-     the <firstterm>primary</firstterm>,
-     also known as a <firstterm>master</firstterm>.
-    </para>
-   </glossdef>
-  </glossentry>
-
   <glossentry id="glossary-primary-key">
    <glossterm>Primary key</glossterm>
    <glossdef>
@@ -1031,19 +1046,29 @@
    </glossdef>
   </glossentry>
 
+  <glossentry id="glossary-primary-server">
+   <glossterm>Primary (server)</glossterm>
+   <glossdef>
+    <para>
+     When two or more <glossterm linkend="glossary-database">databases</glossterm>
+     are linked via <glossterm linkend="glossary-replication">replication</glossterm>,
+     the <glossterm linkend="glossary-server">server</glossterm>
+     that is considered the authoritative source of information is called
+     the <firstterm>primary</firstterm>,
+     also known as a <firstterm>master</firstterm>.
+    </para>
+   </glossdef>
+  </glossentry>
+
   <glossentry id="glossary-procedure">
-   <glossterm>Procedure</glossterm>
+   <glossterm>Procedure (routine)</glossterm>
    <glossdef>
     <para>
-     A defined set of instructions for manipulating data within a
-     <glossterm linkend="glossary-database">database</glossterm>.
-     A <glossterm linkend="glossary-procedure">procedure</glossterm> can
-     be written in a variety of programming languages. They are
-     similar to <glossterm linkend="glossary-function">functions</glossterm>,
-     but are different in that they must be invoked via the <command>CALL</command>
-     command rather than the <command>SELECT</command> or <command>PERFORM</command>
-     commands, and they are allowed to make transactional statements such
+     A type of routine.
+     Their distinctive qualities are that they do not return values,
+     and that they are allowed to make transactional statements such
      as <command>COMMIT</command> and <command>ROLLBACK</command>.
+     They are invoked via the <command>CALL</command> command.
     </para>
     <para>
      For more information, see
@@ -1115,6 +1140,11 @@
      <glossterm linkend="glossary-index">indexes</glossterm> are all relations.
     </para>
     <para>
+     More generically, a relation is a set of tuples; for example,
+     the result of a query is also a relation.
+    </para>
+    <para>
+     In <productname>PostgreSQL</productname>,
      <firstterm>Class</firstterm> is an archaic synonym for
      <firstterm>relation</firstterm>.
     </para>
@@ -1155,16 +1185,23 @@
    <glossterm>Result set</glossterm>
    <glossdef>
     <para>
-     A data structure transmitted from a
-     <glossterm linkend="glossary-backend">backend process</glossterm> to
-     a <glossterm linkend="glossary-client">client</glossterm> upon the
-     completion of an <acronym>SQL</acronym>
-     command, usually a <command>SELECT</command> but it can be an
+     A <glossterm linkend="glossary-relation">relation</glossterm> transmitted
+     from a <glossterm linkend="glossary-backend">backend process</glossterm>
+     to a <glossterm linkend="glossary-client">client</glossterm> upon the
+     completion of an <acronym>SQL</acronym> command, usually a
+     <command>SELECT</command> but it can be an
      <command>INSERT</command>, <command>UPDATE</command>, or
      <command>DELETE</command> command if the <literal>RETURNING</literal>
-     clause is specified. The data structure consists of zero or more
-     <glossterm linkend="glossary-tuple">rows</glossterm> with the same ordered set of
-     <glossterm linkend="glossary-attribute">attributes</glossterm>.
+     clause is specified.
+    </para>
+    <para>
+     The fact that a result set is a relation means that a query can be used
+     in the definition of another query, becoming a
+     <firstterm>subquery</firstterm>.
+    </para>
+   </glossdef>
+   <glossdef>
+    <para>
     </para>
    </glossdef>
   </glossentry>
@@ -1216,6 +1253,27 @@
    </glossdef>
   </glossentry>
 
+  <glossentry id="glossary-routine">
+   <glossterm>Routine</glossterm>
+   <glossdef>
+    <para>
+     A defined set of instructions stored in the database system
+     that can be invoked for execution.
+     A routine can be written in a variety of programming
+     languages.  Routines can be
+     <glossterm linkend="glossary-function">functions</glossterm>
+     (including set-returning functions and
+     <glossterm linkend="glossary-trigger">trigger functions</glossterm>),
+     <glossterm linkend="glossary-aggregate">aggregate functions</glossterm>,
+     and <glossterm linkend="glossary-procedure">procedures</glossterm>.
+    </para>
+    <para>
+     Many routines are already defined within <productname>PostgreSQL</productname>
+     itself, but user-defined ones can also be added.
+    </para>
+   </glossdef>
+  </glossentry>
+
   <glossentry>
    <glossterm>Row</glossterm>
    <glosssee otherterm="glossary-tuple" />
@@ -1248,16 +1306,7 @@
      Each SQL object must reside in exactly one schema.
     </para>
     <para>
-     The names of SQL objects of the same type in the same schema are enforced
-     to be unique.
-     There is no restriction on reusing a name in multiple schemas.
-    </para>
-    <para>
-     All system-defined SQL objects reside in schema <literal>pg_catalog</literal>,
-     and commonly many user-defined SQL objects reside in the default schema
-     <literal>public</literal>,
-     but it is common and recommended that other schemas are created to hold
-     application-specific SQL objects.
+     All system-defined SQL objects reside in schema <literal>pg_catalog</literal>.
     </para>
    </glossdef>
    <glossdef>
@@ -1299,6 +1348,19 @@
    </glossdef>
   </glossentry>
 
+  <glossentry id="glossary-sequence">
+   <glossterm>Sequence (relation)</glossterm>
+   <glossdef>
+    <para>
+     A type of relation that is used to generate values.
+     Typically the generated values are sequential non-repeating numbers.
+     They are commonly used to generate surrogate
+     <glossterm linkend="glossary-primary-key">primary key</glossterm>
+     values.
+    </para>
+   </glossdef>
+  </glossentry>
+
 <!--  XXX should define all other isolation levels (and improve this definition)
   <glossentry id="glossary-serializable">
    <glossterm>Serializable (isolation level)</glossterm>
@@ -1339,19 +1401,6 @@
    </glossdef>
   </glossentry>
 
-  <glossentry id="glossary-sequence">
-   <glossterm>Sequence</glossterm>
-   <glossdef>
-    <para>
-     A type of relation that is used to generate values.
-     Typically the generated values are sequential non-repeating numbers.
-     They are commonly used to generate surrogate
-     <glossterm linkend="glossary-primary-key">primary key</glossterm>
-     values.
-    </para>
-   </glossdef>
-  </glossentry>
-
   <glossentry id="glossary-shared-memory">
    <glossterm>Shared memory</glossterm>
    <glossdef>
@@ -1378,33 +1427,43 @@
    </glossdef>
   </glossentry>
 
-  <glossentry>
-   <glossterm>Standby (server)</glossterm>
-   <glosssee otherterm="glossary-replica" />
-  </glossentry>
-
   <glossentry id="glossary-sql-object">
-   <glossterm>SQL Object</glossterm>
+   <glossterm>SQL object</glossterm>
     <glossdef>
      <para>
       Any object that can be created with a <command>CREATE</command>
       command.  Most objects are specific to one database, and are commonly
       known as <firstterm>local objects</firstterm>.
-      <glossterm linkend="glossary-role">Roles</glossterm>,
-      <glossterm linkend="glossary-tablespace">tablespaces</glossterm>,
-      replication origins, subscriptions for logical replication, and
-      databases themselves are not local SQL objects since they exist
-      entirely outside of any specific database;
-      they are called <firstterm>global objects</firstterm>.
      </para>
      <para>
       Most local objects belong to a specific
-      <glossterm linkend="glossary-schema">schema</glossterm> in their containing database.
+      <glossterm linkend="glossary-schema">schema</glossterm> in their
+      containing database, such as
+      <glossterm linkend="glossary-relation">relations</glossterm> (all types),
+      <glossterm linkend="glossary-function">routines</glossterm> (all types),
+      data types, etc.
+      The names of such objects of the same type in the same schema
+      are enforced to be unique.
+     </para>
+     <para>
       There also exist local objects that do not belong to schemas; some examples are
       <glossterm linkend="glossary-extension">extensions</glossterm>,
       <glossterm linkend="glossary-cast">data type casts</glossterm>, and
       <glossterm linkend="glossary-foreign-data-wrapper">foreign data wrappers</glossterm>.
-    </para>
+      The names of such objects of the same type are enforced to be unique
+      within the database.
+     </para>
+     <para>
+      Other object types, such as
+      <glossterm linkend="glossary-role">roles</glossterm>,
+      <glossterm linkend="glossary-tablespace">tablespaces</glossterm>,
+      replication origins, subscriptions for logical replication, and
+      databases themselves are not local SQL objects since they exist
+      entirely outside of any specific database;
+      they are called <firstterm>global objects</firstterm>.
+      The names of such objects are enforced to be unique within the whole
+      database cluster.
+     </para>
     <para>
       For more information, see
       <xref linkend="manage-ag-overview"/>.
@@ -1421,6 +1480,11 @@
    </glossdef>
   </glossentry>
 
+  <glossentry>
+   <glossterm>Standby (server)</glossterm>
+   <glosssee otherterm="glossary-replica" />
+  </glossentry>
+
   <glossentry id="glossary-stats-collector">
    <glossterm>Stats collector</glossterm>
    <glossdef>
@@ -1489,8 +1553,8 @@
      which require storage beyond their definition in the
      <glossterm linkend="glossary-system-catalog">system catalog</glossterm>
      must belong to a single tablespace.
-     Initially, an instance contains a single usable tablespace which is
-     used as the default one for all SQL objects, called <literal>pg_default</literal>.
+     Initially, a database cluster contains a single usable tablespace which is
+     used as the default for all SQL objects, called <literal>pg_default</literal>.
     </para>
     <para>
      For more information, see
@@ -1577,6 +1641,18 @@
    </glossdef>
   </glossentry>
 
+  <glossentry id="glossary-tps">
+   <glossterm>Transactions per second (TPS)</glossterm>
+   <glossdef>
+    <para>
+     Average number of transactions that are executed per second,
+     totalled across all sessions active for a measured run.
+     This is used as a measure of the performance characteristics of
+     an instance.
+    </para>
+   </glossdef>
+  </glossentry>
+
   <glossentry id="glossary-trigger">
    <glossterm>Trigger</glossterm>
    <glossdef>
@@ -1605,6 +1681,7 @@
      A collection of <glossterm linkend="glossary-attribute">attributes</glossterm>
      in a fixed order.
      That order may be defined by the <glossterm linkend="glossary-table">table</glossterm>
+     (or other <glossterm linkend="glossary-relation">relation</glossterm>)
      where the tuple is contained, in which case the tuple is often called a
      <firstterm>row</firstterm>.  It may also be defined by the structure of a
      result set, in which case it is sometimes called a <firstterm>record</firstterm>.
@@ -1818,28 +1895,32 @@
   </glossentry>
 
   <glossentry id="glossary-wal-writer">
-  <glossterm>WAL writer (process)</glossterm>
-  <glossdef>
-   <para>
-    A process that writes <glossterm linkend="glossary-wal-record">WAL records</glossterm>
-    from <glossterm linkend="glossary-shared-memory">shared memory</glossterm> to
-    <glossterm linkend="glossary-wal-file">WAL files</glossterm>.
-   </para>
-   <para>
-    For more information, see
-    <xref linkend="runtime-config-wal"/>.
-   </para>
-  </glossdef>
+   <glossterm>WAL writer (process)</glossterm>
+   <glossdef>
+    <para>
+     A process that writes <glossterm linkend="glossary-wal-record">WAL records</glossterm>
+     from <glossterm linkend="glossary-shared-memory">shared memory</glossterm> to
+     <glossterm linkend="glossary-wal-file">WAL files</glossterm>.
+    </para>
+    <para>
+     For more information, see
+     <xref linkend="runtime-config-wal"/>.
+    </para>
+   </glossdef>
   </glossentry>
 
   <glossentry id="glossary-window-function">
-   <glossterm>Window function</glossterm>
+   <glossterm>Window function (routine)</glossterm>
    <glossdef>
     <para>
-     A type of <glossterm linkend="glossary-function">function</glossterm> whose
-     result is based on values found in
-     <glossterm linkend="glossary-tuple">rows</glossterm> of the same
-     <glossterm linkend="glossary-partition">partition</glossterm>.
+     A type of <glossterm linkend="glossary-function">function</glossterm>
+     used in a <glossterm linkend="glossary-query">query</glossterm>
+     that applies to a <glossterm linkend="glossary-partition">partition</glossterm>
+     of the query's <glossterm linkend="glossary-result-set">result set</glossterm>;
+     the function's result is based on values found in
+     <glossterm linkend="glossary-tuple">rows</glossterm> of the same partition or frame.
+    </para>
+    <para>
      All <glossterm linkend="glossary-aggregate">aggregate functions</glossterm>
      can be used as window functions, but window functions can also be
      used to, for example, give ranks to each of the rows in the partition.
@@ -1857,8 +1938,8 @@
    <glossdef>
     <para>
      The journal that keeps track of the changes in the
-     <glossterm linkend="glossary-instance">instance</glossterm> as user- and
-     system-invoked operations take place.
+     <glossterm linkend="glossary-db-cluster">database cluster</glossterm>
+     as user- and system-invoked operations take place.
      It comprises many individual
      <glossterm linkend="glossary-wal-record">WAL records</glossterm> written
      sequentially to <glossterm linkend="glossary-wal-file">WAL files</glossterm>.

From 2c8ef9363db199aed9e8f17edba866b1215803c6 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 19 Jun 2020 13:55:21 -0400
Subject: [PATCH 019/334] Future-proof regression tests against
 possibly-missing posixrules file.

The IANA time zone folk have deprecated use of a "posixrules" file in
the tz database.  While for now it's our choice whether to keep
supplying one in our own builds, installations built with
--with-system-tzdata will soon be needing to cope with that file not
being present, at least on some platforms.

This causes a problem for the horology test, which expected the
nonstandard POSIX zone spec "CST7CDT" to apply pre-2007 US daylight
savings rules.  That does happen if the posixrules file supplies such
information, but otherwise the test produces undesired results.
To fix, add an explicit transition date rule that matches 2005 practice.
(We could alternatively have switched the test to use some real time
zone, but it seems useful to have coverage of this type of zone spec.)

While at it, update a documentation example that also relied on
"CST7CDT"; use a real-world zone name instead.  Also, document why
the zone names EST5EDT, CST6CDT, MST7MDT, PST8PDT aren't subject to
similar failures when "posixrules" is missing.

Back-patch to all supported branches, since the hazard is the same
for all.

Discussion: https://postgr.es/m/1665379.1592581287@sss.pgh.pa.us
---
 doc/src/sgml/datetime.sgml             | 12 ++++++++++++
 doc/src/sgml/func.sgml                 | 22 +++++++++++++---------
 src/test/regress/expected/horology.out |  3 ++-
 src/test/regress/sql/horology.sql      |  3 ++-
 4 files changed, 29 insertions(+), 11 deletions(-)

diff --git a/doc/src/sgml/datetime.sgml b/doc/src/sgml/datetime.sgml
index 7da4d0b7789d..71fbf842cca9 100644
--- a/doc/src/sgml/datetime.sgml
+++ b/doc/src/sgml/datetime.sgml
@@ -757,6 +757,18 @@
    CET and ends on the last Sunday in October at 3AM CEST.
   </para>
 
+  <para>
+   The four timezone names <literal>EST5EDT</literal>,
+   <literal>CST6CDT</literal>, <literal>MST7MDT</literal>,
+   and <literal>PST8PDT</literal> look like they are POSIX zone
+   specifications.  However, they actually are treated as named time zones
+   because (for historical reasons) there are files by those names in the
+   IANA time zone database.  The practical implication of this is that
+   these zone names will produce valid historical USA daylight-savings
+   transitions, even when a plain POSIX specification would not due to
+   lack of a suitable <filename>posixrules</filename> file.
+  </para>
+
   <para>
    One should be wary that it is easy to misspell a POSIX-style time zone
    specification, since there is no check on the reasonableness of the
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 9d71678029ea..b7c450ea29d2 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -9131,18 +9131,22 @@ SELECT (DATE '2001-10-30', DATE '2001-10-30') OVERLAPS
    When adding an <type>interval</type> value to (or subtracting an
    <type>interval</type> value from) a <type>timestamp with time zone</type>
    value, the days component advances or decrements the date of the
-   <type>timestamp with time zone</type> by the indicated number of days.
+   <type>timestamp with time zone</type> by the indicated number of days,
+   keeping the time of day the same.
    Across daylight saving time changes (when the session time zone is set to a
    time zone that recognizes DST), this means <literal>interval '1 day'</literal>
    does not necessarily equal <literal>interval '24 hours'</literal>.
-   For example, with the session time zone set to <literal>CST7CDT</literal>,
-   <literal>timestamp with time zone '2005-04-02 12:00-07' + interval '1 day'</literal>
-   will produce <literal>timestamp with time zone '2005-04-03 12:00-06'</literal>,
-   while adding <literal>interval '24 hours'</literal> to the same initial
-   <type>timestamp with time zone</type> produces
-   <literal>timestamp with time zone '2005-04-03 13:00-06'</literal>, as there is
-   a change in daylight saving time at <literal>2005-04-03 02:00</literal> in time zone
-   <literal>CST7CDT</literal>.
+   For example, with the session time zone set
+   to <literal>America/Denver</literal>:
+<screen>
+SELECT timestamp with time zone '2005-04-02 12:00:00-07' + interval '1 day';
+<lineannotation>Result: </lineannotation><computeroutput>2005-04-03 12:00:00-06</computeroutput>
+SELECT timestamp with time zone '2005-04-02 12:00:00-07' + interval '24 hours';
+<lineannotation>Result: </lineannotation><computeroutput>2005-04-03 13:00:00-06</computeroutput>
+</screen>
+   This happens because an hour was skipped due to a change in daylight saving
+   time at <literal>2005-04-03 02:00:00</literal> in time zone
+   <literal>America/Denver</literal>.
   </para>
 
   <para>
diff --git a/src/test/regress/expected/horology.out b/src/test/regress/expected/horology.out
index f67d624ad5d7..c8c33a0fc067 100644
--- a/src/test/regress/expected/horology.out
+++ b/src/test/regress/expected/horology.out
@@ -652,7 +652,8 @@ SELECT (timestamp with time zone 'tomorrow' > 'now') as "True";
 (1 row)
 
 -- timestamp with time zone, interval arithmetic around DST change
-SET TIME ZONE 'CST7CDT';
+-- (just for fun, let's use an intentionally nonstandard POSIX zone spec)
+SET TIME ZONE 'CST7CDT,M4.1.0,M10.5.0';
 SELECT timestamp with time zone '2005-04-02 12:00-07' + interval '1 day' as "Apr 3, 12:00";
          Apr 3, 12:00         
 ------------------------------
diff --git a/src/test/regress/sql/horology.sql b/src/test/regress/sql/horology.sql
index d1f6d5bfcd3c..c464e6766c69 100644
--- a/src/test/regress/sql/horology.sql
+++ b/src/test/regress/sql/horology.sql
@@ -122,7 +122,8 @@ SELECT (timestamp with time zone 'tomorrow' = (timestamp with time zone 'yesterd
 SELECT (timestamp with time zone 'tomorrow' > 'now') as "True";
 
 -- timestamp with time zone, interval arithmetic around DST change
-SET TIME ZONE 'CST7CDT';
+-- (just for fun, let's use an intentionally nonstandard POSIX zone spec)
+SET TIME ZONE 'CST7CDT,M4.1.0,M10.5.0';
 SELECT timestamp with time zone '2005-04-02 12:00-07' + interval '1 day' as "Apr 3, 12:00";
 SELECT timestamp with time zone '2005-04-02 12:00-07' + interval '24 hours' as "Apr 3, 13:00";
 SELECT timestamp with time zone '2005-04-03 12:00-06' - interval '1 day' as "Apr 2, 12:00";

From ae3259c55067c926d25c745d70265fca15c2d26b Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Fri, 19 Jun 2020 16:46:07 -0400
Subject: [PATCH 020/334] Ensure write failure reports no-disk-space
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A few places calling fwrite and gzwrite were not setting errno to ENOSPC
when reporting errors, as is customary; this led to some failures being
reported as
"could not write file: Success"
which makes us look silly.  Make a few of these places in pg_dump and
pg_basebackup use our customary pattern.

Backpatch-to: 9.5
Author: Justin Pryzby <pryzby@telsasoft.com>
Author: Tom Lane <tgl@sss.pgh.pa.us>
Author: Álvaro Herrera <alvherre@alvh.no-ip.org>
Discussion: https://postgr.es/m/20200611153753.GU14879@telsasoft.com
---
 src/bin/pg_basebackup/pg_basebackup.c | 16 ++++++++++++++++
 src/bin/pg_dump/pg_backup_directory.c | 18 ++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 256c0c074c75..4f29671d0cdc 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -992,8 +992,12 @@ writeTarData(WriteTarState *state, char *buf, int r)
 #ifdef HAVE_LIBZ
 	if (state->ztarfile != NULL)
 	{
+		errno = 0;
 		if (gzwrite(state->ztarfile, buf, r) != r)
 		{
+			/* if write didn't set errno, assume problem is no disk space */
+			if (errno == 0)
+				errno = ENOSPC;
 			pg_log_error("could not write to compressed file \"%s\": %s",
 						 state->filename, get_gz_error(state->ztarfile));
 			exit(1);
@@ -1002,8 +1006,12 @@ writeTarData(WriteTarState *state, char *buf, int r)
 	else
 #endif
 	{
+		errno = 0;
 		if (fwrite(buf, r, 1, state->tarfile) != 1)
 		{
+			/* if write didn't set errno, assume problem is no disk space */
+			if (errno == 0)
+				errno = ENOSPC;
 			pg_log_error("could not write to file \"%s\": %m",
 						 state->filename);
 			exit(1);
@@ -1691,8 +1699,12 @@ ReceiveTarAndUnpackCopyChunk(size_t r, char *copybuf, void *callback_data)
 			return;
 		}
 
+		errno = 0;
 		if (fwrite(copybuf, r, 1, state->file) != 1)
 		{
+			/* if write didn't set errno, assume problem is no disk space */
+			if (errno == 0)
+				errno = ENOSPC;
 			pg_log_error("could not write to file \"%s\": %m", state->filename);
 			exit(1);
 		}
@@ -1743,8 +1755,12 @@ ReceiveBackupManifestChunk(size_t r, char *copybuf, void *callback_data)
 {
 	WriteManifestState *state = callback_data;
 
+	errno = 0;
 	if (fwrite(copybuf, r, 1, state->file) != 1)
 	{
+		/* if write didn't set errno, assume problem is no disk space */
+		if (errno == 0)
+			errno = ENOSPC;
 		pg_log_error("could not write to file \"%s\": %m", state->filename);
 		exit(1);
 	}
diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c
index ac81151acc9d..cb0f7f31fd7d 100644
--- a/src/bin/pg_dump/pg_backup_directory.c
+++ b/src/bin/pg_dump/pg_backup_directory.c
@@ -346,9 +346,15 @@ _WriteData(ArchiveHandle *AH, const void *data, size_t dLen)
 {
 	lclContext *ctx = (lclContext *) AH->formatData;
 
+	errno = 0;
 	if (dLen > 0 && cfwrite(data, dLen, ctx->dataFH) != dLen)
+	{
+		/* if write didn't set errno, assume problem is no disk space */
+		if (errno == 0)
+			errno = ENOSPC;
 		fatal("could not write to output file: %s",
 			  get_cfp_error(ctx->dataFH));
+	}
 }
 
 /*
@@ -481,9 +487,15 @@ _WriteByte(ArchiveHandle *AH, const int i)
 	unsigned char c = (unsigned char) i;
 	lclContext *ctx = (lclContext *) AH->formatData;
 
+	errno = 0;
 	if (cfwrite(&c, 1, ctx->dataFH) != 1)
+	{
+		/* if write didn't set errno, assume problem is no disk space */
+		if (errno == 0)
+			errno = ENOSPC;
 		fatal("could not write to output file: %s",
 			  get_cfp_error(ctx->dataFH));
+	}
 
 	return 1;
 }
@@ -511,9 +523,15 @@ _WriteBuf(ArchiveHandle *AH, const void *buf, size_t len)
 {
 	lclContext *ctx = (lclContext *) AH->formatData;
 
+	errno = 0;
 	if (cfwrite(buf, len, ctx->dataFH) != len)
+	{
+		/* if write didn't set errno, assume problem is no disk space */
+		if (errno == 0)
+			errno = ENOSPC;
 		fatal("could not write to output file: %s",
 			  get_cfp_error(ctx->dataFH));
+	}
 }
 
 /*

From 74b4d78e037c1d4d55fd61f1b58c161e6249dfde Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Sat, 20 Jun 2020 09:18:57 +0530
Subject: [PATCH 021/334] Removal unused function parameter in
 CopyReadBinaryAttribute.

The function parameter column_no is not used in CopyReadBinaryAttribute,
this can be removed.

Commit 0e319c7ad7 removed the usage of column_no parameter in function
CopyReadBinaryAttribute but forgot to remove the parameter.

Reported-by: Vignesh C
Author: Vignesh C
Discussion: https://postgr.es/m/CALDaNm1TYSNTfqx_jfz9_mwEZ2Er=dZnu++duXpC1uQo1cG=WA@mail.gmail.com
---
 src/backend/commands/copy.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 6d53dc463c18..6b1fd6d4cca6 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -367,8 +367,7 @@ static bool CopyReadLine(CopyState cstate);
 static bool CopyReadLineText(CopyState cstate);
 static int	CopyReadAttributesText(CopyState cstate);
 static int	CopyReadAttributesCSV(CopyState cstate);
-static Datum CopyReadBinaryAttribute(CopyState cstate,
-									 int column_no, FmgrInfo *flinfo,
+static Datum CopyReadBinaryAttribute(CopyState cstate, FmgrInfo *flinfo,
 									 Oid typioparam, int32 typmod,
 									 bool *isnull);
 static void CopyAttributeOutText(CopyState cstate, char *string);
@@ -3776,7 +3775,6 @@ NextCopyFrom(CopyState cstate, ExprContext *econtext,
 					 errmsg("row field count is %d, expected %d",
 							(int) fld_count, attr_count)));
 
-		i = 0;
 		foreach(cur, cstate->attnumlist)
 		{
 			int			attnum = lfirst_int(cur);
@@ -3784,9 +3782,7 @@ NextCopyFrom(CopyState cstate, ExprContext *econtext,
 			Form_pg_attribute att = TupleDescAttr(tupDesc, m);
 
 			cstate->cur_attname = NameStr(att->attname);
-			i++;
 			values[m] = CopyReadBinaryAttribute(cstate,
-												i,
 												&in_functions[m],
 												typioparams[m],
 												att->atttypmod,
@@ -4714,8 +4710,7 @@ CopyReadAttributesCSV(CopyState cstate)
  * Read a binary attribute
  */
 static Datum
-CopyReadBinaryAttribute(CopyState cstate,
-						int column_no, FmgrInfo *flinfo,
+CopyReadBinaryAttribute(CopyState cstate, FmgrInfo *flinfo,
 						Oid typioparam, int32 typmod,
 						bool *isnull)
 {

From d28ab91e7155353d4377abad5a7d5b0f07450867 Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Sat, 20 Jun 2020 01:25:40 -0700
Subject: [PATCH 022/334] Remove dead forceSync parameter of
 XactLogCommitRecord().

The function has been reading global variable forceSyncCommit, mirroring
the intent of the caller that passed forceSync=forceSyncCommit.  The
other caller, RecordTransactionCommitPrepared(), passed false.  Since
COMMIT PREPARED can't share a transaction with any command, it certainly
doesn't share a transaction with a command that sets forceSyncCommit.

Reviewed by Michael Paquier.

Discussion: https://postgr.es/m/20200617032615.GC2916904@rfd.leadboat.com
---
 src/backend/access/transam/twophase.c | 2 +-
 src/backend/access/transam/xact.c     | 8 +++++---
 src/include/access/xact.h             | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 54fb6cc04745..9b2e59bf0ec1 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -2217,7 +2217,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
 	recptr = XactLogCommitRecord(committs,
 								 nchildren, children, nrels, rels,
 								 ninvalmsgs, invalmsgs,
-								 initfileinval, false,
+								 initfileinval,
 								 MyXactFlags | XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK,
 								 xid, gid);
 
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index cd30b62d365d..905dc7d8d3bc 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1048,7 +1048,9 @@ CommandCounterIncrement(void)
  * ForceSyncCommit
  *
  * Interface routine to allow commands to force a synchronous commit of the
- * current top-level transaction
+ * current top-level transaction.  Currently, two-phase commit does not
+ * persist and restore this variable.  So long as all callers use
+ * PreventInTransactionBlock(), that omission has no consequences.
  */
 void
 ForceSyncCommit(void)
@@ -1315,7 +1317,7 @@ RecordTransactionCommit(void)
 		XactLogCommitRecord(xactStopTimestamp,
 							nchildren, children, nrels, rels,
 							nmsgs, invalMessages,
-							RelcacheInitFileInval, forceSyncCommit,
+							RelcacheInitFileInval,
 							MyXactFlags,
 							InvalidTransactionId, NULL /* plain commit */ );
 
@@ -5468,7 +5470,7 @@ XactLogCommitRecord(TimestampTz commit_time,
 					int nsubxacts, TransactionId *subxacts,
 					int nrels, RelFileNode *rels,
 					int nmsgs, SharedInvalidationMessage *msgs,
-					bool relcacheInval, bool forceSync,
+					bool relcacheInval,
 					int xactflags, TransactionId twophase_xid,
 					const char *twophase_gid)
 {
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index 88025b1cc2f9..db191879b9d2 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -434,7 +434,7 @@ extern XLogRecPtr XactLogCommitRecord(TimestampTz commit_time,
 									  int nsubxacts, TransactionId *subxacts,
 									  int nrels, RelFileNode *rels,
 									  int nmsgs, SharedInvalidationMessage *msgs,
-									  bool relcacheInval, bool forceSync,
+									  bool relcacheInval,
 									  int xactflags,
 									  TransactionId twophase_xid,
 									  const char *twophase_gid);

From 15cb2bd27009f73a84a35c2ba60fdd105b4bf263 Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Sat, 20 Jun 2020 13:34:54 +0300
Subject: [PATCH 023/334] Add documentation for opclass options

911e7020770 added opclass options and adjusted documentation for each
particular affected opclass.  However, documentation for extendability was
not adjusted.  This commit adjusts documentation for interfaces of index AMs
and opclasses.

Discussion: https://postgr.es/m/CAH2-WzmQnW6%2Bz5F9AW%2BSz%2BzEcEvXofTwh_A9J3%3D_WA-FBP0wYg%40mail.gmail.com
Author: Alexander Korotkov
Reported-by: Peter Geoghegan
Reviewed-by: Peter Geoghegan
---
 doc/src/sgml/brin.sgml    |  30 +++++++
 doc/src/sgml/btree.sgml   |  33 ++++++++
 doc/src/sgml/gin.sgml     |  26 +++++-
 doc/src/sgml/gist.sgml    | 161 +++++++++++++++++++++++++++++++++++++-
 doc/src/sgml/indexam.sgml |   2 +
 doc/src/sgml/spgist.sgml  |  35 ++++++++-
 doc/src/sgml/xindex.sgml  |  53 +++++++++++++
 7 files changed, 336 insertions(+), 4 deletions(-)

diff --git a/doc/src/sgml/brin.sgml b/doc/src/sgml/brin.sgml
index 46a7d07bf8b6..d7f1af7819a4 100644
--- a/doc/src/sgml/brin.sgml
+++ b/doc/src/sgml/brin.sgml
@@ -562,6 +562,36 @@ typedef struct BrinOpcInfo
    </varlistentry>
   </variablelist>
 
+  Optionally, an operator class for <acronym>BRIN</acronym> can supply the
+  following method:
+
+  <variablelist>
+    <varlistentry>
+     <term><function>void options(local_relopts *relopts)</function></term>
+     <listitem>
+      <para>
+       Defines set of user-visible parameters that control operator class
+       behavior.
+      </para>
+
+      <para>
+       The <function>options</function> function has given pointer to
+       <replaceable>local_relopts</replaceable> struct, which needs to be
+       filled with a set of operator class specific options.  The options
+       can be accessed from other support functions using
+       <literal>PG_HAS_OPCLASS_OPTIONS()</literal> and
+       <literal>PG_GET_OPCLASS_OPTIONS()</literal> macros.
+      </para>
+
+      <para>
+       Since both key extraction for indexed value and representation of the
+       key in <acronym>GIN</acronym> are flexible, it may depends on
+       user-specified parameters.
+      </para>
+     </listitem>
+    </varlistentry>
+  </variablelist>
+
   The core distribution includes support for two types of operator classes:
   minmax and inclusion.  Operator class definitions using them are shipped for
   in-core data types as appropriate.  Additional operator classes can be
diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml
index 73947db55cb4..2c4dd48ea35f 100644
--- a/doc/src/sgml/btree.sgml
+++ b/doc/src/sgml/btree.sgml
@@ -550,6 +550,39 @@ equalimage(<replaceable>opcintype</replaceable> <type>oid</type>) returns bool
     </para>
    </listitem>
   </varlistentry>
+  <varlistentry>
+   <term><function>options</function></term>
+   <listitem>
+    <para>
+     Optionally, a B-tree operator family may provide
+     <function>options</function> (<quote>operator class specific
+     options</quote>) support functions, registered under support
+     function number 5.  These functions define set of user-visible
+     parameters that control operator class behavior.
+    </para>
+    <para>
+     An <function>options</function> support function must have the
+     signature
+<synopsis>
+options(<replaceable>relopts</replaceable> <type>local_relopts *</type>) returns void
+</synopsis>
+     The function has given pointer to <replaceable>local_relopts</replaceable>
+     struct, which needs to be filled with a set of operator class
+     specific options.  The options can be accessed from other support
+     functions using <literal>PG_HAS_OPCLASS_OPTIONS()</literal> and
+     <literal>PG_GET_OPCLASS_OPTIONS()</literal> macros.
+    </para>
+    <para>
+     Currently, no B-Tree operator class has <function>options</function>
+     support function.  B-tree doesn't allow flexible representation of keys
+     like GiST, SP-GiST, GIN and BRIN do.  So, <function>options</function>
+     probably doesn't have much usage in current shape of B-tree index
+     access method.  Nevertheless, this support function was added to B-tree
+     for uniformity, and probably it will found its usage during further
+     evolution of B-tree in <productname>PostgreSQL</productname>.
+    </para>
+   </listitem>
+  </varlistentry>
  </variablelist>
 
 </sect1>
diff --git a/doc/src/sgml/gin.sgml b/doc/src/sgml/gin.sgml
index 0182b445855c..d85e7c8796bc 100644
--- a/doc/src/sgml/gin.sgml
+++ b/doc/src/sgml/gin.sgml
@@ -380,7 +380,7 @@
 
  <para>
   Optionally, an operator class for <acronym>GIN</acronym> can supply the
-  following method:
+  following methods:
 
   <variablelist>
     <varlistentry>
@@ -402,6 +402,30 @@
       </para>
      </listitem>
     </varlistentry>
+    <varlistentry>
+     <term><function>void options(local_relopts *relopts)</function></term>
+     <listitem>
+      <para>
+       Defines set of user-visible parameters that control operator class
+       behavior.
+      </para>
+
+      <para>
+       The <function>options</function> function has given pointer to
+       <replaceable>local_relopts</replaceable> struct, which needs to be
+       filled with s set of operator class specific options.  The options
+       can be accessed from other support functions using
+       <literal>PG_HAS_OPCLASS_OPTIONS()</literal> and
+       <literal>PG_GET_OPCLASS_OPTIONS()</literal> macros.
+      </para>
+
+      <para>
+       Since both key extraction for indexed value and representation of the
+       key in <acronym>GIN</acronym> are flexible, it may depends on
+       user-specified parameters.
+      </para>
+     </listitem>
+    </varlistentry>
   </variablelist>
  </para>
 
diff --git a/doc/src/sgml/gist.sgml b/doc/src/sgml/gist.sgml
index a7eec1e94970..31c28fdb61c7 100644
--- a/doc/src/sgml/gist.sgml
+++ b/doc/src/sgml/gist.sgml
@@ -269,7 +269,7 @@ CREATE INDEX ON my_table USING GIST (my_inet_column inet_ops);
 
  <para>
    There are five methods that an index operator class for
-   <acronym>GiST</acronym> must provide, and four that are optional.
+   <acronym>GiST</acronym> must provide, and five that are optional.
    Correctness of the index is ensured
    by proper implementation of the <function>same</function>, <function>consistent</function>
    and <function>union</function> methods, while efficiency (size and speed) of the
@@ -287,7 +287,9 @@ CREATE INDEX ON my_table USING GIST (my_inet_column inet_ops);
    if the operator class wishes to support ordered scans (nearest-neighbor
    searches). The optional ninth method <function>fetch</function> is needed if the
    operator class wishes to support index-only scans, except when the
-   <function>compress</function> method is omitted.
+   <function>compress</function> method is omitted. The optional tenth method
+   <function>options</function> is needed if the operator class provides
+   the user-specified parameters.
  </para>
 
  <variablelist>
@@ -939,6 +941,161 @@ my_fetch(PG_FUNCTION_ARGS)
 
      </listitem>
     </varlistentry>
+
+    <varlistentry>
+     <term><function>options</function></term>
+     <listitem>
+      <para>
+       Allows defintion of user-visible parameters that control operator
+       class behavior.
+      </para>
+
+      <para>
+        The <acronym>SQL</acronym> declaration of the function must look like this:
+
+<programlisting>
+CREATE OR REPLACE FUNCTION my_options(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+</programlisting>
+      </para>
+
+      <para>
+       The function has given pointer to <replaceable>local_relopts</replaceable>
+       struct, which needs to be filled with a set of operator class
+       specific options.  The options can be accessed from other support
+       functions using <literal>PG_HAS_OPCLASS_OPTIONS()</literal> and
+       <literal>PG_GET_OPCLASS_OPTIONS()</literal> macros.
+      </para>
+
+       <para>
+        The sample implementation of my_option() and parameters usage
+        in the another support function are given below:
+
+<programlisting>
+typedef enum MyEnumType
+{
+    MY_ENUM_ON,
+    MY_ENUM_OFF,
+    MY_ENUM_AUTO
+} MyEnumType;
+
+typedef struct
+{
+    int32   vl_len_;    /* varlena header (do not touch directly!) */
+    int     int_param;  /* integer parameter */
+    double  real_param; /* real parameter */
+    MyEnumType enum_param; /* enum parameter */
+    int     str_param;  /* string parameter */
+} MyOptionsStruct;
+
+/* String representations for enum values */
+static relopt_enum_elt_def myEnumValues[] =
+{
+    {"on", MY_ENUM_ON},
+    {"off", MY_ENUM_OFF},
+    {"auto", MY_ENUM_AUTO},
+    {(const char *) NULL}   /* list terminator */
+};
+
+static char *str_param_default = "default";
+
+/*
+ * Sample validatior: checks that string is not longer than 8 bytes.
+ */
+static void 
+validate_my_string_relopt(const char *value)
+{
+    if (strlen(value) > 8)
+        ereport(ERROR,
+                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                 errmsg("str_param must be at most 8 bytes")));
+}
+
+/*
+ * Sample filler: switches characters to lower case.
+ */
+static Size 
+fill_my_string_relopt(const char *value, void *ptr)
+{
+    char   *tmp = str_tolower(value, strlen(value), DEFAULT_COLLATION_OID);
+    int     len = strlen(tmp);
+
+    if (ptr)
+        strcpy((char *) ptr, tmp);
+
+    pfree(tmp);
+    return len + 1;
+}
+
+PG_FUNCTION_INFO_V1(my_options);
+
+Datum
+my_options(PG_FUNCTION_ARGS)
+{
+    local_relopts *relopts = (local_relopts *) PG_GETARG_POINTER(0);
+
+    init_local_reloptions(relopts, sizeof(MyOptionsStruct));
+    add_local_int_reloption(relopts, "int_param", "integer parameter",
+                            100, 0, 1000000,
+                            offsetof(MyOptionsStruct, int_param));
+    add_local_real_reloption(relopts, "real_param", "real parameter",
+                             1.0, 0.0, 1000000.0,
+                             offsetof(MyOptionsStruct, real_param));
+    add_local_enum_reloption(relopts, "enum_param", "enum parameter",
+                             myEnumValues, MY_ENUM_ON,
+                             "Valid values are: \"on\", \"off\" and \"auto\".",
+                             offsetof(MyOptionsStruct, enum_param));
+    add_local_string_reloption(relopts, "str_param", "string parameter",
+                               str_param_default,
+                               &amp;validate_my_string_relopt,
+                               &amp;fill_my_string_relopt,
+                               offsetof(MyOptionsStruct, str_param));
+
+    PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(my_compress);
+
+Datum
+my_compress(PG_FUNCTION_ARGS)
+{
+    int     int_param = 100;
+    double  real_param = 1.0;
+    MyEnumType enum_param = MY_ENUM_ON;
+    char   *str_param = str_param_default;
+
+    /*
+     * Normally, when opclass contains 'options' method, then options are always
+     * passed to support functions.  However, if you add 'options' method to
+     * existing opclass, previously defined indexes have no options, so the
+     * check is required.
+     */
+    if (PG_HAS_OPCLASS_OPTIONS())
+    {
+        MyOptionsStruct *options = (MyOptionsStruct *) PG_GET_OPCLASS_OPTIONS();
+
+        int_param = options->int_param;
+        real_param = options->real_param;
+        enum_param = options->enum_param;
+        str_param = GET_STRING_RELOPTION(options, str_param);
+    }
+
+    /* the rest implementation of support function */
+}
+
+</programlisting>
+      </para>
+
+      <para>
+       Since the representation of the key in <acronym>GiST</acronym> is
+       flexible, it may depends on user-specified parameters.  For instace,
+       the length of key signature may be such parameter.  See
+       <literal>gtsvector_options()</literal> for example.
+      </para>
+     </listitem>
+    </varlistentry>
   </variablelist>
 
   <para>
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index 37f8d8760a38..af87f172a7cd 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -96,6 +96,8 @@ typedef struct IndexAmRoutine
     uint16      amstrategies;
     /* total number of support functions that this AM uses */
     uint16      amsupport;
+    /* opclass options support function number or 0 */
+    uint16      amoptsprocnum;
     /* does AM support ORDER BY indexed column's value? */
     bool        amcanorder;
     /* does AM support ORDER BY result of an operator on indexed column? */
diff --git a/doc/src/sgml/spgist.sgml b/doc/src/sgml/spgist.sgml
index 0e04a0867932..03f914735bdb 100644
--- a/doc/src/sgml/spgist.sgml
+++ b/doc/src/sgml/spgist.sgml
@@ -858,7 +858,7 @@ typedef struct spgLeafConsistentOut
    </variablelist>
 
  <para>
-  The optional user-defined method is:
+  The optional user-defined method are:
  </para>
 
  <variablelist>
@@ -875,6 +875,39 @@ typedef struct spgLeafConsistentOut
       </para>
      </listitem>
     </varlistentry>
+    <varlistentry>
+     <term><function>options</function></term>
+     <listitem>
+      <para>
+       Defines set of user-visible parameters that control operator class
+       behavior.
+      </para>
+
+      <para>
+        The <acronym>SQL</acronym> declaration of the function must look like this:
+
+<programlisting>
+CREATE OR REPLACE FUNCTION my_options(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE C STRICT;
+</programlisting>
+      </para>
+
+      <para>
+       The function has given pointer to <replaceable>local_relopts</replaceable>
+       struct, which needs to be filled with a set of operator class
+       specific options.  The options can be accessed from other support
+       functions using <literal>PG_HAS_OPCLASS_OPTIONS()</literal> and
+       <literal>PG_GET_OPCLASS_OPTIONS()</literal> macros.
+      </para>
+
+      <para>
+       Since the representation of the key in <acronym>SP-GiST</acronym> is
+       flexible, it may depends on user-specified parameters.
+      </para>
+     </listitem>
+    </varlistentry>
   </variablelist>
 
   <para>
diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml
index 14c1701c9b57..0e4587a81b98 100644
--- a/doc/src/sgml/xindex.sgml
+++ b/doc/src/sgml/xindex.sgml
@@ -409,6 +409,13 @@
    <xref linkend="btree-support-funcs"/>.
   </para>
 
+  <para>
+   Additionally, some opclasses allow user to set specific parameters, which
+   controls its behavior.  Each builtin index access method have optional
+   <function>options</function> support function, which defines set of
+   opclass-specific parameters.
+  </para>
+
    <table tocentry="1" id="xindex-btree-support-table">
     <title>B-Tree Support Functions</title>
     <tgroup cols="2">
@@ -450,6 +457,13 @@
        </entry>
        <entry>4</entry>
       </row>
+      <row>
+       <entry>
+        Defines set of options that are specific for this operator class
+        (optional)
+       </entry>
+       <entry>5</entry>
+      </row>
      </tbody>
     </tgroup>
    </table>
@@ -485,6 +499,13 @@
        </entry>
        <entry>2</entry>
       </row>
+      <row>
+       <entry>
+        Defines set of options that are specific for this operator class
+        (optional)
+       </entry>
+       <entry>3</entry>
+      </row>
      </tbody>
     </tgroup>
    </table>
@@ -560,6 +581,14 @@
        index-only scans (optional)</entry>
        <entry>9</entry>
       </row>
+      <row>
+       <entry><function>options</function></entry>
+       <entry>
+        Defines set of options that are specific for this operator class
+        (optional)
+       </entry>
+       <entry>10</entry>
+      </row>
      </tbody>
     </tgroup>
    </table>
@@ -611,6 +640,14 @@
         query qualifier</entry>
        <entry>5</entry>
       </row>
+      <row>
+       <entry><function>options</function></entry>
+       <entry>
+        Defines set of options that are specific for this operator class
+        (optional)
+       </entry>
+       <entry>6</entry>
+      </row>
      </tbody>
     </tgroup>
    </table>
@@ -680,6 +717,14 @@
        </entry>
        <entry>6</entry>
       </row>
+      <row>
+       <entry><function>options</function></entry>
+       <entry>
+        Defines set of options that are specific for this operator class
+        (optional)
+       </entry>
+       <entry>7</entry>
+      </row>
      </tbody>
     </tgroup>
    </table>
@@ -730,6 +775,14 @@
        </entry>
        <entry>4</entry>
       </row>
+      <row>
+       <entry><function>options</function></entry>
+       <entry>
+        Defines set of options that are specific for this operator class
+        (optional)
+       </entry>
+       <entry>5</entry>
+      </row>
      </tbody>
     </tgroup>
    </table>

From a44dd932ff3816de7fe0414063cfcc5656117c3a Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Sat, 20 Jun 2020 17:34:51 +0300
Subject: [PATCH 024/334] Fix masking of SP-GiST pages during xlog consistency
 check

spg_mask() didn't take into account that pd_lower equal to SizeOfPageHeaderData
is still valid value.  This commit fixes that.  Backpatch to 11, where
spg_mask() pg_lower check was introduced.

Reported-by: Michael Paquier
Discussion: https://postgr.es/m/20200615131405.GM52676%40paquier.xyz
Backpatch-through: 11
---
 src/backend/access/spgist/spgxlog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c
index 7be2291d07e9..999d0ca15d56 100644
--- a/src/backend/access/spgist/spgxlog.c
+++ b/src/backend/access/spgist/spgxlog.c
@@ -1008,6 +1008,6 @@ spg_mask(char *pagedata, BlockNumber blkno)
 	 * Mask the unused space, but only if the page's pd_lower appears to have
 	 * been set correctly.
 	 */
-	if (pagehdr->pd_lower > SizeOfPageHeaderData)
+	if (pagehdr->pd_lower >= SizeOfPageHeaderData)
 		mask_unused_space(page);
 }

From 3be015c9fc02316f1f1579301391fb5c67731057 Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Sun, 21 Jun 2020 00:35:42 +0300
Subject: [PATCH 025/334] Minor corrections to docs related to opclass options

Reported-by: Peter Geoghegan
Discussion: https://postgr.es/m/CAH2-WzmwhYbxuoL0WjTLaiCxW3gj6qadeNpBhWAo_KZsE5-FGw%40mail.gmail.com
---
 doc/src/sgml/btree.sgml  | 2 +-
 doc/src/sgml/spgist.sgml | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml
index 2c4dd48ea35f..661e7ab6503f 100644
--- a/doc/src/sgml/btree.sgml
+++ b/doc/src/sgml/btree.sgml
@@ -207,7 +207,7 @@
 
  <para>
   As shown in <xref linkend="xindex-btree-support-table"/>, btree defines
-  one required and three optional support functions.  The four
+  one required and four optional support functions.  The five
   user-defined methods are:
  </para>
  <variablelist>
diff --git a/doc/src/sgml/spgist.sgml b/doc/src/sgml/spgist.sgml
index 03f914735bdb..ad8d348a43de 100644
--- a/doc/src/sgml/spgist.sgml
+++ b/doc/src/sgml/spgist.sgml
@@ -284,7 +284,7 @@
 
  <para>
   There are five user-defined methods that an index operator class for
-  <acronym>SP-GiST</acronym> must provide, and one is optional.  All five
+  <acronym>SP-GiST</acronym> must provide, and two are optional.  All five
   mandatory methods follow the convention of accepting two <type>internal</type>
   arguments, the first of which is a pointer to a C struct containing input
   values for the support method, while the second argument is a pointer to a
@@ -295,7 +295,9 @@
   cases, the output struct is initialized to zeroes before calling the
   user-defined method.  The optional sixth method <function>compress</function>
   accepts datum to be indexed as the only argument and returns a value suitable
-  for physical storage in a leaf tuple.
+  for physical storage in a leaf tuple.  The optional seventh method
+  <function>options</function> accepts internal pointer to a C struct, where
+  opclass-specific parameters should be placed, and returns <type>void</type>.
  </para>
 
  <para>

From 48c6959864491964ae3f893af8242d5770b3d3d2 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sat, 20 Jun 2020 17:34:07 -0700
Subject: [PATCH 026/334] Doc: Tweak description of B-Tree duplicate tuples.

Defining duplicates as "close by" to each other was unclear.  Simplify
the definition.

Backpatch: 13-, where deduplication was introduced (by commit 0d861bbb)
---
 doc/src/sgml/btree.sgml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml
index 661e7ab6503f..4f1438d522df 100644
--- a/doc/src/sgml/btree.sgml
+++ b/doc/src/sgml/btree.sgml
@@ -635,10 +635,10 @@ options(<replaceable>relopts</replaceable> <type>local_relopts *</type>) returns
    A duplicate is a leaf page tuple (a tuple that points to a table
    row) where <emphasis>all</emphasis> indexed key columns have values
    that match corresponding column values from at least one other leaf
-   page tuple that's close by in the same index.  Duplicate tuples are
-   quite common in practice.  B-Tree indexes can use a special,
-   space-efficient representation for duplicates when an optional
-   technique is enabled: <firstterm>deduplication</firstterm>.
+   page tuple in the same index.  Duplicate tuples are quite common in
+   practice.  B-Tree indexes can use a special, space-efficient
+   representation for duplicates when an optional technique is
+   enabled: <firstterm>deduplication</firstterm>.
   </para>
   <para>
    Deduplication works by periodically merging groups of duplicate

From 14903f238ec999802df5c539010c6be32d72f8cd Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Sun, 21 Jun 2020 04:48:03 +0300
Subject: [PATCH 027/334] Language fixes for docs related to opclass options

Discussion: https://postgr.es/m/20200620232145.GB17995%40telsasoft.com
Author: Justin Pryzby
Backpatch-through: 13
---
 doc/src/sgml/brin.sgml   | 12 ++++++------
 doc/src/sgml/btree.sgml  | 12 ++++++------
 doc/src/sgml/gin.sgml    | 10 +++++-----
 doc/src/sgml/gist.sgml   | 18 +++++++++---------
 doc/src/sgml/spgist.sgml | 12 ++++++------
 doc/src/sgml/xindex.sgml | 18 +++++++++---------
 6 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/doc/src/sgml/brin.sgml b/doc/src/sgml/brin.sgml
index d7f1af7819a4..4c5eeb875f4b 100644
--- a/doc/src/sgml/brin.sgml
+++ b/doc/src/sgml/brin.sgml
@@ -562,7 +562,7 @@ typedef struct BrinOpcInfo
    </varlistentry>
   </variablelist>
 
-  Optionally, an operator class for <acronym>BRIN</acronym> can supply the
+  An operator class for <acronym>BRIN</acronym> can optionally specify the
   following method:
 
   <variablelist>
@@ -570,22 +570,22 @@ typedef struct BrinOpcInfo
      <term><function>void options(local_relopts *relopts)</function></term>
      <listitem>
       <para>
-       Defines set of user-visible parameters that control operator class
+       Defines a set of user-visible parameters that control operator class
        behavior.
       </para>
 
       <para>
-       The <function>options</function> function has given pointer to
+       The <function>options</function> function is passed a pointer to a
        <replaceable>local_relopts</replaceable> struct, which needs to be
        filled with a set of operator class specific options.  The options
-       can be accessed from other support functions using
+       can be accessed from other support functions using the
        <literal>PG_HAS_OPCLASS_OPTIONS()</literal> and
        <literal>PG_GET_OPCLASS_OPTIONS()</literal> macros.
       </para>
 
       <para>
-       Since both key extraction for indexed value and representation of the
-       key in <acronym>GIN</acronym> are flexible, it may depends on
+       Since both key extraction of indexed values and representation of the
+       key in <acronym>GIN</acronym> are flexible, they may depend on
        user-specified parameters.
       </para>
      </listitem>
diff --git a/doc/src/sgml/btree.sgml b/doc/src/sgml/btree.sgml
index 4f1438d522df..d03ee4d6fa0d 100644
--- a/doc/src/sgml/btree.sgml
+++ b/doc/src/sgml/btree.sgml
@@ -557,7 +557,7 @@ equalimage(<replaceable>opcintype</replaceable> <type>oid</type>) returns bool
      Optionally, a B-tree operator family may provide
      <function>options</function> (<quote>operator class specific
      options</quote>) support functions, registered under support
-     function number 5.  These functions define set of user-visible
+     function number 5.  These functions define a set of user-visible
      parameters that control operator class behavior.
     </para>
     <para>
@@ -566,19 +566,19 @@ equalimage(<replaceable>opcintype</replaceable> <type>oid</type>) returns bool
 <synopsis>
 options(<replaceable>relopts</replaceable> <type>local_relopts *</type>) returns void
 </synopsis>
-     The function has given pointer to <replaceable>local_relopts</replaceable>
+     The function is passed a pointer to a <replaceable>local_relopts</replaceable>
      struct, which needs to be filled with a set of operator class
      specific options.  The options can be accessed from other support
-     functions using <literal>PG_HAS_OPCLASS_OPTIONS()</literal> and
+     functions using the <literal>PG_HAS_OPCLASS_OPTIONS()</literal> and
      <literal>PG_GET_OPCLASS_OPTIONS()</literal> macros.
     </para>
     <para>
-     Currently, no B-Tree operator class has <function>options</function>
+     Currently, no B-Tree operator class has an <function>options</function>
      support function.  B-tree doesn't allow flexible representation of keys
      like GiST, SP-GiST, GIN and BRIN do.  So, <function>options</function>
-     probably doesn't have much usage in current shape of B-tree index
+     probably doesn't have much application in the current B-tree index
      access method.  Nevertheless, this support function was added to B-tree
-     for uniformity, and probably it will found its usage during further
+     for uniformity, and will probably find uses during further
      evolution of B-tree in <productname>PostgreSQL</productname>.
     </para>
    </listitem>
diff --git a/doc/src/sgml/gin.sgml b/doc/src/sgml/gin.sgml
index d85e7c8796bc..7a8c18a4495c 100644
--- a/doc/src/sgml/gin.sgml
+++ b/doc/src/sgml/gin.sgml
@@ -411,17 +411,17 @@
       </para>
 
       <para>
-       The <function>options</function> function has given pointer to
+       The <function>options</function> function is passed a pointer to a
        <replaceable>local_relopts</replaceable> struct, which needs to be
-       filled with s set of operator class specific options.  The options
-       can be accessed from other support functions using
+       filled with a set of operator class specific options.  The options
+       can be accessed from other support functions using the
        <literal>PG_HAS_OPCLASS_OPTIONS()</literal> and
        <literal>PG_GET_OPCLASS_OPTIONS()</literal> macros.
       </para>
 
       <para>
-       Since both key extraction for indexed value and representation of the
-       key in <acronym>GIN</acronym> are flexible, it may depends on
+       Since both key extraction of indexed values and representation of the
+       key in <acronym>GIN</acronym> are flexible, they may depend on
        user-specified parameters.
       </para>
      </listitem>
diff --git a/doc/src/sgml/gist.sgml b/doc/src/sgml/gist.sgml
index 31c28fdb61c7..5d970ee9f2f4 100644
--- a/doc/src/sgml/gist.sgml
+++ b/doc/src/sgml/gist.sgml
@@ -946,7 +946,7 @@ my_fetch(PG_FUNCTION_ARGS)
      <term><function>options</function></term>
      <listitem>
       <para>
-       Allows defintion of user-visible parameters that control operator
+       Allows definition of user-visible parameters that control operator
        class behavior.
       </para>
 
@@ -962,16 +962,16 @@ LANGUAGE C STRICT;
       </para>
 
       <para>
-       The function has given pointer to <replaceable>local_relopts</replaceable>
+       The function is passed a pointer to a <replaceable>local_relopts</replaceable>
        struct, which needs to be filled with a set of operator class
        specific options.  The options can be accessed from other support
-       functions using <literal>PG_HAS_OPCLASS_OPTIONS()</literal> and
+       functions using the <literal>PG_HAS_OPCLASS_OPTIONS()</literal> and
        <literal>PG_GET_OPCLASS_OPTIONS()</literal> macros.
       </para>
 
        <para>
-        The sample implementation of my_option() and parameters usage
-        in the another support function are given below:
+        An example implementation of my_options() and parameters use
+        from other support functions are given below:
 
 <programlisting>
 typedef enum MyEnumType
@@ -990,7 +990,7 @@ typedef struct
     int     str_param;  /* string parameter */
 } MyOptionsStruct;
 
-/* String representations for enum values */
+/* String representation of enum values */
 static relopt_enum_elt_def myEnumValues[] =
 {
     {"on", MY_ENUM_ON},
@@ -1002,7 +1002,7 @@ static relopt_enum_elt_def myEnumValues[] =
 static char *str_param_default = "default";
 
 /*
- * Sample validatior: checks that string is not longer than 8 bytes.
+ * Sample validator: checks that string is not longer than 8 bytes.
  */
 static void 
 validate_my_string_relopt(const char *value)
@@ -1090,8 +1090,8 @@ my_compress(PG_FUNCTION_ARGS)
 
       <para>
        Since the representation of the key in <acronym>GiST</acronym> is
-       flexible, it may depends on user-specified parameters.  For instace,
-       the length of key signature may be such parameter.  See
+       flexible, it may depend on user-specified parameters.  For instance,
+       the length of key signature may be specified.  See
        <literal>gtsvector_options()</literal> for example.
       </para>
      </listitem>
diff --git a/doc/src/sgml/spgist.sgml b/doc/src/sgml/spgist.sgml
index ad8d348a43de..d9ecfe74ed11 100644
--- a/doc/src/sgml/spgist.sgml
+++ b/doc/src/sgml/spgist.sgml
@@ -290,13 +290,13 @@
   values for the support method, while the second argument is a pointer to a
   C struct where output values must be placed.  Four of the mandatory methods just
   return <type>void</type>, since all their results appear in the output struct; but
-  <function>leaf_consistent</function> additionally returns a <type>boolean</type> result.
+  <function>leaf_consistent</function> returns a <type>boolean</type> result.
   The methods must not modify any fields of their input structs.  In all
   cases, the output struct is initialized to zeroes before calling the
   user-defined method.  The optional sixth method <function>compress</function>
-  accepts datum to be indexed as the only argument and returns a value suitable
+  accepts a <type>datum</type> to be indexed as the only argument and returns a value suitable
   for physical storage in a leaf tuple.  The optional seventh method
-  <function>options</function> accepts internal pointer to a C struct, where
+  <function>options</function> accepts an <type>internal</type> pointer to a C struct, where
   opclass-specific parameters should be placed, and returns <type>void</type>.
  </para>
 
@@ -897,16 +897,16 @@ LANGUAGE C STRICT;
       </para>
 
       <para>
-       The function has given pointer to <replaceable>local_relopts</replaceable>
+       The function is passed a pointer to a <replaceable>local_relopts</replaceable>
        struct, which needs to be filled with a set of operator class
        specific options.  The options can be accessed from other support
-       functions using <literal>PG_HAS_OPCLASS_OPTIONS()</literal> and
+       functions using the <literal>PG_HAS_OPCLASS_OPTIONS()</literal> and
        <literal>PG_GET_OPCLASS_OPTIONS()</literal> macros.
       </para>
 
       <para>
        Since the representation of the key in <acronym>SP-GiST</acronym> is
-       flexible, it may depends on user-specified parameters.
+       flexible, it may depend on user-specified parameters.
       </para>
      </listitem>
     </varlistentry>
diff --git a/doc/src/sgml/xindex.sgml b/doc/src/sgml/xindex.sgml
index 0e4587a81b98..2cfd71b5b77a 100644
--- a/doc/src/sgml/xindex.sgml
+++ b/doc/src/sgml/xindex.sgml
@@ -410,9 +410,9 @@
   </para>
 
   <para>
-   Additionally, some opclasses allow user to set specific parameters, which
-   controls its behavior.  Each builtin index access method have optional
-   <function>options</function> support function, which defines set of
+   Additionally, some opclasses allow users to specify parameters which
+   control their behavior.  Each builtin index access method has an optional
+   <function>options</function> support function, which defines a set of
    opclass-specific parameters.
   </para>
 
@@ -459,7 +459,7 @@
       </row>
       <row>
        <entry>
-        Defines set of options that are specific for this operator class
+        Defines a set of options that are specific to this operator class
         (optional)
        </entry>
        <entry>5</entry>
@@ -501,7 +501,7 @@
       </row>
       <row>
        <entry>
-        Defines set of options that are specific for this operator class
+        Defines a set of options that are specific to this operator class
         (optional)
        </entry>
        <entry>3</entry>
@@ -584,7 +584,7 @@
       <row>
        <entry><function>options</function></entry>
        <entry>
-        Defines set of options that are specific for this operator class
+        Defines a set of options that are specific to this operator class
         (optional)
        </entry>
        <entry>10</entry>
@@ -643,7 +643,7 @@
       <row>
        <entry><function>options</function></entry>
        <entry>
-        Defines set of options that are specific for this operator class
+        Defines a set of options that are specific to this operator class
         (optional)
        </entry>
        <entry>6</entry>
@@ -720,7 +720,7 @@
       <row>
        <entry><function>options</function></entry>
        <entry>
-        Defines set of options that are specific for this operator class
+        Defines a set of options that are specific to this operator class
         (optional)
        </entry>
        <entry>7</entry>
@@ -778,7 +778,7 @@
       <row>
        <entry><function>options</function></entry>
        <entry>
-        Defines set of options that are specific for this operator class
+        Defines a set of options that are specific to this operator class
         (optional)
        </entry>
        <entry>5</entry>

From 9550ea3027aa4f290c998afd8836a927df40b09d Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 22 Jun 2020 13:23:38 +0900
Subject: [PATCH 028/334] Add --no-index-cleanup and --no-truncate to vacuumdb.

Both INDEX_CLEANUP and TRUNCATE have been available since v12, and are
enabled by default except if respectively vacuum_index_cleanup and
vacuum_truncate are disabled for a given relation.  This change adds
support for disabling these options from vacuumdb.

Author: Nathan Bossart
Reviewed-by: Michael Paquier, Masahiko Sawada
Discussion: https://postgr.es/m/6F7F17EF-B1F2-4681-8D03-BA96365717C0@amazon.com
---
 doc/src/sgml/ref/vacuumdb.sgml    | 30 ++++++++++++++++
 src/bin/scripts/t/100_vacuumdb.pl | 16 ++++++++-
 src/bin/scripts/vacuumdb.c        | 58 ++++++++++++++++++++++++++++++-
 3 files changed, 102 insertions(+), 2 deletions(-)

diff --git a/doc/src/sgml/ref/vacuumdb.sgml b/doc/src/sgml/ref/vacuumdb.sgml
index fd1dc140abad..95d6894cb03a 100644
--- a/doc/src/sgml/ref/vacuumdb.sgml
+++ b/doc/src/sgml/ref/vacuumdb.sgml
@@ -226,6 +226,36 @@ PostgreSQL documentation
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>--no-index-cleanup</option></term>
+      <listitem>
+       <para>
+        Do not remove index entries pointing to dead tuples.
+       </para>
+       <note>
+        <para>
+         This option is only available for servers running
+         <productname>PostgreSQL</productname> 12 and later.
+        </para>
+       </note>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term><option>--no-truncate</option></term>
+      <listitem>
+       <para>
+        Do not truncate empty pages at the end of the table.
+       </para>
+       <note>
+        <para>
+         This option is only available for servers running
+         <productname>PostgreSQL</productname> 12 and later.
+        </para>
+       </note>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><option>-P <replaceable class="parameter">parallel_degree</replaceable></option></term>
       <term><option>--parallel=<replaceable class="parameter">parallel_degree</replaceable></option></term>
diff --git a/src/bin/scripts/t/100_vacuumdb.pl b/src/bin/scripts/t/100_vacuumdb.pl
index b136bd445708..9e36b6d2b05f 100644
--- a/src/bin/scripts/t/100_vacuumdb.pl
+++ b/src/bin/scripts/t/100_vacuumdb.pl
@@ -3,7 +3,7 @@
 
 use PostgresNode;
 use TestLib;
-use Test::More tests => 49;
+use Test::More tests => 55;
 
 program_help_ok('vacuumdb');
 program_version_ok('vacuumdb');
@@ -48,6 +48,20 @@
 $node->command_fails(
 	[ 'vacuumdb', '--analyze-only', '--disable-page-skipping', 'postgres' ],
 	'--analyze-only and --disable-page-skipping specified together');
+$node->issues_sql_like(
+	[ 'vacuumdb', '--no-index-cleanup', 'postgres' ],
+	qr/statement: VACUUM \(INDEX_CLEANUP FALSE\).*;/,
+	'vacuumdb --no-index-cleanup');
+$node->command_fails(
+	[ 'vacuumdb', '--analyze-only', '--no-index-cleanup', 'postgres' ],
+	'--analyze-only and --no-index-cleanup specified together');
+$node->issues_sql_like(
+    [ 'vacuumdb', '--no-truncate', 'postgres' ],
+    qr/statement: VACUUM \(TRUNCATE FALSE\).*;/,
+    'vacuumdb --no-truncate');
+$node->command_fails(
+    [ 'vacuumdb', '--analyze-only', '--no-truncate', 'postgres' ],
+    '--analyze-only and --no-truncate specified together');
 $node->issues_sql_like(
 	[ 'vacuumdb', '-P', 2, 'postgres' ],
 	qr/statement: VACUUM \(PARALLEL 2\).*;/,
diff --git a/src/bin/scripts/vacuumdb.c b/src/bin/scripts/vacuumdb.c
index 154084a086e1..6a3c941158fb 100644
--- a/src/bin/scripts/vacuumdb.c
+++ b/src/bin/scripts/vacuumdb.c
@@ -37,6 +37,8 @@ typedef struct vacuumingOptions
 	int			min_mxid_age;
 	int			parallel_workers;	/* >= 0 indicates user specified the
 									 * parallel degree, otherwise -1 */
+	bool		do_index_cleanup;
+	bool		do_truncate;
 } vacuumingOptions;
 
 
@@ -96,6 +98,8 @@ main(int argc, char *argv[])
 		{"skip-locked", no_argument, NULL, 5},
 		{"min-xid-age", required_argument, NULL, 6},
 		{"min-mxid-age", required_argument, NULL, 7},
+		{"no-index-cleanup", no_argument, NULL, 8},
+		{"no-truncate", no_argument, NULL, 9},
 		{NULL, 0, NULL, 0}
 	};
 
@@ -117,9 +121,11 @@ main(int argc, char *argv[])
 	int			concurrentCons = 1;
 	int			tbl_count = 0;
 
-	/* initialize options to all false */
+	/* initialize options */
 	memset(&vacopts, 0, sizeof(vacopts));
 	vacopts.parallel_workers = -1;
+	vacopts.do_index_cleanup = true;
+	vacopts.do_truncate = true;
 
 	pg_logging_init(argv[0]);
 	progname = get_progname(argv[0]);
@@ -223,6 +229,12 @@ main(int argc, char *argv[])
 					exit(1);
 				}
 				break;
+			case 8:
+				vacopts.do_index_cleanup = false;
+				break;
+			case 9:
+				vacopts.do_truncate = false;
+				break;
 			default:
 				fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
 				exit(1);
@@ -267,6 +279,18 @@ main(int argc, char *argv[])
 						 "disable-page-skipping");
 			exit(1);
 		}
+		if (!vacopts.do_index_cleanup)
+		{
+			pg_log_error("cannot use the \"%s\" option when performing only analyze",
+						 "no-index-cleanup");
+			exit(1);
+		}
+		if (!vacopts.do_truncate)
+		{
+			pg_log_error("cannot use the \"%s\" option when performing only analyze",
+						 "no-truncate");
+			exit(1);
+		}
 		/* allow 'and_analyze' with 'analyze_only' */
 	}
 
@@ -412,6 +436,22 @@ vacuum_one_database(const char *dbname, vacuumingOptions *vacopts,
 		exit(1);
 	}
 
+	if (!vacopts->do_index_cleanup && PQserverVersion(conn) < 120000)
+	{
+		PQfinish(conn);
+		pg_log_error("cannot use the \"%s\" option on server versions older than PostgreSQL %s",
+					 "no-index-cleanup", "12");
+		exit(1);
+	}
+
+	if (!vacopts->do_truncate && PQserverVersion(conn) < 120000)
+	{
+		PQfinish(conn);
+		pg_log_error("cannot use the \"%s\" option on server versions older than PostgreSQL %s",
+					 "no-truncate", "12");
+		exit(1);
+	}
+
 	if (vacopts->skip_locked && PQserverVersion(conn) < 120000)
 	{
 		PQfinish(conn);
@@ -832,6 +872,20 @@ prepare_vacuum_command(PQExpBuffer sql, int serverVersion,
 				appendPQExpBuffer(sql, "%sDISABLE_PAGE_SKIPPING", sep);
 				sep = comma;
 			}
+			if (!vacopts->do_index_cleanup)
+			{
+				/* INDEX_CLEANUP is supported since v12 */
+				Assert(serverVersion >= 120000);
+				appendPQExpBuffer(sql, "%sINDEX_CLEANUP FALSE", sep);
+				sep = comma;
+			}
+			if (!vacopts->do_truncate)
+			{
+				/* TRUNCATE is supported since v12 */
+				Assert(serverVersion >= 120000);
+				appendPQExpBuffer(sql, "%sTRUNCATE FALSE", sep);
+				sep = comma;
+			}
 			if (vacopts->skip_locked)
 			{
 				/* SKIP_LOCKED is supported since v12 */
@@ -930,6 +984,8 @@ help(const char *progname)
 	printf(_("  -j, --jobs=NUM                  use this many concurrent connections to vacuum\n"));
 	printf(_("      --min-mxid-age=MXID_AGE     minimum multixact ID age of tables to vacuum\n"));
 	printf(_("      --min-xid-age=XID_AGE       minimum transaction ID age of tables to vacuum\n"));
+	printf(_("      --no-index-cleanup          don't remove index entries that point to dead tuples\n"));
+	printf(_("      --no-truncate               don't truncate empty pages at the end of the table\n"));
 	printf(_("  -P, --parallel=PARALLEL_DEGREE  use this many background workers for vacuum, if available\n"));
 	printf(_("  -q, --quiet                     don't write any messages\n"));
 	printf(_("      --skip-locked               skip relations that cannot be immediately locked\n"));

From fe186b4c200b76a5c0f03379fe8645ed1c70a844 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 22 Jun 2020 13:40:04 +0900
Subject: [PATCH 029/334] Fix inconsistent markups in catalogs.sgml

Some fields related to pg_opclass and pg_opfamily were using incorrect
markups, listing them as structname instead of structfield.

Author: Fabien Coelho
Discussion: https://postgr.es/m/alpine.DEB.2.22.394.2006210903560.859381@pseudo
---
 doc/src/sgml/catalogs.sgml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 700271fd40e1..5a66115df1ee 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -897,7 +897,7 @@
 
   <para>
    An entry's <structfield>amopmethod</structfield> must match the
-   <structname>opfmethod</structname> of its containing operator family (including
+   <structfield>opfmethod</structfield> of its containing operator family (including
    <structfield>amopmethod</structfield> here is an intentional denormalization of the
    catalog structure for performance reasons).  Also,
    <structfield>amoplefttype</structfield> and <structfield>amoprighttype</structfield> must match
@@ -5064,10 +5064,10 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
 
   <para>
    An operator class's <structfield>opcmethod</structfield> must match the
-   <structname>opfmethod</structname> of its containing operator family.
+   <structfield>opfmethod</structfield> of its containing operator family.
    Also, there must be no more than one <structname>pg_opclass</structname>
-   row having <structname>opcdefault</structname> true for any given combination of
-   <structname>opcmethod</structname> and <structname>opcintype</structname>.
+   row having <structfield>opcdefault</structfield> true for any given combination of
+   <structfield>opcmethod</structfield> and <structfield>opcintype</structfield>.
   </para>
 
  </sect1>

From 63d2ac23b018c2b173f42d274ae46b7b0c3263df Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 22 Jun 2020 11:46:41 -0400
Subject: [PATCH 030/334] Undo double-quoting of index names in non-text
 EXPLAIN output formats.

explain_get_index_name() applied quote_identifier() to the index name.
This is fine for text output, but the non-text output formats all have
their own quoting conventions and would much rather start from the
actual index name.  For example in JSON you'd get something like

       "Index Name": "\"My Index\"",

which is surely not desirable, especially when the same does not
happen for table names.  Hence, move the responsibility for applying
quoting out to the callers, where it can go into already-existing
special code paths for text format.

This changes the API spec for users of explain_get_index_name_hook:
before, they were supposed to apply quote_identifier() if necessary,
now they should not.  Research suggests that the only publicly
available user of the hook is hypopg, and it actually forgot to
apply quoting anyway, so it's fine.  (In any case, there's no
behavioral change for the output of a hook as seen in non-text
EXPLAIN formats, so this won't break any case that programs should
be relying on.)

Digging in the commit logs, it appears that quoting was included in
explain_get_index_name's duties when commit 604ffd280 invented it;
and that was fine at the time because we only had text output format.
This should have been rethought when non-text formats were invented,
but it wasn't.

This is a fairly clear bug for users of non-text EXPLAIN formats,
so back-patch to all supported branches.

Per bug #16502 from Maciek Sakrejda.  Patch by me (based on
investigation by Euler Taveira); thanks to Julien Rouhaud for review.

Discussion: https://postgr.es/m/16502-57bd1c9f913ed1d1@postgresql.org
---
 src/backend/commands/explain.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 67bdcb2b2785..a131d15ac0ce 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1456,7 +1456,8 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				explain_get_index_name(bitmapindexscan->indexid);
 
 				if (es->format == EXPLAIN_FORMAT_TEXT)
-					appendStringInfo(es->str, " on %s", indexname);
+					appendStringInfo(es->str, " on %s",
+									 quote_identifier(indexname));
 				else
 					ExplainPropertyText("Index Name", indexname, es);
 			}
@@ -3267,6 +3268,10 @@ show_eval_params(Bitmapset *bms_params, ExplainState *es)
  *
  * We allow plugins to get control here so that plans involving hypothetical
  * indexes can be explained.
+ *
+ * Note: names returned by this function should be "raw"; the caller will
+ * apply quoting if needed.  Formerly the convention was to do quoting here,
+ * but we don't want that in non-text output formats.
  */
 static const char *
 explain_get_index_name(Oid indexId)
@@ -3279,11 +3284,10 @@ explain_get_index_name(Oid indexId)
 		result = NULL;
 	if (result == NULL)
 	{
-		/* default behavior: look in the catalogs and quote it */
+		/* default behavior: look it up in the catalogs */
 		result = get_rel_name(indexId);
 		if (result == NULL)
 			elog(ERROR, "cache lookup failed for index %u", indexId);
-		result = quote_identifier(result);
 	}
 	return result;
 }
@@ -3463,7 +3467,7 @@ ExplainIndexScanDetails(Oid indexid, ScanDirection indexorderdir,
 	{
 		if (ScanDirectionIsBackward(indexorderdir))
 			appendStringInfoString(es->str, " Backward");
-		appendStringInfo(es->str, " using %s", indexname);
+		appendStringInfo(es->str, " using %s", quote_identifier(indexname));
 	}
 	else
 	{

From 7ce461560159948ba0c802c767e42c5f5ae08b4a Mon Sep 17 00:00:00 2001
From: Jeff Davis <jdavis@postgresql.org>
Date: Mon, 22 Jun 2020 12:14:55 -0700
Subject: [PATCH 031/334] Doc fixup for hashagg_avoid_disk_plan GUC.

Reported-by: Justin Pryzby
Discussion: https://postgr.es/m/20200620220402.GZ17995@telsasoft.com
Backport-through: 13
---
 doc/src/sgml/config.sgml | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 783bf7a12bab..b81aab239f3f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -4560,23 +4560,6 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
-     <varlistentry id="guc-hashagg-avoid-disk-plan" xreflabel="hashagg_avoid_disk_plan">
-      <term><varname>hashagg_avoid_disk_plan</varname> (<type>boolean</type>)
-      <indexterm>
-       <primary><varname>hashagg_avoid_disk_plan</varname> configuration parameter</primary>
-      </indexterm>
-      </term>
-      <listitem>
-       <para>
-        If set to <literal>on</literal>, causes the planner to avoid choosing
-        hashed aggregation plans that are expected to use the disk. If hashed
-        aggregation is chosen, it may still require the use of disk at
-        execution time, even if this parameter is enabled. The default is
-        <literal>off</literal>.
-       </para>
-      </listitem>
-     </varlistentry>
-
      <varlistentry id="guc-enable-hashjoin" xreflabel="enable_hashjoin">
       <term><varname>enable_hashjoin</varname> (<type>boolean</type>)
       <indexterm>
@@ -4823,6 +4806,23 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-hashagg-avoid-disk-plan" xreflabel="hashagg_avoid_disk_plan">
+      <term><varname>hashagg_avoid_disk_plan</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>hashagg_avoid_disk_plan</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        If set to <literal>on</literal>, causes the planner to avoid choosing
+        hashed aggregation plans that are expected to use the disk. If hashed
+        aggregation is chosen, it may still require the use of disk at
+        execution time, even if this parameter is enabled. The default is
+        <literal>off</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
      </sect2>
      <sect2 id="runtime-config-query-constants">

From a3554b2d718520cbd16c13ff5c9f2e8257846170 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 24 Jun 2020 15:14:04 +0900
Subject: [PATCH 032/334] Fix comment in heap.c

The description of InsertPgAttributeTuple() does not match its handling
of pg_attribute contents with NULL values for a long time, with 911e702
making things more inconsistent.  This adjusts the description to match
the reality.

Author: Daniel Gustafsson
Discussion: https://postgr.es/m/4E4E4B33-9FDF-4D21-B77A-642D027AEAD9@yesql.se
---
 src/backend/catalog/heap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 9c4554481562..3c83fe6bab48 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -714,8 +714,8 @@ CheckAttributeType(const char *attname,
  *		Construct and insert a new tuple in pg_attribute.
  *
  * Caller has already opened and locked pg_attribute.  new_attribute is the
- * attribute to insert.  attcacheoff is always initialized to -1, attacl and
- * attoptions are always initialized to NULL.
+ * attribute to insert.  attcacheoff is always initialized to -1, attacl,
+ * attfdwoptions and attmissingval are always initialized to NULL.
  *
  * indstate is the index state for CatalogTupleInsertWithInfo.  It can be
  * passed as NULL, in which case we'll fetch the necessary info.  (Don't do

From 368d7f3297e7e1304da03904d2e1310d79fa82a9 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Wed, 24 Jun 2020 14:00:37 -0400
Subject: [PATCH 033/334] Add parens to ConvertToXSegs macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The current definition is dangerous.  No bugs exist in our code at
present, but backpatch to 11 nonetheless where it was introduced.

Author: Álvaro Herrera <alvherre@alvh.no-ip.org>
---
 src/backend/access/transam/xlog.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index a1256a103b61..34ede80c44f4 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -760,9 +760,12 @@ static ControlFileData *ControlFile = NULL;
  */
 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
 
-/* Convert values of GUCs measured in megabytes to equiv. segment count */
+/*
+ * Convert values of GUCs measured in megabytes to equiv. segment count.
+ * Rounds down.
+ */
 #define ConvertToXSegs(x, segsize)	\
-	(x / ((segsize) / (1024 * 1024)))
+	((x) / ((segsize) / (1024 * 1024)))
 
 /* The number of bytes in a WAL segment usable for WAL data. */
 static int	UsableBytesInSegment;

From 0188bb82531f1b0ae3648fb81a4bd4a4f6242127 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Wed, 24 Jun 2020 14:15:17 -0400
Subject: [PATCH 034/334] Save slot's restart_lsn when invalidated due to size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We put it aside as invalidated_at, which let us show "lost" in
pg_replication slot.  Prior to this change, the state value was reported
as NULL.

Backpatch to 13.

Author: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Reviewed-by: Álvaro Herrera <alvherre@alvh.no-ip.org>
Discussion: https://postgr.es/m/20200617.101707.1735599255100002667.horikyota.ntt@gmail.com
Discussion: https://postgr.es/m/20200407.120905.1507671100168805403.horikyota.ntt@gmail.com
---
 src/backend/replication/slot.c            |  1 +
 src/backend/replication/slotfuncs.c       | 11 ++++++++++-
 src/include/access/xlog.h                 |  2 +-
 src/include/replication/slot.h            |  3 +++
 src/test/recovery/t/019_replslot_limit.pl |  2 +-
 5 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index a7bbcf34991a..e8761f3a1809 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1226,6 +1226,7 @@ InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
 						(uint32) restart_lsn)));
 
 		SpinLockAcquire(&s->mutex);
+		s->data.invalidated_at = s->data.restart_lsn;
 		s->data.restart_lsn = InvalidXLogRecPtr;
 		SpinLockRelease(&s->mutex);
 		ReplicationSlotRelease();
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 06e4955de73b..3fc54cb9bab1 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -283,6 +283,7 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 		bool		nulls[PG_GET_REPLICATION_SLOTS_COLS];
 		WALAvailability walstate;
 		XLogSegNo	last_removed_seg;
+		XLogRecPtr	targetLSN;
 		int			i;
 
 		if (!slot->in_use)
@@ -342,7 +343,15 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 		else
 			nulls[i++] = true;
 
-		walstate = GetWALAvailability(slot_contents.data.restart_lsn);
+		/*
+		 * Report availability from invalidated_at when the slot has been
+		 * invalidated; otherwise slots would appear as invalid without any
+		 * more clues as to what happened.
+		 */
+		targetLSN = XLogRecPtrIsInvalid(slot_contents.data.restart_lsn) ?
+			slot_contents.data.invalidated_at :
+			slot_contents.data.restart_lsn;
+		walstate = GetWALAvailability(targetLSN);
 
 		switch (walstate)
 		{
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 347a38f57cf4..9702dce98aa2 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -326,7 +326,7 @@ extern void ShutdownXLOG(int code, Datum arg);
 extern void InitXLOGAccess(void);
 extern void CreateCheckPoint(int flags);
 extern bool CreateRestartPoint(int flags);
-extern WALAvailability GetWALAvailability(XLogRecPtr restart_lsn);
+extern WALAvailability GetWALAvailability(XLogRecPtr targetLSN);
 extern XLogRecPtr CalculateMaxmumSafeLSN(void);
 extern void XLogPutNextOid(Oid nextOid);
 extern XLogRecPtr XLogRestorePoint(const char *rpName);
diff --git a/src/include/replication/slot.h b/src/include/replication/slot.h
index 917876010eb8..31362585ecb1 100644
--- a/src/include/replication/slot.h
+++ b/src/include/replication/slot.h
@@ -79,6 +79,9 @@ typedef struct ReplicationSlotPersistentData
 	/* oldest LSN that might be required by this replication slot */
 	XLogRecPtr	restart_lsn;
 
+	/* restart_lsn is copied here when the slot is invalidated */
+	XLogRecPtr	invalidated_at;
+
 	/*
 	 * Oldest LSN that the client has acked receipt for.  This is used as the
 	 * start_lsn point in case the client doesn't specify one, and also as a
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
index cba7df920c0c..f1be984cc9a6 100644
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -186,7 +186,7 @@
 $result = $node_master->safe_psql('postgres',
 	"SELECT slot_name, active, restart_lsn IS NULL, wal_status, min_safe_lsn FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
-is($result, "rep1|f|t||", 'check that the slot became inactive');
+is($result, "rep1|f|t|lost|", 'check that the slot became inactive');
 
 # The standby no longer can connect to the master
 $logstart = get_log_size($node_standby);

From b8fd4e02c6d01183bf6def5897ad6cf7766bfff4 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Wed, 24 Jun 2020 14:23:39 -0400
Subject: [PATCH 035/334] Adjust max_slot_wal_keep_size behavior per review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In pg_replication_slot, change output from normal/reserved/lost to
reserved/extended/unreserved/ lost, which better expresses the possible
states particularly near the time where segments are no longer safe but
checkpoint has not run yet.

Under the new definition, reserved means the slot is consuming WAL
that's still under the normal WAL size constraints; extended means it's
consuming WAL that's being protected by wal_keep_segments or the slot
itself, whose size is below max_slot_wal_keep_size; unreserved means the
WAL is no longer safe, but checkpoint has not yet removed those files.
Such as slot is in imminent danger, but can still continue for a little
while and may catch up to the reserved WAL space.

Also, there were some bugs in the calculations used to report the
status; fixed those.

Backpatch to 13.

Reported-by: Fujii Masao <masao.fujii@oss.nttdata.com>
Author: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Reviewed-by: Fujii Masao <masao.fujii@oss.nttdata.com>
Reviewed-by: Álvaro Herrera <alvherre@alvh.no-ip.org>
Discussion: https://postgr.es/m/20200616.120236.1809496990963386593.horikyota.ntt@gmail.com
---
 doc/src/sgml/catalogs.sgml                | 24 +++++++---
 src/backend/access/transam/xlog.c         | 57 ++++++++++++-----------
 src/backend/replication/slotfuncs.c       | 39 ++++++++++++----
 src/include/access/xlog.h                 |  6 ++-
 src/test/recovery/t/019_replslot_limit.pl | 23 +++++----
 5 files changed, 96 insertions(+), 53 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 5a66115df1ee..49a881b26210 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -11239,19 +11239,29 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
        Possible values are:
        <itemizedlist>
         <listitem>
-         <para><literal>normal</literal> means that the claimed files
+         <para><literal>reserved</literal> means that the claimed files
           are within <varname>max_wal_size</varname>.</para>
         </listitem>
         <listitem>
-         <para><literal>reserved</literal> means
+         <para><literal>extended</literal> means
           that <varname>max_wal_size</varname> is exceeded but the files are
-          still held, either by some replication slot or
-          by <varname>wal_keep_segments</varname>.</para>
+          still retained, either by the replication slot or
+          by <varname>wal_keep_segments</varname>.
+         </para>
         </listitem>
         <listitem>
-         <para><literal>lost</literal> means that some WAL files are
-          definitely lost and this slot cannot be used to resume replication
-          anymore.</para>
+         <para>
+          <literal>unreserved</literal> means that the slot no longer
+          retains the required WAL files and some of them are to be removed at
+          the next checkpoint.  This state can return
+          to <literal>reserved</literal> or <literal>extended</literal>.
+         </para>
+        </listitem>
+        <listitem>
+         <para>
+          <literal>lost</literal> means that some required WAL files have
+          been removed and this slot is no longer usable.
+         </para>
         </listitem>
        </itemizedlist>
        The last two states are seen only when
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 34ede80c44f4..e455384b5b08 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9488,15 +9488,20 @@ CreateRestartPoint(int flags)
  *		(typically a slot's restart_lsn)
  *
  * Returns one of the following enum values:
- * * WALAVAIL_NORMAL means targetLSN is available because it is in the range
- *   of max_wal_size.
  *
- * * WALAVAIL_PRESERVED means it is still available by preserving extra
+ * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
+ *   max_wal_size.
+ *
+ * * WALAVAIL_EXTENDED means it is still available by preserving extra
  *   segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
  *   than max_wal_size, this state is not returned.
  *
- * * WALAVAIL_REMOVED means it is definitely lost. A replication stream on
- *   a slot with this LSN cannot continue.
+ * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
+ *   remove reserved segments. The walsender using this slot may return to the
+ *   above.
+ *
+ * * WALAVAIL_REMOVED means it has been removed. A replication stream on
+ *   a slot with this LSN cannot continue after a restart.
  *
  * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
  */
@@ -9512,13 +9517,18 @@ GetWALAvailability(XLogRecPtr targetLSN)
 													 * slot */
 	uint64		keepSegs;
 
-	/* slot does not reserve WAL. Either deactivated, or has never been active */
+	/*
+	 * slot does not reserve WAL. Either deactivated, or has never been active
+	 */
 	if (XLogRecPtrIsInvalid(targetLSN))
 		return WALAVAIL_INVALID_LSN;
 
 	currpos = GetXLogWriteRecPtr();
 
-	/* calculate oldest segment currently needed by slots */
+	/*
+	 * calculate the oldest segment currently reserved by all slots,
+	 * considering wal_keep_segments and max_slot_wal_keep_size
+	 */
 	XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
 	KeepLogSeg(currpos, &oldestSlotSeg);
 
@@ -9529,10 +9539,9 @@ GetWALAvailability(XLogRecPtr targetLSN)
 	 */
 	oldestSeg = XLogGetLastRemovedSegno() + 1;
 
-	/* calculate oldest segment by max_wal_size and wal_keep_segments */
+	/* calculate oldest segment by max_wal_size */
 	XLByteToSeg(currpos, currSeg, wal_segment_size);
-	keepSegs = ConvertToXSegs(Max(max_wal_size_mb, wal_keep_segments),
-							  wal_segment_size) + 1;
+	keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
 
 	if (currSeg > keepSegs)
 		oldestSegMaxWalSize = currSeg - keepSegs;
@@ -9540,27 +9549,23 @@ GetWALAvailability(XLogRecPtr targetLSN)
 		oldestSegMaxWalSize = 1;
 
 	/*
-	 * If max_slot_wal_keep_size has changed after the last call, the segment
-	 * that would been kept by the current setting might have been lost by the
-	 * previous setting. No point in showing normal or keeping status values
-	 * if the targetSeg is known to be lost.
+	 * No point in returning reserved or extended status values if the
+	 * targetSeg is known to be lost.
 	 */
-	if (targetSeg >= oldestSeg)
+	if (targetSeg >= oldestSlotSeg)
 	{
-		/*
-		 * show "normal" when targetSeg is within max_wal_size, even if
-		 * max_slot_wal_keep_size is smaller than max_wal_size.
-		 */
-		if ((max_slot_wal_keep_size_mb <= 0 ||
-			 max_slot_wal_keep_size_mb >= max_wal_size_mb) &&
-			oldestSegMaxWalSize <= targetSeg)
-			return WALAVAIL_NORMAL;
-
-		/* being retained by slots */
-		if (oldestSlotSeg <= targetSeg)
+		/* show "reserved" when targetSeg is within max_wal_size */
+		if (targetSeg >= oldestSegMaxWalSize)
 			return WALAVAIL_RESERVED;
+
+		/* being retained by slots exceeding max_wal_size */
+		return WALAVAIL_EXTENDED;
 	}
 
+	/* WAL segments are no longer retained but haven't been removed yet */
+	if (targetSeg >= oldestSeg)
+		return WALAVAIL_UNRESERVED;
+
 	/* Definitely lost */
 	return WALAVAIL_REMOVED;
 }
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 3fc54cb9bab1..df854bc6e3f7 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -359,24 +359,47 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 				nulls[i++] = true;
 				break;
 
-			case WALAVAIL_NORMAL:
-				values[i++] = CStringGetTextDatum("normal");
-				break;
-
 			case WALAVAIL_RESERVED:
 				values[i++] = CStringGetTextDatum("reserved");
 				break;
 
+			case WALAVAIL_EXTENDED:
+				values[i++] = CStringGetTextDatum("extended");
+				break;
+
+			case WALAVAIL_UNRESERVED:
+				values[i++] = CStringGetTextDatum("unreserved");
+				break;
+
 			case WALAVAIL_REMOVED:
+
+				/*
+				 * If we read the restart_lsn long enough ago, maybe that file
+				 * has been removed by now.  However, the walsender could have
+				 * moved forward enough that it jumped to another file after
+				 * we looked.  If checkpointer signalled the process to
+				 * termination, then it's definitely lost; but if a process is
+				 * still alive, then "unreserved" seems more appropriate.
+				 */
+				if (!XLogRecPtrIsInvalid(slot_contents.data.restart_lsn))
+				{
+					int			pid;
+
+					SpinLockAcquire(&slot->mutex);
+					pid = slot->active_pid;
+					SpinLockRelease(&slot->mutex);
+					if (pid != 0)
+					{
+						values[i++] = CStringGetTextDatum("unreserved");
+						break;
+					}
+				}
 				values[i++] = CStringGetTextDatum("lost");
 				break;
-
-			default:
-				elog(ERROR, "invalid walstate: %d", (int) walstate);
 		}
 
 		if (max_slot_wal_keep_size_mb >= 0 &&
-			(walstate == WALAVAIL_NORMAL || walstate == WALAVAIL_RESERVED) &&
+			(walstate == WALAVAIL_RESERVED || walstate == WALAVAIL_EXTENDED) &&
 			((last_removed_seg = XLogGetLastRemovedSegno()) != 0))
 		{
 			XLogRecPtr	min_safe_lsn;
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 9702dce98aa2..77ac4e785fce 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -270,8 +270,10 @@ extern CheckpointStatsData CheckpointStats;
 typedef enum WALAvailability
 {
 	WALAVAIL_INVALID_LSN,		/* parameter error */
-	WALAVAIL_NORMAL,			/* WAL segment is within max_wal_size */
-	WALAVAIL_RESERVED,			/* WAL segment is reserved by a slot */
+	WALAVAIL_RESERVED,			/* WAL segment is within max_wal_size */
+	WALAVAIL_EXTENDED,			/* WAL segment is reserved by a slot or
+								 * wal_keep_segments */
+	WALAVAIL_UNRESERVED,		/* no longer reserved, but not removed yet */
 	WALAVAIL_REMOVED			/* WAL segment has been removed */
 } WALAvailability;
 
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
index f1be984cc9a6..7d22ae57201a 100644
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -56,7 +56,7 @@
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
-is($result, "normal|t", 'check the catching-up state');
+is($result, "reserved|t", 'check the catching-up state');
 
 # Advance WAL by five segments (= 5MB) on master
 advance_wal($node_master, 1);
@@ -66,7 +66,8 @@
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
-is($result, "normal|t", 'check that it is safe if WAL fits in max_wal_size');
+is($result, "reserved|t",
+	'check that it is safe if WAL fits in max_wal_size');
 
 advance_wal($node_master, 4);
 $node_master->safe_psql('postgres', "CHECKPOINT;");
@@ -75,7 +76,7 @@
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
-is($result, "normal|t", 'check that slot is working');
+is($result, "reserved|t", 'check that slot is working');
 
 # The standby can reconnect to master
 $node_standby->start;
@@ -99,7 +100,7 @@
 
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'");
-is($result, "normal", 'check that max_slot_wal_keep_size is working');
+is($result, "reserved", 'check that max_slot_wal_keep_size is working');
 
 # Advance WAL again then checkpoint, reducing remain by 2 MB.
 advance_wal($node_master, 2);
@@ -108,7 +109,7 @@
 # The slot is still working
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'");
-is($result, "normal",
+is($result, "reserved",
 	'check that min_safe_lsn gets close to the current LSN');
 
 # The standby can reconnect to master
@@ -125,7 +126,7 @@
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status as remain FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
-is($result, "normal",
+is($result, "extended",
 	'check that wal_keep_segments overrides max_slot_wal_keep_size');
 # restore wal_keep_segments
 $result = $node_master->safe_psql('postgres',
@@ -143,7 +144,7 @@
 # Slot gets into 'reserved' state
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'");
-is($result, "reserved", 'check that the slot state changes to "reserved"');
+is($result, "extended", 'check that the slot state changes to "extended"');
 
 # do checkpoint so that the next checkpoint runs too early
 $node_master->safe_psql('postgres', "CHECKPOINT;");
@@ -151,11 +152,12 @@
 # Advance WAL again without checkpoint; remain goes to 0.
 advance_wal($node_master, 1);
 
-# Slot gets into 'lost' state
+# Slot gets into 'unreserved' state
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
-is($result, "lost|t", 'check that the slot state changes to "lost"');
+is($result, "unreserved|t",
+	'check that the slot state changes to "unreserved"');
 
 # The standby still can connect to master before a checkpoint
 $node_standby->start;
@@ -186,7 +188,8 @@
 $result = $node_master->safe_psql('postgres',
 	"SELECT slot_name, active, restart_lsn IS NULL, wal_status, min_safe_lsn FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
-is($result, "rep1|f|t|lost|", 'check that the slot became inactive');
+is($result, "rep1|f|t|lost|",
+	'check that the slot became inactive and the state "lost" persists');
 
 # The standby no longer can connect to the master
 $logstart = get_log_size($node_standby);

From 235c0f6eed2d9f5650f9b6ee0c51601792eff8e4 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 24 Jun 2020 15:47:30 -0400
Subject: [PATCH 036/334] Fix compiler warning induced by commit d8b15eeb8.

I forgot that INT64_FORMAT can't be used with sscanf on Windows.
Use the same trick of sscanf'ing into a temp variable as we do in
some other places in zic.c.

The upstream IANA code avoids the portability problem by relying on
<inttypes.h>'s SCNdFAST64 macro.  Once we're requiring C99 in all
branches, we should do likewise and drop this set of diffs from
upstream.  For now, though, a hack seems fine, since we do not
actually care about leapseconds anyway.

Discussion: https://postgr.es/m/4e5d1a5b-143e-e70e-a99d-a3b01c1ae7c3@2ndquadrant.com
---
 src/timezone/zic.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/timezone/zic.c b/src/timezone/zic.c
index e5a3ca26f42e..10c5b4bfb5b5 100644
--- a/src/timezone/zic.c
+++ b/src/timezone/zic.c
@@ -1292,7 +1292,20 @@ infile(const char *name)
 		if (nfields == 0)
 		{
 			if (name == leapsec && *buf == '#')
-				sscanf(buf, "#expires " INT64_FORMAT, &comment_leapexpires);
+			{
+				/*
+				 * PG: INT64_FORMAT isn't portable for sscanf, so be content
+				 * with scanning a "long".  Once we are requiring C99 in all
+				 * live branches, it'd be sensible to adopt upstream's
+				 * practice of using the <inttypes.h> macros.  But for now, we
+				 * don't actually use this code, and it won't overflow before
+				 * 2038 anyway.
+				 */
+				long		cl_tmp;
+
+				sscanf(buf, "#expires %ld", &cl_tmp);
+				comment_leapexpires = cl_tmp;
+			}
 		}
 		else if (wantcont)
 		{

From a82ba066ea217e7fe4da3c20ced01e7ca976a351 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Thu, 25 Jun 2020 11:13:13 +0900
Subject: [PATCH 037/334] Remove erroneous assertion from
 pg_copy_logical_replication_slot().

If restart_lsn of logical replication slot gets behind more than
max_slot_wal_keep_size from the current LSN, the logical replication slot
would be invalidated and its restart_lsn is reset to an invalid LSN.
If this logical replication slot with an invalid restart_lsn was specified as
the source slot in pg_copy_logical_replication_slot(), the function caused
the assertion failure unexpectedly.

This assertion was added because restart_lsn should not be invalid before.
But in v13, it can be invalid thanks to max_slot_wal_keep_size. So since this
assertion is no longer useful, this commit removes it.

This commit also changes the errcode in the error message that
pg_copy_logical_replication_slot() emits when the slot with an invalid
restart_lsn is specified, to more appropriate one.

Back-patch to v13 where max_slot_wal_keep_size was added and
the assertion was no longer valid.

Author: Fujii Masao
Reviewed-by: Alvaro Herrera, Kyotaro Horiguchi
Discussion: https://postgr.es/m/f91de4fb-a7ab-b90e-8132-74796e049d51@oss.nttdata.com
---
 src/backend/replication/slotfuncs.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index df854bc6e3f7..fca18ffae534 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -755,12 +755,9 @@ copy_replication_slot(FunctionCallInfo fcinfo, bool logical_slot)
 
 	/* Copying non-reserved slot doesn't make sense */
 	if (XLogRecPtrIsInvalid(src_restart_lsn))
-	{
-		Assert(!logical_slot);
 		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 				 errmsg("cannot copy a replication slot that doesn't reserve WAL")));
-	}
 
 	/* Overwrite params from optional arguments */
 	if (PG_NARGS() >= 3)

From 463b808e757928f053490dd397af77a80b4e7baa Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 25 Jun 2020 13:28:30 -0400
Subject: [PATCH 038/334] Doc: correct nitpicky mistakes in
 array_position/array_positions examples.

Daniel Gustafsson and Erik Rijkers, per report from nick@cleaton

Discussion: https://postgr.es/m/159275646273.679.16940709892308114570@wrigleys.postgresql.org
---
 doc/src/sgml/array.sgml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/array.sgml b/doc/src/sgml/array.sgml
index a473fa8ee8bc..56185b9b0387 100644
--- a/doc/src/sgml/array.sgml
+++ b/doc/src/sgml/array.sgml
@@ -669,14 +669,16 @@ SELECT * FROM sal_emp WHERE pay_by_quarter &amp;&amp; ARRAY[10000];
 
 <programlisting>
 SELECT array_position(ARRAY['sun','mon','tue','wed','thu','fri','sat'], 'mon');
- array_positions
------------------
- 2
+ array_position
+----------------
+              2
+(1 row)
 
 SELECT array_positions(ARRAY[1, 4, 3, 1, 3, 4, 2, 1], 1);
  array_positions
 -----------------
  {1,4,8}
+(1 row)
 </programlisting>
  </para>
 

From 10f1ab2cb8bea3c6741a78f6dc19a5c91c0a34e1 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Thu, 25 Jun 2020 10:55:28 -0700
Subject: [PATCH 039/334] Fix misuse of table_index_fetch_tuple_check().

Commit 0d861bbb, which added deduplication to nbtree, had
_bt_check_unique() pass a TID to table_index_fetch_tuple_check() that
isn't safe to mutate.  table_index_fetch_tuple_check()'s tid argument is
modified when the TID in question is not the latest visible tuple in a
hot chain, though this wasn't documented.

To fix, go back to using a local copy of the TID in _bt_check_unique(),
and update comments above table_index_fetch_tuple_check().

Backpatch: 13-, where B-Tree deduplication was introduced.
---
 src/backend/access/nbtree/nbtinsert.c | 3 ++-
 src/backend/access/table/tableam.c    | 4 ++++
 src/include/access/tableam.h          | 4 +++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 55fe16bd4e12..b86c122763eb 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -597,7 +597,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 					 * not part of this chain because it had a different index
 					 * entry.
 					 */
-					if (table_index_fetch_tuple_check(heapRel, &itup->t_tid,
+					htid = itup->t_tid;
+					if (table_index_fetch_tuple_check(heapRel, &htid,
 													  SnapshotSelf, NULL))
 					{
 						/* Normal case --- it's still live */
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index c814733b2222..4b2bb29559a7 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -196,6 +196,10 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc parallel_scan)
  * optimized, but is unlikely to matter from a performance POV. If there
  * frequently are live index pointers also matching a unique index key, the
  * CPU overhead of this routine is unlikely to matter.
+ *
+ * Note that *tid may be modified when we return true if the AM supports
+ * storing multiple row versions reachable via a single index entry (like
+ * heap's HOT).
  */
 bool
 table_index_fetch_tuple_check(Relation rel,
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index eb18739c3650..b3d2a6dd3150 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -989,7 +989,9 @@ table_index_fetch_end(struct IndexFetchTableData *scan)
 /*
  * Fetches, as part of an index scan, tuple at `tid` into `slot`, after doing
  * a visibility test according to `snapshot`. If a tuple was found and passed
- * the visibility test, returns true, false otherwise.
+ * the visibility test, returns true, false otherwise. Note that *tid may be
+ * modified when we return true (see later remarks on multiple row versions
+ * reachable via a single index entry).
  *
  * *call_again needs to be false on the first call to table_index_fetch_tuple() for
  * a tid. If there potentially is another tuple matching the tid, *call_again

From d352de8d8eb7102e51e6adf0a965a9eae09e3f39 Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Thu, 25 Jun 2020 18:22:44 -0400
Subject: [PATCH 040/334] docs:  clarify that CREATE DATABASE does not copy db
 permissions

That is, those database permissions set by GRANT.

Diagnosed-by: Joseph Nahmias

Discussion: https://postgr.es/m/20200614072613.GA21852@nahmias.net

Backpatch-through: 9.5
---
 doc/src/sgml/ref/create_database.sgml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml
index 504c1b022460..d116b321bce9 100644
--- a/doc/src/sgml/ref/create_database.sgml
+++ b/doc/src/sgml/ref/create_database.sgml
@@ -236,8 +236,8 @@ CREATE DATABASE <replaceable class="parameter">name</replaceable>
 
    <para>
     Database-level configuration parameters (set via <xref
-    linkend="sql-alterdatabase"/>) are not copied from the template
-    database.
+    linkend="sql-alterdatabase"/>) and database-level permissions (set via
+    <xref linkend="sql-grant"/>) are not copied from the template database.
    </para>
 
   <para>

From 81d46ea12cef2391a4cae63c1f3951401e3dd883 Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Thu, 25 Jun 2020 18:33:28 -0400
Subject: [PATCH 041/334] doc:  mention trigger helper functions in CREATE
 TRIGGER docs

Reported-by: petermpallesen@gmail.com

Discussion: https://postgr.es/m/159195294959.673.5752624528747900508@wrigleys.postgresql.org

Backpatch-through: 9.5
---
 doc/src/sgml/ref/create_trigger.sgml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/doc/src/sgml/ref/create_trigger.sgml b/doc/src/sgml/ref/create_trigger.sgml
index 303881fcfbbc..289dd1d9da8e 100644
--- a/doc/src/sgml/ref/create_trigger.sgml
+++ b/doc/src/sgml/ref/create_trigger.sgml
@@ -457,6 +457,12 @@ UPDATE OF <replaceable>column_name1</replaceable> [, <replaceable>column_name2</
    value did not change.
   </para>
 
+  <para>
+   There are a few built-in trigger functions that can be used to
+   solve common problems without having to write your own trigger code;
+   see <xref linkend="functions-trigger"/>.
+  </para>
+
   <para>
    In a <literal>BEFORE</literal> trigger, the <literal>WHEN</literal> condition is
    evaluated just before the function is or would be executed, so using

From eca08f58d05f45c4cae02bca5e1556ba58732fc4 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 26 Jun 2020 13:54:01 -0400
Subject: [PATCH 042/334] Doc: explain that "timestamp - timestamp" applies
 justify_hours().

Back-patch to v13; before that, there's not really space for this
kind of detail.

Discussion: https://postgr.es/m/c1696f68-fa8d-7759-6a9c-eb293ab1bbc9@gmx.net
---
 doc/src/sgml/func.sgml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index b7c450ea29d2..7119f0b2ca58 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -8412,11 +8412,12 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
          <returnvalue>interval</returnvalue>
         </para>
         <para>
-         Subtract timestamps
+         Subtract timestamps (converting 24-hour intervals into days,
+         similarly to <function>justify_hours()</function>)
         </para>
         <para>
-         <literal>timestamp '2001-09-29 03:00' - timestamp '2001-09-27 12:00'</literal>
-         <returnvalue>1 day 15:00:00</returnvalue>
+         <literal>timestamp '2001-09-29 03:00' - timestamp '2001-07-27 12:00'</literal>
+         <returnvalue>63 days 15:00:00</returnvalue>
         </para></entry>
        </row>
 

From 4ae08cd5fd19d566538005c15e7bf992ebae4c72 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Fri, 26 Jun 2020 20:41:29 -0400
Subject: [PATCH 043/334] Persist slot invalidation correctly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We failed to save slot to disk after invalidating it, so the state was
lost in case of server restart or crash.  Fix by marking it dirty and
flushing.

Also, if the slot is known invalidated we don't need to reason about the
LSN at all -- it's known invalidated.  Only test the LSN if the slot is
known not invalidated.

Author: Fujii Masao <masao.fujii@oss.nttdata.com>
Author: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Reviewed-by: Álvaro Herrera <alvherre@alvh.no-ip.org>
Discussion: https://postgr.es/m/17a69cfe-f1c1-a416-ee25-ae15427c69eb@oss.nttdata.com
---
 src/backend/replication/slot.c      |  9 +++++----
 src/backend/replication/slotfuncs.c | 16 ++++++++--------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index e8761f3a1809..57bbb6288c68 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1157,6 +1157,7 @@ InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
 		if (XLogRecPtrIsInvalid(restart_lsn) || restart_lsn >= oldestLSN)
 			continue;
 		LWLockRelease(ReplicationSlotControlLock);
+		CHECK_FOR_INTERRUPTS();
 
 		/* Get ready to sleep on the slot in case it is active */
 		ConditionVariablePrepareToSleep(&s->active_cv);
@@ -1214,10 +1215,7 @@ InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
 		 * already been dropped.
 		 */
 		if (wspid == -1)
-		{
-			CHECK_FOR_INTERRUPTS();
 			goto restart;
-		}
 
 		ereport(LOG,
 				(errmsg("invalidating slot \"%s\" because its restart_lsn %X/%X exceeds max_slot_wal_keep_size",
@@ -1229,10 +1227,13 @@ InvalidateObsoleteReplicationSlots(XLogSegNo oldestSegno)
 		s->data.invalidated_at = s->data.restart_lsn;
 		s->data.restart_lsn = InvalidXLogRecPtr;
 		SpinLockRelease(&s->mutex);
+
+		/* Make sure the invalidated state persists across server restart */
+		ReplicationSlotMarkDirty();
+		ReplicationSlotSave();
 		ReplicationSlotRelease();
 
 		/* if we did anything, start from scratch */
-		CHECK_FOR_INTERRUPTS();
 		goto restart;
 	}
 	LWLockRelease(ReplicationSlotControlLock);
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index fca18ffae534..88033a79b21b 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -283,7 +283,6 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 		bool		nulls[PG_GET_REPLICATION_SLOTS_COLS];
 		WALAvailability walstate;
 		XLogSegNo	last_removed_seg;
-		XLogRecPtr	targetLSN;
 		int			i;
 
 		if (!slot->in_use)
@@ -344,14 +343,15 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 			nulls[i++] = true;
 
 		/*
-		 * Report availability from invalidated_at when the slot has been
-		 * invalidated; otherwise slots would appear as invalid without any
-		 * more clues as to what happened.
+		 * If invalidated_at is valid and restart_lsn is invalid, we know for
+		 * certain that the slot has been invalidated.  Otherwise, test
+		 * availability from restart_lsn.
 		 */
-		targetLSN = XLogRecPtrIsInvalid(slot_contents.data.restart_lsn) ?
-			slot_contents.data.invalidated_at :
-			slot_contents.data.restart_lsn;
-		walstate = GetWALAvailability(targetLSN);
+		if (XLogRecPtrIsInvalid(slot_contents.data.restart_lsn) &&
+			!XLogRecPtrIsInvalid(slot_contents.data.invalidated_at))
+			walstate = WALAVAIL_REMOVED;
+		else
+			walstate = GetWALAvailability(slot_contents.data.restart_lsn);
 
 		switch (walstate)
 		{

From e7b476c657ebe4c0a47fa14b8f1c7ec767067585 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Sat, 27 Jun 2020 09:54:51 +0530
Subject: [PATCH 044/334] Remove duplicate check added by commit b2a5545bd6.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As this doesn't cause any harm so we decided to this clean up in HEAD only.

Author: Ádám Balogh
Discussion: https://postgr.es/m/VI1PR0702MB36631BD67559461AFDE1FEEE81920@VI1PR0702MB3663.eurprd07.prod.outlook.com
---
 src/backend/access/transam/xlog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index e455384b5b08..fd93bcfaebaa 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7377,7 +7377,7 @@ StartupXLOG(void)
 					 * Wake up any walsenders to notice that we are on a new
 					 * timeline.
 					 */
-					if (switchedTLI && AllowCascadeReplication())
+					if (AllowCascadeReplication())
 						WalSndWakeup();
 				}
 

From 6e682f61a5bdb08164a805419144318db6b7229f Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 27 Jun 2020 12:20:33 -0400
Subject: [PATCH 045/334] Change libpq's default ssl_min_protocol_version to
 TLSv1.2.

When we initially created this parameter, in commit ff8ca5fad, we left
the default as "allow any protocol version" on grounds of backwards
compatibility.  However, that's inconsistent with the backend's default
since b1abfec82; protocol versions prior to 1.2 are not considered very
secure; and OpenSSL has had TLSv1.2 support since 2012, so the number
of PG servers that need a lesser minimum is probably quite small.

On top of those things, it emerges that some popular distros (including
Debian and RHEL) set MinProtocol=TLSv1.2 in openssl.cnf.  Thus, far
from having "allow any protocol version" behavior in practice, what
we actually have as things stand is a platform-dependent lower limit.

So, change our minds and set the min version to TLSv1.2.  Anybody
wanting to connect with a new libpq to a pre-2012 server can either
set ssl_min_protocol_version=TLSv1 or accept the fallback to non-SSL.

Back-patch to v13 where the aforementioned patches appeared.

Patch by me, reviewed by Daniel Gustafsson

Discussion: https://postgr.es/m/a9408304-4381-a5af-d259-e55d349ae4ce@2ndquadrant.com
---
 doc/src/sgml/libpq.sgml           | 6 +++---
 src/interfaces/libpq/fe-connect.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml
index dfc292872a97..ea1909c08dc8 100644
--- a/doc/src/sgml/libpq.sgml
+++ b/doc/src/sgml/libpq.sgml
@@ -1745,9 +1745,9 @@ postgresql://%2Fvar%2Flib%2Fpostgresql/dbname
         <literal>TLSv1.1</literal>, <literal>TLSv1.2</literal> and
         <literal>TLSv1.3</literal>. The supported protocols depend on the
         version of <productname>OpenSSL</productname> used, older versions
-        not supporting the most modern protocol versions. If not set, this
-        parameter is ignored and the connection will use the minimum bound
-        defined by the backend.
+        not supporting the most modern protocol versions. If not specified,
+        the default is <literal>TLSv1.2</literal>, which satisfies industry
+        best practices as of this writing.
        </para>
       </listitem>
      </varlistentry>
diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 2c87b34028db..27c9bb46eea0 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -320,7 +320,7 @@ static const internalPQconninfoOption PQconninfoOptions[] = {
 		"Require-Peer", "", 10,
 	offsetof(struct pg_conn, requirepeer)},
 
-	{"ssl_min_protocol_version", "PGSSLMINPROTOCOLVERSION", NULL, NULL,
+	{"ssl_min_protocol_version", "PGSSLMINPROTOCOLVERSION", "TLSv1.2", NULL,
 		"SSL-Minimum-Protocol-Version", "", 8,	/* sizeof("TLSv1.x") == 8 */
 	offsetof(struct pg_conn, ssl_min_protocol_version)},
 

From b63dd3d88f479947ef7fb7cbf5db27de66ae0654 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 27 Jun 2020 12:47:58 -0400
Subject: [PATCH 046/334] Add hints about protocol-version-related SSL
 connection failures.

OpenSSL's native reports about problems related to protocol version
restrictions are pretty opaque and inconsistent.  When we get an
SSL error that is plausibly due to this, emit a hint message that
includes the range of SSL protocol versions we (think we) are
allowing.  This should at least get the user thinking in the right
direction to resolve the problem, even if the hint isn't totally
accurate, which it might not be for assorted reasons.

Back-patch to v13 where we increased the default minimum protocol
version, thereby increasing the risk of this class of failure.

Patch by me, reviewed by Daniel Gustafsson

Discussion: https://postgr.es/m/a9408304-4381-a5af-d259-e55d349ae4ce@2ndquadrant.com
---
 src/backend/libpq/be-secure-openssl.c    | 67 +++++++++++++++++++++++-
 src/include/common/openssl.h             | 23 +++++++-
 src/interfaces/libpq/fe-secure-openssl.c | 39 ++++++++++++++
 3 files changed, 127 insertions(+), 2 deletions(-)

diff --git a/src/backend/libpq/be-secure-openssl.c b/src/backend/libpq/be-secure-openssl.c
index 8adf64c78eee..d1cf455ab442 100644
--- a/src/backend/libpq/be-secure-openssl.c
+++ b/src/backend/libpq/be-secure-openssl.c
@@ -72,6 +72,7 @@ static bool dummy_ssl_passwd_cb_called = false;
 static bool ssl_is_server_start;
 
 static int	ssl_protocol_version_to_openssl(int v);
+static const char *ssl_protocol_version_to_string(int v);
 
 /* ------------------------------------------------------------ */
 /*						 Public interface						*/
@@ -365,6 +366,7 @@ be_tls_open_server(Port *port)
 	int			err;
 	int			waitfor;
 	unsigned long ecode;
+	bool		give_proto_hint;
 
 	Assert(!port->ssl);
 	Assert(!port->peer);
@@ -451,10 +453,50 @@ be_tls_open_server(Port *port)
 							 errmsg("could not accept SSL connection: EOF detected")));
 				break;
 			case SSL_ERROR_SSL:
+				switch (ERR_GET_REASON(ecode))
+				{
+						/*
+						 * UNSUPPORTED_PROTOCOL, WRONG_VERSION_NUMBER, and
+						 * TLSV1_ALERT_PROTOCOL_VERSION have been observed
+						 * when trying to communicate with an old OpenSSL
+						 * library, or when the client and server specify
+						 * disjoint protocol ranges.  NO_PROTOCOLS_AVAILABLE
+						 * occurs if there's a local misconfiguration (which
+						 * can happen despite our checks, if openssl.cnf
+						 * injects a limit we didn't account for).  It's not
+						 * very clear what would make OpenSSL return the other
+						 * codes listed here, but a hint about protocol
+						 * versions seems like it's appropriate for all.
+						 */
+					case SSL_R_NO_PROTOCOLS_AVAILABLE:
+					case SSL_R_UNSUPPORTED_PROTOCOL:
+					case SSL_R_BAD_PROTOCOL_VERSION_NUMBER:
+					case SSL_R_UNKNOWN_PROTOCOL:
+					case SSL_R_UNKNOWN_SSL_VERSION:
+					case SSL_R_UNSUPPORTED_SSL_VERSION:
+					case SSL_R_VERSION_TOO_HIGH:
+					case SSL_R_VERSION_TOO_LOW:
+					case SSL_R_WRONG_SSL_VERSION:
+					case SSL_R_WRONG_VERSION_NUMBER:
+					case SSL_R_TLSV1_ALERT_PROTOCOL_VERSION:
+						give_proto_hint = true;
+						break;
+					default:
+						give_proto_hint = false;
+						break;
+				}
 				ereport(COMMERROR,
 						(errcode(ERRCODE_PROTOCOL_VIOLATION),
 						 errmsg("could not accept SSL connection: %s",
-								SSLerrmessage(ecode))));
+								SSLerrmessage(ecode)),
+						 give_proto_hint ?
+						 errhint("This may indicate that the client does not support any SSL protocol version between %s and %s.",
+								 ssl_min_protocol_version ?
+								 ssl_protocol_version_to_string(ssl_min_protocol_version) :
+								 MIN_OPENSSL_TLS_VERSION,
+								 ssl_max_protocol_version ?
+								 ssl_protocol_version_to_string(ssl_max_protocol_version) :
+								 MAX_OPENSSL_TLS_VERSION) : 0));
 				break;
 			case SSL_ERROR_ZERO_RETURN:
 				ereport(COMMERROR,
@@ -1328,6 +1370,29 @@ ssl_protocol_version_to_openssl(int v)
 	return -1;
 }
 
+/*
+ * Likewise provide a mapping to strings.
+ */
+static const char *
+ssl_protocol_version_to_string(int v)
+{
+	switch (v)
+	{
+		case PG_TLS_ANY:
+			return "any";
+		case PG_TLS1_VERSION:
+			return "TLSv1";
+		case PG_TLS1_1_VERSION:
+			return "TLSv1.1";
+		case PG_TLS1_2_VERSION:
+			return "TLSv1.2";
+		case PG_TLS1_3_VERSION:
+			return "TLSv1.3";
+	}
+
+	return "(unrecognized)";
+}
+
 
 static void
 default_openssl_tls_init(SSL_CTX *context, bool isServerStart)
diff --git a/src/include/common/openssl.h b/src/include/common/openssl.h
index 47fa1299945c..9d1c681d9f84 100644
--- a/src/include/common/openssl.h
+++ b/src/include/common/openssl.h
@@ -17,12 +17,33 @@
 #ifdef USE_OPENSSL
 #include <openssl/ssl.h>
 
+/*
+ * OpenSSL doesn't provide any very nice way to identify the min/max
+ * protocol versions the library supports, so we fake it as best we can.
+ * Note in particular that this doesn't account for restrictions that
+ * might be specified in the installation's openssl.cnf.
+ *
+ * We disable SSLv3 and older in library setup, so TLSv1 is the oldest
+ * protocol version of interest.
+ */
+#define MIN_OPENSSL_TLS_VERSION  "TLSv1"
+
+#if defined(TLS1_3_VERSION)
+#define MAX_OPENSSL_TLS_VERSION  "TLSv1.3"
+#elif defined(TLS1_2_VERSION)
+#define MAX_OPENSSL_TLS_VERSION  "TLSv1.2"
+#elif defined(TLS1_1_VERSION)
+#define MAX_OPENSSL_TLS_VERSION  "TLSv1.1"
+#else
+#define MAX_OPENSSL_TLS_VERSION  "TLSv1"
+#endif
+
 /* src/common/protocol_openssl.c */
 #ifndef SSL_CTX_set_min_proto_version
 extern int	SSL_CTX_set_min_proto_version(SSL_CTX *ctx, int version);
 extern int	SSL_CTX_set_max_proto_version(SSL_CTX *ctx, int version);
 #endif
 
-#endif
+#endif							/* USE_OPENSSL */
 
 #endif							/* COMMON_OPENSSL_H */
diff --git a/src/interfaces/libpq/fe-secure-openssl.c b/src/interfaces/libpq/fe-secure-openssl.c
index 2d813ef5f9be..b5b2006b75de 100644
--- a/src/interfaces/libpq/fe-secure-openssl.c
+++ b/src/interfaces/libpq/fe-secure-openssl.c
@@ -1304,6 +1304,45 @@ open_client_SSL(PGconn *conn)
 									  libpq_gettext("SSL error: %s\n"),
 									  err);
 					SSLerrfree(err);
+					switch (ERR_GET_REASON(ecode))
+					{
+							/*
+							 * UNSUPPORTED_PROTOCOL, WRONG_VERSION_NUMBER, and
+							 * TLSV1_ALERT_PROTOCOL_VERSION have been observed
+							 * when trying to communicate with an old OpenSSL
+							 * library, or when the client and server specify
+							 * disjoint protocol ranges.
+							 * NO_PROTOCOLS_AVAILABLE occurs if there's a
+							 * local misconfiguration (which can happen
+							 * despite our checks, if openssl.cnf injects a
+							 * limit we didn't account for).  It's not very
+							 * clear what would make OpenSSL return the other
+							 * codes listed here, but a hint about protocol
+							 * versions seems like it's appropriate for all.
+							 */
+						case SSL_R_NO_PROTOCOLS_AVAILABLE:
+						case SSL_R_UNSUPPORTED_PROTOCOL:
+						case SSL_R_BAD_PROTOCOL_VERSION_NUMBER:
+						case SSL_R_UNKNOWN_PROTOCOL:
+						case SSL_R_UNKNOWN_SSL_VERSION:
+						case SSL_R_UNSUPPORTED_SSL_VERSION:
+						case SSL_R_VERSION_TOO_HIGH:
+						case SSL_R_VERSION_TOO_LOW:
+						case SSL_R_WRONG_SSL_VERSION:
+						case SSL_R_WRONG_VERSION_NUMBER:
+						case SSL_R_TLSV1_ALERT_PROTOCOL_VERSION:
+							appendPQExpBuffer(&conn->errorMessage,
+											  libpq_gettext("This may indicate that the server does not support any SSL protocol version between %s and %s.\n"),
+											  conn->ssl_min_protocol_version ?
+											  conn->ssl_min_protocol_version :
+											  MIN_OPENSSL_TLS_VERSION,
+											  conn->ssl_max_protocol_version ?
+											  conn->ssl_max_protocol_version :
+											  MAX_OPENSSL_TLS_VERSION);
+							break;
+						default:
+							break;
+					}
 					pgtls_close(conn);
 					return PGRES_POLLING_FAILED;
 				}

From e1cc25f59a8a90d821aaf894e1691575ed94454e Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 27 Jun 2020 13:26:17 -0400
Subject: [PATCH 047/334] Fix list of SSL error codes for older OpenSSL
 versions.

Apparently 1.0.1 lacks SSL_R_VERSION_TOO_HIGH and
SSL_R_VERSION_TOO_LOW.  Per buildfarm.
---
 src/backend/libpq/be-secure-openssl.c    | 6 ++++--
 src/interfaces/libpq/fe-secure-openssl.c | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/backend/libpq/be-secure-openssl.c b/src/backend/libpq/be-secure-openssl.c
index d1cf455ab442..8b21ff4065c5 100644
--- a/src/backend/libpq/be-secure-openssl.c
+++ b/src/backend/libpq/be-secure-openssl.c
@@ -474,11 +474,13 @@ be_tls_open_server(Port *port)
 					case SSL_R_UNKNOWN_PROTOCOL:
 					case SSL_R_UNKNOWN_SSL_VERSION:
 					case SSL_R_UNSUPPORTED_SSL_VERSION:
-					case SSL_R_VERSION_TOO_HIGH:
-					case SSL_R_VERSION_TOO_LOW:
 					case SSL_R_WRONG_SSL_VERSION:
 					case SSL_R_WRONG_VERSION_NUMBER:
 					case SSL_R_TLSV1_ALERT_PROTOCOL_VERSION:
+#ifdef SSL_R_VERSION_TOO_HIGH
+					case SSL_R_VERSION_TOO_HIGH:
+					case SSL_R_VERSION_TOO_LOW:
+#endif
 						give_proto_hint = true;
 						break;
 					default:
diff --git a/src/interfaces/libpq/fe-secure-openssl.c b/src/interfaces/libpq/fe-secure-openssl.c
index b5b2006b75de..d609a38bbe03 100644
--- a/src/interfaces/libpq/fe-secure-openssl.c
+++ b/src/interfaces/libpq/fe-secure-openssl.c
@@ -1326,11 +1326,13 @@ open_client_SSL(PGconn *conn)
 						case SSL_R_UNKNOWN_PROTOCOL:
 						case SSL_R_UNKNOWN_SSL_VERSION:
 						case SSL_R_UNSUPPORTED_SSL_VERSION:
-						case SSL_R_VERSION_TOO_HIGH:
-						case SSL_R_VERSION_TOO_LOW:
 						case SSL_R_WRONG_SSL_VERSION:
 						case SSL_R_WRONG_VERSION_NUMBER:
 						case SSL_R_TLSV1_ALERT_PROTOCOL_VERSION:
+#ifdef SSL_R_VERSION_TOO_HIGH
+						case SSL_R_VERSION_TOO_HIGH:
+						case SSL_R_VERSION_TOO_LOW:
+#endif
 							appendPQExpBuffer(&conn->errorMessage,
 											  libpq_gettext("This may indicate that the server does not support any SSL protocol version between %s and %s.\n"),
 											  conn->ssl_min_protocol_version ?

From 96879a0efb65b9cde0a688201516633aa79fd5b0 Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Sat, 27 Jun 2020 22:05:04 -0700
Subject: [PATCH 048/334] Fix documentation of "must be vacuumed within"
 warning.

Warnings start 10M transactions before xidStopLimit, which is 11M
transactions before wraparound.  The sample WARNING output showed a
value greater than 11M, and its HINT message predated commit
25ec228ef760eb91c094cc3b6dea7257cc22ffb5.  Hence, the sample was
impossible.  Back-patch to 9.5 (all supported versions).
---
 doc/src/sgml/maintenance.sgml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/doc/src/sgml/maintenance.sgml b/doc/src/sgml/maintenance.sgml
index 39075ea8f33d..612e4cb20085 100644
--- a/doc/src/sgml/maintenance.sgml
+++ b/doc/src/sgml/maintenance.sgml
@@ -606,14 +606,13 @@ SELECT datname, age(datfrozenxid) FROM pg_database;
    </para>
 
    <para>
-    If for some reason autovacuum fails to clear old XIDs from a table,
-    the system will begin to emit warning messages like this when the
-    database's oldest XIDs reach ten million transactions from the wraparound
-    point:
+    If for some reason autovacuum fails to clear old XIDs from a table, the
+    system will begin to emit warning messages like this when the database's
+    oldest XIDs reach eleven million transactions from the wraparound point:
 
 <programlisting>
-WARNING:  database "mydb" must be vacuumed within 177009986 transactions
-HINT:  To avoid a database shutdown, execute a database-wide VACUUM in "mydb".
+WARNING:  database "mydb" must be vacuumed within 10985967 transactions
+HINT:  To avoid a database shutdown, execute a database-wide VACUUM in that database.
 </programlisting>
 
     (A manual <command>VACUUM</command> should fix the problem, as suggested by the

From 68de1440c79d75e529ff8c7395d698252370f992 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 29 Jun 2020 09:56:52 +0900
Subject: [PATCH 049/334] Refactor ObjectAddress field assignments for type
 dependencies

The logic used to build the set of dependencies needed for a type is
rather repetitive with direct assignments for each ObjectAddress field.
This refactors the code to use the macro ObjectAddressSet() instead, to
do the same work.  There are more areas of the backend code that could
use this macro, but these are left for a follow-up patch that will
partially rework the way dependencies are recorded as well.  Type
dependencies are left out of the follow-up patch, so they are refactored
separately here.

Extracted from a larger patch by the same author.

Author: Daniel Gustafsson
Discussion: https://potgr.es/m/20190213182737.mxn6hkdxwrzgxk35@alap3.anarazel.de
---
 src/backend/catalog/pg_type.c | 53 +++++++++--------------------------
 1 file changed, 14 insertions(+), 39 deletions(-)

diff --git a/src/backend/catalog/pg_type.c b/src/backend/catalog/pg_type.c
index cd5671496890..79ffe317dde4 100644
--- a/src/backend/catalog/pg_type.c
+++ b/src/backend/catalog/pg_type.c
@@ -579,9 +579,7 @@ GenerateTypeDependencies(HeapTuple typeTuple,
 		deleteSharedDependencyRecordsFor(TypeRelationId, typeObjectId, 0);
 	}
 
-	myself.classId = TypeRelationId;
-	myself.objectId = typeObjectId;
-	myself.objectSubId = 0;
+	ObjectAddressSet(myself, TypeRelationId, typeObjectId);
 
 	/*
 	 * Make dependencies on namespace, owner, ACL, extension.
@@ -591,9 +589,8 @@ GenerateTypeDependencies(HeapTuple typeTuple,
 	 */
 	if (!isDependentType)
 	{
-		referenced.classId = NamespaceRelationId;
-		referenced.objectId = typeForm->typnamespace;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, NamespaceRelationId,
+						 typeForm->typnamespace);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 
 		recordDependencyOnOwner(TypeRelationId, typeObjectId,
@@ -608,57 +605,43 @@ GenerateTypeDependencies(HeapTuple typeTuple,
 	/* Normal dependencies on the I/O functions */
 	if (OidIsValid(typeForm->typinput))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = typeForm->typinput;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, typeForm->typinput);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	if (OidIsValid(typeForm->typoutput))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = typeForm->typoutput;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, typeForm->typoutput);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	if (OidIsValid(typeForm->typreceive))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = typeForm->typreceive;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, typeForm->typreceive);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	if (OidIsValid(typeForm->typsend))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = typeForm->typsend;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, typeForm->typsend);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	if (OidIsValid(typeForm->typmodin))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = typeForm->typmodin;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, typeForm->typmodin);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	if (OidIsValid(typeForm->typmodout))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = typeForm->typmodout;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, typeForm->typmodout);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	if (OidIsValid(typeForm->typanalyze))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = typeForm->typanalyze;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, typeForm->typanalyze);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
@@ -673,9 +656,7 @@ GenerateTypeDependencies(HeapTuple typeTuple,
 	 */
 	if (OidIsValid(typeForm->typrelid))
 	{
-		referenced.classId = RelationRelationId;
-		referenced.objectId = typeForm->typrelid;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, RelationRelationId, typeForm->typrelid);
 
 		if (relationKind != RELKIND_COMPOSITE_TYPE)
 			recordDependencyOn(&myself, &referenced, DEPENDENCY_INTERNAL);
@@ -690,9 +671,7 @@ GenerateTypeDependencies(HeapTuple typeTuple,
 	 */
 	if (OidIsValid(typeForm->typelem))
 	{
-		referenced.classId = TypeRelationId;
-		referenced.objectId = typeForm->typelem;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, TypeRelationId, typeForm->typelem);
 		recordDependencyOn(&myself, &referenced,
 						   isImplicitArray ? DEPENDENCY_INTERNAL : DEPENDENCY_NORMAL);
 	}
@@ -700,9 +679,7 @@ GenerateTypeDependencies(HeapTuple typeTuple,
 	/* Normal dependency from a domain to its base type. */
 	if (OidIsValid(typeForm->typbasetype))
 	{
-		referenced.classId = TypeRelationId;
-		referenced.objectId = typeForm->typbasetype;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, TypeRelationId, typeForm->typbasetype);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
@@ -711,9 +688,7 @@ GenerateTypeDependencies(HeapTuple typeTuple,
 	if (OidIsValid(typeForm->typcollation) &&
 		typeForm->typcollation != DEFAULT_COLLATION_OID)
 	{
-		referenced.classId = CollationRelationId;
-		referenced.objectId = typeForm->typcollation;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, CollationRelationId, typeForm->typcollation);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 

From aafefb4dcbf79e8cb1439e888a9cdb3dfefa7657 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Mon, 29 Jun 2020 10:36:52 +0200
Subject: [PATCH 050/334] Clean up grammar a bit

Simplify the grammar specification of substring() and overlay() a bit,
simplify and update some comments.

Reviewed-by: Pavel Stehule <pavel.stehule@gmail.com>
Reviewed-by: Vik Fearing <vik@postgresfriends.org>
Reviewed-by: Fabien COELHO <coelho@cri.ensmp.fr>
Discussion: https://www.postgresql.org/message-id/flat/a15db31c-d0f8-8ce0-9039-578a31758adb%402ndquadrant.com
---
 src/backend/parser/gram.y | 73 ++++++++++++---------------------------
 1 file changed, 23 insertions(+), 50 deletions(-)

diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index e669d75a5af3..1a843049f055 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -452,7 +452,6 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query);
 %type <list>	extract_list overlay_list position_list
 %type <list>	substr_list trim_list
 %type <list>	opt_interval interval_second
-%type <node>	overlay_placing substr_from substr_for
 %type <str>		unicode_normal_form
 
 %type <boolean> opt_instead
@@ -13797,11 +13796,6 @@ func_expr_common_subexpr:
 				}
 			| OVERLAY '(' overlay_list ')'
 				{
-					/* overlay(A PLACING B FROM C FOR D) is converted to
-					 * overlay(A, B, C, D)
-					 * overlay(A PLACING B FROM C) is converted to
-					 * overlay(A, B, C)
-					 */
 					$$ = (Node *) makeFuncCall(SystemFuncName("overlay"), $3, @1);
 				}
 			| POSITION '(' position_list ')'
@@ -14437,63 +14431,45 @@ unicode_normal_form:
 			| NFKD									{ $$ = "nfkd"; }
 		;
 
-/* OVERLAY() arguments
- * SQL99 defines the OVERLAY() function:
- * o overlay(text placing text from int for int)
- * o overlay(text placing text from int)
- * and similarly for binary strings
- */
+/* OVERLAY() arguments */
 overlay_list:
-			a_expr overlay_placing substr_from substr_for
+			a_expr PLACING a_expr FROM a_expr FOR a_expr
 				{
-					$$ = list_make4($1, $2, $3, $4);
+					/* overlay(A PLACING B FROM C FOR D) is converted to overlay(A, B, C, D) */
+					$$ = list_make4($1, $3, $5, $7);
 				}
-			| a_expr overlay_placing substr_from
+			| a_expr PLACING a_expr FROM a_expr
 				{
-					$$ = list_make3($1, $2, $3);
+					/* overlay(A PLACING B FROM C) is converted to overlay(A, B, C) */
+					$$ = list_make3($1, $3, $5);
 				}
 		;
 
-overlay_placing:
-			PLACING a_expr
-				{ $$ = $2; }
-		;
-
 /* position_list uses b_expr not a_expr to avoid conflict with general IN */
-
 position_list:
 			b_expr IN_P b_expr						{ $$ = list_make2($3, $1); }
 			| /*EMPTY*/								{ $$ = NIL; }
 		;
 
-/* SUBSTRING() arguments
- * SQL9x defines a specific syntax for arguments to SUBSTRING():
- * o substring(text from int for int)
- * o substring(text from int) get entire string from starting point "int"
- * o substring(text for int) get first "int" characters of string
- * o substring(text from pattern) get entire string matching pattern
- * o substring(text from pattern for escape) same with specified escape char
- * We also want to support generic substring functions which accept
- * the usual generic list of arguments. So we will accept both styles
- * here, and convert the SQL9x style to the generic list for further
- * processing. - thomas 2000-11-28
- */
+/* SUBSTRING() arguments */
 substr_list:
-			a_expr substr_from substr_for
+			a_expr FROM a_expr FOR a_expr
 				{
-					$$ = list_make3($1, $2, $3);
+					$$ = list_make3($1, $3, $5);
 				}
-			| a_expr substr_for substr_from
+			| a_expr FOR a_expr FROM a_expr
 				{
-					/* not legal per SQL99, but might as well allow it */
-					$$ = list_make3($1, $3, $2);
+					/* not legal per SQL, but might as well allow it */
+					$$ = list_make3($1, $5, $3);
 				}
-			| a_expr substr_from
+			| a_expr FROM a_expr
 				{
-					$$ = list_make2($1, $2);
+					$$ = list_make2($1, $3);
 				}
-			| a_expr substr_for
+			| a_expr FOR a_expr
 				{
+					/* not legal per SQL */
+
 					/*
 					 * Since there are no cases where this syntax allows
 					 * a textual FOR value, we forcibly cast the argument
@@ -14504,9 +14480,13 @@ substr_list:
 					 * is unknown or doesn't have an implicit cast to int4.
 					 */
 					$$ = list_make3($1, makeIntConst(1, -1),
-									makeTypeCast($2,
+									makeTypeCast($3,
 												 SystemTypeName("int4"), -1));
 				}
+			/*
+			 * We also want to support generic substring functions that
+			 * accept the usual generic list of arguments.
+			 */
 			| expr_list
 				{
 					$$ = $1;
@@ -14515,13 +14495,6 @@ substr_list:
 				{ $$ = NIL; }
 		;
 
-substr_from:
-			FROM a_expr								{ $$ = $2; }
-		;
-
-substr_for: FOR a_expr								{ $$ = $2; }
-		;
-
 trim_list:	a_expr FROM expr_list					{ $$ = lappend($3, $1); }
 			| FROM expr_list						{ $$ = $2; }
 			| expr_list								{ $$ = $1; }

From 78c887679d7632c1211f85eb95723f3226bf1b46 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Mon, 29 Jun 2020 11:04:42 +0200
Subject: [PATCH 051/334] Add current substring regular expression syntax

SQL:1999 had syntax

    SUBSTRING(text FROM pattern FOR escapechar)

but this was replaced in SQL:2003 by the more clear

    SUBSTRING(text SIMILAR pattern ESCAPE escapechar)

but this was never implemented in PostgreSQL.  This patch adds that
new syntax as an alternative in the parser, and updates documentation
and tests to indicate that this is the preferred alternative now.

Reviewed-by: Pavel Stehule <pavel.stehule@gmail.com>
Reviewed-by: Vik Fearing <vik@postgresfriends.org>
Reviewed-by: Fabien COELHO <coelho@cri.ensmp.fr>
Discussion: https://www.postgresql.org/message-id/flat/a15db31c-d0f8-8ce0-9039-578a31758adb%402ndquadrant.com
---
 contrib/citext/expected/citext.out         |  2 +-
 contrib/citext/expected/citext_1.out       |  2 +-
 contrib/citext/sql/citext.sql              |  2 +-
 doc/src/sgml/func.sgml                     | 20 ++++++++++----
 src/backend/catalog/information_schema.sql |  2 +-
 src/backend/parser/gram.y                  | 26 +++++++++++++++++-
 src/test/regress/expected/strings.out      | 31 +++++++++++++---------
 src/test/regress/sql/strings.sql           | 26 +++++++++---------
 8 files changed, 77 insertions(+), 34 deletions(-)

diff --git a/contrib/citext/expected/citext.out b/contrib/citext/expected/citext.out
index 96800be9c03c..ec99aaed5dcc 100644
--- a/contrib/citext/expected/citext.out
+++ b/contrib/citext/expected/citext.out
@@ -1602,7 +1602,7 @@ SELECT substring('Thomas'::citext from '...$') = 'mas' AS t;
  t
 (1 row)
 
-SELECT substring('Thomas'::citext from '%#"o_a#"_' for '#') = 'oma' AS t;
+SELECT substring('Thomas'::citext similar '%#"o_a#"_' escape '#') = 'oma' AS t;
  t 
 ---
  t
diff --git a/contrib/citext/expected/citext_1.out b/contrib/citext/expected/citext_1.out
index 33e3676d3c48..75fd08b7cc4a 100644
--- a/contrib/citext/expected/citext_1.out
+++ b/contrib/citext/expected/citext_1.out
@@ -1602,7 +1602,7 @@ SELECT substring('Thomas'::citext from '...$') = 'mas' AS t;
  t
 (1 row)
 
-SELECT substring('Thomas'::citext from '%#"o_a#"_' for '#') = 'oma' AS t;
+SELECT substring('Thomas'::citext similar '%#"o_a#"_' escape '#') = 'oma' AS t;
  t 
 ---
  t
diff --git a/contrib/citext/sql/citext.sql b/contrib/citext/sql/citext.sql
index 261b73cfa6cc..10232f5a9f44 100644
--- a/contrib/citext/sql/citext.sql
+++ b/contrib/citext/sql/citext.sql
@@ -564,7 +564,7 @@ SELECT substring('alphabet'::citext, 3, 2) = 'ph' AS t;
 SELECT substring('Thomas'::citext from 2 for 3) = 'hom' AS t;
 SELECT substring('Thomas'::citext from 2) = 'homas' AS t;
 SELECT substring('Thomas'::citext from '...$') = 'mas' AS t;
-SELECT substring('Thomas'::citext from '%#"o_a#"_' for '#') = 'oma' AS t;
+SELECT substring('Thomas'::citext similar '%#"o_a#"_' escape '#') = 'oma' AS t;
 
 SELECT trim('    trim    '::citext)               = 'trim' AS t;
 SELECT trim('xxxxxtrimxxxx'::citext, 'x'::citext) = 'trim' AS t;
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 7119f0b2ca58..f06585653503 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -2669,15 +2669,21 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
 
       <row>
        <entry role="func_table_entry"><para role="func_signature">
+        <function>substring</function> ( <parameter>string</parameter> <type>text</type> <literal>SIMILAR</literal> <parameter>pattern</parameter> <type>text</type> <literal>ESCAPE</literal> <parameter>escape</parameter> <type>text</type> )
+        <returnvalue>text</returnvalue>
+       </para>
+       <para role="func_signature">
         <function>substring</function> ( <parameter>string</parameter> <type>text</type> <literal>FROM</literal> <parameter>pattern</parameter> <type>text</type> <literal>FOR</literal> <parameter>escape</parameter> <type>text</type> )
         <returnvalue>text</returnvalue>
        </para>
        <para>
         Extracts substring matching <acronym>SQL</acronym> regular expression;
-        see <xref linkend="functions-similarto-regexp"/>.
+        see <xref linkend="functions-similarto-regexp"/>.  The first form has
+        been specified since SQL:2003; the second form was only in SQL:1999
+        and should be considered obsolete.
        </para>
        <para>
-        <literal>substring('Thomas' from '%#"o_a#"_' for '#')</literal>
+        <literal>substring('Thomas' similar '%#"o_a#"_' escape '#')</literal>
         <returnvalue>oma</returnvalue>
        </para></entry>
       </row>
@@ -5160,7 +5166,11 @@ cast(-44 as bit(12))           <lineannotation>111111010100</lineannotation>
     The <function>substring</function> function with three parameters
     provides extraction of a substring that matches an SQL
     regular expression pattern.  The function can be written according
-    to SQL99 syntax:
+    to standard SQL syntax:
+<synopsis>
+substring(<replaceable>string</replaceable> similar <replaceable>pattern</replaceable> escape <replaceable>escape-character</replaceable>)
+</synopsis>
+    or using the now obsolete SQL:1999 syntax:
 <synopsis>
 substring(<replaceable>string</replaceable> from <replaceable>pattern</replaceable> for <replaceable>escape-character</replaceable>)
 </synopsis>
@@ -5201,8 +5211,8 @@ substring(<replaceable>string</replaceable>, <replaceable>pattern</replaceable>,
    <para>
     Some examples, with <literal>#&quot;</literal> delimiting the return string:
 <programlisting>
-substring('foobar' from '%#"o_b#"%' for '#')   <lineannotation>oob</lineannotation>
-substring('foobar' from '#"o_b#"%' for '#')    <lineannotation>NULL</lineannotation>
+substring('foobar' similar '%#"o_b#"%' escape '#')   <lineannotation>oob</lineannotation>
+substring('foobar' similar '#"o_b#"%' escape '#')    <lineannotation>NULL</lineannotation>
 </programlisting>
    </para>
   </sect2>
diff --git a/src/backend/catalog/information_schema.sql b/src/backend/catalog/information_schema.sql
index 3e07fb107eb6..5ab47e774316 100644
--- a/src/backend/catalog/information_schema.sql
+++ b/src/backend/catalog/information_schema.sql
@@ -182,7 +182,7 @@ CREATE FUNCTION _pg_interval_type(typid oid, mod int4) RETURNS text
     AS
 $$SELECT
   CASE WHEN $1 IN (1186) /* interval */
-           THEN pg_catalog.upper(substring(pg_catalog.format_type($1, $2) from 'interval[()0-9]* #"%#"' for '#'))
+           THEN pg_catalog.upper(substring(pg_catalog.format_type($1, $2) similar 'interval[()0-9]* #"%#"' escape '#'))
        ELSE null
   END$$;
 
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 1a843049f055..4ff35095b855 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -14451,7 +14451,27 @@ position_list:
 			| /*EMPTY*/								{ $$ = NIL; }
 		;
 
-/* SUBSTRING() arguments */
+/*
+ * SUBSTRING() arguments
+ *
+ * Note that SQL:1999 has both
+ *
+ *     text FROM int FOR int
+ *
+ * and
+ *
+ *     text FROM pattern FOR escape
+ *
+ * In the parser we map them both to a call to the substring() function and
+ * rely on type resolution to pick the right one.
+ *
+ * In SQL:2003, the second variant was changed to
+ *
+ *     text SIMILAR pattern ESCAPE escape
+ *
+ * We could in theory map that to a different function internally, but
+ * since we still support the SQL:1999 version, we don't.
+ */
 substr_list:
 			a_expr FROM a_expr FOR a_expr
 				{
@@ -14483,6 +14503,10 @@ substr_list:
 									makeTypeCast($3,
 												 SystemTypeName("int4"), -1));
 				}
+			| a_expr SIMILAR a_expr ESCAPE a_expr
+				{
+					$$ = list_make3($1, $3, $5);
+				}
 			/*
 			 * We also want to support generic substring functions that
 			 * accept the usual generic list of arguments.
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 6e98d183f616..8c034c9599fd 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -397,6 +397,13 @@ SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
 (1 row)
 
 -- T581 regular expression substring (with SQL's bizarre regexp syntax)
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd";
+ bcd 
+-----
+ bcd
+(1 row)
+
+-- obsolete SQL99 syntax
 SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
  bcd 
 -----
@@ -404,75 +411,75 @@ SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
 (1 row)
 
 -- No match should return NULL
-SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True";
  True 
 ------
  t
 (1 row)
 
 -- Null inputs should return NULL
-SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True";
  True 
 ------
  t
 (1 row)
 
-SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True";
  True 
 ------
  t
 (1 row)
 
-SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True";
  True 
 ------
  t
 (1 row)
 
 -- The first and last parts should act non-greedy
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef";
  bcdef 
 -------
  bcdef
 (1 row)
 
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg";
  abcdefg 
 ---------
  abcdefg
 (1 row)
 
 -- Vertical bar in any part affects only that part
-SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef";
  bcdef 
 -------
  bcdef
 (1 row)
 
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef";
  bcdef 
 -------
  bcdef
 (1 row)
 
-SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef";
  bcdef 
 -------
  bcdef
 (1 row)
 
 -- Can't have more than two part separators
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error";
 ERROR:  SQL regular expression may not contain more than two escape-double-quote separators
 CONTEXT:  SQL function "substring" statement 1
 -- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
-SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg";
  bcdefg 
 --------
  bcdefg
 (1 row)
 
-SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg";
  abcdefg 
 ---------
  abcdefg
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 3e89159a4fd4..14901a26923f 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -132,31 +132,33 @@ SELECT SUBSTRING('1234567890' FROM 3) = '34567890' AS "34567890";
 SELECT SUBSTRING('1234567890' FROM 4 FOR 3) = '456' AS "456";
 
 -- T581 regular expression substring (with SQL's bizarre regexp syntax)
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"(b_d)#"%' ESCAPE '#') AS "bcd";
+-- obsolete SQL99 syntax
 SELECT SUBSTRING('abcdefg' FROM 'a#"(b_d)#"%' FOR '#') AS "bcd";
 
 -- No match should return NULL
-SELECT SUBSTRING('abcdefg' FROM '#"(b_d)#"%' FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '#"(b_d)#"%' ESCAPE '#') IS NULL AS "True";
 
 -- Null inputs should return NULL
-SELECT SUBSTRING('abcdefg' FROM '%' FOR NULL) IS NULL AS "True";
-SELECT SUBSTRING(NULL FROM '%' FOR '#') IS NULL AS "True";
-SELECT SUBSTRING('abcdefg' FROM NULL FOR '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR '%' ESCAPE NULL) IS NULL AS "True";
+SELECT SUBSTRING(NULL SIMILAR '%' ESCAPE '#') IS NULL AS "True";
+SELECT SUBSTRING('abcdefg' SIMILAR NULL ESCAPE '#') IS NULL AS "True";
 
 -- The first and last parts should act non-greedy
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"g' FOR '#') AS "bcdef";
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"g' ESCAPE '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*' ESCAPE '#') AS "abcdefg";
 
 -- Vertical bar in any part affects only that part
-SELECT SUBSTRING('abcdefg' FROM 'a|b#"%#"g' FOR '#') AS "bcdef";
-SELECT SUBSTRING('abcdefg' FROM 'a#"%#"x|g' FOR '#') AS "bcdef";
-SELECT SUBSTRING('abcdefg' FROM 'a#"%|ab#"g' FOR '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a|b#"%#"g' ESCAPE '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%#"x|g' ESCAPE '#') AS "bcdef";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%|ab#"g' ESCAPE '#') AS "bcdef";
 
 -- Can't have more than two part separators
-SELECT SUBSTRING('abcdefg' FROM 'a*#"%#"g*#"x' FOR '#') AS "error";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a*#"%#"g*#"x' ESCAPE '#') AS "error";
 
 -- Postgres extension: with 0 or 1 separator, assume parts 1 and 3 are empty
-SELECT SUBSTRING('abcdefg' FROM 'a#"%g' FOR '#') AS "bcdefg";
-SELECT SUBSTRING('abcdefg' FROM 'a%g' FOR '#') AS "abcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a#"%g' ESCAPE '#') AS "bcdefg";
+SELECT SUBSTRING('abcdefg' SIMILAR 'a%g' ESCAPE '#') AS "abcdefg";
 
 -- substring() with just two arguments is not allowed by SQL spec;
 -- we accept it, but we interpret the pattern as a POSIX regexp not SQL

From 16e3ad5d143795b05a21dc887c2ab384cce4bcb8 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 29 Jun 2020 11:41:19 -0400
Subject: [PATCH 052/334] Avoid using %c printf format for potentially
 non-ASCII characters.

Since %c only passes a C "char" to printf, it's incapable of dealing
with multibyte characters.  Passing just the first byte of such a
character leads to an output string that is visibly not correctly
encoded, resulting in undesirable behavior such as encoding conversion
failures while sending error messages to clients.

We've lived with this issue for a long time because it was inconvenient
to avoid in a portable fashion.  However, now that we always use our own
snprintf code, it's reasonable to use the %.*s format to print just one
possibly-multibyte character in a string.  (We previously avoided that
obvious-looking answer in order to work around glibc's bug #6530, cf
commits 54cd4f045 and ed437e2b2.)

Hence, run around and fix a bunch of places that used %c to report
a character found in a user-supplied string.  For simplicity, I did
not touch places that were emitting non-user-facing debug messages,
or reporting catalog data that should always be ASCII.  (It's also
unclear how useful this approach could be in frontend code, where
it's less certain that we know what encoding we're dealing with.)

In passing, improve a couple of poorly-written error messages in
pageinspect/heapfuncs.c.

This is a longstanding issue, but I'm hesitant to back-patch because
of the impact on translatable message strings.  In any case this fix
would not work reliably before v12.

Tom Lane and Quan Zongliang

Discussion: https://postgr.es/m/a120087c-4c88-d9d4-1ec5-808d7a7f133d@gmail.com
---
 contrib/hstore/hstore_io.c            | 16 ++++++++++++----
 contrib/pageinspect/heapfuncs.c       | 11 ++++++-----
 src/backend/utils/adt/encode.c        | 20 +++++++++++++-------
 src/backend/utils/adt/jsonpath_gram.y |  4 ++--
 src/backend/utils/adt/regexp.c        |  4 ++--
 src/backend/utils/adt/varbit.c        | 16 ++++++++--------
 src/backend/utils/adt/varlena.c       |  8 ++++----
 7 files changed, 47 insertions(+), 32 deletions(-)

diff --git a/contrib/hstore/hstore_io.c b/contrib/hstore/hstore_io.c
index 60bdbea46be1..b3304ff84452 100644
--- a/contrib/hstore/hstore_io.c
+++ b/contrib/hstore/hstore_io.c
@@ -80,7 +80,9 @@ get_val(HSParser *state, bool ignoreeq, bool *escaped)
 			}
 			else if (*(state->ptr) == '=' && !ignoreeq)
 			{
-				elog(ERROR, "Syntax error near '%c' at position %d", *(state->ptr), (int32) (state->ptr - state->begin));
+				elog(ERROR, "Syntax error near \"%.*s\" at position %d",
+					 pg_mblen(state->ptr), state->ptr,
+					 (int32) (state->ptr - state->begin));
 			}
 			else if (*(state->ptr) == '\\')
 			{
@@ -219,7 +221,9 @@ parse_hstore(HSParser *state)
 			}
 			else if (!isspace((unsigned char) *(state->ptr)))
 			{
-				elog(ERROR, "Syntax error near '%c' at position %d", *(state->ptr), (int32) (state->ptr - state->begin));
+				elog(ERROR, "Syntax error near \"%.*s\" at position %d",
+					 pg_mblen(state->ptr), state->ptr,
+					 (int32) (state->ptr - state->begin));
 			}
 		}
 		else if (st == WGT)
@@ -234,7 +238,9 @@ parse_hstore(HSParser *state)
 			}
 			else
 			{
-				elog(ERROR, "Syntax error near '%c' at position %d", *(state->ptr), (int32) (state->ptr - state->begin));
+				elog(ERROR, "Syntax error near \"%.*s\" at position %d",
+					 pg_mblen(state->ptr), state->ptr,
+					 (int32) (state->ptr - state->begin));
 			}
 		}
 		else if (st == WVAL)
@@ -267,7 +273,9 @@ parse_hstore(HSParser *state)
 			}
 			else if (!isspace((unsigned char) *(state->ptr)))
 			{
-				elog(ERROR, "Syntax error near '%c' at position %d", *(state->ptr), (int32) (state->ptr - state->begin));
+				elog(ERROR, "Syntax error near \"%.*s\" at position %d",
+					 pg_mblen(state->ptr), state->ptr,
+					 (int32) (state->ptr - state->begin));
 			}
 		}
 		else
diff --git a/contrib/pageinspect/heapfuncs.c b/contrib/pageinspect/heapfuncs.c
index 11a910184bf3..f04455da127c 100644
--- a/contrib/pageinspect/heapfuncs.c
+++ b/contrib/pageinspect/heapfuncs.c
@@ -30,6 +30,7 @@
 #include "catalog/pg_am_d.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
+#include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "pageinspect.h"
 #include "port/pg_bitutils.h"
@@ -99,7 +100,8 @@ text_to_bits(char *str, int len)
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("illegal character '%c' in t_bits string", str[off])));
+					 errmsg("invalid character \"%.*s\" in t_bits string",
+							pg_mblen(str + off), str + off)));
 
 		if (off % 8 == 7)
 			bits[off / 8] = byte;
@@ -460,14 +462,13 @@ tuple_data_split(PG_FUNCTION_ARGS)
 		if (!t_bits_str)
 			ereport(ERROR,
 					(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("argument of t_bits is null, but it is expected to be null and %d character long",
-							bits_len)));
+					 errmsg("t_bits string must not be NULL")));
 
 		bits_str_len = strlen(t_bits_str);
 		if (bits_len != bits_str_len)
 			ereport(ERROR,
 					(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("unexpected length of t_bits %u, expected %d",
+					 errmsg("unexpected length of t_bits string: %u, expected %u",
 							bits_str_len, bits_len)));
 
 		/* do the conversion */
@@ -478,7 +479,7 @@ tuple_data_split(PG_FUNCTION_ARGS)
 		if (t_bits_str)
 			ereport(ERROR,
 					(errcode(ERRCODE_DATA_CORRUPTED),
-					 errmsg("t_bits string is expected to be NULL, but instead it is %zu bytes length",
+					 errmsg("t_bits string is expected to be NULL, but instead it is %zu bytes long",
 							strlen(t_bits_str))));
 	}
 
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index 61d318d93ca2..a609d49c12c2 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -15,6 +15,7 @@
 
 #include <ctype.h>
 
+#include "mb/pg_wchar.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"
 
@@ -171,17 +172,19 @@ hex_encode(const char *src, size_t len, char *dst)
 }
 
 static inline char
-get_hex(char c)
+get_hex(const char *cp)
 {
+	unsigned char c = (unsigned char) *cp;
 	int			res = -1;
 
-	if (c > 0 && c < 127)
-		res = hexlookup[(unsigned char) c];
+	if (c < 127)
+		res = hexlookup[c];
 
 	if (res < 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-				 errmsg("invalid hexadecimal digit: \"%c\"", c)));
+				 errmsg("invalid hexadecimal digit: \"%.*s\"",
+						pg_mblen(cp), cp)));
 
 	return (char) res;
 }
@@ -205,13 +208,15 @@ hex_decode(const char *src, size_t len, char *dst)
 			s++;
 			continue;
 		}
-		v1 = get_hex(*s++) << 4;
+		v1 = get_hex(s) << 4;
+		s++;
 		if (s >= srcend)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 					 errmsg("invalid hexadecimal data: odd number of digits")));
 
-		v2 = get_hex(*s++);
+		v2 = get_hex(s);
+		s++;
 		*p++ = v1 | v2;
 	}
 
@@ -338,7 +343,8 @@ pg_base64_decode(const char *src, size_t len, char *dst)
 			if (b < 0)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-						 errmsg("invalid symbol \"%c\" while decoding base64 sequence", (int) c)));
+						 errmsg("invalid symbol \"%.*s\" found while decoding base64 sequence",
+								pg_mblen(s - 1), s - 1)));
 		}
 		/* add it to buffer */
 		buf = (buf << 6) + b;
diff --git a/src/backend/utils/adt/jsonpath_gram.y b/src/backend/utils/adt/jsonpath_gram.y
index f87db8ccf6b4..88ef9550e9db 100644
--- a/src/backend/utils/adt/jsonpath_gram.y
+++ b/src/backend/utils/adt/jsonpath_gram.y
@@ -526,8 +526,8 @@ makeItemLikeRegex(JsonPathParseItem *expr, JsonPathString *pattern,
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("invalid input syntax for type %s", "jsonpath"),
-						 errdetail("unrecognized flag character \"%c\" in LIKE_REGEX predicate",
-								   flags->val[i])));
+						 errdetail("unrecognized flag character \"%.*s\" in LIKE_REGEX predicate",
+								   pg_mblen(flags->val + i), flags->val + i)));
 				break;
 		}
 	}
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 06f808652a4c..c70c5eeeb37f 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -423,8 +423,8 @@ parse_re_flags(pg_re_flags *flags, text *opts)
 				default:
 					ereport(ERROR,
 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-							 errmsg("invalid regular expression option: \"%c\"",
-									opt_p[i])));
+							 errmsg("invalid regular expression option: \"%.*s\"",
+									pg_mblen(opt_p + i), opt_p + i)));
 					break;
 			}
 		}
diff --git a/src/backend/utils/adt/varbit.c b/src/backend/utils/adt/varbit.c
index f0c6a44b8425..3c03459f5192 100644
--- a/src/backend/utils/adt/varbit.c
+++ b/src/backend/utils/adt/varbit.c
@@ -230,8 +230,8 @@ bit_in(PG_FUNCTION_ARGS)
 			else if (*sp != '0')
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-						 errmsg("\"%c\" is not a valid binary digit",
-								*sp)));
+						 errmsg("\"%.*s\" is not a valid binary digit",
+								pg_mblen(sp), sp)));
 
 			x >>= 1;
 			if (x == 0)
@@ -255,8 +255,8 @@ bit_in(PG_FUNCTION_ARGS)
 			else
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-						 errmsg("\"%c\" is not a valid hexadecimal digit",
-								*sp)));
+						 errmsg("\"%.*s\" is not a valid hexadecimal digit",
+								pg_mblen(sp), sp)));
 
 			if (bc)
 			{
@@ -531,8 +531,8 @@ varbit_in(PG_FUNCTION_ARGS)
 			else if (*sp != '0')
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-						 errmsg("\"%c\" is not a valid binary digit",
-								*sp)));
+						 errmsg("\"%.*s\" is not a valid binary digit",
+								pg_mblen(sp), sp)));
 
 			x >>= 1;
 			if (x == 0)
@@ -556,8 +556,8 @@ varbit_in(PG_FUNCTION_ARGS)
 			else
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-						 errmsg("\"%c\" is not a valid hexadecimal digit",
-								*sp)));
+						 errmsg("\"%.*s\" is not a valid hexadecimal digit",
+								pg_mblen(sp), sp)));
 
 			if (bc)
 			{
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 2eaabd6231d3..df10bfb906ed 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -5586,8 +5586,8 @@ text_format(PG_FUNCTION_ARGS)
 		if (strchr("sIL", *cp) == NULL)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("unrecognized format() type specifier \"%c\"",
-							*cp),
+					 errmsg("unrecognized format() type specifier \"%.*s\"",
+							pg_mblen(cp), cp),
 					 errhint("For a single \"%%\" use \"%%%%\".")));
 
 		/* If indirect width was specified, get its value */
@@ -5707,8 +5707,8 @@ text_format(PG_FUNCTION_ARGS)
 				/* should not get here, because of previous check */
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-						 errmsg("unrecognized format() type specifier \"%c\"",
-								*cp),
+						 errmsg("unrecognized format() type specifier \"%.*s\"",
+								pg_mblen(cp), cp),
 						 errhint("For a single \"%%\" use \"%%%%\".")));
 				break;
 		}

From f7a476f0d656bbc673474b9165cd05fa548c18c9 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 29 Jun 2020 12:30:39 -0700
Subject: [PATCH 053/334] nbtree: Correct inaccurate split location comment.

Minor oversight in commit fab25024338.
---
 src/backend/access/nbtree/nbtsplitloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
index fcfc23ce601a..ef6dd1cf1920 100644
--- a/src/backend/access/nbtree/nbtsplitloc.c
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -190,7 +190,7 @@ _bt_findsplitloc(Relation rel,
 	Assert(!BTreeTupleIsPosting(newitem));
 
 	/*
-	 * maxsplits should never exceed maxoff because there will be at most as
+	 * nsplits should never exceed maxoff because there will be at most as
 	 * many candidate split points as there are points _between_ tuples, once
 	 * you imagine that the new item is already on the original page (the
 	 * final number of splits may be slightly lower because not all points

From c410af098c46949e36607eb13689e697fa2def97 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 29 Jun 2020 17:12:38 -0400
Subject: [PATCH 054/334] Mop up some no-longer-necessary hacks around printf
 %.*s format.

Commit 54cd4f045 added some kluges to work around an old glibc bug,
namely that %.*s could misbehave if glibc thought any characters in
the supplied string were incorrectly encoded.  Now that we use our
own snprintf.c implementation, we need not worry about that bug (even
if it still exists in the wild).  Revert a couple of particularly
ugly hacks, and remove or improve assorted comments.

Note that there can still be encoding-related hazards here: blindly
clipping at a fixed length risks producing wrongly-encoded output
if the clip splits a multibyte character.  However, code that's
doing correct multibyte-aware clipping doesn't really need a comment
about that, while code that isn't needs an explanation why not,
rather than a red-herring comment about an obsolete bug.

Discussion: https://postgr.es/m/279428.1593373684@sss.pgh.pa.us
---
 src/backend/commands/copy.c                |  6 +----
 src/backend/parser/scansup.c               | 14 ++---------
 src/backend/tsearch/wparser_def.c          |  7 ------
 src/backend/utils/adt/datetime.c           |  3 ++-
 src/backend/utils/adt/ruleutils.c          | 10 --------
 src/fe_utils/print.c                       | 28 +++++-----------------
 src/interfaces/ecpg/ecpglib/error.c        |  1 -
 src/interfaces/ecpg/pgtypeslib/dt_common.c |  3 ++-
 src/interfaces/libpq/fe-misc.c             | 19 +++------------
 9 files changed, 16 insertions(+), 75 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 6b1fd6d4cca6..3e199bdfd0c6 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2303,11 +2303,7 @@ CopyFromErrorCallback(void *arg)
 /*
  * Make sure we don't print an unreasonable amount of COPY data in a message.
  *
- * It would seem a lot easier to just use the sprintf "precision" limit to
- * truncate the string.  However, some versions of glibc have a bug/misfeature
- * that vsnprintf will always fail (return -1) if it is asked to truncate
- * a string that contains invalid byte sequences for the current encoding.
- * So, do our own truncation.  We return a pstrdup'd copy of the input.
+ * Returns a pstrdup'd copy of the input.
  */
 static char *
 limit_printout_length(const char *str)
diff --git a/src/backend/parser/scansup.c b/src/backend/parser/scansup.c
index 18169ec4f4cc..cac70d5df7af 100644
--- a/src/backend/parser/scansup.c
+++ b/src/backend/parser/scansup.c
@@ -189,20 +189,10 @@ truncate_identifier(char *ident, int len, bool warn)
 	{
 		len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);
 		if (warn)
-		{
-			/*
-			 * We avoid using %.*s here because it can misbehave if the data
-			 * is not valid in what libc thinks is the prevailing encoding.
-			 */
-			char		buf[NAMEDATALEN];
-
-			memcpy(buf, ident, len);
-			buf[len] = '\0';
 			ereport(NOTICE,
 					(errcode(ERRCODE_NAME_TOO_LONG),
-					 errmsg("identifier \"%s\" will be truncated to \"%s\"",
-							ident, buf)));
-		}
+					 errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
+							ident, len, ident)));
 		ident[len] = '\0';
 	}
 }
diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index 48e55e141a41..fda35abc7417 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -324,12 +324,6 @@ TParserInit(char *str, int len)
 	prs->state->state = TPS_Base;
 
 #ifdef WPARSER_TRACE
-
-	/*
-	 * Use of %.*s here is a bit risky since it can misbehave if the data is
-	 * not in what libc thinks is the prevailing encoding.  However, since
-	 * this is just a debugging aid, we choose to live with that.
-	 */
 	fprintf(stderr, "parsing \"%.*s\"\n", len, str);
 #endif
 
@@ -366,7 +360,6 @@ TParserCopyInit(const TParser *orig)
 	prs->state->state = TPS_Base;
 
 #ifdef WPARSER_TRACE
-	/* See note above about %.*s */
 	fprintf(stderr, "parsing copy of \"%.*s\"\n", prs->lenstr, prs->str);
 #endif
 
diff --git a/src/backend/utils/adt/datetime.c b/src/backend/utils/adt/datetime.c
index 0b6dfb248cbc..dec2fad82a68 100644
--- a/src/backend/utils/adt/datetime.c
+++ b/src/backend/utils/adt/datetime.c
@@ -4013,7 +4013,8 @@ EncodeDateTime(struct pg_tm *tm, fsec_t fsec, bool print_tz, int tz, const char
 
 			/*
 			 * Note: the uses of %.*s in this function would be risky if the
-			 * timezone names ever contain non-ASCII characters.  However, all
+			 * timezone names ever contain non-ASCII characters, since we are
+			 * not being careful to do encoding-aware clipping.  However, all
 			 * TZ abbreviations in the IANA database are plain ASCII.
 			 */
 			if (print_tz)
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 076c3c019ff4..2cbcb4b85e3b 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -3554,11 +3554,6 @@ set_rtable_names(deparse_namespace *dpns, List *parent_namespaces,
 					hentry->counter++;
 					for (;;)
 					{
-						/*
-						 * We avoid using %.*s here because it can misbehave
-						 * if the data is not valid in what libc thinks is the
-						 * prevailing encoding.
-						 */
 						memcpy(modname, refname, refnamelen);
 						sprintf(modname + refnamelen, "_%d", hentry->counter);
 						if (strlen(modname) < NAMEDATALEN)
@@ -4438,11 +4433,6 @@ make_colname_unique(char *colname, deparse_namespace *dpns,
 			i++;
 			for (;;)
 			{
-				/*
-				 * We avoid using %.*s here because it can misbehave if the
-				 * data is not valid in what libc thinks is the prevailing
-				 * encoding.
-				 */
 				memcpy(modname, colname, colnamelen);
 				sprintf(modname + colnamelen, "_%d", i);
 				if (strlen(modname) < NAMEDATALEN)
diff --git a/src/fe_utils/print.c b/src/fe_utils/print.c
index 66a50f183f52..508f537c0c7a 100644
--- a/src/fe_utils/print.c
+++ b/src/fe_utils/print.c
@@ -305,20 +305,6 @@ format_numeric_locale(const char *my_str)
 }
 
 
-/*
- * fputnbytes: print exactly N bytes to a file
- *
- * We avoid using %.*s here because it can misbehave if the data
- * is not valid in what libc thinks is the prevailing encoding.
- */
-static void
-fputnbytes(FILE *f, const char *str, size_t n)
-{
-	while (n-- > 0)
-		fputc(*str++, f);
-}
-
-
 static void
 print_separator(struct separator sep, FILE *fout)
 {
@@ -1042,16 +1028,14 @@ print_aligned_text(const printTableContent *cont, FILE *fout, bool is_pager)
 					{
 						/* spaces first */
 						fprintf(fout, "%*s", width_wrap[j] - chars_to_output, "");
-						fputnbytes(fout,
-								   (char *) (this_line->ptr + bytes_output[j]),
-								   bytes_to_output);
+						fwrite((char *) (this_line->ptr + bytes_output[j]),
+							   1, bytes_to_output, fout);
 					}
 					else		/* Left aligned cell */
 					{
 						/* spaces second */
-						fputnbytes(fout,
-								   (char *) (this_line->ptr + bytes_output[j]),
-								   bytes_to_output);
+						fwrite((char *) (this_line->ptr + bytes_output[j]),
+							   1, bytes_to_output, fout);
 					}
 
 					bytes_output[j] += bytes_to_output;
@@ -1637,8 +1621,8 @@ print_aligned_vertical(const printTableContent *cont,
 				 */
 				bytes_to_output = strlen_max_width(dlineptr[dline].ptr + offset,
 												   &target_width, encoding);
-				fputnbytes(fout, (char *) (dlineptr[dline].ptr + offset),
-						   bytes_to_output);
+				fwrite((char *) (dlineptr[dline].ptr + offset),
+					   1, bytes_to_output, fout);
 
 				chars_to_output -= target_width;
 				offset += bytes_to_output;
diff --git a/src/interfaces/ecpg/ecpglib/error.c b/src/interfaces/ecpg/ecpglib/error.c
index a4e3c0d01f8c..cd6c6a6819bf 100644
--- a/src/interfaces/ecpg/ecpglib/error.c
+++ b/src/interfaces/ecpg/ecpglib/error.c
@@ -270,7 +270,6 @@ ecpg_raise_backend(int line, PGresult *result, PGconn *conn, int compat)
 	else
 		sqlca->sqlcode = ECPG_PGSQL;
 
-	/* %.*s is safe here as long as sqlstate is all-ASCII */
 	ecpg_log("raising sqlstate %.*s (sqlcode %ld): %s\n",
 			 (int) sizeof(sqlca->sqlstate), sqlca->sqlstate, sqlca->sqlcode, sqlca->sqlerrm.sqlerrmc);
 
diff --git a/src/interfaces/ecpg/pgtypeslib/dt_common.c b/src/interfaces/ecpg/pgtypeslib/dt_common.c
index 81bd7aa526f8..14cdf2d428b5 100644
--- a/src/interfaces/ecpg/pgtypeslib/dt_common.c
+++ b/src/interfaces/ecpg/pgtypeslib/dt_common.c
@@ -826,7 +826,8 @@ EncodeDateTime(struct tm *tm, fsec_t fsec, bool print_tz, int tz, const char *tz
 
 			/*
 			 * Note: the uses of %.*s in this function would be risky if the
-			 * timezone names ever contain non-ASCII characters.  However, all
+			 * timezone names ever contain non-ASCII characters, since we are
+			 * not being careful to do encoding-aware clipping.  However, all
 			 * TZ abbreviations in the IANA database are plain ASCII.
 			 */
 
diff --git a/src/interfaces/libpq/fe-misc.c b/src/interfaces/libpq/fe-misc.c
index 9273984727a2..ff840b7730d8 100644
--- a/src/interfaces/libpq/fe-misc.c
+++ b/src/interfaces/libpq/fe-misc.c
@@ -68,19 +68,6 @@ PQlibVersion(void)
 	return PG_VERSION_NUM;
 }
 
-/*
- * fputnbytes: print exactly N bytes to a file
- *
- * We avoid using %.*s here because it can misbehave if the data
- * is not valid in what libc thinks is the prevailing encoding.
- */
-static void
-fputnbytes(FILE *f, const char *str, size_t n)
-{
-	while (n-- > 0)
-		fputc(*str++, f);
-}
-
 
 /*
  * pqGetc: get 1 character from the connection
@@ -204,7 +191,7 @@ pqGetnchar(char *s, size_t len, PGconn *conn)
 	if (conn->Pfdebug)
 	{
 		fprintf(conn->Pfdebug, "From backend (%lu)> ", (unsigned long) len);
-		fputnbytes(conn->Pfdebug, s, len);
+		fwrite(s, 1, len, conn->Pfdebug);
 		fprintf(conn->Pfdebug, "\n");
 	}
 
@@ -228,7 +215,7 @@ pqSkipnchar(size_t len, PGconn *conn)
 	if (conn->Pfdebug)
 	{
 		fprintf(conn->Pfdebug, "From backend (%lu)> ", (unsigned long) len);
-		fputnbytes(conn->Pfdebug, conn->inBuffer + conn->inCursor, len);
+		fwrite(conn->inBuffer + conn->inCursor, 1, len, conn->Pfdebug);
 		fprintf(conn->Pfdebug, "\n");
 	}
 
@@ -250,7 +237,7 @@ pqPutnchar(const char *s, size_t len, PGconn *conn)
 	if (conn->Pfdebug)
 	{
 		fprintf(conn->Pfdebug, "To backend> ");
-		fputnbytes(conn->Pfdebug, s, len);
+		fwrite(s, 1, len, conn->Pfdebug);
 		fprintf(conn->Pfdebug, "\n");
 	}
 

From ea57e531b9487e042131ca1151a3ef5d655f40ec Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 29 Jun 2020 18:55:01 -0400
Subject: [PATCH 055/334] Remove support for timezone "posixrules" file.

The IANA tzcode library has a feature to read a time zone file named
"posixrules" and apply the daylight-savings transition dates and times
therein, when it is given a POSIX-style time zone specification that
lacks an explicit transition rule.  However, there's a problem with
that code: it doesn't work for dates past the Y2038 time_t rollover.
(Effectively, all times beyond that point are treated as standard
time.)  The IANA crew regard this feature as legacy, so their plan is
to remove it not fix it.  The time frame in which that will happen
is unclear, but presumably it'll happen well before 2038.

Moreover, effective with the next IANA data update (probably this
fall), the recommended default will be to not install a "posixrules"
file in the first place.  The time frame in which tzdata packagers
might adopt that suggestion is likewise unclear, but at least some
platforms will probably do it in the next year or so.  While we could
ignore that recommendation so far as PG-supplied tzdata trees are
concerned, builds using --with-system-tzdata will be subject to
whatever the platform's tzdata packager decides to do.

Thus, whether or not we do anything, some increasing fraction of
Postgres users will be exposed to the behavior observed when there
is no "posixrules" file; and if we do nothing, we'll have essentially
no control over the timing of that change.

The best thing to do to ameliorate the uncertainty seems to be to
proactively remove the posixrules-reading feature.  If we do that in
a scheduled release then at least we can release-note the behavioral
change, rather than having users be surprised by it after a routine
tzdata update.

The change in question is fairly minor anyway: to be affected,
you have to be using a POSIX-style timezone spec, it has to not
have an explicit rule, and it has to not be one of the four traditional
continental-USA zone names (EST5EDT, CST6CDT, MST7MDT, or PST8PDT),
as those are special-cased.  Since the default "posixrules" file
provides USA DST rules, the number of people who are likely to find
such a zone spec useful is probably quite small.  Moreover, the
fallback behavior with no explicit rule and no "posixrules" file is to
apply current USA rules, so the only thing that really breaks is the
DST transitions in years before 2007 (and you get the countervailing
fix that transitions after 2038 will be applied).

Now, some installations might have replaced the "posixrules" file,
allowing e.g. EU rules to be applied to a POSIX-style timezone spec.
That won't work anymore.  But it's not exactly clear why this solution
would be preferable to using a regular named zone.  In any case, given
the Y2038 issue, we need to be pushing users to stop depending on this.

Back-patch into v13; it hasn't been released yet, so it seems OK to
change its behavior.  (Personally I think we ought to back-patch
further, but I've been outvoted.)

Discussion: https://postgr.es/m/1390.1562258309@sss.pgh.pa.us
Discussion: https://postgr.es/m/20200621211855.6211-1-eggert@cs.ucla.edu
---
 doc/src/sgml/datetime.sgml | 28 ++----------------
 src/timezone/Makefile      |  8 ++----
 src/timezone/README        |  5 +---
 src/timezone/localtime.c   | 58 ++++++++------------------------------
 src/tools/msvc/Install.pm  |  8 +-----
 5 files changed, 18 insertions(+), 89 deletions(-)

diff --git a/doc/src/sgml/datetime.sgml b/doc/src/sgml/datetime.sgml
index 71fbf842cca9..bbf50b76f8c3 100644
--- a/doc/src/sgml/datetime.sgml
+++ b/doc/src/sgml/datetime.sgml
@@ -718,33 +718,12 @@
   <para>
    If a daylight-savings abbreviation is given but the
    transition <replaceable>rule</replaceable> field is omitted,
-   <productname>PostgreSQL</productname> attempts to determine the
-   transition times by consulting the <filename>posixrules</filename> file
-   in the IANA time zone database.  This file has the same format as a
-   full time zone entry, but only its transition timing rules are used,
-   not its UTC offsets.  Typically, this file has the same contents as the
-   <literal>US/Eastern</literal> file, so that POSIX-style time zone
-   specifications follow USA daylight-savings rules.  If needed, you can
-   adjust this behavior by replacing the <filename>posixrules</filename>
-   file.
-  </para>
-
-  <note>
-   <para>
-    The facility to consult a <filename>posixrules</filename> file has
-    been deprecated by IANA, and it is likely to go away in the future.
-    One bug in this feature, which is unlikely to be fixed before it
-    disappears, is that it fails to apply DST rules to dates after 2038.
-   </para>
-  </note>
-
-  <para>
-   If the <filename>posixrules</filename> file is not present,
    the fallback behavior is to use the
    rule <literal>M3.2.0,M11.1.0</literal>, which corresponds to USA
    practice as of 2020 (that is, spring forward on the second Sunday of
    March, fall back on the first Sunday of November, both transitions
-   occurring at 2AM prevailing time).
+   occurring at 2AM prevailing time).  Note that this rule does not
+   give correct USA transition dates for years before 2007.
   </para>
 
   <para>
@@ -765,8 +744,7 @@
    because (for historical reasons) there are files by those names in the
    IANA time zone database.  The practical implication of this is that
    these zone names will produce valid historical USA daylight-savings
-   transitions, even when a plain POSIX specification would not due to
-   lack of a suitable <filename>posixrules</filename> file.
+   transitions, even when a plain POSIX specification would not.
   </para>
 
   <para>
diff --git a/src/timezone/Makefile b/src/timezone/Makefile
index bf23ac9da974..715b63cee0cd 100644
--- a/src/timezone/Makefile
+++ b/src/timezone/Makefile
@@ -29,10 +29,6 @@ ZICOBJS = \
 # we now distribute the timezone data as a single file
 TZDATAFILES = $(srcdir)/data/tzdata.zi
 
-# which zone should determine the DST rules (not the specific UTC offset!)
-# for POSIX-style timezone specs
-POSIXRULES = US/Eastern
-
 # any custom options you might want to pass to zic while installing data files
 ZIC_OPTIONS =
 
@@ -60,13 +56,13 @@ zic: $(ZICOBJS) | submake-libpgport
 
 install: all installdirs
 ifeq (,$(with_system_tzdata))
-	$(ZIC) -d '$(DESTDIR)$(datadir)/timezone' -p '$(POSIXRULES)' -b slim $(ZIC_OPTIONS) $(TZDATAFILES)
+	$(ZIC) -d '$(DESTDIR)$(datadir)/timezone' -b slim $(ZIC_OPTIONS) $(TZDATAFILES)
 endif
 	$(MAKE) -C tznames $@
 
 abbrevs.txt: zic $(TZDATAFILES)
 	mkdir junkdir
-	$(ZIC) -P -d junkdir -p '$(POSIXRULES)' $(TZDATAFILES) | LANG=C sort | uniq >abbrevs.txt
+	$(ZIC) -P -d junkdir $(TZDATAFILES) | LANG=C sort | uniq >abbrevs.txt
 	rm -rf junkdir
 
 installdirs:
diff --git a/src/timezone/README b/src/timezone/README
index 9939aa6dd7ea..8af44449329a 100644
--- a/src/timezone/README
+++ b/src/timezone/README
@@ -93,10 +93,7 @@ in some other files where we have variables named that.
 slightly modified the API of the former, in part because it now relies
 on our own pg_open_tzfile() rather than opening files for itself.
 
-* tzparse() is adjusted to avoid loading the TZDEFRULES zone unless
-really necessary, and to ignore any leap-second data it may supply.
-We also cache the result of loading the TZDEFRULES zone, so that
-that's not repeated more than once per process.
+* tzparse() is adjusted to never try to load the TZDEFRULES zone.
 
 * There's a fair amount of code we don't need and have removed,
 including all the nonstandard optional APIs.  We have also added
diff --git a/src/timezone/localtime.c b/src/timezone/localtime.c
index 0f65f3c648e5..fa3c059038c4 100644
--- a/src/timezone/localtime.c
+++ b/src/timezone/localtime.c
@@ -53,14 +53,7 @@ static const char wildabbr[] = WILDABBR;
 static const char gmt[] = "GMT";
 
 /*
- * PG: We cache the result of trying to load the TZDEFRULES zone here.
- * tzdefrules_loaded is 0 if not tried yet, +1 if good, -1 if failed.
- */
-static struct state *tzdefrules_s = NULL;
-static int	tzdefrules_loaded = 0;
-
-/*
- * The DST rules to use if TZ has no rules and we can't load TZDEFRULES.
+ * The DST rules to use if a POSIX TZ string has no rules.
  * Default to US rules as of 2017-05-07.
  * POSIX does not specify the default DST rules;
  * for historical reasons, US rules are a common default.
@@ -986,14 +979,15 @@ tzparse(const char *name, struct state *sp, bool lastditch)
 		return false;
 
 	/*
-	 * The IANA code always tries tzload(TZDEFRULES) here.  We do not want to
-	 * do that; it would be bad news in the lastditch case, where we can't
-	 * assume pg_open_tzfile() is sane yet.  Moreover, the only reason to do
-	 * it unconditionally is to absorb the TZDEFRULES zone's leap second info,
-	 * which we don't want to do anyway.  Without that, we only need to load
-	 * TZDEFRULES if the zone name specifies DST but doesn't incorporate a
-	 * POSIX-style transition date rule, which is not a common case.
+	 * The IANA code always tries to tzload(TZDEFRULES) here.  We do not want
+	 * to do that; it would be bad news in the lastditch case, where we can't
+	 * assume pg_open_tzfile() is sane yet.  Moreover, if we did load it and
+	 * it contains leap-second-dependent info, that would cause problems too.
+	 * Finally, IANA has deprecated the TZDEFRULES feature, so it presumably
+	 * will die at some point.  Desupporting it now seems like good
+	 * future-proofing.
 	 */
+	load_ok = false;
 	sp->goback = sp->goahead = false;	/* simulate failed tzload() */
 	sp->leapcnt = 0;			/* intentionally assume no leap seconds */
 
@@ -1027,38 +1021,8 @@ tzparse(const char *name, struct state *sp, bool lastditch)
 		}
 		else
 			dstoffset = stdoffset - SECSPERHOUR;
-		if (*name == '\0')
-		{
-			/*
-			 * The POSIX zone name does not provide a transition-date rule.
-			 * Here we must load the TZDEFRULES zone, if possible, to serve as
-			 * source data for the transition dates.  Unlike the IANA code, we
-			 * try to cache the data so it's only loaded once.
-			 */
-			if (tzdefrules_loaded == 0)
-			{
-				/* Allocate on first use */
-				if (tzdefrules_s == NULL)
-					tzdefrules_s = (struct state *) malloc(sizeof(struct state));
-				if (tzdefrules_s != NULL)
-				{
-					if (tzload(TZDEFRULES, NULL, tzdefrules_s, false) == 0)
-						tzdefrules_loaded = 1;
-					else
-						tzdefrules_loaded = -1;
-					/* In any case, we ignore leap-second data from the file */
-					tzdefrules_s->leapcnt = 0;
-				}
-			}
-			load_ok = (tzdefrules_loaded > 0);
-			if (load_ok)
-				memcpy(sp, tzdefrules_s, sizeof(struct state));
-			else
-			{
-				/* If we can't load TZDEFRULES, fall back to hard-wired rule */
-				name = TZDEFRULESTRING;
-			}
-		}
+		if (*name == '\0' && !load_ok)
+			name = TZDEFRULESTRING;
 		if (*name == ',' || *name == ';')
 		{
 			struct rule start;
diff --git a/src/tools/msvc/Install.pm b/src/tools/msvc/Install.pm
index 9bf111c41ef8..b6d0cfd39b49 100644
--- a/src/tools/msvc/Install.pm
+++ b/src/tools/msvc/Install.pm
@@ -366,16 +366,10 @@ sub GenerateTimezoneFiles
 	  || die "Could not find TZDATAFILES line in timezone makefile\n";
 	my @tzfiles = split /\s+/, $1;
 
-	$mf =~ /^POSIXRULES\s*:?=\s*(.*)$/m
-	  || die "Could not find POSIXRULES line in timezone makefile\n";
-	my $posixrules = $1;
-	$posixrules =~ s/\s+//g;
-
 	print "Generating timezone files...";
 
 	my @args = (
-		"$conf/zic/zic", '-d', "$target/share/timezone", '-p',
-		"$posixrules",   '-b', 'slim');
+		"$conf/zic/zic", '-d', "$target/share/timezone", '-b', 'slim');
 	foreach (@tzfiles)
 	{
 		my $tzfile = $_;

From ee0202d552791f14bc407ce299628ce8d50eebe3 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Tue, 30 Jun 2020 00:29:35 +0200
Subject: [PATCH 056/334] pgstattuple: Have pgstattuple_approx accept TOAST
 tables

TOAST tables have a visibility map and a free space map, so they can
be supported by pgstattuple_approx just fine.

Add test cases to show how various pgstattuple functions accept TOAST
tables.  Also add similar tests to pg_visibility, which already
accepted TOAST tables correctly but had no test coverage for them.

Reviewed-by: Laurenz Albe <laurenz.albe@cybertec.at>
Discussion: https://www.postgresql.org/message-id/flat/27c4496a-02b9-dc87-8f6f-bddbef54e0fe@2ndquadrant.com
---
 .../pg_visibility/expected/pg_visibility.out  | 17 +++++++++++--
 contrib/pg_visibility/sql/pg_visibility.sql   |  7 ++++--
 contrib/pgstattuple/expected/pgstattuple.out  | 25 ++++++++++++++++---
 contrib/pgstattuple/pgstatapprox.c            | 10 ++++----
 contrib/pgstattuple/sql/pgstattuple.sql       |  5 ++++
 5 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/contrib/pg_visibility/expected/pg_visibility.out b/contrib/pg_visibility/expected/pg_visibility.out
index 2abc1b5107a0..ca4b6e186bca 100644
--- a/contrib/pg_visibility/expected/pg_visibility.out
+++ b/contrib/pg_visibility/expected/pg_visibility.out
@@ -102,8 +102,9 @@ ERROR:  "test_foreign_table" is not a table, materialized view, or TOAST table
 select pg_truncate_visibility_map('test_foreign_table');
 ERROR:  "test_foreign_table" is not a table, materialized view, or TOAST table
 -- check some of the allowed relkinds
-create table regular_table (a int);
-insert into regular_table values (1), (2);
+create table regular_table (a int, b text);
+alter table regular_table alter column b set storage external;
+insert into regular_table values (1, repeat('one', 1000)), (2, repeat('two', 1000));
 vacuum regular_table;
 select count(*) > 0 from pg_visibility('regular_table');
  ?column? 
@@ -111,6 +112,12 @@ select count(*) > 0 from pg_visibility('regular_table');
  t
 (1 row)
 
+select count(*) > 0 from pg_visibility((select reltoastrelid from pg_class where relname = 'regular_table'));
+ ?column? 
+----------
+ t
+(1 row)
+
 truncate regular_table;
 select count(*) > 0 from pg_visibility('regular_table');
  ?column? 
@@ -118,6 +125,12 @@ select count(*) > 0 from pg_visibility('regular_table');
  f
 (1 row)
 
+select count(*) > 0 from pg_visibility((select reltoastrelid from pg_class where relname = 'regular_table'));
+ ?column? 
+----------
+ f
+(1 row)
+
 create materialized view matview_visibility_test as select * from regular_table;
 vacuum matview_visibility_test;
 select count(*) > 0 from pg_visibility('matview_visibility_test');
diff --git a/contrib/pg_visibility/sql/pg_visibility.sql b/contrib/pg_visibility/sql/pg_visibility.sql
index c78b90521bfe..f79b54480b70 100644
--- a/contrib/pg_visibility/sql/pg_visibility.sql
+++ b/contrib/pg_visibility/sql/pg_visibility.sql
@@ -68,12 +68,15 @@ select pg_check_frozen('test_foreign_table');
 select pg_truncate_visibility_map('test_foreign_table');
 
 -- check some of the allowed relkinds
-create table regular_table (a int);
-insert into regular_table values (1), (2);
+create table regular_table (a int, b text);
+alter table regular_table alter column b set storage external;
+insert into regular_table values (1, repeat('one', 1000)), (2, repeat('two', 1000));
 vacuum regular_table;
 select count(*) > 0 from pg_visibility('regular_table');
+select count(*) > 0 from pg_visibility((select reltoastrelid from pg_class where relname = 'regular_table'));
 truncate regular_table;
 select count(*) > 0 from pg_visibility('regular_table');
+select count(*) > 0 from pg_visibility((select reltoastrelid from pg_class where relname = 'regular_table'));
 
 create materialized view matview_visibility_test as select * from regular_table;
 vacuum matview_visibility_test;
diff --git a/contrib/pgstattuple/expected/pgstattuple.out b/contrib/pgstattuple/expected/pgstattuple.out
index 9920dbfd4083..40f7825ddb4b 100644
--- a/contrib/pgstattuple/expected/pgstattuple.out
+++ b/contrib/pgstattuple/expected/pgstattuple.out
@@ -159,7 +159,7 @@ ERROR:  "test_partitioned" (partitioned table) is not supported
 select pgstattuple('test_partitioned_index');
 ERROR:  "test_partitioned_index" (partitioned index) is not supported
 select pgstattuple_approx('test_partitioned');
-ERROR:  "test_partitioned" is not a table or materialized view
+ERROR:  "test_partitioned" is not a table, materialized view, or TOAST table
 select pg_relpages('test_partitioned');
 ERROR:  "test_partitioned" is not a table, index, materialized view, sequence, or TOAST table
 select pgstatindex('test_partitioned');
@@ -173,7 +173,7 @@ create view test_view as select 1;
 select pgstattuple('test_view');
 ERROR:  "test_view" (view) is not supported
 select pgstattuple_approx('test_view');
-ERROR:  "test_view" is not a table or materialized view
+ERROR:  "test_view" is not a table, materialized view, or TOAST table
 select pg_relpages('test_view');
 ERROR:  "test_view" is not a table, index, materialized view, sequence, or TOAST table
 select pgstatindex('test_view');
@@ -189,7 +189,7 @@ create foreign table test_foreign_table () server dummy_server;
 select pgstattuple('test_foreign_table');
 ERROR:  "test_foreign_table" (foreign table) is not supported
 select pgstattuple_approx('test_foreign_table');
-ERROR:  "test_foreign_table" is not a table or materialized view
+ERROR:  "test_foreign_table" is not a table, materialized view, or TOAST table
 select pg_relpages('test_foreign_table');
 ERROR:  "test_foreign_table" is not a table, index, materialized view, sequence, or TOAST table
 select pgstatindex('test_foreign_table');
@@ -218,6 +218,25 @@ select pg_relpages('test_partition');
            0
 (1 row)
 
+-- toast tables should work
+select pgstattuple((select reltoastrelid from pg_class where relname = 'test'));
+     pgstattuple     
+---------------------
+ (0,0,0,0,0,0,0,0,0)
+(1 row)
+
+select pgstattuple_approx((select reltoastrelid from pg_class where relname = 'test'));
+  pgstattuple_approx   
+-----------------------
+ (0,0,0,0,0,0,0,0,0,0)
+(1 row)
+
+select pg_relpages((select reltoastrelid from pg_class where relname = 'test'));
+ pg_relpages 
+-------------
+           0
+(1 row)
+
 -- not for the index calls though, of course
 select pgstatindex('test_partition');
 ERROR:  relation "test_partition" is not a btree index
diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index 96d837485fa3..dbc0fa11f615 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -278,15 +278,15 @@ pgstattuple_approx_internal(Oid relid, FunctionCallInfo fcinfo)
 				 errmsg("cannot access temporary tables of other sessions")));
 
 	/*
-	 * We support only ordinary relations and materialised views, because we
-	 * depend on the visibility map and free space map for our estimates about
-	 * unscanned pages.
+	 * We support only relation kinds with a visibility map and a free space
+	 * map.
 	 */
 	if (!(rel->rd_rel->relkind == RELKIND_RELATION ||
-		  rel->rd_rel->relkind == RELKIND_MATVIEW))
+		  rel->rd_rel->relkind == RELKIND_MATVIEW ||
+		  rel->rd_rel->relkind == RELKIND_TOASTVALUE))
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("\"%s\" is not a table or materialized view",
+				 errmsg("\"%s\" is not a table, materialized view, or TOAST table",
 						RelationGetRelationName(rel))));
 
 	if (rel->rd_rel->relam != HEAP_TABLE_AM_OID)
diff --git a/contrib/pgstattuple/sql/pgstattuple.sql b/contrib/pgstattuple/sql/pgstattuple.sql
index cfa540302da0..5111be0e6250 100644
--- a/contrib/pgstattuple/sql/pgstattuple.sql
+++ b/contrib/pgstattuple/sql/pgstattuple.sql
@@ -100,6 +100,11 @@ select pgstattuple('test_partition');
 select pgstattuple_approx('test_partition');
 select pg_relpages('test_partition');
 
+-- toast tables should work
+select pgstattuple((select reltoastrelid from pg_class where relname = 'test'));
+select pgstattuple_approx((select reltoastrelid from pg_class where relname = 'test'));
+select pg_relpages((select reltoastrelid from pg_class where relname = 'test'));
+
 -- not for the index calls though, of course
 select pgstatindex('test_partition');
 select pgstatginindex('test_partition');

From 324435eb14e4f41cd430f96c9b13ad9b160e45e4 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 30 Jun 2020 13:26:11 +0900
Subject: [PATCH 057/334] Prevent compilation of frontend-only files in
 src/common/ with backend

Any frontend-only file of src/common/ should include a protection to
prevent such code to be included in the backend compilation.
fe_memutils.c and restricted_token.c have been doing that, while
file_utils.c (since bf5bb2e) and logging.c (since fc9a62a) forgot it.

Reviewed-by: Daniel Gustafsson
Discussion: https://postgr.es/m/20200625080757.GI130132@paquier.xyz
---
 src/common/file_utils.c | 5 +++++
 src/common/logging.c    | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index 7584c1f2fb4e..a2faafdf13a3 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -12,6 +12,11 @@
  *
  *-------------------------------------------------------------------------
  */
+
+#ifndef FRONTEND
+#error "This file is not expected to be compiled for backend code"
+#endif
+
 #include "postgres_fe.h"
 
 #include <dirent.h>
diff --git a/src/common/logging.c b/src/common/logging.c
index f3fc0b8262f7..6a3a437a34bd 100644
--- a/src/common/logging.c
+++ b/src/common/logging.c
@@ -7,6 +7,11 @@
  *
  *-------------------------------------------------------------------------
  */
+
+#ifndef FRONTEND
+#error "This file is not expected to be compiled for backend code"
+#endif
+
 #include "postgres_fe.h"
 
 #include <unistd.h>

From 9bae7e4cde7c9786ee61dac4a3e032b346350a88 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Tue, 30 Jun 2020 23:55:07 +0900
Subject: [PATCH 058/334] Add +(pg_lsn,numeric) and -(pg_lsn,numeric)
 operators.

By using these operators, the number of bytes can be added into and
subtracted from LSN.

Bump catalog version.

Author: Fujii Masao
Reviewed-by: Kyotaro Horiguchi, Michael Paquier, Asif Rehman
Discussion: https://postgr.es/m/ed9f7f74-e996-67f8-554a-52ebd3779b3b@oss.nttdata.com
---
 doc/src/sgml/datatype.sgml            |  8 ++-
 src/backend/utils/adt/numeric.c       | 98 +++++++++++++++++++++++++++
 src/backend/utils/adt/pg_lsn.c        | 69 +++++++++++++++++++
 src/include/catalog/catversion.h      |  2 +-
 src/include/catalog/pg_operator.dat   | 11 +++
 src/include/catalog/pg_proc.dat       | 12 ++++
 src/test/regress/expected/numeric.out | 27 ++++++++
 src/test/regress/expected/pg_lsn.out  | 50 ++++++++++++++
 src/test/regress/sql/numeric.sql      | 10 +++
 src/test/regress/sql/pg_lsn.sql       | 11 +++
 10 files changed, 296 insertions(+), 2 deletions(-)

diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index 49fb19ff9194..7027758d28dd 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -4801,7 +4801,13 @@ SELECT * FROM pg_attribute
     standard comparison operators, like <literal>=</literal> and
     <literal>&gt;</literal>.  Two LSNs can be subtracted using the
     <literal>-</literal> operator; the result is the number of bytes separating
-    those write-ahead log locations.
+    those write-ahead log locations.  Also the number of bytes can be
+    added into and subtracted from LSN using the
+    <literal>+(pg_lsn,numeric)</literal> and
+    <literal>-(pg_lsn,numeric)</literal> operators, respectively. Note that
+    the calculated LSN should be in the range of <type>pg_lsn</type> type,
+    i.e., between <literal>0/0</literal> and
+    <literal>FFFFFFFF/FFFFFFFF</literal>.
    </para>
   </sect1>
 
diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index 5f23f2afac86..1773fa292e49 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -41,6 +41,7 @@
 #include "utils/guc.h"
 #include "utils/int8.h"
 #include "utils/numeric.h"
+#include "utils/pg_lsn.h"
 #include "utils/sortsupport.h"
 
 /* ----------
@@ -472,6 +473,7 @@ static void apply_typmod(NumericVar *var, int32 typmod);
 static bool numericvar_to_int32(const NumericVar *var, int32 *result);
 static bool numericvar_to_int64(const NumericVar *var, int64 *result);
 static void int64_to_numericvar(int64 val, NumericVar *var);
+static bool numericvar_to_uint64(const NumericVar *var, uint64 *result);
 #ifdef HAVE_INT128
 static bool numericvar_to_int128(const NumericVar *var, int128 *result);
 static void int128_to_numericvar(int128 val, NumericVar *var);
@@ -3692,6 +3694,30 @@ numeric_float4(PG_FUNCTION_ARGS)
 }
 
 
+Datum
+numeric_pg_lsn(PG_FUNCTION_ARGS)
+{
+	Numeric		num = PG_GETARG_NUMERIC(0);
+	NumericVar	x;
+	XLogRecPtr	result;
+
+	if (NUMERIC_IS_NAN(num))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot convert NaN to pg_lsn")));
+
+	/* Convert to variable format and thence to pg_lsn */
+	init_var_from_num(num, &x);
+
+	if (!numericvar_to_uint64(&x, (uint64 *) &result))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("pg_lsn out of range")));
+
+	PG_RETURN_LSN(result);
+}
+
+
 /* ----------------------------------------------------------------------
  *
  * Aggregate functions
@@ -6742,6 +6768,78 @@ int64_to_numericvar(int64 val, NumericVar *var)
 	var->weight = ndigits - 1;
 }
 
+/*
+ * Convert numeric to uint64, rounding if needed.
+ *
+ * If overflow, return false (no error is raised).  Return true if okay.
+ */
+static bool
+numericvar_to_uint64(const NumericVar *var, uint64 *result)
+{
+	NumericDigit *digits;
+	int			ndigits;
+	int			weight;
+	int			i;
+	uint64		val;
+	NumericVar	rounded;
+
+	/* Round to nearest integer */
+	init_var(&rounded);
+	set_var_from_var(var, &rounded);
+	round_var(&rounded, 0);
+
+	/* Check for zero input */
+	strip_var(&rounded);
+	ndigits = rounded.ndigits;
+	if (ndigits == 0)
+	{
+		*result = 0;
+		free_var(&rounded);
+		return true;
+	}
+
+	/* Check for negative input */
+	if (rounded.sign == NUMERIC_NEG)
+	{
+		free_var(&rounded);
+		return false;
+	}
+
+	/*
+	 * For input like 10000000000, we must treat stripped digits as real. So
+	 * the loop assumes there are weight+1 digits before the decimal point.
+	 */
+	weight = rounded.weight;
+	Assert(weight >= 0 && ndigits <= weight + 1);
+
+	/* Construct the result */
+	digits = rounded.digits;
+	val = digits[0];
+	for (i = 1; i <= weight; i++)
+	{
+		if (unlikely(pg_mul_u64_overflow(val, NBASE, &val)))
+		{
+			free_var(&rounded);
+			return false;
+		}
+
+		if (i < ndigits)
+		{
+			if (unlikely(pg_add_u64_overflow(val, digits[i], &val)))
+			{
+				free_var(&rounded);
+				return false;
+			}
+		}
+	}
+
+	free_var(&rounded);
+
+	*result = val;
+
+	return true;
+}
+
 #ifdef HAVE_INT128
 /*
  * Convert numeric to int128, rounding if needed.
diff --git a/src/backend/utils/adt/pg_lsn.c b/src/backend/utils/adt/pg_lsn.c
index d9754a7778c9..ad0a7bd869d1 100644
--- a/src/backend/utils/adt/pg_lsn.c
+++ b/src/backend/utils/adt/pg_lsn.c
@@ -16,6 +16,7 @@
 #include "funcapi.h"
 #include "libpq/pqformat.h"
 #include "utils/builtins.h"
+#include "utils/numeric.h"
 #include "utils/pg_lsn.h"
 
 #define MAXPG_LSNLEN			17
@@ -248,3 +249,71 @@ pg_lsn_mi(PG_FUNCTION_ARGS)
 
 	return result;
 }
+
+/*
+ * Add the number of bytes to pg_lsn, giving a new pg_lsn.
+ * Must handle both positive and negative numbers of bytes.
+ */
+Datum
+pg_lsn_pli(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr	lsn = PG_GETARG_LSN(0);
+	Numeric		nbytes = PG_GETARG_NUMERIC(1);
+	Datum		num;
+	Datum		res;
+	char		buf[32];
+
+	if (numeric_is_nan(nbytes))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot add NaN to pg_lsn")));
+
+	/* Convert to numeric */
+	snprintf(buf, sizeof(buf), UINT64_FORMAT, lsn);
+	num = DirectFunctionCall3(numeric_in,
+							  CStringGetDatum(buf),
+							  ObjectIdGetDatum(0),
+							  Int32GetDatum(-1));
+
+	/* Add two numerics */
+	res = DirectFunctionCall2(numeric_add,
+							  NumericGetDatum(num),
+							  NumericGetDatum(nbytes));
+
+	/* Convert to pg_lsn */
+	return DirectFunctionCall1(numeric_pg_lsn, res);
+}
+
+/*
+ * Subtract the number of bytes from pg_lsn, giving a new pg_lsn.
+ * Must handle both positive and negative numbers of bytes.
+ */
+Datum
+pg_lsn_mii(PG_FUNCTION_ARGS)
+{
+	XLogRecPtr	lsn = PG_GETARG_LSN(0);
+	Numeric		nbytes = PG_GETARG_NUMERIC(1);
+	Datum		num;
+	Datum		res;
+	char		buf[32];
+
+	if (numeric_is_nan(nbytes))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("cannot subtract NaN from pg_lsn")));
+
+	/* Convert to numeric */
+	snprintf(buf, sizeof(buf), UINT64_FORMAT, lsn);
+	num = DirectFunctionCall3(numeric_in,
+							  CStringGetDatum(buf),
+							  ObjectIdGetDatum(0),
+							  Int32GetDatum(-1));
+
+	/* Subtract two numerics */
+	res = DirectFunctionCall2(numeric_sub,
+							  NumericGetDatum(num),
+							  NumericGetDatum(nbytes));
+
+	/* Convert to pg_lsn */
+	return DirectFunctionCall1(numeric_pg_lsn, res);
+}
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 7644147cf5c2..a433bf52c1bd 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202006151
+#define CATALOG_VERSION_NO	202006301
 
 #endif
diff --git a/src/include/catalog/pg_operator.dat b/src/include/catalog/pg_operator.dat
index 59771f606d4f..5b0e063655d3 100644
--- a/src/include/catalog/pg_operator.dat
+++ b/src/include/catalog/pg_operator.dat
@@ -2909,6 +2909,17 @@
 { oid => '3228', descr => 'minus',
   oprname => '-', oprleft => 'pg_lsn', oprright => 'pg_lsn',
   oprresult => 'numeric', oprcode => 'pg_lsn_mi' },
+{ oid => '5025', descr => 'add',
+  oprname => '+', oprleft => 'pg_lsn', oprright => 'numeric',
+  oprresult => 'pg_lsn', oprcom => '+(numeric,pg_lsn)',
+  oprcode => 'pg_lsn_pli' },
+{ oid => '5026', descr => 'add',
+  oprname => '+', oprleft => 'numeric', oprright => 'pg_lsn',
+  oprresult => 'pg_lsn', oprcom => '+(pg_lsn,numeric)',
+  oprcode => 'numeric_pl_pg_lsn' },
+{ oid => '5027', descr => 'subtract',
+  oprname => '-', oprleft => 'pg_lsn', oprright => 'numeric',
+  oprresult => 'pg_lsn', oprcode => 'pg_lsn_mii' },
 
 # enum operators
 { oid => '3516', descr => 'equal',
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 61f2c2f5b490..38295aca4831 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -4398,6 +4398,9 @@
 { oid => '1783', descr => 'convert numeric to int2',
   proname => 'int2', prorettype => 'int2', proargtypes => 'numeric',
   prosrc => 'numeric_int2' },
+{ oid => '6103', descr => 'convert numeric to pg_lsn',
+  proname => 'pg_lsn', prorettype => 'pg_lsn', proargtypes => 'numeric',
+  prosrc => 'numeric_pg_lsn' },
 
 { oid => '3556', descr => 'convert jsonb to boolean',
   proname => 'bool', prorettype => 'bool', proargtypes => 'jsonb',
@@ -8576,6 +8579,15 @@
 { oid => '4188', descr => 'smaller of two',
   proname => 'pg_lsn_smaller', prorettype => 'pg_lsn',
   proargtypes => 'pg_lsn pg_lsn', prosrc => 'pg_lsn_smaller' },
+{ oid => '5022',
+  proname => 'pg_lsn_pli', prorettype => 'pg_lsn',
+  proargtypes => 'pg_lsn numeric', prosrc => 'pg_lsn_pli' },
+{ oid => '5023',
+  proname => 'numeric_pl_pg_lsn', prolang => 'sql', prorettype => 'pg_lsn',
+  proargtypes => 'numeric pg_lsn', prosrc => 'select $2 + $1' },
+{ oid => '5024',
+  proname => 'pg_lsn_mii', prorettype => 'pg_lsn',
+  proargtypes => 'pg_lsn numeric', prosrc => 'pg_lsn_mii' },
 
 # enum related procs
 { oid => '3504', descr => 'I/O',
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index 2f3ecb50a733..81a0c5d40f71 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -2348,3 +2348,30 @@ SELECT -4!;
 ERROR:  factorial of a negative number is undefined
 SELECT factorial(-4);
 ERROR:  factorial of a negative number is undefined
+--
+-- Tests for pg_lsn()
+--
+SELECT pg_lsn(23783416::numeric);
+  pg_lsn   
+-----------
+ 0/16AE7F8
+(1 row)
+
+SELECT pg_lsn(0::numeric);
+ pg_lsn 
+--------
+ 0/0
+(1 row)
+
+SELECT pg_lsn(18446744073709551615::numeric);
+      pg_lsn       
+-------------------
+ FFFFFFFF/FFFFFFFF
+(1 row)
+
+SELECT pg_lsn(-1::numeric);
+ERROR:  pg_lsn out of range
+SELECT pg_lsn(18446744073709551616::numeric);
+ERROR:  pg_lsn out of range
+SELECT pg_lsn('NaN'::numeric);
+ERROR:  cannot convert NaN to pg_lsn
diff --git a/src/test/regress/expected/pg_lsn.out b/src/test/regress/expected/pg_lsn.out
index 64d41dfdad26..99a748a6a765 100644
--- a/src/test/regress/expected/pg_lsn.out
+++ b/src/test/regress/expected/pg_lsn.out
@@ -71,6 +71,56 @@ SELECT '0/16AE7F8'::pg_lsn - '0/16AE7F7'::pg_lsn;
         1
 (1 row)
 
+SELECT '0/16AE7F7'::pg_lsn + 16::numeric;
+ ?column?  
+-----------
+ 0/16AE807
+(1 row)
+
+SELECT 16::numeric + '0/16AE7F7'::pg_lsn;
+ ?column?  
+-----------
+ 0/16AE807
+(1 row)
+
+SELECT '0/16AE7F7'::pg_lsn - 16::numeric;
+ ?column?  
+-----------
+ 0/16AE7E7
+(1 row)
+
+SELECT 'FFFFFFFF/FFFFFFFE'::pg_lsn + 1::numeric;
+     ?column?      
+-------------------
+ FFFFFFFF/FFFFFFFF
+(1 row)
+
+SELECT 'FFFFFFFF/FFFFFFFE'::pg_lsn + 2::numeric; -- out of range error
+ERROR:  pg_lsn out of range
+SELECT '0/1'::pg_lsn - 1::numeric;
+ ?column? 
+----------
+ 0/0
+(1 row)
+
+SELECT '0/1'::pg_lsn - 2::numeric; -- out of range error
+ERROR:  pg_lsn out of range
+SELECT '0/0'::pg_lsn + ('FFFFFFFF/FFFFFFFF'::pg_lsn - '0/0'::pg_lsn);
+     ?column?      
+-------------------
+ FFFFFFFF/FFFFFFFF
+(1 row)
+
+SELECT 'FFFFFFFF/FFFFFFFF'::pg_lsn - ('FFFFFFFF/FFFFFFFF'::pg_lsn - '0/0'::pg_lsn);
+ ?column? 
+----------
+ 0/0
+(1 row)
+
+SELECT '0/16AE7F7'::pg_lsn + 'NaN'::numeric;
+ERROR:  cannot add NaN to pg_lsn
+SELECT '0/16AE7F7'::pg_lsn - 'NaN'::numeric;
+ERROR:  cannot subtract NaN from pg_lsn
 -- Check btree and hash opclasses
 EXPLAIN (COSTS OFF)
 SELECT DISTINCT (i || '/' || j)::pg_lsn f
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index 1332a9cf07a6..5dc80f686f48 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -1122,3 +1122,13 @@ SELECT 100000!;
 SELECT 0!;
 SELECT -4!;
 SELECT factorial(-4);
+
+--
+-- Tests for pg_lsn()
+--
+SELECT pg_lsn(23783416::numeric);
+SELECT pg_lsn(0::numeric);
+SELECT pg_lsn(18446744073709551615::numeric);
+SELECT pg_lsn(-1::numeric);
+SELECT pg_lsn(18446744073709551616::numeric);
+SELECT pg_lsn('NaN'::numeric);
diff --git a/src/test/regress/sql/pg_lsn.sql b/src/test/regress/sql/pg_lsn.sql
index 2c143c82ffe7..615368ba960b 100644
--- a/src/test/regress/sql/pg_lsn.sql
+++ b/src/test/regress/sql/pg_lsn.sql
@@ -27,6 +27,17 @@ SELECT '0/16AE7F7' < '0/16AE7F8'::pg_lsn;
 SELECT '0/16AE7F8' > pg_lsn '0/16AE7F7';
 SELECT '0/16AE7F7'::pg_lsn - '0/16AE7F8'::pg_lsn;
 SELECT '0/16AE7F8'::pg_lsn - '0/16AE7F7'::pg_lsn;
+SELECT '0/16AE7F7'::pg_lsn + 16::numeric;
+SELECT 16::numeric + '0/16AE7F7'::pg_lsn;
+SELECT '0/16AE7F7'::pg_lsn - 16::numeric;
+SELECT 'FFFFFFFF/FFFFFFFE'::pg_lsn + 1::numeric;
+SELECT 'FFFFFFFF/FFFFFFFE'::pg_lsn + 2::numeric; -- out of range error
+SELECT '0/1'::pg_lsn - 1::numeric;
+SELECT '0/1'::pg_lsn - 2::numeric; -- out of range error
+SELECT '0/0'::pg_lsn + ('FFFFFFFF/FFFFFFFF'::pg_lsn - '0/0'::pg_lsn);
+SELECT 'FFFFFFFF/FFFFFFFF'::pg_lsn - ('FFFFFFFF/FFFFFFFF'::pg_lsn - '0/0'::pg_lsn);
+SELECT '0/16AE7F7'::pg_lsn + 'NaN'::numeric;
+SELECT '0/16AE7F7'::pg_lsn - 'NaN'::numeric;
 
 -- Check btree and hash opclasses
 EXPLAIN (COSTS OFF)

From 2a06cb86dbacdf19a19d5bfd00d5156584f2188a Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Tue, 30 Jun 2020 11:55:53 -0400
Subject: [PATCH 059/334] doc: change pg_upgrade wal_level to be not minimal

Previously it was specified to be only replica.

Discussion: https://postgr.es/m/20200618180058.GK7349@momjian.us

Backpatch-through: 9.5
---
 doc/src/sgml/ref/pgupgrade.sgml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml
index 319d6132966a..6779a5bddcf3 100644
--- a/doc/src/sgml/ref/pgupgrade.sgml
+++ b/doc/src/sgml/ref/pgupgrade.sgml
@@ -360,8 +360,8 @@ NET STOP postgresql-&majorversion;
      <quote>Latest checkpoint location</quote> values match in all clusters.
      (There will be a mismatch if old standby servers were shut down
      before the old primary or if the old standby servers are still running.)
-     Also, change <varname>wal_level</varname> to
-     <literal>replica</literal> in the <filename>postgresql.conf</filename> file on the
+     Also, make sure <varname>wal_level</varname> is not set to 
+     <literal>minimal</literal> in the <filename>postgresql.conf</filename> file on the
      new primary cluster.
     </para>
    </step>

From aa90d9957bd2d9ce39f9078d15914a37d99af072 Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Tue, 30 Jun 2020 12:26:51 -0400
Subject: [PATCH 060/334] doc:  clarify that storage parameter values are
 optional

In a few cases, the documented syntax specified storage parameter values
as required.

Reported-by: galiev_mr@taximaxim.ru

Discussion: https://postgr.es/m/159283163235.684.4482737698910467437@wrigleys.postgresql.org

Backpatch-through: 9.5
---
 doc/src/sgml/ref/alter_index.sgml             | 4 ++--
 doc/src/sgml/ref/alter_materialized_view.sgml | 2 +-
 doc/src/sgml/ref/alter_table.sgml             | 4 ++--
 doc/src/sgml/ref/create_index.sgml            | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/doc/src/sgml/ref/alter_index.sgml b/doc/src/sgml/ref/alter_index.sgml
index de6f89d458cb..a5e3b06ee493 100644
--- a/doc/src/sgml/ref/alter_index.sgml
+++ b/doc/src/sgml/ref/alter_index.sgml
@@ -25,7 +25,7 @@ ALTER INDEX [ IF EXISTS ] <replaceable class="parameter">name</replaceable> RENA
 ALTER INDEX [ IF EXISTS ] <replaceable class="parameter">name</replaceable> SET TABLESPACE <replaceable class="parameter">tablespace_name</replaceable>
 ALTER INDEX <replaceable class="parameter">name</replaceable> ATTACH PARTITION <replaceable class="parameter">index_name</replaceable>
 ALTER INDEX <replaceable class="parameter">name</replaceable> DEPENDS ON EXTENSION <replaceable class="parameter">extension_name</replaceable>
-ALTER INDEX [ IF EXISTS ] <replaceable class="parameter">name</replaceable> SET ( <replaceable class="parameter">storage_parameter</replaceable> = <replaceable class="parameter">value</replaceable> [, ... ] )
+ALTER INDEX [ IF EXISTS ] <replaceable class="parameter">name</replaceable> SET ( <replaceable class="parameter">storage_parameter</replaceable> [= <replaceable class="parameter">value</replaceable>] [, ... ] )
 ALTER INDEX [ IF EXISTS ] <replaceable class="parameter">name</replaceable> RESET ( <replaceable class="parameter">storage_parameter</replaceable> [, ... ] )
 ALTER INDEX [ IF EXISTS ] <replaceable class="parameter">name</replaceable> ALTER [ COLUMN ] <replaceable class="parameter">column_number</replaceable>
     SET STATISTICS <replaceable class="parameter">integer</replaceable>
@@ -113,7 +113,7 @@ ALTER INDEX ALL IN TABLESPACE <replaceable class="parameter">name</replaceable>
    </varlistentry>
 
    <varlistentry>
-    <term><literal>SET ( <replaceable class="parameter">storage_parameter</replaceable> = <replaceable class="parameter">value</replaceable> [, ... ] )</literal></term>
+    <term><literal>SET ( <replaceable class="parameter">storage_parameter</replaceable> [= <replaceable class="parameter">value</replaceable>] [, ... ] )</literal></term>
     <listitem>
      <para>
       This form changes one or more index-method-specific storage parameters
diff --git a/doc/src/sgml/ref/alter_materialized_view.sgml b/doc/src/sgml/ref/alter_materialized_view.sgml
index 9df8a7997716..7321183dd0db 100644
--- a/doc/src/sgml/ref/alter_materialized_view.sgml
+++ b/doc/src/sgml/ref/alter_materialized_view.sgml
@@ -42,7 +42,7 @@ ALTER MATERIALIZED VIEW ALL IN TABLESPACE <replaceable class="parameter">name</r
     ALTER [ COLUMN ] <replaceable class="parameter">column_name</replaceable> SET STORAGE { PLAIN | EXTERNAL | EXTENDED | MAIN }
     CLUSTER ON <replaceable class="parameter">index_name</replaceable>
     SET WITHOUT CLUSTER
-    SET ( <replaceable class="parameter">storage_parameter</replaceable> = <replaceable class="parameter">value</replaceable> [, ... ] )
+    SET ( <replaceable class="parameter">storage_parameter</replaceable> [= <replaceable class="parameter">value</replaceable>] [, ... ] )
     RESET ( <replaceable class="parameter">storage_parameter</replaceable> [, ... ] )
     OWNER TO { <replaceable class="parameter">new_owner</replaceable> | CURRENT_USER | SESSION_USER }
 </synopsis>
diff --git a/doc/src/sgml/ref/alter_table.sgml b/doc/src/sgml/ref/alter_table.sgml
index cbfb4828e506..b2eb7097a957 100644
--- a/doc/src/sgml/ref/alter_table.sgml
+++ b/doc/src/sgml/ref/alter_table.sgml
@@ -76,7 +76,7 @@ ALTER TABLE [ IF EXISTS ] <replaceable class="parameter">name</replaceable>
     SET WITHOUT OIDS
     SET TABLESPACE <replaceable class="parameter">new_tablespace</replaceable>
     SET { LOGGED | UNLOGGED }
-    SET ( <replaceable class="parameter">storage_parameter</replaceable> = <replaceable class="parameter">value</replaceable> [, ... ] )
+    SET ( <replaceable class="parameter">storage_parameter</replaceable> [= <replaceable class="parameter">value</replaceable>] [, ... ] )
     RESET ( <replaceable class="parameter">storage_parameter</replaceable> [, ... ] )
     INHERIT <replaceable class="parameter">parent_table</replaceable>
     NO INHERIT <replaceable class="parameter">parent_table</replaceable>
@@ -702,7 +702,7 @@ WITH ( MODULUS <replaceable class="parameter">numeric_literal</replaceable>, REM
    </varlistentry>
 
    <varlistentry>
-    <term><literal>SET ( <replaceable class="parameter">storage_parameter</replaceable> = <replaceable class="parameter">value</replaceable> [, ... ] )</literal></term>
+    <term><literal>SET ( <replaceable class="parameter">storage_parameter</replaceable> [= <replaceable class="parameter">value</replaceable>] [, ... ] )</literal></term>
     <listitem>
      <para>
       This form changes one or more storage parameters for the table.  See
diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml
index ff87b2d28fca..33aa64e81d58 100644
--- a/doc/src/sgml/ref/create_index.sgml
+++ b/doc/src/sgml/ref/create_index.sgml
@@ -24,7 +24,7 @@ PostgreSQL documentation
 CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class="parameter">name</replaceable> ] ON [ ONLY ] <replaceable class="parameter">table_name</replaceable> [ USING <replaceable class="parameter">method</replaceable> ]
     ( { <replaceable class="parameter">column_name</replaceable> | ( <replaceable class="parameter">expression</replaceable> ) } [ COLLATE <replaceable class="parameter">collation</replaceable> ] [ <replaceable class="parameter">opclass</replaceable> [ ( <replaceable class="parameter">opclass_parameter</replaceable> = <replaceable class="parameter">value</replaceable> [, ... ] ) ] ] [ ASC | DESC ] [ NULLS { FIRST | LAST } ] [, ...] )
     [ INCLUDE ( <replaceable class="parameter">column_name</replaceable> [, ...] ) ]
-    [ WITH ( <replaceable class="parameter">storage_parameter</replaceable> = <replaceable class="parameter">value</replaceable> [, ... ] ) ]
+    [ WITH ( <replaceable class="parameter">storage_parameter</replaceable> [= <replaceable class="parameter">value</replaceable>] [, ... ] ) ]
     [ TABLESPACE <replaceable class="parameter">tablespace_name</replaceable> ]
     [ WHERE <replaceable class="parameter">predicate</replaceable> ]
 </synopsis>

From e576f71fbe713d9357d1b480b4993043f7fcc73a Mon Sep 17 00:00:00 2001
From: Michael Meskes <meskes@postgresql.org>
Date: Tue, 30 Jun 2020 17:31:08 +0200
Subject: [PATCH 061/334] Fix ecpg crash with bytea and cursor variables.

Author: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
---
 src/interfaces/ecpg/preproc/ecpg.header       |   9 +-
 src/interfaces/ecpg/test/expected/sql-bytea.c | 141 ++++++++++++------
 .../ecpg/test/expected/sql-bytea.stderr       | 125 ++++++++++------
 .../ecpg/test/expected/sql-bytea.stdout       |   1 +
 src/interfaces/ecpg/test/sql/bytea.pgc        |  13 +-
 5 files changed, 201 insertions(+), 88 deletions(-)

diff --git a/src/interfaces/ecpg/preproc/ecpg.header b/src/interfaces/ecpg/preproc/ecpg.header
index 4091ffd28b55..f37112dd4dc0 100644
--- a/src/interfaces/ecpg/preproc/ecpg.header
+++ b/src/interfaces/ecpg/preproc/ecpg.header
@@ -289,7 +289,8 @@ adjust_outofscope_cursor_vars(struct cursor *cur)
 			else if ((ptr->variable->type->type != ECPGt_varchar
 					  && ptr->variable->type->type != ECPGt_char
 					  && ptr->variable->type->type != ECPGt_unsigned_char
-					  && ptr->variable->type->type != ECPGt_string)
+					  && ptr->variable->type->type != ECPGt_string
+					  && ptr->variable->type->type != ECPGt_bytea)
 					 && atoi(ptr->variable->type->size) > 1)
 			{
 				newvar = new_variable(cat_str(4, mm_strdup("("),
@@ -305,7 +306,8 @@ adjust_outofscope_cursor_vars(struct cursor *cur)
 			else if ((ptr->variable->type->type == ECPGt_varchar
 					  || ptr->variable->type->type == ECPGt_char
 					  || ptr->variable->type->type == ECPGt_unsigned_char
-					  || ptr->variable->type->type == ECPGt_string)
+					  || ptr->variable->type->type == ECPGt_string
+					  || ptr->variable->type->type == ECPGt_bytea)
 					 && atoi(ptr->variable->type->size) > 1)
 			{
 				newvar = new_variable(cat_str(4, mm_strdup("("),
@@ -316,7 +318,8 @@ adjust_outofscope_cursor_vars(struct cursor *cur)
 														   ptr->variable->type->size,
 														   ptr->variable->type->counter),
 									  0);
-				if (ptr->variable->type->type == ECPGt_varchar)
+				if (ptr->variable->type->type == ECPGt_varchar ||
+					ptr->variable->type->type == ECPGt_bytea)
 					var_ptr = true;
 			}
 			else if (ptr->variable->type->type == ECPGt_struct
diff --git a/src/interfaces/ecpg/test/expected/sql-bytea.c b/src/interfaces/ecpg/test/expected/sql-bytea.c
index a1379fdd67c7..8338c6008dd8 100644
--- a/src/interfaces/ecpg/test/expected/sql-bytea.c
+++ b/src/interfaces/ecpg/test/expected/sql-bytea.c
@@ -51,7 +51,7 @@ main(void)
 	 
 
 #line 27 "bytea.pgc"
-  struct bytea_1  { int len; char arr[ DATA_SIZE ]; }  send_buf [ 2 ] ;
+  struct bytea_1  { int len; char arr[ 512 ]; }  send_buf [ 2 ] ;
  
 #line 28 "bytea.pgc"
   struct bytea_2  { int len; char arr[ DATA_SIZE ]; }  recv_buf [ 2 ] ;
@@ -139,9 +139,9 @@ if (sqlca.sqlcode < 0) sqlprint();}
 #line 68 "bytea.pgc"
 
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "insert into test values ( $1  , $2  )", 
-	ECPGt_bytea,&(send_buf[0]),(long)DATA_SIZE,(long)1,sizeof(struct bytea_1), 
+	ECPGt_bytea,&(send_buf[0]),(long)512,(long)1,sizeof(struct bytea_1), 
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
-	ECPGt_bytea,&(send_buf[1]),(long)DATA_SIZE,(long)1,sizeof(struct bytea_1), 
+	ECPGt_bytea,&(send_buf[1]),(long)512,(long)1,sizeof(struct bytea_1), 
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EOIT, ECPGt_EORT);
 #line 69 "bytea.pgc"
 
@@ -161,7 +161,7 @@ if (sqlca.sqlcode < 0) sqlprint();}
 	dump_binary(recv_buf[0].arr, recv_buf[0].len, ind[0]);
 	dump_binary(recv_short_buf.arr, recv_short_buf.len, ind[1]);
 
-	/* Test for variable length array */
+	/* Test for cursor */
 	init();
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "truncate test", ECPGt_EOIT, ECPGt_EORT);
 #line 76 "bytea.pgc"
@@ -170,32 +170,89 @@ if (sqlca.sqlcode < 0) sqlprint();}
 #line 76 "bytea.pgc"
 
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "insert into test values ( $1  , $2  )", 
-	ECPGt_bytea,&(send_buf[0]),(long)DATA_SIZE,(long)1,sizeof(struct bytea_1), 
+	ECPGt_bytea,&(send_buf[0]),(long)512,(long)1,sizeof(struct bytea_1), 
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
-	ECPGt_bytea,&(send_buf[1]),(long)DATA_SIZE,(long)1,sizeof(struct bytea_1), 
+	ECPGt_bytea,&(send_buf[1]),(long)512,(long)1,sizeof(struct bytea_1), 
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EOIT, ECPGt_EORT);
 #line 77 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
 #line 77 "bytea.pgc"
 
+	ECPGset_var( 0, &( send_buf[0] ), __LINE__);\
+ /* declare cursor1 cursor for select data1 from test where data1 = $1  */
+#line 78 "bytea.pgc"
+
+if (sqlca.sqlcode < 0) sqlprint();
+#line 78 "bytea.pgc"
+
+#line 78 "bytea.pgc"
+
+	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "declare cursor1 cursor for select data1 from test where data1 = $1 ", 
+	ECPGt_bytea,&(send_buf[0]),(long)512,(long)1,sizeof(struct bytea_1), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EOIT, ECPGt_EORT);
+#line 79 "bytea.pgc"
+
+if (sqlca.sqlcode < 0) sqlprint();}
+#line 79 "bytea.pgc"
+
+	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "fetch from cursor1", ECPGt_EOIT, 
+	ECPGt_bytea,&(recv_buf[0]),(long)DATA_SIZE,(long)1,sizeof(struct bytea_2), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EORT);
+#line 80 "bytea.pgc"
+
+if (sqlca.sqlcode < 0) sqlprint();}
+#line 80 "bytea.pgc"
+
+	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "close cursor1", ECPGt_EOIT, ECPGt_EORT);
+#line 81 "bytea.pgc"
+
+if (sqlca.sqlcode < 0) sqlprint();}
+#line 81 "bytea.pgc"
+
+	{ ECPGdeallocate(__LINE__, 0, NULL, "cursor1");
+#line 82 "bytea.pgc"
+
+if (sqlca.sqlcode < 0) sqlprint();}
+#line 82 "bytea.pgc"
+
+	dump_binary(recv_buf[0].arr, recv_buf[0].len, 0);
+
+	/* Test for variable length array */
+	init();
+	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "truncate test", ECPGt_EOIT, ECPGt_EORT);
+#line 87 "bytea.pgc"
+
+if (sqlca.sqlcode < 0) sqlprint();}
+#line 87 "bytea.pgc"
+
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "insert into test values ( $1  , $2  )", 
-	ECPGt_bytea,&(send_buf[0]),(long)DATA_SIZE,(long)1,sizeof(struct bytea_1), 
+	ECPGt_bytea,&(send_buf[0]),(long)512,(long)1,sizeof(struct bytea_1), 
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
-	ECPGt_bytea,&(send_buf[1]),(long)DATA_SIZE,(long)1,sizeof(struct bytea_1), 
+	ECPGt_bytea,&(send_buf[1]),(long)512,(long)1,sizeof(struct bytea_1), 
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EOIT, ECPGt_EORT);
-#line 78 "bytea.pgc"
+#line 88 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 78 "bytea.pgc"
+#line 88 "bytea.pgc"
+
+	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "insert into test values ( $1  , $2  )", 
+	ECPGt_bytea,&(send_buf[0]),(long)512,(long)1,sizeof(struct bytea_1), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
+	ECPGt_bytea,&(send_buf[1]),(long)512,(long)1,sizeof(struct bytea_1), 
+	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EOIT, ECPGt_EORT);
+#line 89 "bytea.pgc"
+
+if (sqlca.sqlcode < 0) sqlprint();}
+#line 89 "bytea.pgc"
 
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "select data1 from test", ECPGt_EOIT, 
 	ECPGt_bytea,&(recv_vlen_buf),(long)DATA_SIZE,(long)0,sizeof(struct bytea_3), 
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EORT);
-#line 79 "bytea.pgc"
+#line 90 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 79 "bytea.pgc"
+#line 90 "bytea.pgc"
 
 	dump_binary(recv_vlen_buf[0].arr, recv_vlen_buf[0].len, 0);
 	dump_binary(recv_vlen_buf[1].arr, recv_vlen_buf[1].len, 0);
@@ -204,30 +261,30 @@ if (sqlca.sqlcode < 0) sqlprint();}
 	/* Test for dynamic sql statement with normal host variable, indicator */
 	init();
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "truncate test", ECPGt_EOIT, ECPGt_EORT);
-#line 86 "bytea.pgc"
+#line 97 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 86 "bytea.pgc"
+#line 97 "bytea.pgc"
 
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_execute, "ins_stmt", 
-	ECPGt_bytea,&(send_buf[0]),(long)DATA_SIZE,(long)1,sizeof(struct bytea_1), 
+	ECPGt_bytea,&(send_buf[0]),(long)512,(long)1,sizeof(struct bytea_1), 
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
-	ECPGt_bytea,&(send_buf[1]),(long)DATA_SIZE,(long)1,sizeof(struct bytea_1), 
+	ECPGt_bytea,&(send_buf[1]),(long)512,(long)1,sizeof(struct bytea_1), 
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EOIT, ECPGt_EORT);
-#line 87 "bytea.pgc"
+#line 98 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 87 "bytea.pgc"
+#line 98 "bytea.pgc"
 
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_execute, "sel_stmt", ECPGt_EOIT, 
 	ECPGt_bytea,&(recv_buf[0]),(long)DATA_SIZE,(long)1,sizeof(struct bytea_2), 
 	ECPGt_int,&(ind[0]),(long)1,(long)1,sizeof(int), 
 	ECPGt_bytea,&(recv_short_buf),(long)DATA_SIZE - LACK_SIZE,(long)1,sizeof(struct bytea_4), 
 	ECPGt_int,&(ind[1]),(long)1,(long)1,sizeof(int), ECPGt_EORT);
-#line 88 "bytea.pgc"
+#line 99 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 88 "bytea.pgc"
+#line 99 "bytea.pgc"
 
 	dump_binary(recv_buf[0].arr, recv_buf[0].len, ind[0]);
 	dump_binary(recv_short_buf.arr, recv_short_buf.len, ind[1]);
@@ -235,81 +292,81 @@ if (sqlca.sqlcode < 0) sqlprint();}
 	/* Test for dynamic sql statement with sql descriptor */
 	init();
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "truncate test", ECPGt_EOIT, ECPGt_EORT);
-#line 94 "bytea.pgc"
+#line 105 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 94 "bytea.pgc"
+#line 105 "bytea.pgc"
 
 	{ ECPGset_desc(__LINE__, "idesc", 1,ECPGd_data,
-	ECPGt_bytea,&(send_buf[0]),(long)DATA_SIZE,(long)1,sizeof(struct bytea_1), ECPGd_EODT);
+	ECPGt_bytea,&(send_buf[0]),(long)512,(long)1,sizeof(struct bytea_1), ECPGd_EODT);
 
-#line 95 "bytea.pgc"
+#line 106 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 95 "bytea.pgc"
+#line 106 "bytea.pgc"
 
 	{ ECPGset_desc(__LINE__, "idesc", 2,ECPGd_data,
-	ECPGt_bytea,&(send_buf[1]),(long)DATA_SIZE,(long)1,sizeof(struct bytea_1), ECPGd_EODT);
+	ECPGt_bytea,&(send_buf[1]),(long)512,(long)1,sizeof(struct bytea_1), ECPGd_EODT);
 
-#line 96 "bytea.pgc"
+#line 107 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 96 "bytea.pgc"
+#line 107 "bytea.pgc"
 
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_execute, "ins_stmt", 
 	ECPGt_descriptor, "idesc", 1L, 1L, 1L, 
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EOIT, ECPGt_EORT);
-#line 97 "bytea.pgc"
+#line 108 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 97 "bytea.pgc"
+#line 108 "bytea.pgc"
 
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_execute, "sel_stmt", ECPGt_EOIT, 
 	ECPGt_descriptor, "odesc", 1L, 1L, 1L, 
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EORT);
-#line 98 "bytea.pgc"
+#line 109 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 98 "bytea.pgc"
+#line 109 "bytea.pgc"
 
 	{ ECPGget_desc(__LINE__, "odesc", 1,ECPGd_indicator,
 	ECPGt_int,&(ind[0]),(long)1,(long)1,sizeof(int), ECPGd_data,
 	ECPGt_bytea,&(recv_buf[0]),(long)DATA_SIZE,(long)1,sizeof(struct bytea_2), ECPGd_EODT);
 
-#line 99 "bytea.pgc"
+#line 110 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 99 "bytea.pgc"
+#line 110 "bytea.pgc"
 
 	{ ECPGget_desc(__LINE__, "odesc", 2,ECPGd_indicator,
 	ECPGt_int,&(ind[1]),(long)1,(long)1,sizeof(int), ECPGd_data,
 	ECPGt_bytea,&(recv_short_buf),(long)DATA_SIZE - LACK_SIZE,(long)1,sizeof(struct bytea_4), ECPGd_EODT);
 
-#line 100 "bytea.pgc"
+#line 111 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 100 "bytea.pgc"
+#line 111 "bytea.pgc"
 
 	dump_binary(recv_buf[0].arr, recv_buf[0].len, ind[0]);
 	dump_binary(recv_short_buf.arr, recv_short_buf.len, ind[1]);
 
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "drop table test", ECPGt_EOIT, ECPGt_EORT);
-#line 104 "bytea.pgc"
+#line 115 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 104 "bytea.pgc"
+#line 115 "bytea.pgc"
 
 	{ ECPGtrans(__LINE__, NULL, "commit");
-#line 105 "bytea.pgc"
+#line 116 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 105 "bytea.pgc"
+#line 116 "bytea.pgc"
 
 	{ ECPGdisconnect(__LINE__, "CURRENT");
-#line 106 "bytea.pgc"
+#line 117 "bytea.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 106 "bytea.pgc"
+#line 117 "bytea.pgc"
 
 
 	return 0;
diff --git a/src/interfaces/ecpg/test/expected/sql-bytea.stderr b/src/interfaces/ecpg/test/expected/sql-bytea.stderr
index 579cf1ff48c9..cb828a760207 100644
--- a/src/interfaces/ecpg/test/expected/sql-bytea.stderr
+++ b/src/interfaces/ecpg/test/expected/sql-bytea.stderr
@@ -54,93 +54,134 @@
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ecpg_process_output on line 77: OK: INSERT 0 1
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 78: query: insert into test values ( $1  , $2  ); with 2 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_execute on line 79: query: declare cursor1 cursor for select data1 from test where data1 = $1 ; with 1 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 78: using PQexecParams
+[NO_PID]: ecpg_execute on line 79: using PQexecParams
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_free_params on line 78: parameter 1 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
+[NO_PID]: ecpg_free_params on line 79: parameter 1 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_free_params on line 78: parameter 2 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
+[NO_PID]: ecpg_process_output on line 79: OK: DECLARE CURSOR
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 78: OK: INSERT 0 1
+[NO_PID]: ecpg_execute on line 80: query: fetch from cursor1; with 0 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 79: query: select data1 from test; with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_execute on line 80: using PQexec
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 79: using PQexec
+[NO_PID]: ecpg_process_output on line 80: correctly got 1 tuples with 1 fields
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 79: correctly got 2 tuples with 1 fields
+[NO_PID]: ecpg_get_data on line 80: RESULT: \xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100 offset: -1; array: no
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_store_result on line 79: allocating memory for 2 tuples
+[NO_PID]: ecpg_execute on line 81: query: close cursor1; with 0 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_get_data on line 79: RESULT: \xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100 offset: -1; array: no
+[NO_PID]: ecpg_execute on line 81: using PQexec
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_get_data on line 79: RESULT: \xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100 offset: -1; array: no
+[NO_PID]: ecpg_process_output on line 81: OK: CLOSE CURSOR
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 86: query: truncate test; with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: raising sqlcode -230 on line 82: invalid statement name "cursor1" on line 82
+[NO_PID]: sqlca: code: -230, state: 26000
+SQL error: invalid statement name "cursor1" on line 82
+[NO_PID]: ecpg_execute on line 87: query: truncate test; with 0 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 86: using PQexec
+[NO_PID]: ecpg_execute on line 87: using PQexec
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 86: OK: TRUNCATE TABLE
+[NO_PID]: ecpg_process_output on line 87: OK: TRUNCATE TABLE
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 87: query: insert into test values($1,$2); with 2 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_execute on line 88: query: insert into test values ( $1  , $2  ); with 2 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 87: using PQexecPrepared for "insert into test values($1,$2)"
+[NO_PID]: ecpg_execute on line 88: using PQexecParams
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_free_params on line 87: parameter 1 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
+[NO_PID]: ecpg_free_params on line 88: parameter 1 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_free_params on line 87: parameter 2 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
+[NO_PID]: ecpg_free_params on line 88: parameter 2 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 87: OK: INSERT 0 1
+[NO_PID]: ecpg_process_output on line 88: OK: INSERT 0 1
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 88: query: select data1,data2 from test; with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_execute on line 89: query: insert into test values ( $1  , $2  ); with 2 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 88: using PQexecPrepared for "select data1,data2 from test"
+[NO_PID]: ecpg_execute on line 89: using PQexecParams
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 88: correctly got 1 tuples with 2 fields
+[NO_PID]: ecpg_free_params on line 89: parameter 1 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_get_data on line 88: RESULT: \xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100 offset: -1; array: no
+[NO_PID]: ecpg_free_params on line 89: parameter 2 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_get_data on line 88: RESULT: \xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100 offset: -1; array: no
+[NO_PID]: ecpg_process_output on line 89: OK: INSERT 0 1
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 94: query: truncate test; with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_execute on line 90: query: select data1 from test; with 0 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 94: using PQexec
+[NO_PID]: ecpg_execute on line 90: using PQexec
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 94: OK: TRUNCATE TABLE
+[NO_PID]: ecpg_process_output on line 90: correctly got 2 tuples with 1 fields
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 97: query: insert into test values($1,$2); with 2 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_store_result on line 90: allocating memory for 2 tuples
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 97: using PQexecPrepared for "insert into test values($1,$2)"
+[NO_PID]: ecpg_get_data on line 90: RESULT: \xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100 offset: -1; array: no
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_free_params on line 97: parameter 1 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
+[NO_PID]: ecpg_get_data on line 90: RESULT: \xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100 offset: -1; array: no
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_free_params on line 97: parameter 2 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
+[NO_PID]: ecpg_execute on line 97: query: truncate test; with 0 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 97: OK: INSERT 0 1
+[NO_PID]: ecpg_execute on line 97: using PQexec
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 98: query: select data1,data2 from test; with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_process_output on line 97: OK: TRUNCATE TABLE
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 98: using PQexecPrepared for "select data1,data2 from test"
+[NO_PID]: ecpg_execute on line 98: query: insert into test values($1,$2); with 2 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 98: correctly got 1 tuples with 2 fields
+[NO_PID]: ecpg_execute on line 98: using PQexecPrepared for "insert into test values($1,$2)"
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 98: putting result (1 tuples) into descriptor odesc
+[NO_PID]: ecpg_free_params on line 98: parameter 1 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ECPGget_desc: reading items for tuple 1
+[NO_PID]: ecpg_free_params on line 98: parameter 2 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_process_output on line 98: OK: INSERT 0 1
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 99: query: select data1,data2 from test; with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 99: using PQexecPrepared for "select data1,data2 from test"
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_process_output on line 99: correctly got 1 tuples with 2 fields
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ecpg_get_data on line 99: RESULT: \xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100 offset: -1; array: no
 [NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 99: RESULT: \xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100 offset: -1; array: no
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 105: query: truncate test; with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 105: using PQexec
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_process_output on line 105: OK: TRUNCATE TABLE
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 108: query: insert into test values($1,$2); with 2 parameter(s) on connection ecpg1_regression
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 108: using PQexecPrepared for "insert into test values($1,$2)"
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_free_params on line 108: parameter 1 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_free_params on line 108: parameter 2 = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_process_output on line 108: OK: INSERT 0 1
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 109: query: select data1,data2 from test; with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_execute on line 109: using PQexecPrepared for "select data1,data2 from test"
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_process_output on line 109: correctly got 1 tuples with 2 fields
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_process_output on line 109: putting result (1 tuples) into descriptor odesc
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ECPGget_desc: reading items for tuple 1
+[NO_PID]: sqlca: code: 0, state: 00000
+[NO_PID]: ecpg_get_data on line 110: RESULT: \xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100 offset: -1; array: no
+[NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ECPGget_desc: reading items for tuple 2
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_get_data on line 100: RESULT: \xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100 offset: -1; array: no
+[NO_PID]: ecpg_get_data on line 111: RESULT: \xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100 offset: -1; array: no
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 104: query: drop table test; with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_execute on line 115: query: drop table test; with 0 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 104: using PQexec
+[NO_PID]: ecpg_execute on line 115: using PQexec
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 104: OK: DROP TABLE
+[NO_PID]: ecpg_process_output on line 115: OK: DROP TABLE
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ECPGtrans on line 105: action "commit"; connection "ecpg1_regression"
+[NO_PID]: ECPGtrans on line 116: action "commit"; connection "ecpg1_regression"
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: deallocate_one on line 0: name sel_stmt
 [NO_PID]: sqlca: code: 0, state: 00000
diff --git a/src/interfaces/ecpg/test/expected/sql-bytea.stdout b/src/interfaces/ecpg/test/expected/sql-bytea.stdout
index 4c9ad4c59999..33e6100e09d0 100644
--- a/src/interfaces/ecpg/test/expected/sql-bytea.stdout
+++ b/src/interfaces/ecpg/test/expected/sql-bytea.stdout
@@ -3,6 +3,7 @@ len=499, ind=512, data=0xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e
 len=512, ind=0, data=0xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
 len=512, ind=0, data=0xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
 len=512, ind=0, data=0xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
+len=512, ind=0, data=0xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
 len=499, ind=512, data=0xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d
 len=512, ind=0, data=0xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100
 len=499, ind=512, data=0xfffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d0c0b0a09080706050403020100fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0efeeedecebeae9e8e7e6e5e4e3e2e1e0dfdedddcdbdad9d8d7d6d5d4d3d2d1d0cfcecdcccbcac9c8c7c6c5c4c3c2c1c0bfbebdbcbbbab9b8b7b6b5b4b3b2b1b0afaeadacabaaa9a8a7a6a5a4a3a2a1a09f9e9d9c9b9a999897969594939291908f8e8d8c8b8a898887868584838281807f7e7d7c7b7a797877767574737271706f6e6d6c6b6a696867666564636261605f5e5d5c5b5a595857565554535251504f4e4d4c4b4a494847464544434241403f3e3d3c3b3a393837363534333231302f2e2d2c2b2a292827262524232221201f1e1d1c1b1a191817161514131211100f0e0d
diff --git a/src/interfaces/ecpg/test/sql/bytea.pgc b/src/interfaces/ecpg/test/sql/bytea.pgc
index e000c6ca4b71..e87412311945 100644
--- a/src/interfaces/ecpg/test/sql/bytea.pgc
+++ b/src/interfaces/ecpg/test/sql/bytea.pgc
@@ -24,7 +24,7 @@ int
 main(void)
 {
 exec sql begin declare section;
-	bytea send_buf[2][DATA_SIZE];
+	bytea send_buf[2][512];
 	bytea recv_buf[2][DATA_SIZE];
 	bytea recv_vlen_buf[][DATA_SIZE];
 	bytea recv_short_buf[DATA_SIZE - LACK_SIZE];
@@ -71,6 +71,17 @@ while (0)
 	dump_binary(recv_buf[0].arr, recv_buf[0].len, ind[0]);
 	dump_binary(recv_short_buf.arr, recv_short_buf.len, ind[1]);
 
+	/* Test for cursor */
+	init();
+	exec sql truncate test;
+	exec sql insert into test values(:send_buf[0], :send_buf[1]);
+	exec sql declare cursor1 cursor for select data1 from test where data1 = :send_buf[0];
+	exec sql open cursor1;
+	exec sql fetch from cursor1 INTO :recv_buf[0];
+	exec sql close cursor1;
+	exec sql free cursor1 ;
+	dump_binary(recv_buf[0].arr, recv_buf[0].len, 0);
+
 	/* Test for variable length array */
 	init();
 	exec sql truncate test;

From 40efbf8706cdd96e06bc4d1754272e46d9857875 Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Wed, 1 Jul 2020 12:15:59 +1200
Subject: [PATCH 062/334] Further adjustments to Hashagg EXPLAIN ANALYZE output

The "Disk Usage" and "HashAgg Batches" properties in the EXPLAIN ANALYZE
output for HashAgg were previously only shown if the number of batches
was greater than 0.  Here we change this so that these properties are
always shown for EXPLAIN ANALYZE formats other than "text".  The idea here
is that since the HashAgg could have spilled to disk if there had been
more data or groups to aggregate, then it's relevant that we're clear in
the EXPLAIN ANALYZE output when no spilling occurred in this particular
execution of the given plan.

For the "text" EXPLAIN format, we still hide these properties when no
spilling occurs.  This EXPLAIN format is designed to be easy for humans
to read.  To maintain the readability we have a higher threshold for which
properties we display for this format.

Discussion: https://postgr.es/m/CAApHDvo_dmNozQQTmN-2jGp1vT%3Ddxx7Q0vd%2BMvD1cGpv2HU%3DSg%40mail.gmail.com
Backpatch-through: 13, where the hashagg spilling code was added.
---
 src/backend/commands/explain.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index a131d15ac0ce..093864cfc04a 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3070,13 +3070,10 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 
 		/* EXPLAIN ANALYZE */
 		ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es);
-		if (aggstate->hash_batches_used > 0)
-		{
-			ExplainPropertyInteger("Disk Usage", "kB",
-								   aggstate->hash_disk_used, es);
-			ExplainPropertyInteger("HashAgg Batches", NULL,
-								   aggstate->hash_batches_used, es);
-		}
+		ExplainPropertyInteger("Disk Usage", "kB",
+							   aggstate->hash_disk_used, es);
+		ExplainPropertyInteger("HashAgg Batches", NULL,
+							   aggstate->hash_batches_used, es);
 	}
 	else
 	{
@@ -3145,13 +3142,9 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 			{
 				ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb,
 									   es);
-				if (hash_batches_used > 0)
-				{
-					ExplainPropertyInteger("Disk Usage", "kB", hash_disk_used,
-										   es);
-					ExplainPropertyInteger("HashAgg Batches", NULL,
-										   hash_batches_used, es);
-				}
+				ExplainPropertyInteger("Disk Usage", "kB", hash_disk_used, es);
+				ExplainPropertyInteger("HashAgg Batches", NULL,
+									   hash_batches_used, es);
 			}
 
 			if (es->workers_state)

From c4342c932abb6792f6c7d8528b6224ca4d5e1a84 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 1 Jul 2020 10:47:24 +0900
Subject: [PATCH 063/334] Fix removal of files generated by TAP tests for SSL

001_ssltests.pl and 002_scram.pl both generated an extra file for a
client key used in the tests that were not removed.  In Debian, this
causes repeated builds to fail.

The code refactoring done in 4dc6355 broke the cleanup done in
001_ssltests.pl, and the new tests added in 002_scram.pl via d6e612f
forgot the removal of one file.  While on it, fix a second issue
introduced in 002_scram.pl where we use the same file name in 001 and
002 for the temporary client key whose permissions are changed in the
test, as using the same file name in both tests could cause failures
with parallel jobs of src/test/ssl/ if one test removes a file still
needed by the second test.

Reported-by: Felix Lechner
Author: Daniel Gustafsson, Felix Lechner
Reviewed-by: Tom Lane, Michael Paquier
Discussion: https://postgr.es/m/CAFHYt543sjX=Cm_aEeoejStyP47C+Y3+Wh6WbirLXsgUMaw7iw@mail.gmail.com
Backpatch-through: 13
---
 src/test/ssl/t/001_ssltests.pl |  4 +++-
 src/test/ssl/t/002_scram.pl    | 16 +++++++++++-----
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/test/ssl/t/001_ssltests.pl b/src/test/ssl/t/001_ssltests.pl
index a454bb0274a2..c0680f39d6f8 100644
--- a/src/test/ssl/t/001_ssltests.pl
+++ b/src/test/ssl/t/001_ssltests.pl
@@ -52,9 +52,11 @@
 
 # Also make a copy of that explicitly world-readable.  We can't
 # necessarily rely on the file in the source tree having those
-# permissions.
+# permissions.  Add it to @keys to include it in the final clean
+# up phase.
 copy("ssl/client.key", "ssl/client_wrongperms_tmp.key");
 chmod 0644, "ssl/client_wrongperms_tmp.key";
+push @keys, 'client_wrongperms';
 
 #### Set up the server.
 
diff --git a/src/test/ssl/t/002_scram.pl b/src/test/ssl/t/002_scram.pl
index ee6e26d7323f..a1ab9119880d 100644
--- a/src/test/ssl/t/002_scram.pl
+++ b/src/test/ssl/t/002_scram.pl
@@ -89,14 +89,20 @@
 	qr/channel binding required but not supported by server's authentication request/,
 	"MD5 with SSL and channel_binding=require");
 
-# Now test with auth method 'cert' by connecting to 'certdb'. Should
-# fail, because channel binding is not performed.
-copy("ssl/client.key", "ssl/client_tmp.key");
-chmod 0600, "ssl/client_tmp.key";
+# Now test with auth method 'cert' by connecting to 'certdb'. Should fail,
+# because channel binding is not performed.  Note that ssl/client.key may
+# be used in a different test, so the name of this temporary client key
+# is chosen here to be unique.
+my $client_tmp_key = "ssl/client_scram_tmp.key";
+copy("ssl/client.key", $client_tmp_key);
+chmod 0600, $client_tmp_key;
 test_connect_fails(
-	"sslcert=ssl/client.crt sslkey=ssl/client_tmp.key hostaddr=$SERVERHOSTADDR",
+	"sslcert=ssl/client.crt sslkey=$client_tmp_key hostaddr=$SERVERHOSTADDR",
 	"dbname=certdb user=ssltestuser channel_binding=require",
 	qr/channel binding required, but server authenticated client without channel binding/,
 	"Cert authentication and channel_binding=require");
 
+# clean up
+unlink($client_tmp_key);
+
 done_testing($number_of_tests);

From 684b4f29b729e5b135d256802d6e712222b9551f Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 1 Jul 2020 11:12:33 +0900
Subject: [PATCH 064/334] Refactor creation of normal dependency records when
 creating extension

When creating an extension, the same type of dependency is used when
registering a dependency to a schema and required extensions.  This
improves the code so as those dependencies are not recorded one-by-one,
but grouped together.  Note that this has as side effect to remove
duplicate dependency entries, even if it should not happen in practice
as extensions listed as required in a control file should be listed only
once.

Extracted from a larger patch by the same author.

Author: Daniel Dustafsson
Discussion: https://postgr.es/m/20200629065535.GA183079@paquier.xyz
---
 src/backend/commands/extension.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c
index 472e69fdaf97..3b69ab7ed5c4 100644
--- a/src/backend/commands/extension.c
+++ b/src/backend/commands/extension.c
@@ -1783,6 +1783,7 @@ InsertExtensionTuple(const char *extName, Oid extOwner,
 	HeapTuple	tuple;
 	ObjectAddress myself;
 	ObjectAddress nsp;
+	ObjectAddresses *refobjs;
 	ListCell   *lc;
 
 	/*
@@ -1825,27 +1826,26 @@ InsertExtensionTuple(const char *extName, Oid extOwner,
 	 */
 	recordDependencyOnOwner(ExtensionRelationId, extensionOid, extOwner);
 
-	myself.classId = ExtensionRelationId;
-	myself.objectId = extensionOid;
-	myself.objectSubId = 0;
+	refobjs = new_object_addresses();
 
-	nsp.classId = NamespaceRelationId;
-	nsp.objectId = schemaOid;
-	nsp.objectSubId = 0;
+	ObjectAddressSet(myself, ExtensionRelationId, extensionOid);
 
-	recordDependencyOn(&myself, &nsp, DEPENDENCY_NORMAL);
+	ObjectAddressSet(nsp, NamespaceRelationId, schemaOid);
+	add_exact_object_address(&nsp, refobjs);
 
 	foreach(lc, requiredExtensions)
 	{
 		Oid			reqext = lfirst_oid(lc);
 		ObjectAddress otherext;
 
-		otherext.classId = ExtensionRelationId;
-		otherext.objectId = reqext;
-		otherext.objectSubId = 0;
-
-		recordDependencyOn(&myself, &otherext, DEPENDENCY_NORMAL);
+		ObjectAddressSet(otherext, ExtensionRelationId, reqext);
+		add_exact_object_address(&otherext, refobjs);
 	}
+
+	/* Record all of them (this includes duplicate elimination) */
+	record_object_address_dependencies(&myself, refobjs, DEPENDENCY_NORMAL);
+	free_object_addresses(refobjs);
+
 	/* Post creation hook for new extension */
 	InvokeObjectPostCreateHook(ExtensionRelationId, extensionOid, 0);
 

From a69e041d0c91759fb60ab52e7e21e3ec3752c69b Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Wed, 1 Jul 2020 07:58:36 +0530
Subject: [PATCH 065/334] Improve vacuum error context handling.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use separate functions to save and restore error context information as
that made code easier to understand.  Also, make it clear that the index
information required for error context is sane.

Author: Andres Freund, Justin Pryzby, Amit Kapila
Backpatch-through: 13, where it was introduced
Discussion: https://postgr.es/m/CAA4eK1LWo+v1OWu=Sky27GTGSCuOmr7iaURNbc5xz6jO+SaPeA@mail.gmail.com
---
 src/backend/access/heap/vacuumlazy.c | 123 ++++++++++++++++-----------
 src/tools/pgindent/typedefs.list     |   1 +
 2 files changed, 73 insertions(+), 51 deletions(-)

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 3bef0e124bac..68effcaed6dc 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -319,6 +319,13 @@ typedef struct LVRelStats
 	VacErrPhase phase;
 } LVRelStats;
 
+/* Struct for saving and restoring vacuum error information. */
+typedef struct LVSavedErrInfo
+{
+	BlockNumber blkno;
+	VacErrPhase phase;
+} LVSavedErrInfo;
+
 /* A few variables that don't seem worth passing around as parameters */
 static int	elevel = -1;
 
@@ -388,8 +395,9 @@ static void end_parallel_vacuum(Relation *Irel, IndexBulkDeleteResult **stats,
 static LVSharedIndStats *get_indstats(LVShared *lvshared, int n);
 static bool skip_parallel_vacuum_index(Relation indrel, LVShared *lvshared);
 static void vacuum_error_callback(void *arg);
-static void update_vacuum_error_info(LVRelStats *errinfo, int phase,
-									 BlockNumber blkno, char *indname);
+static void update_vacuum_error_info(LVRelStats *errinfo, LVSavedErrInfo *saved_err_info,
+									 int phase, BlockNumber blkno);
+static void restore_vacuum_error_info(LVRelStats *errinfo, const LVSavedErrInfo *saved_err_info);
 
 
 /*
@@ -538,8 +546,8 @@ heap_vacuum_rel(Relation onerel, VacuumParams *params,
 		 * which we add context information to errors, so we don't need to
 		 * revert to the previous phase.
 		 */
-		update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_TRUNCATE,
-								 vacrelstats->nonempty_pages, NULL);
+		update_vacuum_error_info(vacrelstats, NULL, VACUUM_ERRCB_PHASE_TRUNCATE,
+								 vacrelstats->nonempty_pages);
 		lazy_truncate_heap(onerel, vacrelstats);
 	}
 
@@ -948,8 +956,8 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
 
 		pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_SCANNED, blkno);
 
-		update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_SCAN_HEAP,
-								 blkno, NULL);
+		update_vacuum_error_info(vacrelstats, NULL, VACUUM_ERRCB_PHASE_SCAN_HEAP,
+								 blkno);
 
 		if (blkno == next_unskippable_block)
 		{
@@ -1820,16 +1828,15 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 	int			npages;
 	PGRUsage	ru0;
 	Buffer		vmbuffer = InvalidBuffer;
-	LVRelStats	olderrinfo;
+	LVSavedErrInfo saved_err_info;
 
 	/* Report that we are now vacuuming the heap */
 	pgstat_progress_update_param(PROGRESS_VACUUM_PHASE,
 								 PROGRESS_VACUUM_PHASE_VACUUM_HEAP);
 
 	/* Update error traceback information */
-	olderrinfo = *vacrelstats;
-	update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_VACUUM_HEAP,
-							 InvalidBlockNumber, NULL);
+	update_vacuum_error_info(vacrelstats, &saved_err_info, VACUUM_ERRCB_PHASE_VACUUM_HEAP,
+							 InvalidBlockNumber);
 
 	pg_rusage_init(&ru0);
 	npages = 0;
@@ -1879,10 +1886,7 @@ lazy_vacuum_heap(Relation onerel, LVRelStats *vacrelstats)
 			 errdetail_internal("%s", pg_rusage_show(&ru0))));
 
 	/* Revert to the previous phase information for error traceback */
-	update_vacuum_error_info(vacrelstats,
-							 olderrinfo.phase,
-							 olderrinfo.blkno,
-							 olderrinfo.indname);
+	restore_vacuum_error_info(vacrelstats, &saved_err_info);
 }
 
 /*
@@ -1905,14 +1909,13 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 	int			uncnt = 0;
 	TransactionId visibility_cutoff_xid;
 	bool		all_frozen;
-	LVRelStats	olderrinfo;
+	LVSavedErrInfo saved_err_info;
 
 	pgstat_progress_update_param(PROGRESS_VACUUM_HEAP_BLKS_VACUUMED, blkno);
 
 	/* Update error traceback information */
-	olderrinfo = *vacrelstats;
-	update_vacuum_error_info(vacrelstats, VACUUM_ERRCB_PHASE_VACUUM_HEAP,
-							 blkno, NULL);
+	update_vacuum_error_info(vacrelstats, &saved_err_info, VACUUM_ERRCB_PHASE_VACUUM_HEAP,
+							 blkno);
 
 	START_CRIT_SECTION();
 
@@ -1991,10 +1994,7 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 	}
 
 	/* Revert to the previous phase information for error traceback */
-	update_vacuum_error_info(vacrelstats,
-							 olderrinfo.phase,
-							 olderrinfo.blkno,
-							 olderrinfo.indname);
+	restore_vacuum_error_info(vacrelstats, &saved_err_info);
 	return tupindex;
 }
 
@@ -2404,7 +2404,7 @@ lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats,
 	IndexVacuumInfo ivinfo;
 	const char *msg;
 	PGRUsage	ru0;
-	LVRelStats	olderrinfo;
+	LVSavedErrInfo saved_err_info;
 
 	pg_rusage_init(&ru0);
 
@@ -2416,12 +2416,17 @@ lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats,
 	ivinfo.num_heap_tuples = reltuples;
 	ivinfo.strategy = vac_strategy;
 
-	/* Update error traceback information */
-	olderrinfo = *vacrelstats;
-	update_vacuum_error_info(vacrelstats,
+	/*
+	 * Update error traceback information.
+	 *
+	 * The index name is saved during this phase and restored immediately
+	 * after this phase.  See vacuum_error_callback.
+	 */
+	Assert(vacrelstats->indname == NULL);
+	vacrelstats->indname = pstrdup(RelationGetRelationName(indrel));
+	update_vacuum_error_info(vacrelstats, &saved_err_info,
 							 VACUUM_ERRCB_PHASE_VACUUM_INDEX,
-							 InvalidBlockNumber,
-							 RelationGetRelationName(indrel));
+							 InvalidBlockNumber);
 
 	/* Do bulk deletion */
 	*stats = index_bulk_delete(&ivinfo, *stats,
@@ -2439,10 +2444,9 @@ lazy_vacuum_index(Relation indrel, IndexBulkDeleteResult **stats,
 			 errdetail_internal("%s", pg_rusage_show(&ru0))));
 
 	/* Revert to the previous phase information for error traceback */
-	update_vacuum_error_info(vacrelstats,
-							 olderrinfo.phase,
-							 olderrinfo.blkno,
-							 olderrinfo.indname);
+	restore_vacuum_error_info(vacrelstats, &saved_err_info);
+	pfree(vacrelstats->indname);
+	vacrelstats->indname = NULL;
 }
 
 /*
@@ -2459,7 +2463,7 @@ lazy_cleanup_index(Relation indrel,
 	IndexVacuumInfo ivinfo;
 	const char *msg;
 	PGRUsage	ru0;
-	LVRelStats	olderrcbarg;
+	LVSavedErrInfo saved_err_info;
 
 	pg_rusage_init(&ru0);
 
@@ -2472,20 +2476,25 @@ lazy_cleanup_index(Relation indrel,
 	ivinfo.num_heap_tuples = reltuples;
 	ivinfo.strategy = vac_strategy;
 
-	/* Update error traceback information */
-	olderrcbarg = *vacrelstats;
-	update_vacuum_error_info(vacrelstats,
+	/*
+	 * Update error traceback information.
+	 *
+	 * The index name is saved during this phase and restored immediately
+	 * after this phase.  See vacuum_error_callback.
+	 */
+	Assert(vacrelstats->indname == NULL);
+	vacrelstats->indname = pstrdup(RelationGetRelationName(indrel));
+	update_vacuum_error_info(vacrelstats, &saved_err_info,
 							 VACUUM_ERRCB_PHASE_INDEX_CLEANUP,
-							 InvalidBlockNumber,
-							 RelationGetRelationName(indrel));
+							 InvalidBlockNumber);
 
 	*stats = index_vacuum_cleanup(&ivinfo, *stats);
 
 	/* Revert back to the old phase information for error traceback */
-	update_vacuum_error_info(vacrelstats,
-							 olderrcbarg.phase,
-							 olderrcbarg.blkno,
-							 olderrcbarg.indname);
+	restore_vacuum_error_info(vacrelstats, &saved_err_info);
+	pfree(vacrelstats->indname);
+	vacrelstats->indname = NULL;
+
 	if (!(*stats))
 		return;
 
@@ -3598,18 +3607,30 @@ vacuum_error_callback(void *arg)
 	}
 }
 
-/* Update vacuum error callback for the current phase, block, and index. */
+/*
+ * Updates the information required for vacuum error callback.  This also saves
+ * the current information which can be later restored via restore_vacuum_error_info.
+ */
 static void
-update_vacuum_error_info(LVRelStats *errinfo, int phase, BlockNumber blkno,
-						 char *indname)
+update_vacuum_error_info(LVRelStats *errinfo, LVSavedErrInfo *saved_err_info, int phase,
+						 BlockNumber blkno)
 {
+	if (saved_err_info)
+	{
+		saved_err_info->blkno = errinfo->blkno;
+		saved_err_info->phase = errinfo->phase;
+	}
+
 	errinfo->blkno = blkno;
 	errinfo->phase = phase;
+}
 
-	/* Free index name from any previous phase */
-	if (errinfo->indname)
-		pfree(errinfo->indname);
-
-	/* For index phases, save the name of the current index for the callback */
-	errinfo->indname = indname ? pstrdup(indname) : NULL;
+/*
+ * Restores the vacuum information saved via a prior call to update_vacuum_error_info.
+ */
+static void
+restore_vacuum_error_info(LVRelStats *errinfo, const LVSavedErrInfo *saved_err_info)
+{
+	errinfo->blkno = saved_err_info->blkno;
+	errinfo->phase = saved_err_info->phase;
 }
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index c65a55257ddd..7eaaad1e140a 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1250,6 +1250,7 @@ LUID
 LVDeadTuples
 LVParallelState
 LVRelStats
+LVSavedErrInfo
 LVShared
 LVSharedIndStats
 LWLock

From 4315e8c23b9a897e12fcf91de7bfd734621096bf Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 1 Jul 2020 17:03:50 +0900
Subject: [PATCH 066/334] Refactor ObjectAddress field assignments in more
 places

This is a follow-up commit similar to 68de144, with more places in the
backend code simplified with the macros able to assign values to the
fields of ObjectAddress.  The code paths changed here could be
transitioned later into using more grouping when inserting dependency
records, simplifying this future work.

Author: Daniel Gustafsson, Michael Paquier
Discussion: https://postgr.es/m/20190213182737.mxn6hkdxwrzgxk35@alap3.anarazel.de
---
 src/backend/catalog/heap.c          | 12 +++------
 src/backend/catalog/index.c         | 39 ++++++++---------------------
 src/backend/catalog/pg_aggregate.c  | 36 +++++++-------------------
 src/backend/catalog/pg_constraint.c | 36 ++++++++------------------
 src/backend/catalog/pg_operator.c   | 32 ++++++-----------------
 src/backend/catalog/pg_proc.c       | 32 ++++++-----------------
 6 files changed, 49 insertions(+), 138 deletions(-)

diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 3c83fe6bab48..d279842d3ceb 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -818,21 +818,15 @@ AddNewAttributeTuples(Oid new_rel_oid,
 		InsertPgAttributeTuple(rel, attr, (Datum) 0, indstate);
 
 		/* Add dependency info */
-		myself.classId = RelationRelationId;
-		myself.objectId = new_rel_oid;
-		myself.objectSubId = i + 1;
-		referenced.classId = TypeRelationId;
-		referenced.objectId = attr->atttypid;
-		referenced.objectSubId = 0;
+		ObjectAddressSubSet(myself, RelationRelationId, new_rel_oid, i + 1);
+		ObjectAddressSet(referenced, TypeRelationId, attr->atttypid);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 
 		/* The default collation is pinned, so don't bother recording it */
 		if (OidIsValid(attr->attcollation) &&
 			attr->attcollation != DEFAULT_COLLATION_OID)
 		{
-			referenced.classId = CollationRelationId;
-			referenced.objectId = attr->attcollation;
-			referenced.objectSubId = 0;
+			ObjectAddressSet(referenced, CollationRelationId, attr->attcollation);
 			recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 		}
 	}
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index cdc01c49c9ff..fc088d3f5275 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -1030,9 +1030,7 @@ index_create(Relation heapRelation,
 		ObjectAddress myself,
 					referenced;
 
-		myself.classId = RelationRelationId;
-		myself.objectId = indexRelationId;
-		myself.objectSubId = 0;
+		ObjectAddressSet(myself, RelationRelationId, indexRelationId);
 
 		if ((flags & INDEX_CREATE_ADD_CONSTRAINT) != 0)
 		{
@@ -1072,12 +1070,10 @@ index_create(Relation heapRelation,
 			{
 				if (indexInfo->ii_IndexAttrNumbers[i] != 0)
 				{
-					referenced.classId = RelationRelationId;
-					referenced.objectId = heapRelationId;
-					referenced.objectSubId = indexInfo->ii_IndexAttrNumbers[i];
-
+					ObjectAddressSubSet(referenced, RelationRelationId,
+										heapRelationId,
+										indexInfo->ii_IndexAttrNumbers[i]);
 					recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
-
 					have_simple_col = true;
 				}
 			}
@@ -1090,10 +1086,8 @@ index_create(Relation heapRelation,
 			 */
 			if (!have_simple_col)
 			{
-				referenced.classId = RelationRelationId;
-				referenced.objectId = heapRelationId;
-				referenced.objectSubId = 0;
-
+				ObjectAddressSet(referenced, RelationRelationId,
+								 heapRelationId);
 				recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
 			}
 		}
@@ -1106,16 +1100,10 @@ index_create(Relation heapRelation,
 		 */
 		if (OidIsValid(parentIndexRelid))
 		{
-			referenced.classId = RelationRelationId;
-			referenced.objectId = parentIndexRelid;
-			referenced.objectSubId = 0;
-
+			ObjectAddressSet(referenced, RelationRelationId, parentIndexRelid);
 			recordDependencyOn(&myself, &referenced, DEPENDENCY_PARTITION_PRI);
 
-			referenced.classId = RelationRelationId;
-			referenced.objectId = heapRelationId;
-			referenced.objectSubId = 0;
-
+			ObjectAddressSet(referenced, RelationRelationId, heapRelationId);
 			recordDependencyOn(&myself, &referenced, DEPENDENCY_PARTITION_SEC);
 		}
 
@@ -1126,10 +1114,8 @@ index_create(Relation heapRelation,
 			if (OidIsValid(collationObjectId[i]) &&
 				collationObjectId[i] != DEFAULT_COLLATION_OID)
 			{
-				referenced.classId = CollationRelationId;
-				referenced.objectId = collationObjectId[i];
-				referenced.objectSubId = 0;
-
+				ObjectAddressSet(referenced, CollationRelationId,
+								 collationObjectId[i]);
 				recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 			}
 		}
@@ -1137,10 +1123,7 @@ index_create(Relation heapRelation,
 		/* Store dependency on operator classes */
 		for (i = 0; i < indexInfo->ii_NumIndexKeyAttrs; i++)
 		{
-			referenced.classId = OperatorClassRelationId;
-			referenced.objectId = classObjectId[i];
-			referenced.objectSubId = 0;
-
+			ObjectAddressSet(referenced, OperatorClassRelationId, classObjectId[i]);
 			recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 		}
 
diff --git a/src/backend/catalog/pg_aggregate.c b/src/backend/catalog/pg_aggregate.c
index 7d887ea24a2b..89007ad1ed7f 100644
--- a/src/backend/catalog/pg_aggregate.c
+++ b/src/backend/catalog/pg_aggregate.c
@@ -742,80 +742,62 @@ AggregateCreate(const char *aggName,
 	 */
 
 	/* Depends on transition function */
-	referenced.classId = ProcedureRelationId;
-	referenced.objectId = transfn;
-	referenced.objectSubId = 0;
+	ObjectAddressSet(referenced, ProcedureRelationId, transfn);
 	recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 
 	/* Depends on final function, if any */
 	if (OidIsValid(finalfn))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = finalfn;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, finalfn);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	/* Depends on combine function, if any */
 	if (OidIsValid(combinefn))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = combinefn;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, combinefn);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	/* Depends on serialization function, if any */
 	if (OidIsValid(serialfn))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = serialfn;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, serialfn);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	/* Depends on deserialization function, if any */
 	if (OidIsValid(deserialfn))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = deserialfn;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, deserialfn);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	/* Depends on forward transition function, if any */
 	if (OidIsValid(mtransfn))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = mtransfn;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, mtransfn);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	/* Depends on inverse transition function, if any */
 	if (OidIsValid(minvtransfn))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = minvtransfn;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, minvtransfn);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	/* Depends on final function, if any */
 	if (OidIsValid(mfinalfn))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = mfinalfn;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, mfinalfn);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	/* Depends on sort operator, if any */
 	if (OidIsValid(sortop))
 	{
-		referenced.classId = OperatorRelationId;
-		referenced.objectId = sortop;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, OperatorRelationId, sortop);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c
index 90932be83101..fdc63e7dea16 100644
--- a/src/backend/catalog/pg_constraint.c
+++ b/src/backend/catalog/pg_constraint.c
@@ -223,9 +223,7 @@ CreateConstraintEntry(const char *constraintName,
 
 	CatalogTupleInsert(conDesc, tup);
 
-	conobject.classId = ConstraintRelationId;
-	conobject.objectId = conOid;
-	conobject.objectSubId = 0;
+	ObjectAddressSet(conobject, ConstraintRelationId, conOid);
 
 	table_close(conDesc, RowExclusiveLock);
 
@@ -237,21 +235,18 @@ CreateConstraintEntry(const char *constraintName,
 		 */
 		ObjectAddress relobject;
 
-		relobject.classId = RelationRelationId;
-		relobject.objectId = relId;
 		if (constraintNTotalKeys > 0)
 		{
 			for (i = 0; i < constraintNTotalKeys; i++)
 			{
-				relobject.objectSubId = constraintKey[i];
-
+				ObjectAddressSubSet(relobject, RelationRelationId, relId,
+									constraintKey[i]);
 				recordDependencyOn(&conobject, &relobject, DEPENDENCY_AUTO);
 			}
 		}
 		else
 		{
-			relobject.objectSubId = 0;
-
+			ObjectAddressSet(relobject, RelationRelationId, relId);
 			recordDependencyOn(&conobject, &relobject, DEPENDENCY_AUTO);
 		}
 	}
@@ -263,10 +258,7 @@ CreateConstraintEntry(const char *constraintName,
 		 */
 		ObjectAddress domobject;
 
-		domobject.classId = TypeRelationId;
-		domobject.objectId = domainId;
-		domobject.objectSubId = 0;
-
+		ObjectAddressSet(domobject, TypeRelationId, domainId);
 		recordDependencyOn(&conobject, &domobject, DEPENDENCY_AUTO);
 	}
 
@@ -278,21 +270,18 @@ CreateConstraintEntry(const char *constraintName,
 		 */
 		ObjectAddress relobject;
 
-		relobject.classId = RelationRelationId;
-		relobject.objectId = foreignRelId;
 		if (foreignNKeys > 0)
 		{
 			for (i = 0; i < foreignNKeys; i++)
 			{
-				relobject.objectSubId = foreignKey[i];
-
+				ObjectAddressSubSet(relobject, RelationRelationId,
+									foreignRelId, foreignKey[i]);
 				recordDependencyOn(&conobject, &relobject, DEPENDENCY_NORMAL);
 			}
 		}
 		else
 		{
-			relobject.objectSubId = 0;
-
+			ObjectAddressSet(relobject, RelationRelationId, foreignRelId);
 			recordDependencyOn(&conobject, &relobject, DEPENDENCY_NORMAL);
 		}
 	}
@@ -307,10 +296,7 @@ CreateConstraintEntry(const char *constraintName,
 		 */
 		ObjectAddress relobject;
 
-		relobject.classId = RelationRelationId;
-		relobject.objectId = indexRelId;
-		relobject.objectSubId = 0;
-
+		ObjectAddressSet(relobject, RelationRelationId, indexRelId);
 		recordDependencyOn(&conobject, &relobject, DEPENDENCY_NORMAL);
 	}
 
@@ -722,9 +708,7 @@ AlterConstraintNamespaces(Oid ownerId, Oid oldNspId,
 		Form_pg_constraint conform = (Form_pg_constraint) GETSTRUCT(tup);
 		ObjectAddress thisobj;
 
-		thisobj.classId = ConstraintRelationId;
-		thisobj.objectId = conform->oid;
-		thisobj.objectSubId = 0;
+		ObjectAddressSet(thisobj, ConstraintRelationId, conform->oid);
 
 		if (object_address_present(&thisobj, objsMoved))
 			continue;
diff --git a/src/backend/catalog/pg_operator.c b/src/backend/catalog/pg_operator.c
index 340e284ae4b5..65a36be5ee61 100644
--- a/src/backend/catalog/pg_operator.c
+++ b/src/backend/catalog/pg_operator.c
@@ -776,9 +776,7 @@ makeOperatorDependencies(HeapTuple tuple, bool isUpdate)
 	ObjectAddress myself,
 				referenced;
 
-	myself.classId = OperatorRelationId;
-	myself.objectId = oper->oid;
-	myself.objectSubId = 0;
+	ObjectAddressSet(myself, OperatorRelationId, oper->oid);
 
 	/*
 	 * If we are updating the operator, delete any existing entries, except
@@ -793,36 +791,28 @@ makeOperatorDependencies(HeapTuple tuple, bool isUpdate)
 	/* Dependency on namespace */
 	if (OidIsValid(oper->oprnamespace))
 	{
-		referenced.classId = NamespaceRelationId;
-		referenced.objectId = oper->oprnamespace;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, NamespaceRelationId, oper->oprnamespace);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	/* Dependency on left type */
 	if (OidIsValid(oper->oprleft))
 	{
-		referenced.classId = TypeRelationId;
-		referenced.objectId = oper->oprleft;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, TypeRelationId, oper->oprleft);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	/* Dependency on right type */
 	if (OidIsValid(oper->oprright))
 	{
-		referenced.classId = TypeRelationId;
-		referenced.objectId = oper->oprright;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, TypeRelationId, oper->oprright);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	/* Dependency on result type */
 	if (OidIsValid(oper->oprresult))
 	{
-		referenced.classId = TypeRelationId;
-		referenced.objectId = oper->oprresult;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, TypeRelationId, oper->oprresult);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
@@ -838,27 +828,21 @@ makeOperatorDependencies(HeapTuple tuple, bool isUpdate)
 	/* Dependency on implementation function */
 	if (OidIsValid(oper->oprcode))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = oper->oprcode;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, oper->oprcode);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	/* Dependency on restriction selectivity function */
 	if (OidIsValid(oper->oprrest))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = oper->oprrest;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, oper->oprrest);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	/* Dependency on join selectivity function */
 	if (OidIsValid(oper->oprjoin))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = oper->oprjoin;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, oper->oprjoin);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
diff --git a/src/backend/catalog/pg_proc.c b/src/backend/catalog/pg_proc.c
index 6cdda35d1c97..a28ab74d6086 100644
--- a/src/backend/catalog/pg_proc.c
+++ b/src/backend/catalog/pg_proc.c
@@ -585,51 +585,37 @@ ProcedureCreate(const char *procedureName,
 	if (is_update)
 		deleteDependencyRecordsFor(ProcedureRelationId, retval, true);
 
-	myself.classId = ProcedureRelationId;
-	myself.objectId = retval;
-	myself.objectSubId = 0;
+	ObjectAddressSet(myself, ProcedureRelationId, retval);
 
 	/* dependency on namespace */
-	referenced.classId = NamespaceRelationId;
-	referenced.objectId = procNamespace;
-	referenced.objectSubId = 0;
+	ObjectAddressSet(referenced, NamespaceRelationId, procNamespace);
 	recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 
 	/* dependency on implementation language */
-	referenced.classId = LanguageRelationId;
-	referenced.objectId = languageObjectId;
-	referenced.objectSubId = 0;
+	ObjectAddressSet(referenced, LanguageRelationId, languageObjectId);
 	recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 
 	/* dependency on return type */
-	referenced.classId = TypeRelationId;
-	referenced.objectId = returnType;
-	referenced.objectSubId = 0;
+	ObjectAddressSet(referenced, TypeRelationId, returnType);
 	recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 
 	/* dependency on transform used by return type, if any */
 	if ((trfid = get_transform_oid(returnType, languageObjectId, true)))
 	{
-		referenced.classId = TransformRelationId;
-		referenced.objectId = trfid;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, TransformRelationId, trfid);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 
 	/* dependency on parameter types */
 	for (i = 0; i < allParamCount; i++)
 	{
-		referenced.classId = TypeRelationId;
-		referenced.objectId = allParams[i];
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, TypeRelationId, allParams[i]);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 
 		/* dependency on transform used by parameter type, if any */
 		if ((trfid = get_transform_oid(allParams[i], languageObjectId, true)))
 		{
-			referenced.classId = TransformRelationId;
-			referenced.objectId = trfid;
-			referenced.objectSubId = 0;
+			ObjectAddressSet(referenced, TransformRelationId, trfid);
 			recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 		}
 	}
@@ -642,9 +628,7 @@ ProcedureCreate(const char *procedureName,
 	/* dependency on support function, if any */
 	if (OidIsValid(prosupport))
 	{
-		referenced.classId = ProcedureRelationId;
-		referenced.objectId = prosupport;
-		referenced.objectSubId = 0;
+		ObjectAddressSet(referenced, ProcedureRelationId, prosupport);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 	}
 

From 641dd167a3424f97eed8416b5ef1c21dbbf8cf12 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Thu, 2 Jul 2020 13:57:03 +0900
Subject: [PATCH 067/334] Move description of libpqwalreceiver hooks out of the
 replication's README

src/backend/replication/README includes since 32bc08b a basic
description of the WAL receiver hooks available in walreceiver.h for a
module like libpqwalreceiver, but the README has never been updated to
reflect changes done to the hooks, so the contents of the README have
rotten with the time.  This commit moves the description from the README
to walreceiver.h, where it will be hard to miss that a description
update or addition is needed depending on the modifications done to the
hooks.

Each hook now includes a description of what it does in walreceiver.h,
and the replication's README mentions walreceiver.h.

Thanks also to Amit Kapila for the discussion.

Author: Michael Paquier
Reviewed-by: Peter Eisentraut
Discussion: https://postgr.es/m/20200502024606.GA471944@paquier.xyz
---
 src/backend/replication/README        |  29 +-----
 src/include/replication/walreceiver.h | 136 ++++++++++++++++++++++++--
 2 files changed, 132 insertions(+), 33 deletions(-)

diff --git a/src/backend/replication/README b/src/backend/replication/README
index 8ccdd86e74b2..eae6ca729f44 100644
--- a/src/backend/replication/README
+++ b/src/backend/replication/README
@@ -8,33 +8,8 @@ the primary server, receiving WAL files and sending messages, is loaded
 dynamically to avoid having to link the main server binary with libpq.
 The dynamically loaded module is in libpqwalreceiver subdirectory.
 
-The dynamically loaded module implements four functions:
-
-
-bool walrcv_connect(char *conninfo, XLogRecPtr startpoint)
-
-Establish connection to the primary, and starts streaming from 'startpoint'.
-Returns true on success.
-
-int walrcv_receive(char **buffer, pgsocket *wait_fd)
-
-Retrieve any message available without blocking through the
-connection.  If a message was successfully read, returns its
-length. If the connection is closed, returns -1.  Otherwise returns 0
-to indicate that no data is available, and sets *wait_fd to a socket
-descriptor which can be waited on before trying again.  On success, a
-pointer to the message payload is stored in *buffer. The returned
-buffer is valid until the next call to walrcv_* functions, and the
-caller should not attempt to free it.
-
-void walrcv_send(const char *buffer, int nbytes)
-
-Send a message to XLOG stream.
-
-void walrcv_disconnect(void);
-
-Disconnect.
-
+The dynamically loaded module implements a set of functions with details
+about each one of them provided in src/include/replication/walreceiver.h.
 
 This API should be considered internal at the moment, but we could open it
 up for 3rd party replacements of libpqwalreceiver in the future, allowing
diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h
index ac1acbb27ec8..c75dcebea0c1 100644
--- a/src/include/replication/walreceiver.h
+++ b/src/include/replication/walreceiver.h
@@ -213,39 +213,163 @@ typedef struct WalRcvExecResult
 	TupleDesc	tupledesc;
 } WalRcvExecResult;
 
-/* libpqwalreceiver hooks */
-typedef WalReceiverConn *(*walrcv_connect_fn) (const char *conninfo, bool logical,
+/* WAL receiver - libpqwalreceiver hooks */
+
+/*
+ * walrcv_connect_fn
+ *
+ * Establish connection to a cluster.  'logical' is true if the
+ * connection is logical, and false if the connection is physical.
+ * 'appname' is a name associated to the connection, to use for example
+ * with fallback_application_name or application_name.  Returns the
+ * details about the connection established, as defined by
+ * WalReceiverConn for each WAL receiver module.  On error, NULL is
+ * returned with 'err' including the error generated.
+ */
+typedef WalReceiverConn *(*walrcv_connect_fn) (const char *conninfo,
+											   bool logical,
 											   const char *appname,
 											   char **err);
+
+/*
+ * walrcv_check_conninfo_fn
+ *
+ * Parse and validate the connection string given as of 'conninfo'.
+ */
 typedef void (*walrcv_check_conninfo_fn) (const char *conninfo);
+
+/*
+ * walrcv_get_conninfo_fn
+ *
+ * Returns a user-displayable conninfo string.  Note that any
+ * security-sensitive fields should be obfuscated.
+ */
 typedef char *(*walrcv_get_conninfo_fn) (WalReceiverConn *conn);
+
+/*
+ * walrcv_get_senderinfo_fn
+ *
+ * Provide information of the WAL sender this WAL receiver is connected
+ * to, as of 'sender_host' for the host of the sender and 'sender_port'
+ * for its port.
+ */
 typedef void (*walrcv_get_senderinfo_fn) (WalReceiverConn *conn,
 										  char **sender_host,
 										  int *sender_port);
+
+/*
+ * walrcv_identify_system_fn
+ *
+ * Run IDENTIFY_SYSTEM on the cluster connected to and validate the
+ * identity of the cluster.  Returns the system ID of the cluster
+ * connected to.  'primary_tli' is the timeline ID of the sender.
+ */
 typedef char *(*walrcv_identify_system_fn) (WalReceiverConn *conn,
 											TimeLineID *primary_tli);
+
+/*
+ * walrcv_server_version_fn
+ *
+ * Returns the version number of the cluster connected to.
+ */
 typedef int (*walrcv_server_version_fn) (WalReceiverConn *conn);
+
+/*
+ * walrcv_readtimelinehistoryfile_fn
+ *
+ * Fetch from cluster the timeline history file for timeline 'tli'.
+ * Returns the name of the timeline history file as of 'filename', its
+ * contents as of 'content' and its 'size'.
+ */
 typedef void (*walrcv_readtimelinehistoryfile_fn) (WalReceiverConn *conn,
 												   TimeLineID tli,
 												   char **filename,
-												   char **content, int *size);
+												   char **content,
+												   int *size);
+
+/*
+ * walrcv_startstreaming_fn
+ *
+ * Start streaming WAL data from given streaming options.  Returns true
+ * if the connection has switched successfully to copy-both mode and false
+ * if the server received the command and executed it successfully, but
+ * didn't switch to copy-mode.
+ */
 typedef bool (*walrcv_startstreaming_fn) (WalReceiverConn *conn,
 										  const WalRcvStreamOptions *options);
+
+/*
+ * walrcv_endstreaming_fn
+ *
+ * Stop streaming of WAL data.  Returns the next timeline ID of the cluster
+ * connected to in 'next_tli', or 0 if there was no report.
+ */
 typedef void (*walrcv_endstreaming_fn) (WalReceiverConn *conn,
 										TimeLineID *next_tli);
-typedef int (*walrcv_receive_fn) (WalReceiverConn *conn, char **buffer,
+
+/*
+ * walrcv_receive_fn
+ *
+ * Receive a message available from the WAL stream.  'buffer' is a pointer
+ * to a buffer holding the message received.  Returns the length of the data,
+ * 0 if no data is available yet ('wait_fd' is a socket descriptor which can
+ * be waited on before a retry), and -1 if the cluster ended the COPY.
+ */
+typedef int (*walrcv_receive_fn) (WalReceiverConn *conn,
+								  char **buffer,
 								  pgsocket *wait_fd);
-typedef void (*walrcv_send_fn) (WalReceiverConn *conn, const char *buffer,
+
+/*
+ * walrcv_send_fn
+ *
+ * Send a message of size 'nbytes' to the WAL stream with 'buffer' as
+ * contents.
+ */
+typedef void (*walrcv_send_fn) (WalReceiverConn *conn,
+								const char *buffer,
 								int nbytes);
+
+/*
+ * walrcv_create_slot_fn
+ *
+ * Create a new replication slot named 'slotname'.  'temporary' defines
+ * if the slot is temporary.  'snapshot_action' defines the behavior wanted
+ * for an exported snapshot (see replication protocol for more details).
+ * 'lsn' includes the LSN position at which the created slot became
+ * consistent.  Returns the name of the exported snapshot for a logical
+ * slot, or NULL for a physical slot.
+ */
 typedef char *(*walrcv_create_slot_fn) (WalReceiverConn *conn,
-										const char *slotname, bool temporary,
+										const char *slotname,
+										bool temporary,
 										CRSSnapshotAction snapshot_action,
 										XLogRecPtr *lsn);
+
+/*
+ * walrcv_get_backend_pid_fn
+ *
+ * Returns the PID of the remote backend process.
+ */
 typedef pid_t (*walrcv_get_backend_pid_fn) (WalReceiverConn *conn);
+
+/*
+ * walrcv_exec_fn
+ *
+ * Send generic queries (and commands) to the remote cluster.  'nRetTypes'
+ * is the expected number of returned attributes, and 'retTypes' an array
+ * including their type OIDs.  Returns the status of the execution and
+ * tuples if any.
+ */
 typedef WalRcvExecResult *(*walrcv_exec_fn) (WalReceiverConn *conn,
 											 const char *query,
 											 const int nRetTypes,
 											 const Oid *retTypes);
+
+/*
+ * walrcv_disconnect_fn
+ *
+ * Disconnect with the cluster.
+ */
 typedef void (*walrcv_disconnect_fn) (WalReceiverConn *conn);
 
 typedef struct WalReceiverFunctionsType

From e25d462a38f853621a2ad529638d463836230766 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Thu, 2 Jul 2020 14:54:55 -0700
Subject: [PATCH 068/334] nbtree: Rename _bt_search() variables.

Make some of the variable names in _bt_search() consistent with
corresponding variables within _bt_getstackbuf().  This naming scheme is
clearer because the variable names always express a relationship between
the currently locked buffer/page and some other page.
---
 src/backend/access/nbtree/nbtsearch.c | 45 +++++++++++++--------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 45342248128f..f228c87a2b77 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -82,9 +82,10 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
  * The passed scankey is an insertion-type scankey (see nbtree/README),
  * but it can omit the rightmost column(s) of the index.
  *
- * Return value is a stack of parent-page pointers.  *bufP is set to the
- * address of the leaf-page buffer, which is locked and pinned.  No locks
- * are held on the parent pages, however!
+ * Return value is a stack of parent-page pointers (i.e. there is no entry for
+ * the leaf level/page).  *bufP is set to the address of the leaf-page buffer,
+ * which is locked and pinned.  No locks are held on the parent pages,
+ * however!
  *
  * If the snapshot parameter is not NULL, "old snapshot" checking will take
  * place during the descent through the tree.  This is not needed when
@@ -118,21 +119,20 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 		OffsetNumber offnum;
 		ItemId		itemid;
 		IndexTuple	itup;
-		BlockNumber blkno;
-		BlockNumber par_blkno;
+		BlockNumber child;
 		BTStack		new_stack;
 
 		/*
 		 * Race -- the page we just grabbed may have split since we read its
-		 * pointer in the parent (or metapage).  If it has, we may need to
-		 * move right to its new sibling.  Do that.
+		 * downlink in its parent page (or the metapage).  If it has, we may
+		 * need to move right to its new sibling.  Do that.
 		 *
 		 * In write-mode, allow _bt_moveright to finish any incomplete splits
 		 * along the way.  Strictly speaking, we'd only need to finish an
 		 * incomplete split on the leaf page we're about to insert to, not on
-		 * any of the upper levels (they are taken care of in _bt_getstackbuf,
-		 * if the leaf page is split and we insert to the parent page).  But
-		 * this is a good opportunity to finish splits of internal pages too.
+		 * any of the upper levels (internal pages with incomplete splits are
+		 * also taken care of in _bt_getstackbuf).  But this is a good
+		 * opportunity to finish splits of internal pages too.
 		 */
 		*bufP = _bt_moveright(rel, key, *bufP, (access == BT_WRITE), stack_in,
 							  page_access, snapshot);
@@ -144,25 +144,23 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 			break;
 
 		/*
-		 * Find the appropriate item on the internal page, and get the child
-		 * page that it points to.
+		 * Find the appropriate pivot tuple on this page.  Its downlink points
+		 * to the child page that we're about to descend to.
 		 */
 		offnum = _bt_binsrch(rel, key, *bufP);
 		itemid = PageGetItemId(page, offnum);
 		itup = (IndexTuple) PageGetItem(page, itemid);
 		Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace);
-		blkno = BTreeTupleGetDownLink(itup);
-		par_blkno = BufferGetBlockNumber(*bufP);
+		child = BTreeTupleGetDownLink(itup);
 
 		/*
-		 * We need to save the location of the pivot tuple we chose in the
-		 * parent page on a stack.  If we need to split a page, we'll use the
-		 * stack to work back up to its parent page.  If caller ends up
-		 * splitting a page one level down, it usually ends up inserting a new
-		 * pivot tuple/downlink immediately after the location recorded here.
+		 * We need to save the location of the pivot tuple we chose in a new
+		 * stack entry for this page/level.  If caller ends up splitting a
+		 * page one level down, it usually ends up inserting a new pivot
+		 * tuple/downlink immediately after the location recorded here.
 		 */
 		new_stack = (BTStack) palloc(sizeof(BTStackData));
-		new_stack->bts_blkno = par_blkno;
+		new_stack->bts_blkno = BufferGetBlockNumber(*bufP);
 		new_stack->bts_offset = offnum;
 		new_stack->bts_parent = stack_in;
 
@@ -174,8 +172,8 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 		if (opaque->btpo.level == 1 && access == BT_WRITE)
 			page_access = BT_WRITE;
 
-		/* drop the read lock on the parent page, acquire one on the child */
-		*bufP = _bt_relandgetbuf(rel, *bufP, blkno, page_access);
+		/* drop the read lock on the page, then acquire one on its child */
+		*bufP = _bt_relandgetbuf(rel, *bufP, child, page_access);
 
 		/* okay, all set to move down a level */
 		stack_in = new_stack;
@@ -196,8 +194,7 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 		 * If the page was split between the time that we surrendered our read
 		 * lock and acquired our write lock, then this page may no longer be
 		 * the right place for the key we want to insert.  In this case, we
-		 * need to move right in the tree.  See Lehman and Yao for an
-		 * excruciatingly precise description.
+		 * need to move right in the tree.
 		 */
 		*bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
 							  snapshot);

From 947456a823d6b0973b68c6b38c8623a0504054e7 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Thu, 2 Jul 2020 16:34:54 -0700
Subject: [PATCH 069/334] Initialize work_mem using current guc.c default.

Do the same for the maintenance_work_mem global variable.

Oversight in commit 848ae330a49, which increased the previous defaults
for work_mem and maintenance_work_mem by 4X.
---
 src/backend/utils/init/globals.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index eb1964441985..74b52b713236 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -118,8 +118,8 @@ int			IntervalStyle = INTSTYLE_POSTGRES;
 
 bool		enableFsync = true;
 bool		allowSystemTableMods = false;
-int			work_mem = 1024;
-int			maintenance_work_mem = 16384;
+int			work_mem = 4096;
+int			maintenance_work_mem = 65536;
 int			max_parallel_maintenance_workers = 2;
 
 /*

From d1763ea8c9c32837d373a196ed0c2e1256a55824 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Fri, 3 Jul 2020 11:35:22 +0900
Subject: [PATCH 070/334] Change default of pg_stat_statements.track_planning
 to off.

Since v13 pg_stat_statements is allowed to track the planning time of
statements when track_planning option is enabled. Its default was on.

But this feature could cause more terrible spinlock contentions in
pg_stat_statements. As a result of this, Robins Tharakan reported that
v13 beta1 showed ~45% performance drop at high DB connection counts
(when compared with v12.3) during fully-cached SELECT-only test using
pgbench.

To avoid this performance regression by the default setting,
this commit changes default of pg_stat_statements.track_planning to off.

Back-patch to v13 where pg_stat_statements.track_planning was introduced.

Reported-by: Robins Tharakan
Author: Fujii Masao
Reviewed-by: Julien Rouhaud
Discussion: https://postgr.es/m/2895b53b033c47ccb22972b589050dd9@EX13D05UWC001.ant.amazon.com
---
 .../expected/pg_stat_statements.out             |  1 +
 contrib/pg_stat_statements/pg_stat_statements.c |  2 +-
 .../sql/pg_stat_statements.sql                  |  1 +
 doc/src/sgml/pgstatstatements.sgml              | 17 +++++++++++++++--
 4 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/contrib/pg_stat_statements/expected/pg_stat_statements.out b/contrib/pg_stat_statements/expected/pg_stat_statements.out
index f615f8c2bfd4..c3f013860ae3 100644
--- a/contrib/pg_stat_statements/expected/pg_stat_statements.out
+++ b/contrib/pg_stat_statements/expected/pg_stat_statements.out
@@ -3,6 +3,7 @@ CREATE EXTENSION pg_stat_statements;
 -- simple and compound statements
 --
 SET pg_stat_statements.track_utility = FALSE;
+SET pg_stat_statements.track_planning = TRUE;
 SELECT pg_stat_statements_reset();
  pg_stat_statements_reset 
 --------------------------
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index cef8bb5a49a2..65ac301b99fa 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -442,7 +442,7 @@ _PG_init(void)
 							 "Selects whether planning duration is tracked by pg_stat_statements.",
 							 NULL,
 							 &pgss_track_planning,
-							 true,
+							 false,
 							 PGC_SUSET,
 							 0,
 							 NULL,
diff --git a/contrib/pg_stat_statements/sql/pg_stat_statements.sql b/contrib/pg_stat_statements/sql/pg_stat_statements.sql
index 75c10554a891..6ed8e3802802 100644
--- a/contrib/pg_stat_statements/sql/pg_stat_statements.sql
+++ b/contrib/pg_stat_statements/sql/pg_stat_statements.sql
@@ -4,6 +4,7 @@ CREATE EXTENSION pg_stat_statements;
 -- simple and compound statements
 --
 SET pg_stat_statements.track_utility = FALSE;
+SET pg_stat_statements.track_planning = TRUE;
 SELECT pg_stat_statements_reset();
 
 SELECT 1 AS "int";
diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml
index a13e28a84cca..430d8bf07c44 100644
--- a/doc/src/sgml/pgstatstatements.sgml
+++ b/doc/src/sgml/pgstatstatements.sgml
@@ -101,6 +101,8 @@
       </para>
       <para>
        Number of times the statement was planned
+       (if <varname>pg_stat_statements.track_planning</varname> is enabled,
+       otherwise zero)
       </para></entry>
      </row>
 
@@ -110,6 +112,8 @@
       </para>
       <para>
        Total time spent planning the statement, in milliseconds
+       (if <varname>pg_stat_statements.track_planning</varname> is enabled,
+       otherwise zero)
       </para></entry>
      </row>
 
@@ -119,6 +123,8 @@
       </para>
       <para>
        Minimum time spent planning the statement, in milliseconds
+       (if <varname>pg_stat_statements.track_planning</varname> is enabled,
+       otherwise zero)
       </para></entry>
      </row>
 
@@ -128,6 +134,8 @@
       </para>
       <para>
        Maximum time spent planning the statement, in milliseconds
+       (if <varname>pg_stat_statements.track_planning</varname> is enabled,
+       otherwise zero)
       </para></entry>
      </row>
 
@@ -137,6 +145,8 @@
       </para>
       <para>
        Mean time spent planning the statement, in milliseconds
+       (if <varname>pg_stat_statements.track_planning</varname> is enabled,
+       otherwise zero)
       </para></entry>
      </row>
 
@@ -145,7 +155,10 @@
        <structfield>stddev_plan_time</structfield> <type>double precision</type>
       </para>
       <para>
-       Population standard deviation of time spent planning the statement, in milliseconds
+       Population standard deviation of time spent planning the statement,
+       in milliseconds
+       (if <varname>pg_stat_statements.track_planning</varname> is enabled,
+       otherwise zero)
       </para></entry>
      </row>
 
@@ -594,7 +607,7 @@
      <para>
       <varname>pg_stat_statements.track_planning</varname> controls whether
       planning operations and duration are tracked by the module.
-      The default value is <literal>on</literal>.
+      The default value is <literal>off</literal>.
       Only superusers can change this setting.
      </para>
     </listitem>

From 8f9b6d40570bd8991f18a089a8445cc5275c1f49 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Fri, 3 Jul 2020 12:08:35 +0900
Subject: [PATCH 071/334] doc: Correct description of restart_lsn in
 pg_replication_slots

Previously the document explained that restart_lsn indicates the LSN of
oldest WAL won't be automatically removed during checkpoints. But
since v13 this was no longer true thanks to max_slot_wal_keep_size.

Back-patch to v13 where max_slot_wal_keep_size was added.

Author: Fujii Masao
Discussion: https://postgr.es/m/6497f1e9-3148-c5da-7e49-b2fddad9a42f@oss.nttdata.com
---
 doc/src/sgml/catalogs.sgml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 49a881b26210..003d2783703c 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -11214,7 +11214,9 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
       <para>
        The address (<literal>LSN</literal>) of oldest WAL which still
        might be required by the consumer of this slot and thus won't be
-       automatically removed during checkpoints.  <literal>NULL</literal>
+       automatically removed during checkpoints unless this LSN
+       gets behind more than <xref linkend="guc-max-slot-wal-keep-size"/>
+       from the current LSN.  <literal>NULL</literal>
        if the <literal>LSN</literal> of this slot has never been reserved.
       </para></entry>
      </row>

From ecd9e9f0bc141550726b2205dd6f5745a58e9ecd Mon Sep 17 00:00:00 2001
From: Magnus Hagander <magnus@hagander.net>
Date: Fri, 3 Jul 2020 15:09:06 +0200
Subject: [PATCH 072/334] Fix temporary tablespaces for shared filesets

A likely copy/paste error in 98e8b480532 from  back in 2004 would
cause temp tablespace to be reset to InvalidOid if temp_tablespaces
was set to the same value as the primary tablespace in the database.
This would cause shared filesets (such as for parallel hash joins)
to ignore them, putting the temporary files in the default tablespace
instead of the configured one. The bug is in the old code, but it
appears to have been exposed only once we had shared filesets.

Reviewed-By: Daniel Gustafsson
Discussion: https://postgr.es/m/CABUevExg5YEsOvqMxrjoNvb3ApVyH+9jggWGKwTDFyFCVWczGQ@mail.gmail.com
Backpatch-through: 9.5
---
 src/backend/commands/tablespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index 051478057f6f..f887ee98577b 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -1386,7 +1386,7 @@ PrepareTempTablespaces(void)
 		 */
 		if (curoid == MyDatabaseTableSpace)
 		{
-			tblSpcs[numSpcs++] = InvalidOid;
+			tblSpcs[numSpcs++] = curoid;
 			continue;
 		}
 

From 1f902d499eda862a98e881a2227b8d2cf3565374 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 3 Jul 2020 15:42:10 -0400
Subject: [PATCH 073/334] Inline plpgsql's exec_stmt() into exec_stmts().

This saves one level of C function call per plpgsql statement executed,
and permits a tiny additional optimization of not saving and restoring
estate->err_stmt for each statement in a block.  The net effect seems
nearly un-measurable on x86_64, but there's a clear win on aarch64,
amounting to two or three percent in a loop over a few simple plpgsql
statements.

To do this, we have to get rid of the other existing call sites for
exec_stmt().  Replace them with exec_toplevel_block(), which is just
defined to do what exec_stmts() does, but for a single
PLpgSQL_stmt_block statement.  Hard-wiring the expectation of which
statement type applies here allows us to skip the dispatch switch,
making this not much uglier than the previous factorization.

Amit Khandekar, tweaked a bit by me

Discussion: https://postgr.es/m/CAJ3gD9eBNrmUD7WBBLG8ohaZ485H9y+4eihQTgr+K8Lhka3vcQ@mail.gmail.com
---
 src/pl/plpgsql/src/pl_exec.c | 280 ++++++++++++++++++-----------------
 1 file changed, 147 insertions(+), 133 deletions(-)

diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c
index f41d675d6563..54900e01c8f5 100644
--- a/src/pl/plpgsql/src/pl_exec.c
+++ b/src/pl/plpgsql/src/pl_exec.c
@@ -260,12 +260,12 @@ static MemoryContext get_stmt_mcontext(PLpgSQL_execstate *estate);
 static void push_stmt_mcontext(PLpgSQL_execstate *estate);
 static void pop_stmt_mcontext(PLpgSQL_execstate *estate);
 
+static int	exec_toplevel_block(PLpgSQL_execstate *estate,
+								PLpgSQL_stmt_block *block);
 static int	exec_stmt_block(PLpgSQL_execstate *estate,
 							PLpgSQL_stmt_block *block);
 static int	exec_stmts(PLpgSQL_execstate *estate,
 					   List *stmts);
-static int	exec_stmt(PLpgSQL_execstate *estate,
-					  PLpgSQL_stmt *stmt);
 static int	exec_stmt_assign(PLpgSQL_execstate *estate,
 							 PLpgSQL_stmt_assign *stmt);
 static int	exec_stmt_perform(PLpgSQL_execstate *estate,
@@ -599,11 +599,9 @@ plpgsql_exec_function(PLpgSQL_function *func, FunctionCallInfo fcinfo,
 	 * Now call the toplevel block of statements
 	 */
 	estate.err_text = NULL;
-	estate.err_stmt = (PLpgSQL_stmt *) (func->action);
-	rc = exec_stmt(&estate, (PLpgSQL_stmt *) func->action);
+	rc = exec_toplevel_block(&estate, func->action);
 	if (rc != PLPGSQL_RC_RETURN)
 	{
-		estate.err_stmt = NULL;
 		estate.err_text = NULL;
 		ereport(ERROR,
 				(errcode(ERRCODE_S_R_E_FUNCTION_EXECUTED_NO_RETURN_STATEMENT),
@@ -613,7 +611,6 @@ plpgsql_exec_function(PLpgSQL_function *func, FunctionCallInfo fcinfo,
 	/*
 	 * We got a return value - process it
 	 */
-	estate.err_stmt = NULL;
 	estate.err_text = gettext_noop("while casting return value to function's return type");
 
 	fcinfo->isnull = estate.retisnull;
@@ -1015,18 +1012,15 @@ plpgsql_exec_trigger(PLpgSQL_function *func,
 	 * Now call the toplevel block of statements
 	 */
 	estate.err_text = NULL;
-	estate.err_stmt = (PLpgSQL_stmt *) (func->action);
-	rc = exec_stmt(&estate, (PLpgSQL_stmt *) func->action);
+	rc = exec_toplevel_block(&estate, func->action);
 	if (rc != PLPGSQL_RC_RETURN)
 	{
-		estate.err_stmt = NULL;
 		estate.err_text = NULL;
 		ereport(ERROR,
 				(errcode(ERRCODE_S_R_E_FUNCTION_EXECUTED_NO_RETURN_STATEMENT),
 				 errmsg("control reached end of trigger procedure without RETURN")));
 	}
 
-	estate.err_stmt = NULL;
 	estate.err_text = gettext_noop("during function exit");
 
 	if (estate.retisset)
@@ -1176,18 +1170,15 @@ plpgsql_exec_event_trigger(PLpgSQL_function *func, EventTriggerData *trigdata)
 	 * Now call the toplevel block of statements
 	 */
 	estate.err_text = NULL;
-	estate.err_stmt = (PLpgSQL_stmt *) (func->action);
-	rc = exec_stmt(&estate, (PLpgSQL_stmt *) func->action);
+	rc = exec_toplevel_block(&estate, func->action);
 	if (rc != PLPGSQL_RC_RETURN)
 	{
-		estate.err_stmt = NULL;
 		estate.err_text = NULL;
 		ereport(ERROR,
 				(errcode(ERRCODE_S_R_E_FUNCTION_EXECUTED_NO_RETURN_STATEMENT),
 				 errmsg("control reached end of trigger procedure without RETURN")));
 	}
 
-	estate.err_stmt = NULL;
 	estate.err_text = gettext_noop("during function exit");
 
 	/*
@@ -1584,6 +1575,40 @@ exception_matches_conditions(ErrorData *edata, PLpgSQL_condition *cond)
 }
 
 
+/* ----------
+ * exec_toplevel_block			Execute the toplevel block
+ *
+ * This is intentionally equivalent to executing exec_stmts() with a
+ * list consisting of the one statement.  One tiny difference is that
+ * we do not bother to save the entry value of estate->err_stmt;
+ * that's assumed to be NULL.
+ * ----------
+ */
+static int
+exec_toplevel_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block)
+{
+	int			rc;
+
+	estate->err_stmt = (PLpgSQL_stmt *) block;
+
+	/* Let the plugin know that we are about to execute this statement */
+	if (*plpgsql_plugin_ptr && (*plpgsql_plugin_ptr)->stmt_beg)
+		((*plpgsql_plugin_ptr)->stmt_beg) (estate, (PLpgSQL_stmt *) block);
+
+	CHECK_FOR_INTERRUPTS();
+
+	rc = exec_stmt_block(estate, block);
+
+	/* Let the plugin know that we have finished executing this statement */
+	if (*plpgsql_plugin_ptr && (*plpgsql_plugin_ptr)->stmt_end)
+		((*plpgsql_plugin_ptr)->stmt_end) (estate, (PLpgSQL_stmt *) block);
+
+	estate->err_stmt = NULL;
+
+	return rc;
+}
+
+
 /* ----------
  * exec_stmt_block			Execute a block of statements
  * ----------
@@ -1917,6 +1942,7 @@ exec_stmt_block(PLpgSQL_execstate *estate, PLpgSQL_stmt_block *block)
 static int
 exec_stmts(PLpgSQL_execstate *estate, List *stmts)
 {
+	PLpgSQL_stmt *save_estmt = estate->err_stmt;
 	ListCell   *s;
 
 	if (stmts == NIL)
@@ -1933,162 +1959,150 @@ exec_stmts(PLpgSQL_execstate *estate, List *stmts)
 	foreach(s, stmts)
 	{
 		PLpgSQL_stmt *stmt = (PLpgSQL_stmt *) lfirst(s);
-		int			rc = exec_stmt(estate, stmt);
-
-		if (rc != PLPGSQL_RC_OK)
-			return rc;
-	}
+		int			rc;
 
-	return PLPGSQL_RC_OK;
-}
+		estate->err_stmt = stmt;
 
+		/* Let the plugin know that we are about to execute this statement */
+		if (*plpgsql_plugin_ptr && (*plpgsql_plugin_ptr)->stmt_beg)
+			((*plpgsql_plugin_ptr)->stmt_beg) (estate, stmt);
 
-/* ----------
- * exec_stmt			Distribute one statement to the statements
- *				type specific execution function.
- * ----------
- */
-static int
-exec_stmt(PLpgSQL_execstate *estate, PLpgSQL_stmt *stmt)
-{
-	PLpgSQL_stmt *save_estmt;
-	int			rc = -1;
-
-	save_estmt = estate->err_stmt;
-	estate->err_stmt = stmt;
+		CHECK_FOR_INTERRUPTS();
 
-	/* Let the plugin know that we are about to execute this statement */
-	if (*plpgsql_plugin_ptr && (*plpgsql_plugin_ptr)->stmt_beg)
-		((*plpgsql_plugin_ptr)->stmt_beg) (estate, stmt);
+		switch (stmt->cmd_type)
+		{
+			case PLPGSQL_STMT_BLOCK:
+				rc = exec_stmt_block(estate, (PLpgSQL_stmt_block *) stmt);
+				break;
 
-	CHECK_FOR_INTERRUPTS();
+			case PLPGSQL_STMT_ASSIGN:
+				rc = exec_stmt_assign(estate, (PLpgSQL_stmt_assign *) stmt);
+				break;
 
-	switch (stmt->cmd_type)
-	{
-		case PLPGSQL_STMT_BLOCK:
-			rc = exec_stmt_block(estate, (PLpgSQL_stmt_block *) stmt);
-			break;
+			case PLPGSQL_STMT_PERFORM:
+				rc = exec_stmt_perform(estate, (PLpgSQL_stmt_perform *) stmt);
+				break;
 
-		case PLPGSQL_STMT_ASSIGN:
-			rc = exec_stmt_assign(estate, (PLpgSQL_stmt_assign *) stmt);
-			break;
+			case PLPGSQL_STMT_CALL:
+				rc = exec_stmt_call(estate, (PLpgSQL_stmt_call *) stmt);
+				break;
 
-		case PLPGSQL_STMT_PERFORM:
-			rc = exec_stmt_perform(estate, (PLpgSQL_stmt_perform *) stmt);
-			break;
+			case PLPGSQL_STMT_GETDIAG:
+				rc = exec_stmt_getdiag(estate, (PLpgSQL_stmt_getdiag *) stmt);
+				break;
 
-		case PLPGSQL_STMT_CALL:
-			rc = exec_stmt_call(estate, (PLpgSQL_stmt_call *) stmt);
-			break;
+			case PLPGSQL_STMT_IF:
+				rc = exec_stmt_if(estate, (PLpgSQL_stmt_if *) stmt);
+				break;
 
-		case PLPGSQL_STMT_GETDIAG:
-			rc = exec_stmt_getdiag(estate, (PLpgSQL_stmt_getdiag *) stmt);
-			break;
+			case PLPGSQL_STMT_CASE:
+				rc = exec_stmt_case(estate, (PLpgSQL_stmt_case *) stmt);
+				break;
 
-		case PLPGSQL_STMT_IF:
-			rc = exec_stmt_if(estate, (PLpgSQL_stmt_if *) stmt);
-			break;
+			case PLPGSQL_STMT_LOOP:
+				rc = exec_stmt_loop(estate, (PLpgSQL_stmt_loop *) stmt);
+				break;
 
-		case PLPGSQL_STMT_CASE:
-			rc = exec_stmt_case(estate, (PLpgSQL_stmt_case *) stmt);
-			break;
+			case PLPGSQL_STMT_WHILE:
+				rc = exec_stmt_while(estate, (PLpgSQL_stmt_while *) stmt);
+				break;
 
-		case PLPGSQL_STMT_LOOP:
-			rc = exec_stmt_loop(estate, (PLpgSQL_stmt_loop *) stmt);
-			break;
+			case PLPGSQL_STMT_FORI:
+				rc = exec_stmt_fori(estate, (PLpgSQL_stmt_fori *) stmt);
+				break;
 
-		case PLPGSQL_STMT_WHILE:
-			rc = exec_stmt_while(estate, (PLpgSQL_stmt_while *) stmt);
-			break;
+			case PLPGSQL_STMT_FORS:
+				rc = exec_stmt_fors(estate, (PLpgSQL_stmt_fors *) stmt);
+				break;
 
-		case PLPGSQL_STMT_FORI:
-			rc = exec_stmt_fori(estate, (PLpgSQL_stmt_fori *) stmt);
-			break;
+			case PLPGSQL_STMT_FORC:
+				rc = exec_stmt_forc(estate, (PLpgSQL_stmt_forc *) stmt);
+				break;
 
-		case PLPGSQL_STMT_FORS:
-			rc = exec_stmt_fors(estate, (PLpgSQL_stmt_fors *) stmt);
-			break;
+			case PLPGSQL_STMT_FOREACH_A:
+				rc = exec_stmt_foreach_a(estate, (PLpgSQL_stmt_foreach_a *) stmt);
+				break;
 
-		case PLPGSQL_STMT_FORC:
-			rc = exec_stmt_forc(estate, (PLpgSQL_stmt_forc *) stmt);
-			break;
+			case PLPGSQL_STMT_EXIT:
+				rc = exec_stmt_exit(estate, (PLpgSQL_stmt_exit *) stmt);
+				break;
 
-		case PLPGSQL_STMT_FOREACH_A:
-			rc = exec_stmt_foreach_a(estate, (PLpgSQL_stmt_foreach_a *) stmt);
-			break;
+			case PLPGSQL_STMT_RETURN:
+				rc = exec_stmt_return(estate, (PLpgSQL_stmt_return *) stmt);
+				break;
 
-		case PLPGSQL_STMT_EXIT:
-			rc = exec_stmt_exit(estate, (PLpgSQL_stmt_exit *) stmt);
-			break;
+			case PLPGSQL_STMT_RETURN_NEXT:
+				rc = exec_stmt_return_next(estate, (PLpgSQL_stmt_return_next *) stmt);
+				break;
 
-		case PLPGSQL_STMT_RETURN:
-			rc = exec_stmt_return(estate, (PLpgSQL_stmt_return *) stmt);
-			break;
+			case PLPGSQL_STMT_RETURN_QUERY:
+				rc = exec_stmt_return_query(estate, (PLpgSQL_stmt_return_query *) stmt);
+				break;
 
-		case PLPGSQL_STMT_RETURN_NEXT:
-			rc = exec_stmt_return_next(estate, (PLpgSQL_stmt_return_next *) stmt);
-			break;
+			case PLPGSQL_STMT_RAISE:
+				rc = exec_stmt_raise(estate, (PLpgSQL_stmt_raise *) stmt);
+				break;
 
-		case PLPGSQL_STMT_RETURN_QUERY:
-			rc = exec_stmt_return_query(estate, (PLpgSQL_stmt_return_query *) stmt);
-			break;
+			case PLPGSQL_STMT_ASSERT:
+				rc = exec_stmt_assert(estate, (PLpgSQL_stmt_assert *) stmt);
+				break;
 
-		case PLPGSQL_STMT_RAISE:
-			rc = exec_stmt_raise(estate, (PLpgSQL_stmt_raise *) stmt);
-			break;
+			case PLPGSQL_STMT_EXECSQL:
+				rc = exec_stmt_execsql(estate, (PLpgSQL_stmt_execsql *) stmt);
+				break;
 
-		case PLPGSQL_STMT_ASSERT:
-			rc = exec_stmt_assert(estate, (PLpgSQL_stmt_assert *) stmt);
-			break;
+			case PLPGSQL_STMT_DYNEXECUTE:
+				rc = exec_stmt_dynexecute(estate, (PLpgSQL_stmt_dynexecute *) stmt);
+				break;
 
-		case PLPGSQL_STMT_EXECSQL:
-			rc = exec_stmt_execsql(estate, (PLpgSQL_stmt_execsql *) stmt);
-			break;
+			case PLPGSQL_STMT_DYNFORS:
+				rc = exec_stmt_dynfors(estate, (PLpgSQL_stmt_dynfors *) stmt);
+				break;
 
-		case PLPGSQL_STMT_DYNEXECUTE:
-			rc = exec_stmt_dynexecute(estate, (PLpgSQL_stmt_dynexecute *) stmt);
-			break;
+			case PLPGSQL_STMT_OPEN:
+				rc = exec_stmt_open(estate, (PLpgSQL_stmt_open *) stmt);
+				break;
 
-		case PLPGSQL_STMT_DYNFORS:
-			rc = exec_stmt_dynfors(estate, (PLpgSQL_stmt_dynfors *) stmt);
-			break;
+			case PLPGSQL_STMT_FETCH:
+				rc = exec_stmt_fetch(estate, (PLpgSQL_stmt_fetch *) stmt);
+				break;
 
-		case PLPGSQL_STMT_OPEN:
-			rc = exec_stmt_open(estate, (PLpgSQL_stmt_open *) stmt);
-			break;
+			case PLPGSQL_STMT_CLOSE:
+				rc = exec_stmt_close(estate, (PLpgSQL_stmt_close *) stmt);
+				break;
 
-		case PLPGSQL_STMT_FETCH:
-			rc = exec_stmt_fetch(estate, (PLpgSQL_stmt_fetch *) stmt);
-			break;
+			case PLPGSQL_STMT_COMMIT:
+				rc = exec_stmt_commit(estate, (PLpgSQL_stmt_commit *) stmt);
+				break;
 
-		case PLPGSQL_STMT_CLOSE:
-			rc = exec_stmt_close(estate, (PLpgSQL_stmt_close *) stmt);
-			break;
+			case PLPGSQL_STMT_ROLLBACK:
+				rc = exec_stmt_rollback(estate, (PLpgSQL_stmt_rollback *) stmt);
+				break;
 
-		case PLPGSQL_STMT_COMMIT:
-			rc = exec_stmt_commit(estate, (PLpgSQL_stmt_commit *) stmt);
-			break;
+			case PLPGSQL_STMT_SET:
+				rc = exec_stmt_set(estate, (PLpgSQL_stmt_set *) stmt);
+				break;
 
-		case PLPGSQL_STMT_ROLLBACK:
-			rc = exec_stmt_rollback(estate, (PLpgSQL_stmt_rollback *) stmt);
-			break;
+			default:
+				/* point err_stmt to parent, since this one seems corrupt */
+				estate->err_stmt = save_estmt;
+				elog(ERROR, "unrecognized cmd_type: %d", stmt->cmd_type);
+				rc = -1;		/* keep compiler quiet */
+		}
 
-		case PLPGSQL_STMT_SET:
-			rc = exec_stmt_set(estate, (PLpgSQL_stmt_set *) stmt);
-			break;
+		/* Let the plugin know that we have finished executing this statement */
+		if (*plpgsql_plugin_ptr && (*plpgsql_plugin_ptr)->stmt_end)
+			((*plpgsql_plugin_ptr)->stmt_end) (estate, stmt);
 
-		default:
+		if (rc != PLPGSQL_RC_OK)
+		{
 			estate->err_stmt = save_estmt;
-			elog(ERROR, "unrecognized cmd_type: %d", stmt->cmd_type);
-	}
-
-	/* Let the plugin know that we have finished executing this statement */
-	if (*plpgsql_plugin_ptr && (*plpgsql_plugin_ptr)->stmt_end)
-		((*plpgsql_plugin_ptr)->stmt_end) (estate, stmt);
+			return rc;
+		}
+	}							/* end of loop over statements */
 
 	estate->err_stmt = save_estmt;
-
-	return rc;
+	return PLPGSQL_RC_OK;
 }
 
 

From f7b5988cc0d879d6cc9d481ba898e55747a53bbe Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 3 Jul 2020 17:01:34 -0400
Subject: [PATCH 074/334] Fix temporary tablespaces for shared filesets some
 more.

Commit ecd9e9f0b fixed the problem in the wrong place, causing unwanted
side-effects on the behavior of GetNextTempTableSpace().  Instead,
let's make SharedFileSetInit() responsible for subbing in the value
of MyDatabaseTableSpace when the default tablespace is called for.

The convention about what is in the tempTableSpaces[] array is
evidently insufficiently documented, so try to improve that.

It also looks like SharedFileSetInit() is doing the wrong thing in the
case where temp_tablespaces is empty.  It was hard-wiring use of the
pg_default tablespace, but it seems like using MyDatabaseTableSpace
is more consistent with what happens for other temp files.

Back-patch the reversion of PrepareTempTablespaces()'s behavior to
9.5, as ecd9e9f0b was.  The changes in SharedFileSetInit() go back
to v11 where that was introduced.  (Note there is net zero code change
before v11 from these two patch sets, so nothing to release-note.)

Magnus Hagander and Tom Lane

Discussion: https://postgr.es/m/CABUevExg5YEsOvqMxrjoNvb3ApVyH+9jggWGKwTDFyFCVWczGQ@mail.gmail.com
---
 src/backend/commands/tablespace.c        |  7 ++++++-
 src/backend/storage/file/fd.c            | 14 +++++++++++---
 src/backend/storage/file/sharedfileset.c | 18 +++++++++++++++++-
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index f887ee98577b..2c3b9050b278 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -1183,6 +1183,7 @@ GetDefaultTablespace(char relpersistence, bool partitioned)
 
 typedef struct
 {
+	/* Array of OIDs to be passed to SetTempTablespaces() */
 	int			numSpcs;
 	Oid			tblSpcs[FLEXIBLE_ARRAY_MEMBER];
 } temp_tablespaces_extra;
@@ -1232,6 +1233,7 @@ check_temp_tablespaces(char **newval, void **extra, GucSource source)
 			/* Allow an empty string (signifying database default) */
 			if (curname[0] == '\0')
 			{
+				/* InvalidOid signifies database's default tablespace */
 				tblSpcs[numSpcs++] = InvalidOid;
 				continue;
 			}
@@ -1258,6 +1260,7 @@ check_temp_tablespaces(char **newval, void **extra, GucSource source)
 			 */
 			if (curoid == MyDatabaseTableSpace)
 			{
+				/* InvalidOid signifies database's default tablespace */
 				tblSpcs[numSpcs++] = InvalidOid;
 				continue;
 			}
@@ -1368,6 +1371,7 @@ PrepareTempTablespaces(void)
 		/* Allow an empty string (signifying database default) */
 		if (curname[0] == '\0')
 		{
+			/* InvalidOid signifies database's default tablespace */
 			tblSpcs[numSpcs++] = InvalidOid;
 			continue;
 		}
@@ -1386,7 +1390,8 @@ PrepareTempTablespaces(void)
 		 */
 		if (curoid == MyDatabaseTableSpace)
 		{
-			tblSpcs[numSpcs++] = curoid;
+			/* InvalidOid signifies database's default tablespace */
+			tblSpcs[numSpcs++] = InvalidOid;
 			continue;
 		}
 
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 7dc6dd2f1593..5f6420efb2d7 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -264,8 +264,10 @@ static int	numExternalFDs = 0;
 static long tempFileCounter = 0;
 
 /*
- * Array of OIDs of temp tablespaces.  When numTempTableSpaces is -1,
- * this has not been set in the current transaction.
+ * Array of OIDs of temp tablespaces.  (Some entries may be InvalidOid,
+ * indicating that the current database's default tablespace should be used.)
+ * When numTempTableSpaces is -1, this has not been set in the current
+ * transaction.
  */
 static Oid *tempTableSpaces = NULL;
 static int	numTempTableSpaces = -1;
@@ -2779,6 +2781,9 @@ closeAllVfds(void)
  * unless this function is called again before then.  It is caller's
  * responsibility that the passed-in array has adequate lifespan (typically
  * it'd be allocated in TopTransactionContext).
+ *
+ * Some entries of the array may be InvalidOid, indicating that the current
+ * database's default tablespace should be used.
  */
 void
 SetTempTablespaces(Oid *tableSpaces, int numSpaces)
@@ -2818,7 +2823,10 @@ TempTablespacesAreSet(void)
  * GetTempTablespaces
  *
  * Populate an array with the OIDs of the tablespaces that should be used for
- * temporary files.  Return the number that were copied into the output array.
+ * temporary files.  (Some entries may be InvalidOid, indicating that the
+ * current database's default tablespace should be used.)  At most numSpaces
+ * entries will be filled.
+ * Returns the number of OIDs that were copied into the output array.
  */
 int
 GetTempTablespaces(Oid *tableSpaces, int numSpaces)
diff --git a/src/backend/storage/file/sharedfileset.c b/src/backend/storage/file/sharedfileset.c
index f7206c9175f3..16b7594756c6 100644
--- a/src/backend/storage/file/sharedfileset.c
+++ b/src/backend/storage/file/sharedfileset.c
@@ -63,9 +63,25 @@ SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
 						   lengthof(fileset->tablespaces));
 	if (fileset->ntablespaces == 0)
 	{
-		fileset->tablespaces[0] = DEFAULTTABLESPACE_OID;
+		/* If the GUC is empty, use current database's default tablespace */
+		fileset->tablespaces[0] = MyDatabaseTableSpace;
 		fileset->ntablespaces = 1;
 	}
+	else
+	{
+		int			i;
+
+		/*
+		 * An entry of InvalidOid means use the default tablespace for the
+		 * current database.  Replace that now, to be sure that all users of
+		 * the SharedFileSet agree on what to do.
+		 */
+		for (i = 0; i < fileset->ntablespaces; i++)
+		{
+			if (fileset->tablespaces[i] == InvalidOid)
+				fileset->tablespaces[i] = MyDatabaseTableSpace;
+		}
+	}
 
 	/* Register our cleanup callback. */
 	on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));

From ca5e93f769e1bf5e90d080cfcc8c0368ef649a7c Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 3 Jul 2020 19:01:21 -0400
Subject: [PATCH 075/334] Clamp total-tuples estimates for foreign tables to
 ensure planner sanity.

After running GetForeignRelSize for a foreign table, adjust rel->tuples
to be at least as large as rel->rows.  This prevents bizarre behavior
in estimate_num_groups() and perhaps other places, especially in the
scenario where rel->tuples is zero because pg_class.reltuples is
(suggesting that ANALYZE has never been run for the table).  As things
stood, we'd end up estimating one group out of any GROUP BY on such a
table, whereas the default group-count estimate is more likely to result
in a sane plan.

Also, clarify in the documentation that GetForeignRelSize has the option
to override the rel->tuples value if it has a better idea of what to use
than what is in pg_class.reltuples.

Per report from Jeff Janes.  Back-patch to all supported branches.

Patch by me; thanks to Etsuro Fujita for review

Discussion: https://postgr.es/m/CAMkU=1xNo9cnan+Npxgz0eK7394xmjmKg-QEm8wYG9P5-CcaqQ@mail.gmail.com
---
 doc/src/sgml/fdwhandler.sgml          | 8 ++++++++
 src/backend/optimizer/path/allpaths.c | 3 +++
 2 files changed, 11 insertions(+)

diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml
index 6587678af2b6..1e997c218b9d 100644
--- a/doc/src/sgml/fdwhandler.sgml
+++ b/doc/src/sgml/fdwhandler.sgml
@@ -123,6 +123,14 @@ GetForeignRelSize(PlannerInfo *root,
      should be replaced if at all possible.  The function may also choose to
      update <literal>baserel-&gt;width</literal> if it can compute a better estimate
      of the average result row width.
+     (The initial value is based on column data types and on column
+     average-width values measured by the last <command>ANALYZE</command>.)
+     Also, this function may update <literal>baserel-&gt;tuples</literal> if
+     it can compute a better estimate of the foreign table's total row count.
+     (The initial value is
+     from <structname>pg_class</structname>.<structfield>reltuples</structfield>
+     which represents the total row count seen by the
+     last <command>ANALYZE</command>.)
     </para>
 
     <para>
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index d984da25d77e..63761d559381 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -911,6 +911,9 @@ set_foreign_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte)
 
 	/* ... but do not let it set the rows estimate to zero */
 	rel->rows = clamp_row_est(rel->rows);
+
+	/* also, make sure rel->tuples is not insane relative to rel->rows */
+	rel->tuples = Max(rel->tuples, rel->rows);
 }
 
 /*

From 96d1f423f95d15c74989336e0776e1d55561a3f1 Mon Sep 17 00:00:00 2001
From: Joe Conway <mail@joeconway.com>
Date: Sat, 4 Jul 2020 06:26:53 -0400
Subject: [PATCH 076/334] Read until EOF vice stat-reported size in
 read_binary_file

read_binary_file(), used by SQL functions pg_read_file() and friends,
uses stat to determine file length to read, when not passed an explicit
length as an argument. This is problematic, for example, if the file
being read is a virtual file with a stat-reported length of zero.
Arrange to read until EOF, or StringInfo data string lenth limit, is
reached instead.

Original complaint and patch by me, with significant review, corrections,
advice, and code optimizations by Tom Lane. Backpatched to v11. Prior to
that only paths relative to the data and log dirs were allowed for files,
so no "zero length" files were reachable anyway.

Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/flat/969b8d82-5bb2-5fa8-4eb1-f0e685c5d736%40joeconway.com
Backpatch-through: 11
---
 contrib/adminpack/expected/adminpack.out |  4 +-
 src/backend/utils/adt/genfile.c          | 91 +++++++++++++++++-------
 2 files changed, 66 insertions(+), 29 deletions(-)

diff --git a/contrib/adminpack/expected/adminpack.out b/contrib/adminpack/expected/adminpack.out
index 5738b0f6c4dd..edf3ebfcba3d 100644
--- a/contrib/adminpack/expected/adminpack.out
+++ b/contrib/adminpack/expected/adminpack.out
@@ -79,7 +79,7 @@ SELECT pg_file_rename('test_file1', 'test_file2');
 (1 row)
 
 SELECT pg_read_file('test_file1');  -- not there
-ERROR:  could not stat file "test_file1": No such file or directory
+ERROR:  could not open file "test_file1" for reading: No such file or directory
 SELECT pg_read_file('test_file2');
  pg_read_file 
 --------------
@@ -108,7 +108,7 @@ SELECT pg_file_rename('test_file2', 'test_file3', 'test_file3_archive');
 (1 row)
 
 SELECT pg_read_file('test_file2');  -- not there
-ERROR:  could not stat file "test_file2": No such file or directory
+ERROR:  could not open file "test_file2" for reading: No such file or directory
 SELECT pg_read_file('test_file3');
  pg_read_file 
 --------------
diff --git a/src/backend/utils/adt/genfile.c b/src/backend/utils/adt/genfile.c
index ceaa6180dab6..7a691875ddf1 100644
--- a/src/backend/utils/adt/genfile.c
+++ b/src/backend/utils/adt/genfile.c
@@ -106,33 +106,11 @@ read_binary_file(const char *filename, int64 seek_offset, int64 bytes_to_read,
 				 bool missing_ok)
 {
 	bytea	   *buf;
-	size_t		nbytes;
+	size_t		nbytes = 0;
 	FILE	   *file;
 
-	if (bytes_to_read < 0)
-	{
-		if (seek_offset < 0)
-			bytes_to_read = -seek_offset;
-		else
-		{
-			struct stat fst;
-
-			if (stat(filename, &fst) < 0)
-			{
-				if (missing_ok && errno == ENOENT)
-					return NULL;
-				else
-					ereport(ERROR,
-							(errcode_for_file_access(),
-							 errmsg("could not stat file \"%s\": %m", filename)));
-			}
-
-			bytes_to_read = fst.st_size - seek_offset;
-		}
-	}
-
-	/* not sure why anyone thought that int64 length was a good idea */
-	if (bytes_to_read > (MaxAllocSize - VARHDRSZ))
+	/* clamp request size to what we can actually deliver */
+	if (bytes_to_read > (int64) (MaxAllocSize - VARHDRSZ))
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("requested length too large")));
@@ -154,9 +132,68 @@ read_binary_file(const char *filename, int64 seek_offset, int64 bytes_to_read,
 				(errcode_for_file_access(),
 				 errmsg("could not seek in file \"%s\": %m", filename)));
 
-	buf = (bytea *) palloc((Size) bytes_to_read + VARHDRSZ);
+	if (bytes_to_read >= 0)
+	{
+		/* If passed explicit read size just do it */
+		buf = (bytea *) palloc((Size) bytes_to_read + VARHDRSZ);
+
+		nbytes = fread(VARDATA(buf), 1, (size_t) bytes_to_read, file);
+	}
+	else
+	{
+		/* Negative read size, read rest of file */
+		StringInfoData sbuf;
+
+		initStringInfo(&sbuf);
+		/* Leave room in the buffer for the varlena length word */
+		sbuf.len += VARHDRSZ;
+		Assert(sbuf.len < sbuf.maxlen);
+
+		while (!(feof(file) || ferror(file)))
+		{
+			size_t		rbytes;
+
+			/* Minimum amount to read at a time */
+#define MIN_READ_SIZE 4096
+
+			/*
+			 * If not at end of file, and sbuf.len is equal to
+			 * MaxAllocSize - 1, then either the file is too large, or
+			 * there is nothing left to read. Attempt to read one more
+			 * byte to see if the end of file has been reached. If not,
+			 * the file is too large; we'd rather give the error message
+			 * for that ourselves.
+			 */
+			if (sbuf.len == MaxAllocSize - 1)
+			{
+				char	rbuf[1]; 
 
-	nbytes = fread(VARDATA(buf), 1, (size_t) bytes_to_read, file);
+				fread(rbuf, 1, 1, file);
+				if (!feof(file))
+					ereport(ERROR,
+							(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+							 errmsg("file length too large")));
+				else
+					break;
+			}
+
+			/* OK, ensure that we can read at least MIN_READ_SIZE */
+			enlargeStringInfo(&sbuf, MIN_READ_SIZE);
+
+			/*
+			 * stringinfo.c likes to allocate in powers of 2, so it's likely
+			 * that much more space is available than we asked for.  Use all
+			 * of it, rather than making more fread calls than necessary.
+			 */
+			rbytes = fread(sbuf.data + sbuf.len, 1,
+						   (size_t) (sbuf.maxlen - sbuf.len - 1), file);
+			sbuf.len += rbytes;
+			nbytes += rbytes;
+		}
+
+		/* Now we can commandeer the stringinfo's buffer as the result */
+		buf = (bytea *) sbuf.data;
+	}
 
 	if (ferror(file))
 		ereport(ERROR,

From 1d05627fcf54b26e0cbd7527f9858f165d117813 Mon Sep 17 00:00:00 2001
From: Joe Conway <mail@joeconway.com>
Date: Sat, 4 Jul 2020 13:46:31 -0400
Subject: [PATCH 077/334] Fix "ignoring return value" complaints from commit
 96d1f423f9
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cfbot and some BF animals are complaining about the previous
read_binary_file commit because of ignoring return value of ‘fread’.
So let's make everyone happy by testing the return value even though
not strictly needed.

Reported by Justin Pryzby, and suggested patch by Tom Lane. Backpatched
to v11 same as the previous commit.

Reported-By: Justin Pryzby
Reviewed-By: Tom Lane
Discussion: https://postgr.es/m/flat/969b8d82-5bb2-5fa8-4eb1-f0e685c5d736%40joeconway.com
Backpatch-through: 11
---
 src/backend/utils/adt/genfile.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/genfile.c b/src/backend/utils/adt/genfile.c
index 7a691875ddf1..c1cc19d1f5c3 100644
--- a/src/backend/utils/adt/genfile.c
+++ b/src/backend/utils/adt/genfile.c
@@ -168,8 +168,7 @@ read_binary_file(const char *filename, int64 seek_offset, int64 bytes_to_read,
 			{
 				char	rbuf[1]; 
 
-				fread(rbuf, 1, 1, file);
-				if (!feof(file))
+				if (fread(rbuf, 1, 1, file) != 0 || !feof(file))
 					ereport(ERROR,
 							(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 							 errmsg("file length too large")));

From e61225ffab3f8726ecd053b1c22ff249c417f9ba Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Sun, 5 Jul 2020 11:41:52 +0200
Subject: [PATCH 078/334] Rename enable_incrementalsort for clarity

Author: James Coleman <jtc331@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/df652910-e985-9547-152c-9d4357dc3979%402ndquadrant.com
---
 doc/src/sgml/config.sgml                          |  6 +++---
 src/backend/optimizer/path/allpaths.c             |  2 +-
 src/backend/optimizer/path/costsize.c             |  2 +-
 src/backend/optimizer/plan/planner.c              | 14 +++++++-------
 src/backend/utils/misc/guc.c                      |  4 ++--
 src/backend/utils/misc/postgresql.conf.sample     |  2 +-
 src/include/optimizer/cost.h                      |  2 +-
 src/test/regress/expected/incremental_sort.out    |  4 ++--
 src/test/regress/expected/partition_aggregate.out |  2 +-
 src/test/regress/expected/sysviews.out            |  2 +-
 src/test/regress/sql/incremental_sort.sql         |  4 ++--
 src/test/regress/sql/partition_aggregate.sql      |  2 +-
 12 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index b81aab239f3f..02909b1e6647 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -4574,10 +4574,10 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
-     <varlistentry id="guc-enable-incrementalsort" xreflabel="enable_incrementalsort">
-      <term><varname>enable_incrementalsort</varname> (<type>boolean</type>)
+     <varlistentry id="guc-enable-incremental-sort" xreflabel="enable_incremental_sort">
+      <term><varname>enable_incremental_sort</varname> (<type>boolean</type>)
       <indexterm>
-       <primary><varname>enable_incrementalsort</varname> configuration parameter</primary>
+       <primary><varname>enable_incremental_sort</varname> configuration parameter</primary>
       </indexterm>
       </term>
       <listitem>
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 63761d559381..c4e1967f1231 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -2915,7 +2915,7 @@ generate_useful_gather_paths(PlannerInfo *root, RelOptInfo *rel, bool override_r
 			 * Consider incremental sort, but only when the subpath is already
 			 * partially sorted on a pathkey prefix.
 			 */
-			if (enable_incrementalsort && presorted_keys > 0)
+			if (enable_incremental_sort && presorted_keys > 0)
 			{
 				Path	   *tmp;
 
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 4ff3c7a2fd38..87c9b49ce147 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -128,7 +128,7 @@ bool		enable_indexonlyscan = true;
 bool		enable_bitmapscan = true;
 bool		enable_tidscan = true;
 bool		enable_sort = true;
-bool		enable_incrementalsort = true;
+bool		enable_incremental_sort = true;
 bool		enable_hashagg = true;
 bool		hashagg_avoid_disk_plan = true;
 bool		enable_nestloop = true;
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 4131019fc985..14f3fd44e361 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -5014,7 +5014,7 @@ create_ordered_paths(PlannerInfo *root,
 			 * presorted the path is. Additionally incremental sort may enable
 			 * a cheaper startup path to win out despite higher total cost.
 			 */
-			if (!enable_incrementalsort)
+			if (!enable_incremental_sort)
 				continue;
 
 			/* Likewise, if the path can't be used for incremental sort. */
@@ -5095,7 +5095,7 @@ create_ordered_paths(PlannerInfo *root,
 		 * sort_pathkeys because then we can't possibly have a presorted
 		 * prefix of the list without having the list be fully sorted.
 		 */
-		if (enable_incrementalsort && list_length(root->sort_pathkeys) > 1)
+		if (enable_incremental_sort && list_length(root->sort_pathkeys) > 1)
 		{
 			ListCell   *lc;
 
@@ -6572,7 +6572,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 			 * when the path is not already sorted and when incremental sort
 			 * is enabled.
 			 */
-			if (is_sorted || !enable_incrementalsort)
+			if (is_sorted || !enable_incremental_sort)
 				continue;
 
 			/* Restore the input path (we might have added Sort on top). */
@@ -6699,7 +6699,7 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 				 * when the path is not already sorted and when incremental
 				 * sort is enabled.
 				 */
-				if (is_sorted || !enable_incrementalsort)
+				if (is_sorted || !enable_incremental_sort)
 					continue;
 
 				/* Restore the input path (we might have added Sort on top). */
@@ -7022,7 +7022,7 @@ create_partial_grouping_paths(PlannerInfo *root,
 		 * group_pathkeys because then we can't possibly have a presorted
 		 * prefix of the list without having the list be fully sorted.
 		 */
-		if (enable_incrementalsort && list_length(root->group_pathkeys) > 1)
+		if (enable_incremental_sort && list_length(root->group_pathkeys) > 1)
 		{
 			foreach(lc, input_rel->pathlist)
 			{
@@ -7125,7 +7125,7 @@ create_partial_grouping_paths(PlannerInfo *root,
 			 * when the path is not already sorted and when incremental sort
 			 * is enabled.
 			 */
-			if (is_sorted || !enable_incrementalsort)
+			if (is_sorted || !enable_incremental_sort)
 				continue;
 
 			/* Restore the input path (we might have added Sort on top). */
@@ -7304,7 +7304,7 @@ gather_grouping_paths(PlannerInfo *root, RelOptInfo *rel)
 	 * group_pathkeys because then we can't possibly have a presorted prefix
 	 * of the list without having the list be fully sorted.
 	 */
-	if (!enable_incrementalsort || list_length(root->group_pathkeys) == 1)
+	if (!enable_incremental_sort || list_length(root->group_pathkeys) == 1)
 		return;
 
 	/* also consider incremental sort on partial paths, if enabled */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 75fc6f11d6a2..3a802d8627dc 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -983,11 +983,11 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 	{
-		{"enable_incrementalsort", PGC_USERSET, QUERY_TUNING_METHOD,
+		{"enable_incremental_sort", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of incremental sort steps."),
 			NULL
 		},
-		&enable_incrementalsort,
+		&enable_incremental_sort,
 		true,
 		NULL, NULL, NULL
 	},
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 3a25287a391d..0d98e546a6b4 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -361,7 +361,7 @@
 #enable_parallel_append = on
 #enable_seqscan = on
 #enable_sort = on
-#enable_incrementalsort = on
+#enable_incremental_sort = on
 #enable_tidscan = on
 #enable_partitionwise_join = off
 #enable_partitionwise_aggregate = off
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 92e70ec0d9bf..613db8eab688 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -53,7 +53,7 @@ extern PGDLLIMPORT bool enable_indexonlyscan;
 extern PGDLLIMPORT bool enable_bitmapscan;
 extern PGDLLIMPORT bool enable_tidscan;
 extern PGDLLIMPORT bool enable_sort;
-extern PGDLLIMPORT bool enable_incrementalsort;
+extern PGDLLIMPORT bool enable_incremental_sort;
 extern PGDLLIMPORT bool enable_hashagg;
 extern PGDLLIMPORT bool hashagg_avoid_disk_plan;
 extern PGDLLIMPORT bool enable_nestloop;
diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out
index 53accd0df97d..e376ea127617 100644
--- a/src/test/regress/expected/incremental_sort.out
+++ b/src/test/regress/expected/incremental_sort.out
@@ -1414,7 +1414,7 @@ create table t (a int, b int, c int);
 insert into t select mod(i,10),mod(i,10),i from generate_series(1,10000) s(i);
 create index on t (a);
 analyze t;
-set enable_incrementalsort = off;
+set enable_incremental_sort = off;
 explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
                       QUERY PLAN                      
 ------------------------------------------------------
@@ -1430,7 +1430,7 @@ explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1
                            ->  Parallel Seq Scan on t
 (10 rows)
 
-set enable_incrementalsort = on;
+set enable_incremental_sort = on;
 explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
                               QUERY PLAN                              
 ----------------------------------------------------------------------
diff --git a/src/test/regress/expected/partition_aggregate.out b/src/test/regress/expected/partition_aggregate.out
index c36970575f54..45c698daf489 100644
--- a/src/test/regress/expected/partition_aggregate.out
+++ b/src/test/regress/expected/partition_aggregate.out
@@ -12,7 +12,7 @@ SET enable_partitionwise_join TO true;
 -- Disable parallel plans.
 SET max_parallel_workers_per_gather TO 0;
 -- Disable incremental sort, which can influence selected plans due to fuzz factor.
-SET enable_incrementalsort TO off;
+SET enable_incremental_sort TO off;
 --
 -- Tests for list partitioned tables.
 --
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index 01b7786f0142..06c4c3e47637 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -76,7 +76,7 @@ select name, setting from pg_settings where name like 'enable%';
  enable_gathermerge             | on
  enable_hashagg                 | on
  enable_hashjoin                | on
- enable_incrementalsort         | on
+ enable_incremental_sort        | on
  enable_indexonlyscan           | on
  enable_indexscan               | on
  enable_material                | on
diff --git a/src/test/regress/sql/incremental_sort.sql b/src/test/regress/sql/incremental_sort.sql
index 373e62ac13a1..9c040c90e62d 100644
--- a/src/test/regress/sql/incremental_sort.sql
+++ b/src/test/regress/sql/incremental_sort.sql
@@ -210,10 +210,10 @@ insert into t select mod(i,10),mod(i,10),i from generate_series(1,10000) s(i);
 create index on t (a);
 analyze t;
 
-set enable_incrementalsort = off;
+set enable_incremental_sort = off;
 explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
 
-set enable_incrementalsort = on;
+set enable_incremental_sort = on;
 explain (costs off) select a,b,sum(c) from t group by 1,2 order by 1,2,3 limit 1;
 
 -- Incremental sort vs. set operations with varno 0
diff --git a/src/test/regress/sql/partition_aggregate.sql b/src/test/regress/sql/partition_aggregate.sql
index 178f2079faa5..117f65ecb4f6 100644
--- a/src/test/regress/sql/partition_aggregate.sql
+++ b/src/test/regress/sql/partition_aggregate.sql
@@ -13,7 +13,7 @@ SET enable_partitionwise_join TO true;
 -- Disable parallel plans.
 SET max_parallel_workers_per_gather TO 0;
 -- Disable incremental sort, which can influence selected plans due to fuzz factor.
-SET enable_incrementalsort TO off;
+SET enable_incremental_sort TO off;
 
 --
 -- Tests for list partitioned tables.

From 8cea55e99b766fd654431f24987a774622594c9b Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Sun, 5 Jul 2020 19:35:56 +0900
Subject: [PATCH 079/334] doc: Fix incorrect reference to textout in plpgsql
 examples

This error has survived for 22 years, and has been introduced by
da63386.

Reported-by: Erwin Brandstetter
Discussion: https://postgr.es/m/CAGHENJ57wogGOvGXo5LgWYcqswxafLck8ELqHDR+zrkTPgs_OQ@mail.gmail.com
Backpatch-through: 9.5
---
 doc/src/sgml/plpgsql.sgml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/sgml/plpgsql.sgml b/doc/src/sgml/plpgsql.sgml
index 05d2705604b5..d5c1654b16e4 100644
--- a/doc/src/sgml/plpgsql.sgml
+++ b/doc/src/sgml/plpgsql.sgml
@@ -4892,7 +4892,7 @@ $$ LANGUAGE plpgsql;
      to the local variable <varname>curtime</varname>, the
      <application>PL/pgSQL</application> interpreter casts this
      string to the <type>timestamp</type> type by calling the
-     <function>text_out</function> and <function>timestamp_in</function>
+     <function>textout</function> and <function>timestamp_in</function>
      functions for the conversion.  So, the computed time stamp is updated
      on each execution as the programmer expects.  Even though this
      happens to work as expected, it's not terribly efficient, so

From 90b2d8c1adc86d1fc46f83c73444a1fb5b28dd3c Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Sun, 5 Jul 2020 15:37:57 +0200
Subject: [PATCH 080/334] doc: Spell checking

---
 contrib/pg_stat_statements/pg_stat_statements.c | 2 +-
 doc/src/sgml/backup-manifest.sgml               | 4 ++--
 doc/src/sgml/extend.sgml                        | 2 +-
 doc/src/sgml/fdwhandler.sgml                    | 2 +-
 doc/src/sgml/glossary.sgml                      | 6 +++---
 doc/src/sgml/maintenance.sgml                   | 2 +-
 doc/src/sgml/monitoring.sgml                    | 2 +-
 doc/src/sgml/ref/create_table.sgml              | 4 ++--
 doc/src/sgml/ref/pg_dump.sgml                   | 2 +-
 doc/src/sgml/ref/pg_verifybackup.sgml           | 6 +++---
 doc/src/sgml/ref/pgbench.sgml                   | 2 +-
 11 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 65ac301b99fa..14cad19afbc5 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -4,7 +4,7 @@
  *		Track statement planning and execution times as well as resource
  *		usage across a whole database cluster.
  *
- * Execution costs are totalled for each distinct source query, and kept in
+ * Execution costs are totaled for each distinct source query, and kept in
  * a shared hashtable.  (We track only as many distinct queries as will fit
  * in the designated amount of shared memory.)
  *
diff --git a/doc/src/sgml/backup-manifest.sgml b/doc/src/sgml/backup-manifest.sgml
index b9634f270671..6ecf9977a54b 100644
--- a/doc/src/sgml/backup-manifest.sgml
+++ b/doc/src/sgml/backup-manifest.sgml
@@ -26,7 +26,7 @@
   </para>
 
  <sect1 id="backup-manifest-toplevel">
-  <title>Backup Manifest Toplevel Object</title>
+  <title>Backup Manifest Top-level Object</title>
 
   <para>
    The backup manifest JSON document contains the following keys.
@@ -93,7 +93,7 @@
    Normally, the <literal>Path</literal> key will be present. The
    associated string value is the path of the file relative to the root
    of the backup directory. Files located in a user-defined tablespace
-   will have paths whose first two components are pg_tblspc and the OID
+   will have paths whose first two components are <filename>pg_tblspc</filename> and the OID
    of the tablespace. If the path is not a string that is legal in UTF-8,
    or if the user requests that encoded paths be used for all files, then
    the <literal>Encoded-Path</literal> key will be present instead.  This
diff --git a/doc/src/sgml/extend.sgml b/doc/src/sgml/extend.sgml
index c1ffb1457105..890ff97b7aef 100644
--- a/doc/src/sgml/extend.sgml
+++ b/doc/src/sgml/extend.sgml
@@ -767,7 +767,7 @@ RETURNS anycompatible AS ...
         <literal>false</literal>.
         Generally, this should not be set true for extensions that could
         allow access to otherwise-superuser-only abilities, such as
-        filesystem access.
+        file system access.
        </para>
       </listitem>
      </varlistentry>
diff --git a/doc/src/sgml/fdwhandler.sgml b/doc/src/sgml/fdwhandler.sgml
index 1e997c218b9d..74793035d7f5 100644
--- a/doc/src/sgml/fdwhandler.sgml
+++ b/doc/src/sgml/fdwhandler.sgml
@@ -450,7 +450,7 @@ AddForeignUpdateTargets(Query *parsetree,
      generate junk columns of these names.
      If the extra expressions are more complex than simple Vars, they
      must be run through <function>eval_const_expressions</function>
-     before adding them to the targetlist.
+     before adding them to the target list.
     </para>
 
     <para>
diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml
index c7c931c17e1b..76525c6302a1 100644
--- a/doc/src/sgml/glossary.sgml
+++ b/doc/src/sgml/glossary.sgml
@@ -432,7 +432,7 @@
    <glossterm>Data directory</glossterm>
    <glossdef>
     <para>
-     The base directory on the filesystem of a
+     The base directory on the file system of a
      <glossterm linkend="glossary-server">server</glossterm> that contains all
      data files and subdirectories associated with a
      <glossterm linkend="glossary-db-cluster">database cluster</glossterm>
@@ -1548,7 +1548,7 @@
    <glossterm>Tablespace</glossterm>
    <glossdef>
     <para>
-     A named location on the server filesystem.
+     A named location on the server file system.
      All <glossterm linkend="glossary-sql-object">SQL objects</glossterm>
      which require storage beyond their definition in the
      <glossterm linkend="glossary-system-catalog">system catalog</glossterm>
@@ -1646,7 +1646,7 @@
    <glossdef>
     <para>
      Average number of transactions that are executed per second,
-     totalled across all sessions active for a measured run.
+     totaled across all sessions active for a measured run.
      This is used as a measure of the performance characteristics of
      an instance.
     </para>
diff --git a/doc/src/sgml/maintenance.sgml b/doc/src/sgml/maintenance.sgml
index 612e4cb20085..4acdd15d4b37 100644
--- a/doc/src/sgml/maintenance.sgml
+++ b/doc/src/sgml/maintenance.sgml
@@ -998,7 +998,7 @@ pg_ctl start | rotatelogs /var/log/pgsql_log 86400
     In this case, log messages are sent to the old log file until a
     successful log rotation. If <application>logrotate</application> is
     configured to compress the log file and delete it, the server may lose
-    the messages logged in this timeframe. To avoid this issue, you can
+    the messages logged in this time frame. To avoid this issue, you can
     configure the logging collector to dynamically assign log file names
     and use a <literal>prerotate</literal> script to ignore open log files.
     </para>
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index dfa9d0d6410c..211d2790949c 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -2474,7 +2474,7 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
       <para>
        Number of transactions spilled to disk after the memory used by
        logical decoding exceeds <literal>logical_decoding_work_mem</literal>. The
-       counter gets incremented both for toplevel transactions and
+       counter gets incremented both for top-level transactions and
        subtransactions.
       </para></entry>
      </row>
diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml
index 849f7e931fa5..dc688c415fa8 100644
--- a/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@ -1364,8 +1364,8 @@ WITH ( MODULUS <replaceable class="parameter">numeric_literal</replaceable>, REM
       This only affects columns marked as either External or Extended
       and applies only to new tuples; there is no effect on existing rows.
       By default this parameter is set to allow at least 4 tuples per block,
-      which with the default blocksize will be 2040 bytes. Valid values are
-      between 128 bytes and the (blocksize - header), by default 8160 bytes.
+      which with the default block size will be 2040 bytes. Valid values are
+      between 128 bytes and the (block size - header), by default 8160 bytes.
       Changing this value may not be useful for very short or very long rows.
       Note that the default setting is often close to optimal, and
       it is possible that setting this parameter could have negative
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 2f0807e9127d..8aadaa2a12e2 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -788,7 +788,7 @@ PostgreSQL documentation
         <para>
          When <option>--include-foreign-data</option> is specified,
          <application>pg_dump</application> does not check that the foreign
-         table is writeable.  Therefore, there is no guarantee that the
+         table is writable.  Therefore, there is no guarantee that the
          results of a foreign table dump can be successfully restored.
         </para>
        </note>
diff --git a/doc/src/sgml/ref/pg_verifybackup.sgml b/doc/src/sgml/ref/pg_verifybackup.sgml
index 44f4e67d5770..c85d9136ccfd 100644
--- a/doc/src/sgml/ref/pg_verifybackup.sgml
+++ b/doc/src/sgml/ref/pg_verifybackup.sgml
@@ -93,7 +93,7 @@ PostgreSQL documentation
    <literal>backup_manifest</literal> contains information about which
    write-ahead log records will be needed, and
    <literal>pg_verifybackup</literal> will use that information to
-   invoke <literal>pg_waldump</literal> to parse those write-ahed log
+   invoke <literal>pg_waldump</literal> to parse those write-ahead log
    records. The <literal>--quiet</literal> flag will be used, so that
    <literal>pg_waldump</literal> will only report errors, without producing
    any other output. While this level of verification is sufficient to
@@ -143,13 +143,13 @@ PostgreSQL documentation
       <listitem>
        <para>
         Ignore the specified file or directory, which should be expressed
-        as a relative pathname, when comparing the list of data files
+        as a relative path name, when comparing the list of data files
         actually present in the backup to those listed in the
         <literal>backup_manifest</literal> file.  If a directory is
         specified, this option affects the entire subtree rooted at that
         location. Complaints about extra files, missing files, file size
         differences, or checksum mismatches will be suppressed if the
-        relative pathname matches the specified pathname. This option
+        relative path name matches the specified path name. This option
         can be specified multiple times.
        </para>
       </listitem>
diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml
index 8e728dc0946a..9f3bb5fce65c 100644
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -1886,7 +1886,7 @@ SELECT 4 AS four \; SELECT 5 AS five \aset
         <returnvalue>integer</returnvalue>
        </para>
        <para>
-        Computes a gaussian-distributed random integer in <literal>[lb,
+        Computes a Gaussian-distributed random integer in <literal>[lb,
         ub]</literal>, see below.
        </para>
        <para>

From fe2e206cdb00a2d5dcebc3c8e8119017392d9781 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 5 Jul 2020 13:12:31 -0400
Subject: [PATCH 081/334] Inline the fast path of plpgsql's exec_cast_value().

In the common case where this function isn't actually asked to perform
any type conversion, there's nothing it has to do beyond comparing the
arguments.  Arrange for that part to be inlined into callers, with the
slower path remaining out-of-line.  This seems to be good for several
percent speedup on simple cases, with only minimal code bloat.

Amit Khandekar

Discussion: https://postgr.es/m/CAJ3gD9eBNrmUD7WBBLG8ohaZ485H9y+4eihQTgr+K8Lhka3vcQ@mail.gmail.com
---
 src/pl/plpgsql/src/pl_exec.c | 65 ++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c
index 54900e01c8f5..d4a3d58daa9e 100644
--- a/src/pl/plpgsql/src/pl_exec.c
+++ b/src/pl/plpgsql/src/pl_exec.c
@@ -417,10 +417,14 @@ static void instantiate_empty_record_variable(PLpgSQL_execstate *estate,
 											  PLpgSQL_rec *rec);
 static char *convert_value_to_string(PLpgSQL_execstate *estate,
 									 Datum value, Oid valtype);
-static Datum exec_cast_value(PLpgSQL_execstate *estate,
-							 Datum value, bool *isnull,
-							 Oid valtype, int32 valtypmod,
-							 Oid reqtype, int32 reqtypmod);
+static inline Datum exec_cast_value(PLpgSQL_execstate *estate,
+									Datum value, bool *isnull,
+									Oid valtype, int32 valtypmod,
+									Oid reqtype, int32 reqtypmod);
+static Datum do_cast_value(PLpgSQL_execstate *estate,
+						   Datum value, bool *isnull,
+						   Oid valtype, int32 valtypmod,
+						   Oid reqtype, int32 reqtypmod);
 static plpgsql_CastHashEntry *get_cast_hashentry(PLpgSQL_execstate *estate,
 												 Oid srctype, int32 srctypmod,
 												 Oid dsttype, int32 dsttypmod);
@@ -7825,7 +7829,7 @@ convert_value_to_string(PLpgSQL_execstate *estate, Datum value, Oid valtype)
  * done with the result.
  * ----------
  */
-static Datum
+static inline Datum
 exec_cast_value(PLpgSQL_execstate *estate,
 				Datum value, bool *isnull,
 				Oid valtype, int32 valtypmod,
@@ -7837,30 +7841,47 @@ exec_cast_value(PLpgSQL_execstate *estate,
 	if (valtype != reqtype ||
 		(valtypmod != reqtypmod && reqtypmod != -1))
 	{
-		plpgsql_CastHashEntry *cast_entry;
+		/* We keep the slow path out-of-line. */
+		value = do_cast_value(estate, value, isnull, valtype, valtypmod,
+							  reqtype, reqtypmod);
+	}
 
-		cast_entry = get_cast_hashentry(estate,
-										valtype, valtypmod,
-										reqtype, reqtypmod);
-		if (cast_entry)
-		{
-			ExprContext *econtext = estate->eval_econtext;
-			MemoryContext oldcontext;
+	return value;
+}
 
-			oldcontext = MemoryContextSwitchTo(get_eval_mcontext(estate));
+/* ----------
+ * do_cast_value			Slow path for exec_cast_value.
+ * ----------
+ */
+static Datum
+do_cast_value(PLpgSQL_execstate *estate,
+			  Datum value, bool *isnull,
+			  Oid valtype, int32 valtypmod,
+			  Oid reqtype, int32 reqtypmod)
+{
+	plpgsql_CastHashEntry *cast_entry;
 
-			econtext->caseValue_datum = value;
-			econtext->caseValue_isNull = *isnull;
+	cast_entry = get_cast_hashentry(estate,
+									valtype, valtypmod,
+									reqtype, reqtypmod);
+	if (cast_entry)
+	{
+		ExprContext *econtext = estate->eval_econtext;
+		MemoryContext oldcontext;
 
-			cast_entry->cast_in_use = true;
+		oldcontext = MemoryContextSwitchTo(get_eval_mcontext(estate));
 
-			value = ExecEvalExpr(cast_entry->cast_exprstate, econtext,
-								 isnull);
+		econtext->caseValue_datum = value;
+		econtext->caseValue_isNull = *isnull;
 
-			cast_entry->cast_in_use = false;
+		cast_entry->cast_in_use = true;
 
-			MemoryContextSwitchTo(oldcontext);
-		}
+		value = ExecEvalExpr(cast_entry->cast_exprstate, econtext,
+							 isnull);
+
+		cast_entry->cast_in_use = false;
+
+		MemoryContextSwitchTo(oldcontext);
 	}
 
 	return value;

From 404b912c5cab05d4a2ddac85696a300e00783c73 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 6 Jul 2020 09:16:17 +0900
Subject: [PATCH 082/334] Improve perl script in MSVC to build binaries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit includes two improvements for build.pl in src/tools/msvc/:
- Fix two warnings related to $ARGV[0] being uninitialized, something
that happens when calling build.pl (or build.bat) without arguments to
compile all the components with a release quality.
- If calling the script with more than two arguments, exit immediately
and show a new help output.  build.pl was not failing in this case
before this commit, and the extra arguments were just ignored, but the
new behavior helps in understanding how to run the script without
looking at the documentation for the Windows builds.

Reported-by: Ranier Vilela
Author: Michael Paquier
Reviewed-by: Juan José Santamaría Flecha, David Zhang
Discussion: https://postgr.es/m/CAEudQAo38dfR_0Ugt2OHy4mq-6Hz93XPSBAGEUV67RhKdgp8Zg@mail.gmail.com
---
 src/tools/msvc/build.pl | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/src/tools/msvc/build.pl b/src/tools/msvc/build.pl
index de50554e7e13..a75d191b6dd3 100644
--- a/src/tools/msvc/build.pl
+++ b/src/tools/msvc/build.pl
@@ -1,7 +1,9 @@
 # -*-perl-*- hey - emacs - this is a perl file
-
+#
+# Script that provides 'make' functionality for msvc builds.
+#
 # src/tools/msvc/build.pl
-
+#
 use strict;
 use warnings;
 
@@ -12,10 +14,22 @@
 
 use Mkvcbuild;
 
+sub usage
+{
+	die(    "Usage: build.pl [ [ <configuration> ] <component> ]\n"
+		  . "Options are case-insensitive.\n"
+		  . "  configuration: Release | Debug.  This sets the configuration\n"
+		  . "    to build.  Default is Release.\n"
+		  . "  component: name of component to build.  An empty value means\n"
+		  . "    to build all components.\n");
+}
+
 chdir('../../..') if (-d '../msvc' && -d '../../../src');
 die 'Must run from root or msvc directory'
   unless (-d 'src/tools/msvc' && -d 'src');
 
+usage() unless scalar(@ARGV) <= 2;
+
 # buildenv.pl is for specifying the build environment settings
 # it should contain lines like:
 # $ENV{PATH} = "c:/path/to/bison/bin;$ENV{PATH}";
@@ -37,17 +51,20 @@
 my $vcver = Mkvcbuild::mkvcbuild($config);
 
 # check what sort of build we are doing
-
 my $bconf     = $ENV{CONFIG}   || "Release";
 my $msbflags  = $ENV{MSBFLAGS} || "";
 my $buildwhat = $ARGV[1]       || "";
-if (uc($ARGV[0]) eq 'DEBUG')
-{
-	$bconf = "Debug";
-}
-elsif (uc($ARGV[0]) ne "RELEASE")
+
+if (defined($ARGV[0]))
 {
-	$buildwhat = $ARGV[0] || "";
+	if (uc($ARGV[0]) eq 'DEBUG')
+	{
+		$bconf = "Debug";
+	}
+	elsif (uc($ARGV[0]) ne "RELEASE")
+	{
+		$buildwhat = $ARGV[0] || "";
+	}
 }
 
 # ... and do it

From 231ef5b90d3a8a4ddf9de137e8e0997409c7a8b1 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Mon, 6 Jul 2020 08:21:52 +0530
Subject: [PATCH 083/334] Remove unused function parameter in
 end_parallel_vacuum.

Author: Vignesh C
Reviewed-by: Sawada Masahiko
Backpatch-through: 13, where it was introduced
Discussion: https://postgr.es/m/CALDaNm3Ppt71NafGY5mk3V2i3Q+mm93pVibDq-0NpW7WU67Jcg@mail.gmail.com
---
 src/backend/access/heap/vacuumlazy.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 68effcaed6dc..1bbc4598f75e 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -390,7 +390,7 @@ static void update_index_statistics(Relation *Irel, IndexBulkDeleteResult **stat
 static LVParallelState *begin_parallel_vacuum(Oid relid, Relation *Irel,
 											  LVRelStats *vacrelstats, BlockNumber nblocks,
 											  int nindexes, int nrequested);
-static void end_parallel_vacuum(Relation *Irel, IndexBulkDeleteResult **stats,
+static void end_parallel_vacuum(IndexBulkDeleteResult **stats,
 								LVParallelState *lps, int nindexes);
 static LVSharedIndStats *get_indstats(LVShared *lvshared, int n);
 static bool skip_parallel_vacuum_index(Relation indrel, LVShared *lvshared);
@@ -1712,7 +1712,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
 	 * during parallel mode.
 	 */
 	if (ParallelVacuumIsActive(lps))
-		end_parallel_vacuum(Irel, indstats, lps, nindexes);
+		end_parallel_vacuum(indstats, lps, nindexes);
 
 	/* Update index statistics */
 	update_index_statistics(Irel, indstats, nindexes);
@@ -3361,8 +3361,8 @@ begin_parallel_vacuum(Oid relid, Relation *Irel, LVRelStats *vacrelstats,
  * context, but that won't be safe (see ExitParallelMode).
  */
 static void
-end_parallel_vacuum(Relation *Irel, IndexBulkDeleteResult **stats,
-					LVParallelState *lps, int nindexes)
+end_parallel_vacuum(IndexBulkDeleteResult **stats, LVParallelState *lps,
+					int nindexes)
 {
 	int			i;
 

From 1185c782943c6d36835db481b199dce8e6f30246 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 6 Jul 2020 12:12:11 +0900
Subject: [PATCH 084/334] Add new flag to format_type_extended() to get NULL
 for undefined type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If a type scanned is undefined, type format routines have two behaviors
depending on if FORMAT_TYPE_ALLOW_INVALID is used by the caller or not:
- Issue a cache lookup error
- Return an undefined type name "???", "???[]" or "-"

The current interface is not really helpful for callers willing to
format properly a type name, but still make sure that the type is
defined as there could be types matching the strings generated when
looking for an undefined type, even if that should not be a problem in
practice.  In order to counter that, add a new flag called
FORMAT_TYPE_INVALID_AS_NULL that returns a NULL result instead of "???
or "-" which does not generate an error.  This flag will be used in a
follow-up patch improving the set of SQL functions showing information
for object addresses when it comes to undefined objects.

Author: Michael Paquier
Reviewed-by: Aleksander Alekseev, Dmitry Dolgov, Daniel Gustafsson,
Álvaro Herrera
Discussion: https://postgr.es/m/CAB7nPqSZxrSmdHK-rny7z8mi=EAFXJ5J-0RbzDw6aus=wB5azQ@mail.gmail.com
---
 src/backend/utils/adt/format_type.c | 22 +++++++++++++++++-----
 src/include/utils/builtins.h        |  1 +
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/backend/utils/adt/format_type.c b/src/backend/utils/adt/format_type.c
index df0bdeb610bb..f2816e4f37f7 100644
--- a/src/backend/utils/adt/format_type.c
+++ b/src/backend/utils/adt/format_type.c
@@ -96,13 +96,16 @@ format_type(PG_FUNCTION_ARGS)
  * - FORMAT_TYPE_ALLOW_INVALID
  *			if the type OID is invalid or unknown, return ??? or such instead
  *			of failing
+ * - FORMAT_TYPE_INVALID_AS_NULL
+ *			if the type OID is invalid or unknown, return NULL instead of ???
+ *			or such
  * - FORMAT_TYPE_FORCE_QUALIFY
  *			always schema-qualify type names, regardless of search_path
  *
  * Note that TYPEMOD_GIVEN is not interchangeable with "typemod == -1";
  * see the comments above for format_type().
  *
- * Returns a palloc'd string.
+ * Returns a palloc'd string, or NULL.
  */
 char *
 format_type_extended(Oid type_oid, int32 typemod, bits16 flags)
@@ -114,13 +117,20 @@ format_type_extended(Oid type_oid, int32 typemod, bits16 flags)
 	char	   *buf;
 	bool		with_typemod;
 
-	if (type_oid == InvalidOid && (flags & FORMAT_TYPE_ALLOW_INVALID) != 0)
-		return pstrdup("-");
+	if (type_oid == InvalidOid)
+	{
+		if ((flags & FORMAT_TYPE_INVALID_AS_NULL) != 0)
+			return NULL;
+		else if ((flags & FORMAT_TYPE_ALLOW_INVALID) != 0)
+			return pstrdup("-");
+	}
 
 	tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(type_oid));
 	if (!HeapTupleIsValid(tuple))
 	{
-		if ((flags & FORMAT_TYPE_ALLOW_INVALID) != 0)
+		if ((flags & FORMAT_TYPE_INVALID_AS_NULL) != 0)
+			return NULL;
+		else if ((flags & FORMAT_TYPE_ALLOW_INVALID) != 0)
 			return pstrdup("???");
 		else
 			elog(ERROR, "cache lookup failed for type %u", type_oid);
@@ -144,7 +154,9 @@ format_type_extended(Oid type_oid, int32 typemod, bits16 flags)
 		tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(array_base_type));
 		if (!HeapTupleIsValid(tuple))
 		{
-			if ((flags & FORMAT_TYPE_ALLOW_INVALID) != 0)
+			if ((flags & FORMAT_TYPE_INVALID_AS_NULL) != 0)
+				return NULL;
+			else if ((flags & FORMAT_TYPE_ALLOW_INVALID) != 0)
 				return pstrdup("???[]");
 			else
 				elog(ERROR, "cache lookup failed for type %u", type_oid);
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index f8595642da9b..3ca5e938f8f8 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -113,6 +113,7 @@ extern Datum numeric_float8_no_overflow(PG_FUNCTION_ARGS);
 #define FORMAT_TYPE_TYPEMOD_GIVEN	0x01	/* typemod defined by caller */
 #define FORMAT_TYPE_ALLOW_INVALID	0x02	/* allow invalid types */
 #define FORMAT_TYPE_FORCE_QUALIFY	0x04	/* force qualification of type */
+#define FORMAT_TYPE_INVALID_AS_NULL	0x08	/* NULL if undefined */
 extern char *format_type_extended(Oid type_oid, int32 typemod, bits16 flags);
 
 extern char *format_type_be(Oid type_oid);

From 04c7f4144fec3bcddc82b6abdbfebd88989f7953 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Mon, 6 Jul 2020 08:44:33 +0530
Subject: [PATCH 085/334] Remove extra whitespace in comments atop
 ReorderBufferCheckMemoryLimit.

Backpatch-through: 13, where it was introduced
---
 src/backend/replication/logical/reorderbuffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 642a1c767f3a..7afa2271bd8d 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -2359,7 +2359,7 @@ ReorderBufferLargestTXN(ReorderBuffer *rb)
 
 /*
  * Check whether the logical_decoding_work_mem limit was reached, and if yes
- * pick the largest (sub)transaction  at-a-time to evict and spill its changes to
+ * pick the largest (sub)transaction at-a-time to evict and spill its changes to
  * disk until we reach under the memory limit.
  *
  * XXX At this point we select the transactions until we reach under the memory

From aa38434824c4fa52e55a9c7465b24c246caddc81 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 6 Jul 2020 13:06:08 +0900
Subject: [PATCH 086/334] Refactor routines for name lookups of procedures and
 operators
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This introduces a new set of extended routines for procedure and
operator name lookups, with a flag bitmask argument that can modify the
result.  The following options are available:
- Force schema qualification, ignoring search_path.  This is similar to
the existing option for format_{operator|procedure}_qualified().
- Force NULL as result instead of a numeric OID for an undefined
object.  This option is new.

This is a refactoring similar to 1185c78, that will be used for a future
patch to improve the SQL functions providing information using object
addresses for undefined objects.

Author: Michael Paquier
Reviewed-by: Aleksander Alekseev, Dmitry Dolgov, Daniel Gustafsson,
Álvaro Herrera
Discussion: https://postgr.es/m/CAB7nPqSZxrSmdHK-rny7z8mi=EAFXJ5J-0RbzDw6aus=wB5azQ@mail.gmail.com
---
 src/backend/utils/adt/regproc.c | 66 +++++++++++++++++++++++----------
 src/include/utils/regproc.h     | 10 +++++
 2 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/src/backend/utils/adt/regproc.c b/src/backend/utils/adt/regproc.c
index c800d797acc3..b41189db5c1c 100644
--- a/src/backend/utils/adt/regproc.c
+++ b/src/backend/utils/adt/regproc.c
@@ -41,8 +41,6 @@
 #include "utils/syscache.h"
 #include "utils/varlena.h"
 
-static char *format_operator_internal(Oid operator_oid, bool force_qualify);
-static char *format_procedure_internal(Oid procedure_oid, bool force_qualify);
 static void parseNameAndArgTypes(const char *string, bool allowNone,
 								 List **names, int *nargs, Oid *argtypes);
 
@@ -323,24 +321,32 @@ to_regprocedure(PG_FUNCTION_ARGS)
 char *
 format_procedure(Oid procedure_oid)
 {
-	return format_procedure_internal(procedure_oid, false);
+	return format_procedure_extended(procedure_oid, 0);
 }
 
 char *
 format_procedure_qualified(Oid procedure_oid)
 {
-	return format_procedure_internal(procedure_oid, true);
+	return format_procedure_extended(procedure_oid, FORMAT_PROC_FORCE_QUALIFY);
 }
 
 /*
+ * format_procedure_extended - converts procedure OID to "pro_name(args)"
+ *
+ * This exports the useful functionality of regprocedureout for use
+ * in other backend modules.  The result is a palloc'd string, or NULL.
+ *
  * Routine to produce regprocedure names; see format_procedure above.
  *
- * force_qualify says whether to schema-qualify; if true, the name is always
- * qualified regardless of search_path visibility.  Otherwise the name is only
- * qualified if the function is not in path.
+ * The following bits in 'flags' modify the behavior:
+ * - FORMAT_PROC_INVALID_AS_NULL
+ *			if the procedure OID is invalid or unknown, return NULL instead
+ *			of the numeric OID.
+ * - FORMAT_PROC_FORCE_QUALIFY
+ *			always schema-qualify procedure names, regardless of search_path
  */
-static char *
-format_procedure_internal(Oid procedure_oid, bool force_qualify)
+char *
+format_procedure_extended(Oid procedure_oid, bits16 flags)
 {
 	char	   *result;
 	HeapTuple	proctup;
@@ -365,7 +371,8 @@ format_procedure_internal(Oid procedure_oid, bool force_qualify)
 		 * Would this proc be found (given the right args) by regprocedurein?
 		 * If not, or if caller requests it, we need to qualify it.
 		 */
-		if (!force_qualify && FunctionIsVisible(procedure_oid))
+		if ((flags & FORMAT_PROC_FORCE_QUALIFY) == 0 &&
+			FunctionIsVisible(procedure_oid))
 			nspname = NULL;
 		else
 			nspname = get_namespace_name(procform->pronamespace);
@@ -379,7 +386,7 @@ format_procedure_internal(Oid procedure_oid, bool force_qualify)
 			if (i > 0)
 				appendStringInfoChar(&buf, ',');
 			appendStringInfoString(&buf,
-								   force_qualify ?
+								   (flags & FORMAT_PROC_FORCE_QUALIFY) != 0 ?
 								   format_type_be_qualified(thisargtype) :
 								   format_type_be(thisargtype));
 		}
@@ -389,6 +396,11 @@ format_procedure_internal(Oid procedure_oid, bool force_qualify)
 
 		ReleaseSysCache(proctup);
 	}
+	else if ((flags & FORMAT_PROC_INVALID_AS_NULL) != 0)
+	{
+		/* If object is undefined, return NULL as wanted by caller */
+		result = NULL;
+	}
 	else
 	{
 		/* If OID doesn't match any pg_proc entry, return it numerically */
@@ -747,13 +759,20 @@ to_regoperator(PG_FUNCTION_ARGS)
 }
 
 /*
- * format_operator		- converts operator OID to "opr_name(args)"
+ * format_operator_extended - converts operator OID to "opr_name(args)"
  *
  * This exports the useful functionality of regoperatorout for use
- * in other backend modules.  The result is a palloc'd string.
+ * in other backend modules.  The result is a palloc'd string, or NULL.
+ *
+ * The following bits in 'flags' modify the behavior:
+ * - FORMAT_OPERATOR_INVALID_AS_NULL
+ *			if the operator OID is invalid or unknown, return NULL instead
+ *			of the numeric OID.
+ * - FORMAT_OPERATOR_FORCE_QUALIFY
+ *			always schema-qualify operator names, regardless of search_path
  */
-static char *
-format_operator_internal(Oid operator_oid, bool force_qualify)
+char *
+format_operator_extended(Oid operator_oid, bits16 flags)
 {
 	char	   *result;
 	HeapTuple	opertup;
@@ -776,7 +795,8 @@ format_operator_internal(Oid operator_oid, bool force_qualify)
 		 * Would this oper be found (given the right args) by regoperatorin?
 		 * If not, or if caller explicitly requests it, we need to qualify it.
 		 */
-		if (force_qualify || !OperatorIsVisible(operator_oid))
+		if ((flags & FORMAT_OPERATOR_FORCE_QUALIFY) != 0 ||
+			!OperatorIsVisible(operator_oid))
 		{
 			nspname = get_namespace_name(operform->oprnamespace);
 			appendStringInfo(&buf, "%s.",
@@ -787,7 +807,7 @@ format_operator_internal(Oid operator_oid, bool force_qualify)
 
 		if (operform->oprleft)
 			appendStringInfo(&buf, "%s,",
-							 force_qualify ?
+							 (flags & FORMAT_OPERATOR_FORCE_QUALIFY) != 0 ?
 							 format_type_be_qualified(operform->oprleft) :
 							 format_type_be(operform->oprleft));
 		else
@@ -795,7 +815,7 @@ format_operator_internal(Oid operator_oid, bool force_qualify)
 
 		if (operform->oprright)
 			appendStringInfo(&buf, "%s)",
-							 force_qualify ?
+							 (flags & FORMAT_OPERATOR_FORCE_QUALIFY) != 0 ?
 							 format_type_be_qualified(operform->oprright) :
 							 format_type_be(operform->oprright));
 		else
@@ -805,6 +825,11 @@ format_operator_internal(Oid operator_oid, bool force_qualify)
 
 		ReleaseSysCache(opertup);
 	}
+	else if ((flags & FORMAT_OPERATOR_INVALID_AS_NULL) != 0)
+	{
+		/* If object is undefined, return NULL as wanted by caller */
+		result = NULL;
+	}
 	else
 	{
 		/*
@@ -820,13 +845,14 @@ format_operator_internal(Oid operator_oid, bool force_qualify)
 char *
 format_operator(Oid operator_oid)
 {
-	return format_operator_internal(operator_oid, false);
+	return format_operator_extended(operator_oid, 0);
 }
 
 char *
 format_operator_qualified(Oid operator_oid)
 {
-	return format_operator_internal(operator_oid, true);
+	return format_operator_extended(operator_oid,
+									FORMAT_OPERATOR_FORCE_QUALIFY);
 }
 
 void
diff --git a/src/include/utils/regproc.h b/src/include/utils/regproc.h
index 383dfe641e41..145452a5ad71 100644
--- a/src/include/utils/regproc.h
+++ b/src/include/utils/regproc.h
@@ -15,6 +15,16 @@
 
 #include "nodes/pg_list.h"
 
+/* Control flags for format_procedure_extended */
+#define FORMAT_PROC_INVALID_AS_NULL	0x01	/* NULL if undefined */
+#define FORMAT_PROC_FORCE_QUALIFY	0x02	/* force qualification */
+extern char *format_procedure_extended(Oid procedure_oid, bits16 flags);
+
+/* Control flags for format_operator_extended */
+#define FORMAT_OPERATOR_INVALID_AS_NULL	0x01	/* NULL if undefined */
+#define FORMAT_OPERATOR_FORCE_QUALIFY	0x02	/* force qualification */
+extern char *format_operator_extended(Oid operator_oid, bits16 flags);
+
 extern List *stringToQualifiedNameList(const char *string);
 extern char *format_procedure(Oid procedure_oid);
 extern char *format_procedure_qualified(Oid procedure_oid);

From 321fa6a4a26c9b649a0fbec9fc8b019f19e62289 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Mon, 6 Jul 2020 14:27:09 +0900
Subject: [PATCH 087/334] doc: Add note about possible performance overhead by
 enabling track_planning.

Enabling pg_stat_statements.track_plaanning may incur a noticeable
performance penalty, especially when a fewer kinds of queries are executed
on many concurrent connections. This commit documents this note.

Back-patch to v13 where pg_stat_statements.track_plaanning was added.

Suggested-by: Pavel Stehule
Author: Fujii Masao
Reviewed-by: Pavel Stehule
Discussion: https://postgr.es/m/CAFj8pRC9Jxa8r5i0TNBWLb8mzuaYzEoLq3QOvip0jVpHPOLbVA@mail.gmail.com
---
 doc/src/sgml/pgstatstatements.sgml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml
index 430d8bf07c44..cf2d25b7b22f 100644
--- a/doc/src/sgml/pgstatstatements.sgml
+++ b/doc/src/sgml/pgstatstatements.sgml
@@ -607,6 +607,9 @@
      <para>
       <varname>pg_stat_statements.track_planning</varname> controls whether
       planning operations and duration are tracked by the module.
+      Enabling this parameter may incur a noticeable performance penalty,
+      especially when a fewer kinds of queries are executed on many
+      concurrent connections.
       The default value is <literal>off</literal>.
       Only superusers can change this setting.
      </para>

From bae9e8a58bf642aa383f5dc01b2c5947bae533dd Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Mon, 6 Jul 2020 09:53:37 +0200
Subject: [PATCH 088/334] Fix typo in test

The test was supposed to error but didn't.  Apparently, a copy and
paste and string replace mistake from a nearby similar test.
---
 src/test/modules/unsafe_tests/expected/rolenames.out | 5 ++++-
 src/test/modules/unsafe_tests/sql/rolenames.sql      | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/test/modules/unsafe_tests/expected/rolenames.out b/src/test/modules/unsafe_tests/expected/rolenames.out
index 03c1a255f4fa..b9ba8a350983 100644
--- a/src/test/modules/unsafe_tests/expected/rolenames.out
+++ b/src/test/modules/unsafe_tests/expected/rolenames.out
@@ -429,7 +429,10 @@ SELECT * FROM chksetconfig();
 ----+------+------------+-----------
 (0 rows)
 
-ALTER USER CURRENT_USER SET application_name to 'BAZ'; -- error
+ALTER USER CURRENT_ROLE SET application_name to 'BAZ'; -- error
+ERROR:  syntax error at or near "CURRENT_ROLE"
+LINE 1: ALTER USER CURRENT_ROLE SET application_name to 'BAZ';
+                   ^
 ALTER USER USER SET application_name to 'BOOM'; -- error
 ERROR:  syntax error at or near "USER"
 LINE 1: ALTER USER USER SET application_name to 'BOOM';
diff --git a/src/test/modules/unsafe_tests/sql/rolenames.sql b/src/test/modules/unsafe_tests/sql/rolenames.sql
index 5a3cf44d82c8..0e60a79f65cc 100644
--- a/src/test/modules/unsafe_tests/sql/rolenames.sql
+++ b/src/test/modules/unsafe_tests/sql/rolenames.sql
@@ -180,7 +180,7 @@ ALTER USER ALL RESET application_name;
 SELECT * FROM chksetconfig();
 
 
-ALTER USER CURRENT_USER SET application_name to 'BAZ'; -- error
+ALTER USER CURRENT_ROLE SET application_name to 'BAZ'; -- error
 ALTER USER USER SET application_name to 'BOOM'; -- error
 ALTER USER PUBLIC SET application_name to 'BOMB'; -- error
 ALTER USER NONE SET application_name to 'BOMB'; -- error

From f7f70d5e22aa2330b8cc31dfc8732cd26ef0bbdd Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 6 Jul 2020 14:21:16 -0400
Subject: [PATCH 089/334] Create composite array types for initdb-created
 relations.

When we invented arrays of composite types (commit bc8036fc6),
we excluded system catalogs, basically just on the grounds of not
wanting to bloat pg_type.  However, it's definitely inconsistent that
catalogs' composite types can't be put into arrays when others can.
Another problem is that the exclusion is done by checking
IsUnderPostmaster in heap_create_with_catalog, which means that

(1) If a user tries to create a table in single-user mode, it doesn't
get an array type.  That's bad in itself, plus it breaks pg_upgrade.

(2) If someone drops and recreates a system view or information_schema
view (as we occasionally recommend doing), it will now have an array
type where it did not before, making for still more inconsistency.

So this is all pretty messy.  Let's just get rid of the inconsistency
and decree that system-created relations should have array types if
similar user-created ones would, i.e. it only depends on the relkind.
As of HEAD, that means that the initial contents of pg_type grow from
411 rows to 605, which is a lot of growth percentage-wise, but it's
still quite a small catalog compared to others.

Wenjing Zeng, reviewed by Shawn Wang, further hacking by me

Discussion: https://postgr.es/m/761F1389-C6A8-4C15-80CE-950C961F5341@gmail.com
---
 src/backend/catalog/heap.c       | 19 ++++++++-----------
 src/include/catalog/catversion.h |  2 +-
 src/include/catalog/pg_type.dat  |  8 ++++----
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index d279842d3ceb..fd04e82b20ec 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -1262,17 +1262,14 @@ heap_create_with_catalog(const char *relname,
 	new_rel_desc->rd_rel->relrewrite = relrewrite;
 
 	/*
-	 * Decide whether to create an array type over the relation's rowtype. We
-	 * do not create any array types for system catalogs (ie, those made
-	 * during initdb). We do not create them where the use of a relation as
-	 * such is an implementation detail: toast tables, sequences and indexes.
-	 */
-	if (IsUnderPostmaster && (relkind == RELKIND_RELATION ||
-							  relkind == RELKIND_VIEW ||
-							  relkind == RELKIND_MATVIEW ||
-							  relkind == RELKIND_FOREIGN_TABLE ||
-							  relkind == RELKIND_COMPOSITE_TYPE ||
-							  relkind == RELKIND_PARTITIONED_TABLE))
+	 * Decide whether to create an array type over the relation's rowtype.
+	 * Array types are made except where the use of a relation as such is an
+	 * implementation detail: toast tables, sequences and indexes.
+	 */
+	if (!(relkind == RELKIND_SEQUENCE ||
+		  relkind == RELKIND_TOASTVALUE ||
+		  relkind == RELKIND_INDEX ||
+		  relkind == RELKIND_PARTITIONED_INDEX))
 		new_array_oid = AssignTypeArrayOid();
 
 	/*
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index a433bf52c1bd..54518cd40ed0 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202006301
+#define CATALOG_VERSION_NO	202007061
 
 #endif
diff --git a/src/include/catalog/pg_type.dat b/src/include/catalog/pg_type.dat
index e8be0008353d..b2cec0741688 100644
--- a/src/include/catalog/pg_type.dat
+++ b/src/include/catalog/pg_type.dat
@@ -113,22 +113,22 @@
 
 # hand-built rowtype entries for bootstrapped catalogs
 # NB: OIDs assigned here must match the BKI_ROWTYPE_OID declarations
-{ oid => '71',
+{ oid => '71', array_type_oid => '210',
   typname => 'pg_type', typlen => '-1', typbyval => 'f', typtype => 'c',
   typcategory => 'C', typrelid => 'pg_type', typinput => 'record_in',
   typoutput => 'record_out', typreceive => 'record_recv',
   typsend => 'record_send', typalign => 'd', typstorage => 'x' },
-{ oid => '75',
+{ oid => '75', array_type_oid => '270',
   typname => 'pg_attribute', typlen => '-1', typbyval => 'f', typtype => 'c',
   typcategory => 'C', typrelid => 'pg_attribute', typinput => 'record_in',
   typoutput => 'record_out', typreceive => 'record_recv',
   typsend => 'record_send', typalign => 'd', typstorage => 'x' },
-{ oid => '81',
+{ oid => '81', array_type_oid => '272',
   typname => 'pg_proc', typlen => '-1', typbyval => 'f', typtype => 'c',
   typcategory => 'C', typrelid => 'pg_proc', typinput => 'record_in',
   typoutput => 'record_out', typreceive => 'record_recv',
   typsend => 'record_send', typalign => 'd', typstorage => 'x' },
-{ oid => '83',
+{ oid => '83', array_type_oid => '273',
   typname => 'pg_class', typlen => '-1', typbyval => 'f', typtype => 'c',
   typcategory => 'C', typrelid => 'pg_class', typinput => 'record_in',
   typoutput => 'record_out', typreceive => 'record_recv',

From 28c16f4947df0a98c124ec1724faff629d6c77f7 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 6 Jul 2020 13:47:29 -0700
Subject: [PATCH 090/334] Remove unnecessary PageIsEmpty() nbtree build check.

nbtree index builds cannot write out an empty page.  That would mean
that there was no way to create a pivot tuple pointing to the page one
level up, since _bt_truncate() generates one based on page's firstright
tuple.

Replace the unnecessary PageIsEmpty() check with an assertion that
checks that the page has space for at least two line pointers (the
would-be high key line pointer, plus at least one valid "data item"
tuple line pointer).

The PageIsEmpty() check was added by commit 5d9f146c over 20 years ago.
It looks like it has always been unnecessary.
---
 src/backend/access/nbtree/nbtsort.c | 35 +++++++++++++++--------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index c03998834d4a..e6b72111363e 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -267,7 +267,7 @@ static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,
 							   bool *isnull, bool tupleIsAlive, void *state);
 static Page _bt_blnewpage(uint32 level);
 static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);
-static void _bt_slideleft(Page page);
+static void _bt_slideleft(Page rightmostpage);
 static void _bt_sortaddtup(Page page, Size itemsize,
 						   IndexTuple itup, OffsetNumber itup_off,
 						   bool newfirstdataitem);
@@ -721,31 +721,32 @@ _bt_pagestate(BTWriteState *wstate, uint32 level)
 }
 
 /*
- * slide an array of ItemIds back one slot (from P_FIRSTKEY to
- * P_HIKEY, overwriting P_HIKEY).  we need to do this when we discover
- * that we have built an ItemId array in what has turned out to be a
- * P_RIGHTMOST page.
+ * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
+ * P_HIKEY, overwriting P_HIKEY).
+ *
+ * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
+ * rightmost page on its level is not supposed to get a high key.  Now that
+ * it's clear that this page is a rightmost page, remove the unneeded empty
+ * P_HIKEY line pointer space.
  */
 static void
-_bt_slideleft(Page page)
+_bt_slideleft(Page rightmostpage)
 {
 	OffsetNumber off;
 	OffsetNumber maxoff;
 	ItemId		previi;
-	ItemId		thisii;
 
-	if (!PageIsEmpty(page))
+	maxoff = PageGetMaxOffsetNumber(rightmostpage);
+	Assert(maxoff >= P_FIRSTKEY);
+	previi = PageGetItemId(rightmostpage, P_HIKEY);
+	for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
 	{
-		maxoff = PageGetMaxOffsetNumber(page);
-		previi = PageGetItemId(page, P_HIKEY);
-		for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))
-		{
-			thisii = PageGetItemId(page, off);
-			*previi = *thisii;
-			previi = thisii;
-		}
-		((PageHeader) page)->pd_lower -= sizeof(ItemIdData);
+		ItemId		thisii = PageGetItemId(rightmostpage, off);
+
+		*previi = *thisii;
+		previi = thisii;
 	}
+	((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);
 }
 
 /*

From 71ec58e7fc1e180165fbed7a983d8179a03ddfc6 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Tue, 7 Jul 2020 10:21:54 +0200
Subject: [PATCH 091/334] Resolve gratuitous tabs in test SQL file

---
 src/test/modules/unsafe_tests/expected/rolenames.out | 10 +++++-----
 src/test/modules/unsafe_tests/sql/rolenames.sql      | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/test/modules/unsafe_tests/expected/rolenames.out b/src/test/modules/unsafe_tests/expected/rolenames.out
index b9ba8a350983..116d12ee71dc 100644
--- a/src/test/modules/unsafe_tests/expected/rolenames.out
+++ b/src/test/modules/unsafe_tests/expected/rolenames.out
@@ -17,7 +17,7 @@ CREATE OR REPLACE FUNCTION chksetconfig()
  RETURNS TABLE (db name, "role" name, rolkeyword text, setconfig text[])
  AS $$
 SELECT COALESCE(d.datname, 'ALL'), COALESCE(r.rolname, 'ALL'),
-	   COALESCE(v.keyword, '-'), s.setconfig
+       COALESCE(v.keyword, '-'), s.setconfig
  FROM pg_db_role_setting s
  LEFT JOIN pg_roles r ON (r.oid = s.setrole)
  LEFT JOIN pg_database d ON (d.oid = s.setdatabase)
@@ -641,12 +641,12 @@ CREATE USER MAPPING FOR PUBLIC SERVER sv6 OPTIONS (user 'PUBLIC');
 CREATE USER MAPPING FOR "Public" SERVER sv7 OPTIONS (user '"Public"');
 CREATE USER MAPPING FOR regress_testrolx SERVER sv8 OPTIONS (user 'regress_testrolx');
 CREATE USER MAPPING FOR CURRENT_ROLE SERVER sv9
-	    OPTIONS (user 'CURRENT_ROLE'); -- error
+ OPTIONS (user 'CURRENT_ROLE'); -- error
 ERROR:  syntax error at or near "CURRENT_ROLE"
 LINE 1: CREATE USER MAPPING FOR CURRENT_ROLE SERVER sv9
                                 ^
 CREATE USER MAPPING FOR nonexistent SERVER sv9
-	    OPTIONS (user 'nonexistent'); -- error;
+ OPTIONS (user 'nonexistent'); -- error;
 ERROR:  role "nonexistent" does not exist
 SELECT * FROM chkumapping();
       umname      | umserver |         umoptions         
@@ -860,7 +860,7 @@ GRANT ALL PRIVILEGES ON FUNCTION testagg5(int2) TO "Public";
 GRANT ALL PRIVILEGES ON FUNCTION testagg6(int2) TO regress_testrolx;
 GRANT ALL PRIVILEGES ON FUNCTION testagg7(int2) TO "public";
 GRANT ALL PRIVILEGES ON FUNCTION testagg8(int2)
-	   TO current_user, public, regress_testrolx;
+ TO current_user, public, regress_testrolx;
 SELECT proname, proacl FROM pg_proc WHERE proname LIKE 'testagg_';
  proname  |                                                              proacl                                                               
 ----------+-----------------------------------------------------------------------------------------------------------------------------------
@@ -913,7 +913,7 @@ REVOKE ALL PRIVILEGES ON FUNCTION testagg5(int2) FROM "Public";
 REVOKE ALL PRIVILEGES ON FUNCTION testagg6(int2) FROM regress_testrolx;
 REVOKE ALL PRIVILEGES ON FUNCTION testagg7(int2) FROM "public";
 REVOKE ALL PRIVILEGES ON FUNCTION testagg8(int2)
-	   FROM current_user, public, regress_testrolx;
+ FROM current_user, public, regress_testrolx;
 SELECT proname, proacl FROM pg_proc WHERE proname LIKE 'testagg_';
  proname  |                proacl                 
 ----------+---------------------------------------
diff --git a/src/test/modules/unsafe_tests/sql/rolenames.sql b/src/test/modules/unsafe_tests/sql/rolenames.sql
index 0e60a79f65cc..c931b8ebee26 100644
--- a/src/test/modules/unsafe_tests/sql/rolenames.sql
+++ b/src/test/modules/unsafe_tests/sql/rolenames.sql
@@ -18,7 +18,7 @@ CREATE OR REPLACE FUNCTION chksetconfig()
  RETURNS TABLE (db name, "role" name, rolkeyword text, setconfig text[])
  AS $$
 SELECT COALESCE(d.datname, 'ALL'), COALESCE(r.rolname, 'ALL'),
-	   COALESCE(v.keyword, '-'), s.setconfig
+       COALESCE(v.keyword, '-'), s.setconfig
  FROM pg_db_role_setting s
  LEFT JOIN pg_roles r ON (r.oid = s.setrole)
  LEFT JOIN pg_database d ON (d.oid = s.setdatabase)
@@ -312,9 +312,9 @@ CREATE USER MAPPING FOR "Public" SERVER sv7 OPTIONS (user '"Public"');
 CREATE USER MAPPING FOR regress_testrolx SERVER sv8 OPTIONS (user 'regress_testrolx');
 
 CREATE USER MAPPING FOR CURRENT_ROLE SERVER sv9
-	    OPTIONS (user 'CURRENT_ROLE'); -- error
+ OPTIONS (user 'CURRENT_ROLE'); -- error
 CREATE USER MAPPING FOR nonexistent SERVER sv9
-	    OPTIONS (user 'nonexistent'); -- error;
+ OPTIONS (user 'nonexistent'); -- error;
 
 SELECT * FROM chkumapping();
 
@@ -416,7 +416,7 @@ GRANT ALL PRIVILEGES ON FUNCTION testagg5(int2) TO "Public";
 GRANT ALL PRIVILEGES ON FUNCTION testagg6(int2) TO regress_testrolx;
 GRANT ALL PRIVILEGES ON FUNCTION testagg7(int2) TO "public";
 GRANT ALL PRIVILEGES ON FUNCTION testagg8(int2)
-	   TO current_user, public, regress_testrolx;
+ TO current_user, public, regress_testrolx;
 
 SELECT proname, proacl FROM pg_proc WHERE proname LIKE 'testagg_';
 
@@ -435,7 +435,7 @@ REVOKE ALL PRIVILEGES ON FUNCTION testagg5(int2) FROM "Public";
 REVOKE ALL PRIVILEGES ON FUNCTION testagg6(int2) FROM regress_testrolx;
 REVOKE ALL PRIVILEGES ON FUNCTION testagg7(int2) FROM "public";
 REVOKE ALL PRIVILEGES ON FUNCTION testagg8(int2)
-	   FROM current_user, public, regress_testrolx;
+ FROM current_user, public, regress_testrolx;
 
 SELECT proname, proacl FROM pg_proc WHERE proname LIKE 'testagg_';
 

From 6a5c750f3f72899f4f982f921d5bf5665f55651e Mon Sep 17 00:00:00 2001
From: Magnus Hagander <magnus@hagander.net>
Date: Tue, 7 Jul 2020 16:57:27 +0200
Subject: [PATCH 092/334] Check ssl_in_use flag when reporting statistics

Previously we checked that the ssl pointer was not null, but this puts a
requirement on there being such a pointer which may not be true in
future multi-ssl-library supporting times. This seems to have been an
oversight in 9029f4b3740, but hasn't really had any effect since we only
have one library.

Author: Daniel Gustafsson
---
 src/backend/postmaster/pgstat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index c022597bc09a..edfa774ee409 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -2989,7 +2989,7 @@ pgstat_bestart(void)
 		MemSet(&lbeentry.st_clientaddr, 0, sizeof(lbeentry.st_clientaddr));
 
 #ifdef USE_SSL
-	if (MyProcPort && MyProcPort->ssl != NULL)
+	if (MyProcPort && MyProcPort->ssl_in_use)
 	{
 		lbeentry.st_ssl = true;
 		lsslstatus.ssl_bits = be_tls_get_cipher_bits(MyProcPort);

From a8aaa0c786b3a27706172cc7979684a122ebb706 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Tue, 7 Jul 2020 13:08:00 -0400
Subject: [PATCH 093/334] Morph pg_replication_slots.min_safe_lsn to
 safe_wal_size
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous definition of the column was almost universally disliked,
so provide this updated definition which is more useful for monitoring
purposes: a large positive value is good, while zero or a negative value
means danger.  This should be operationally more convenient.

Backpatch to 13, where the new column to pg_replication_slots (and the
feature it represents) were added.

Author: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Author: Álvaro Herrera <alvherre@alvh.no-ip.org>
Reported-by: Fujii Masao <masao.fujii@oss.nttdata.com>
Discussion: https://postgr.es/m/9ddfbf8c-2f67-904d-44ed-cf8bc5916228@oss.nttdata.com
---
 doc/src/sgml/catalogs.sgml                |  7 ++--
 src/backend/access/transam/xlog.c         |  6 ++--
 src/backend/catalog/system_views.sql      |  2 +-
 src/backend/replication/slotfuncs.c       | 40 +++++++++++++++++------
 src/include/access/xlog_internal.h        |  7 ++++
 src/include/catalog/catversion.h          |  2 +-
 src/include/catalog/pg_proc.dat           |  4 +--
 src/test/recovery/t/019_replslot_limit.pl | 22 ++++++-------
 src/test/regress/expected/rules.out       |  4 +--
 9 files changed, 60 insertions(+), 34 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 003d2783703c..361793b337aa 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -11275,10 +11275,13 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
 
      <row>
       <entry role="catalog_table_entry"><para role="column_definition">
-       <structfield>min_safe_lsn</structfield> <type>pg_lsn</type>
+       <structfield>safe_wal_size</structfield> <type>int8</type>
       </para>
       <para>
-       The minimum LSN currently available for walsenders.
+       The number of bytes that can be written to WAL such that this slot
+       is not in danger of getting in state "lost".  It is NULL for lost
+       slots, as well as if <varname>max_slot_wal_keep_size</varname>
+       is <literal>-1</literal>.
       </para></entry>
      </row>
     </tbody>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index fd93bcfaebaa..c2feb9257621 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -764,8 +764,7 @@ static ControlFileData *ControlFile = NULL;
  * Convert values of GUCs measured in megabytes to equiv. segment count.
  * Rounds down.
  */
-#define ConvertToXSegs(x, segsize)	\
-	((x) / ((segsize) / (1024 * 1024)))
+#define ConvertToXSegs(x, segsize)	XLogMBVarToSegs((x), (segsize))
 
 /* The number of bytes in a WAL segment usable for WAL data. */
 static int	UsableBytesInSegment;
@@ -9513,8 +9512,7 @@ GetWALAvailability(XLogRecPtr targetLSN)
 	XLogSegNo	targetSeg;		/* segid of targetLSN */
 	XLogSegNo	oldestSeg;		/* actual oldest segid */
 	XLogSegNo	oldestSegMaxWalSize;	/* oldest segid kept by max_wal_size */
-	XLogSegNo	oldestSlotSeg = InvalidXLogRecPtr;	/* oldest segid kept by
-													 * slot */
+	XLogSegNo	oldestSlotSeg;	/* oldest segid kept by slot */
 	uint64		keepSegs;
 
 	/*
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 5314e9348fa7..73676d04cf4b 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -879,7 +879,7 @@ CREATE VIEW pg_replication_slots AS
             L.restart_lsn,
             L.confirmed_flush_lsn,
             L.wal_status,
-            L.min_safe_lsn
+            L.safe_wal_size
     FROM pg_get_replication_slots() AS L
             LEFT JOIN pg_database D ON (L.datoid = D.oid);
 
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 88033a79b21b..9fe147bf44ec 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -242,6 +242,7 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 	Tuplestorestate *tupstore;
 	MemoryContext per_query_ctx;
 	MemoryContext oldcontext;
+	XLogRecPtr	currlsn;
 	int			slotno;
 
 	/* check to see if caller supports us returning a tuplestore */
@@ -274,6 +275,8 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 
 	MemoryContextSwitchTo(oldcontext);
 
+	currlsn = GetXLogWriteRecPtr();
+
 	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
 	for (slotno = 0; slotno < max_replication_slots; slotno++)
 	{
@@ -282,7 +285,6 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 		Datum		values[PG_GET_REPLICATION_SLOTS_COLS];
 		bool		nulls[PG_GET_REPLICATION_SLOTS_COLS];
 		WALAvailability walstate;
-		XLogSegNo	last_removed_seg;
 		int			i;
 
 		if (!slot->in_use)
@@ -380,6 +382,8 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 				 * we looked.  If checkpointer signalled the process to
 				 * termination, then it's definitely lost; but if a process is
 				 * still alive, then "unreserved" seems more appropriate.
+				 *
+				 * If we do change it, save the state for safe_wal_size below.
 				 */
 				if (!XLogRecPtrIsInvalid(slot_contents.data.restart_lsn))
 				{
@@ -387,10 +391,12 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 
 					SpinLockAcquire(&slot->mutex);
 					pid = slot->active_pid;
+					slot_contents.data.restart_lsn = slot->data.restart_lsn;
 					SpinLockRelease(&slot->mutex);
 					if (pid != 0)
 					{
 						values[i++] = CStringGetTextDatum("unreserved");
+						walstate = WALAVAIL_UNRESERVED;
 						break;
 					}
 				}
@@ -398,18 +404,32 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 				break;
 		}
 
-		if (max_slot_wal_keep_size_mb >= 0 &&
-			(walstate == WALAVAIL_RESERVED || walstate == WALAVAIL_EXTENDED) &&
-			((last_removed_seg = XLogGetLastRemovedSegno()) != 0))
+		/*
+		 * safe_wal_size is only computed for slots that have not been lost,
+		 * and only if there's a configured maximum size.
+		 */
+		if (walstate == WALAVAIL_REMOVED || max_slot_wal_keep_size_mb < 0)
+			nulls[i++] = true;
+		else
 		{
-			XLogRecPtr	min_safe_lsn;
+			XLogSegNo   targetSeg;
+			XLogSegNo   keepSegs;
+			XLogSegNo   failSeg;
+			XLogRecPtr  failLSN;
 
-			XLogSegNoOffsetToRecPtr(last_removed_seg + 1, 0,
-									wal_segment_size, min_safe_lsn);
-			values[i++] = Int64GetDatum(min_safe_lsn);
+			XLByteToSeg(slot_contents.data.restart_lsn, targetSeg, wal_segment_size);
+
+			/* determine how many segments slots can be kept by slots ... */
+			keepSegs = XLogMBVarToSegs(max_slot_wal_keep_size_mb, wal_segment_size);
+			/* ... and override by wal_keep_segments as needed */
+			keepSegs = Max(keepSegs, wal_keep_segments);
+
+			/* if currpos reaches failLSN, we lose our segment */
+			failSeg = targetSeg + keepSegs + 1;
+			XLogSegNoOffsetToRecPtr(failSeg, 0, wal_segment_size, failLSN);
+
+			values[i++] = Int64GetDatum(failLSN - currlsn);
 		}
-		else
-			nulls[i++] = true;
 
 		Assert(i == PG_GET_REPLICATION_SLOTS_COLS);
 
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index c8869d5226c0..88f3d767007b 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -121,6 +121,13 @@ typedef XLogLongPageHeaderData *XLogLongPageHeader;
 #define XLByteToPrevSeg(xlrp, logSegNo, wal_segsz_bytes) \
 	logSegNo = ((xlrp) - 1) / (wal_segsz_bytes)
 
+/*
+ * Convert values of GUCs measured in megabytes to equiv. segment count.
+ * Rounds down.
+ */
+#define XLogMBVarToSegs(mbvar, wal_segsz_bytes) \
+	((mbvar) / ((wal_segsz_bytes) / (1024 * 1024)))
+
 /*
  * Is an XLogRecPtr within a particular XLOG segment?
  *
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 54518cd40ed0..1b35510d46dc 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202007061
+#define CATALOG_VERSION_NO	202007071
 
 #endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 38295aca4831..3c89c53aa28d 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10075,9 +10075,9 @@
   proname => 'pg_get_replication_slots', prorows => '10', proisstrict => 'f',
   proretset => 't', provolatile => 's', prorettype => 'record',
   proargtypes => '',
-  proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,pg_lsn}',
+  proallargtypes => '{name,name,text,oid,bool,bool,int4,xid,xid,pg_lsn,pg_lsn,text,int8}',
   proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o}',
-  proargnames => '{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,min_safe_lsn}',
+  proargnames => '{slot_name,plugin,slot_type,datoid,temporary,active,active_pid,xmin,catalog_xmin,restart_lsn,confirmed_flush_lsn,wal_status,safe_wal_size}',
   prosrc => 'pg_get_replication_slots' },
 { oid => '3786', descr => 'set up a logical replication slot',
   proname => 'pg_create_logical_replication_slot', provolatile => 'v',
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
index 7d22ae57201a..af656c6902f6 100644
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -28,7 +28,7 @@
 
 # The slot state and remain should be null before the first connection
 my $result = $node_master->safe_psql('postgres',
-	"SELECT restart_lsn IS NULL, wal_status is NULL, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
+	"SELECT restart_lsn IS NULL, wal_status is NULL, safe_wal_size is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "t|t|t", 'check the state of non-reserved slot is "unknown"');
 
@@ -52,9 +52,9 @@
 # Stop standby
 $node_standby->stop;
 
-# Preparation done, the slot is the state "normal" now
+# Preparation done, the slot is the state "reserved" now
 $result = $node_master->safe_psql('postgres',
-	"SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
+	"SELECT wal_status, safe_wal_size IS NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "reserved|t", 'check the catching-up state');
 
@@ -64,7 +64,7 @@
 
 # The slot is always "safe" when fitting max_wal_size
 $result = $node_master->safe_psql('postgres',
-	"SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
+	"SELECT wal_status, safe_wal_size IS NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "reserved|t",
 	'check that it is safe if WAL fits in max_wal_size');
@@ -74,7 +74,7 @@
 
 # The slot is always "safe" when max_slot_wal_keep_size is not set
 $result = $node_master->safe_psql('postgres',
-	"SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
+	"SELECT wal_status, safe_wal_size IS NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "reserved|t", 'check that slot is working');
 
@@ -94,9 +94,7 @@
 ));
 $node_master->reload;
 
-# The slot is in safe state. The distance from the min_safe_lsn should
-# be as almost (max_slot_wal_keep_size - 1) times large as the segment
-# size
+# The slot is in safe state.
 
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'");
@@ -110,7 +108,7 @@
 $result = $node_master->safe_psql('postgres',
 	"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'");
 is($result, "reserved",
-	'check that min_safe_lsn gets close to the current LSN');
+	'check that safe_wal_size gets close to the current LSN');
 
 # The standby can reconnect to master
 $node_standby->start;
@@ -152,9 +150,9 @@
 # Advance WAL again without checkpoint; remain goes to 0.
 advance_wal($node_master, 1);
 
-# Slot gets into 'unreserved' state
+# Slot gets into 'unreserved' state and safe_wal_size is negative
 $result = $node_master->safe_psql('postgres',
-	"SELECT wal_status, min_safe_lsn is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
+	"SELECT wal_status, safe_wal_size <= 0 FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "unreserved|t",
 	'check that the slot state changes to "unreserved"');
@@ -186,7 +184,7 @@
 
 # This slot should be broken
 $result = $node_master->safe_psql('postgres',
-	"SELECT slot_name, active, restart_lsn IS NULL, wal_status, min_safe_lsn FROM pg_replication_slots WHERE slot_name = 'rep1'"
+	"SELECT slot_name, active, restart_lsn IS NULL, wal_status, safe_wal_size FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "rep1|f|t|lost|",
 	'check that the slot became inactive and the state "lost" persists');
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index b813e322153d..93bb2159ca89 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1464,8 +1464,8 @@ pg_replication_slots| SELECT l.slot_name,
     l.restart_lsn,
     l.confirmed_flush_lsn,
     l.wal_status,
-    l.min_safe_lsn
-   FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, temporary, active, active_pid, xmin, catalog_xmin, restart_lsn, confirmed_flush_lsn, wal_status, min_safe_lsn)
+    l.safe_wal_size
+   FROM (pg_get_replication_slots() l(slot_name, plugin, slot_type, datoid, temporary, active, active_pid, xmin, catalog_xmin, restart_lsn, confirmed_flush_lsn, wal_status, safe_wal_size)
      LEFT JOIN pg_database d ON ((l.datoid = d.oid)));
 pg_roles| SELECT pg_authid.rolname,
     pg_authid.rolsuper,

From f3faf35f370f558670c8213a08f2683f3811ffc7 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 7 Jul 2020 15:43:22 -0400
Subject: [PATCH 094/334] Don't create pg_type entries for sequences or toast
 tables.

Commit f7f70d5e2 left one inconsistency behind: we're still creating
pg_type entries for the composite types of sequences and toast tables,
but not arrays over those composites.  But there seems precious little
reason to have named composite types for toast tables, and not much more
to have them for sequences (especially given the thought that sequences
may someday not be standalone relations at all).

So, let's close that inconsistency by removing these composite types,
rather than adding arrays for them.  This buys back a little bit of
the initial pg_type bloat added by the previous patch, and could be
a significant savings in a large database with many toast tables.

Aside from a small logic rearrangement in heap_create_with_catalog,
this patch mostly needs to clean up some places that were assuming that
pg_class.reltype always has a valid value.  Those are really pre-existing
bugs, given that it's documented otherwise; notably, the plpgsql changes
fix code that gives "cache lookup failed for type 0" on indexes today.
But none of these seem interesting enough to back-patch.

Also, remove the pg_dump/pg_upgrade infrastructure for propagating
a toast table's pg_type OID into the new database, since we no longer
need that.

Discussion: https://postgr.es/m/761F1389-C6A8-4C15-80CE-950C961F5341@gmail.com
---
 doc/src/sgml/catalogs.sgml                 |  3 +-
 src/backend/catalog/heap.c                 | 75 +++++++++++++---------
 src/backend/catalog/toasting.c             | 20 +-----
 src/backend/commands/tablecmds.c           | 22 +++----
 src/backend/nodes/makefuncs.c              |  6 +-
 src/backend/utils/adt/pg_upgrade_support.c | 11 ----
 src/backend/utils/cache/relcache.c         | 13 ++--
 src/bin/pg_dump/pg_dump.c                  | 46 ++++---------
 src/include/catalog/binary_upgrade.h       |  1 -
 src/include/catalog/catversion.h           |  2 +-
 src/include/catalog/pg_proc.dat            |  4 --
 src/pl/plpgsql/src/pl_comp.c               | 22 ++++++-
 12 files changed, 102 insertions(+), 123 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 361793b337aa..e9cdff486415 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -1895,7 +1895,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
       </para>
       <para>
        The OID of the data type that corresponds to this table's row type,
-       if any (zero for indexes, which have no <structname>pg_type</structname> entry)
+       if any (zero for indexes, sequences, and toast tables, which have
+       no <structname>pg_type</structname> entry)
       </para></entry>
      </row>
 
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index fd04e82b20ec..3985326df62f 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -1000,7 +1000,9 @@ AddNewRelationTuple(Relation pg_class_desc,
 	/* relispartition is always set by updating this tuple later */
 	new_rel_reltup->relispartition = false;
 
-	new_rel_desc->rd_att->tdtypeid = new_type_oid;
+	/* fill rd_att's type ID with something sane even if reltype is zero */
+	new_rel_desc->rd_att->tdtypeid = new_type_oid ? new_type_oid : RECORDOID;
+	new_rel_desc->rd_att->tdtypmod = -1;
 
 	/* Now build and insert the tuple */
 	InsertPgClassTuple(pg_class_desc, new_rel_desc, new_rel_oid,
@@ -1085,6 +1087,7 @@ AddNewRelationType(const char *typeName,
  *
  * Output parameters:
  *	typaddress: if not null, gets the object address of the new pg_type entry
+ *	(this must be null if the relkind is one that doesn't get a pg_type entry)
  *
  * Returns the OID of the new relation
  * --------------------------------
@@ -1118,8 +1121,6 @@ heap_create_with_catalog(const char *relname,
 	Oid			existing_relid;
 	Oid			old_type_oid;
 	Oid			new_type_oid;
-	ObjectAddress new_type_addr;
-	Oid			new_array_oid = InvalidOid;
 	TransactionId relfrozenxid;
 	MultiXactId relminmxid;
 
@@ -1262,44 +1263,46 @@ heap_create_with_catalog(const char *relname,
 	new_rel_desc->rd_rel->relrewrite = relrewrite;
 
 	/*
-	 * Decide whether to create an array type over the relation's rowtype.
-	 * Array types are made except where the use of a relation as such is an
+	 * Decide whether to create a pg_type entry for the relation's rowtype.
+	 * These types are made except where the use of a relation as such is an
 	 * implementation detail: toast tables, sequences and indexes.
 	 */
 	if (!(relkind == RELKIND_SEQUENCE ||
 		  relkind == RELKIND_TOASTVALUE ||
 		  relkind == RELKIND_INDEX ||
 		  relkind == RELKIND_PARTITIONED_INDEX))
-		new_array_oid = AssignTypeArrayOid();
-
-	/*
-	 * Since defining a relation also defines a complex type, we add a new
-	 * system type corresponding to the new relation.  The OID of the type can
-	 * be preselected by the caller, but if reltypeid is InvalidOid, we'll
-	 * generate a new OID for it.
-	 *
-	 * NOTE: we could get a unique-index failure here, in case someone else is
-	 * creating the same type name in parallel but hadn't committed yet when
-	 * we checked for a duplicate name above.
-	 */
-	new_type_addr = AddNewRelationType(relname,
-									   relnamespace,
-									   relid,
-									   relkind,
-									   ownerid,
-									   reltypeid,
-									   new_array_oid);
-	new_type_oid = new_type_addr.objectId;
-	if (typaddress)
-		*typaddress = new_type_addr;
-
-	/*
-	 * Now make the array type if wanted.
-	 */
-	if (OidIsValid(new_array_oid))
 	{
+		Oid			new_array_oid;
+		ObjectAddress new_type_addr;
 		char	   *relarrayname;
 
+		/*
+		 * We'll make an array over the composite type, too.  For largely
+		 * historical reasons, the array type's OID is assigned first.
+		 */
+		new_array_oid = AssignTypeArrayOid();
+
+		/*
+		 * Make the pg_type entry for the composite type.  The OID of the
+		 * composite type can be preselected by the caller, but if reltypeid
+		 * is InvalidOid, we'll generate a new OID for it.
+		 *
+		 * NOTE: we could get a unique-index failure here, in case someone
+		 * else is creating the same type name in parallel but hadn't
+		 * committed yet when we checked for a duplicate name above.
+		 */
+		new_type_addr = AddNewRelationType(relname,
+										   relnamespace,
+										   relid,
+										   relkind,
+										   ownerid,
+										   reltypeid,
+										   new_array_oid);
+		new_type_oid = new_type_addr.objectId;
+		if (typaddress)
+			*typaddress = new_type_addr;
+
+		/* Now create the array type. */
 		relarrayname = makeArrayTypeName(relname, relnamespace);
 
 		TypeCreate(new_array_oid,	/* force the type's OID to this */
@@ -1336,6 +1339,14 @@ heap_create_with_catalog(const char *relname,
 
 		pfree(relarrayname);
 	}
+	else
+	{
+		/* Caller should not be expecting a type to be created. */
+		Assert(reltypeid == InvalidOid);
+		Assert(typaddress == NULL);
+
+		new_type_oid = InvalidOid;
+	}
 
 	/*
 	 * now create an entry in pg_class for the relation.
diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c
index 3f7ab8d389be..8b8888af5ed5 100644
--- a/src/backend/catalog/toasting.c
+++ b/src/backend/catalog/toasting.c
@@ -34,9 +34,6 @@
 #include "utils/rel.h"
 #include "utils/syscache.h"
 
-/* Potentially set by pg_upgrade_support functions */
-Oid			binary_upgrade_next_toast_pg_type_oid = InvalidOid;
-
 static void CheckAndCreateToastTable(Oid relOid, Datum reloptions,
 									 LOCKMODE lockmode, bool check);
 static bool create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
@@ -135,7 +132,6 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
 	Relation	toast_rel;
 	Relation	class_rel;
 	Oid			toast_relid;
-	Oid			toast_typid = InvalidOid;
 	Oid			namespaceid;
 	char		toast_relname[NAMEDATALEN];
 	char		toast_idxname[NAMEDATALEN];
@@ -181,8 +177,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
 		 * problem that it might take up an OID that will conflict with some
 		 * old-cluster table we haven't seen yet.
 		 */
-		if (!OidIsValid(binary_upgrade_next_toast_pg_class_oid) ||
-			!OidIsValid(binary_upgrade_next_toast_pg_type_oid))
+		if (!OidIsValid(binary_upgrade_next_toast_pg_class_oid))
 			return false;
 	}
 
@@ -234,17 +229,6 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
 	else
 		namespaceid = PG_TOAST_NAMESPACE;
 
-	/*
-	 * Use binary-upgrade override for pg_type.oid, if supplied.  We might be
-	 * in the post-schema-restore phase where we are doing ALTER TABLE to
-	 * create TOAST tables that didn't exist in the old cluster.
-	 */
-	if (IsBinaryUpgrade && OidIsValid(binary_upgrade_next_toast_pg_type_oid))
-	{
-		toast_typid = binary_upgrade_next_toast_pg_type_oid;
-		binary_upgrade_next_toast_pg_type_oid = InvalidOid;
-	}
-
 	/* Toast table is shared if and only if its parent is. */
 	shared_relation = rel->rd_rel->relisshared;
 
@@ -255,7 +239,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
 										   namespaceid,
 										   rel->rd_rel->reltablespace,
 										   toastOid,
-										   toast_typid,
+										   InvalidOid,
 										   InvalidOid,
 										   rel->rd_rel->relowner,
 										   table_relation_toast_am(rel),
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index f79044f39fcc..42330692e769 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -11001,8 +11001,8 @@ ATPrepAlterColumnType(List **wqueue,
 		tab->relkind == RELKIND_FOREIGN_TABLE)
 	{
 		/*
-		 * For composite types, do this check now.  Tables will check it later
-		 * when the table is being rewritten.
+		 * For composite types and foreign tables, do this check now.  Regular
+		 * tables will check it later when the table is being rewritten.
 		 */
 		find_composite_type_dependencies(rel->rd_rel->reltype, rel, NULL);
 	}
@@ -12564,8 +12564,7 @@ ATExecChangeOwner(Oid relationOid, Oid newOwnerId, bool recursing, LOCKMODE lock
 		/*
 		 * Also change the ownership of the table's row type, if it has one
 		 */
-		if (tuple_class->relkind != RELKIND_INDEX &&
-			tuple_class->relkind != RELKIND_PARTITIONED_INDEX)
+		if (OidIsValid(tuple_class->reltype))
 			AlterTypeOwnerInternal(tuple_class->reltype, newOwnerId);
 
 		/*
@@ -15009,9 +15008,10 @@ AlterTableNamespaceInternal(Relation rel, Oid oldNspOid, Oid nspOid,
 	AlterRelationNamespaceInternal(classRel, RelationGetRelid(rel), oldNspOid,
 								   nspOid, true, objsMoved);
 
-	/* Fix the table's row type too */
-	AlterTypeNamespaceInternal(rel->rd_rel->reltype,
-							   nspOid, false, false, objsMoved);
+	/* Fix the table's row type too, if it has one */
+	if (OidIsValid(rel->rd_rel->reltype))
+		AlterTypeNamespaceInternal(rel->rd_rel->reltype,
+								   nspOid, false, false, objsMoved);
 
 	/* Fix other dependent stuff */
 	if (rel->rd_rel->relkind == RELKIND_RELATION ||
@@ -15206,11 +15206,11 @@ AlterSeqNamespaces(Relation classRel, Relation rel,
 									   true, objsMoved);
 
 		/*
-		 * Sequences have entries in pg_type. We need to be careful to move
-		 * them to the new namespace, too.
+		 * Sequences used to have entries in pg_type, but no longer do.  If we
+		 * ever re-instate that, we'll need to move the pg_type entry to the
+		 * new namespace, too (using AlterTypeNamespaceInternal).
 		 */
-		AlterTypeNamespaceInternal(RelationGetForm(seqRel)->reltype,
-								   newNspOid, false, false, objsMoved);
+		Assert(RelationGetForm(seqRel)->reltype == InvalidOid);
 
 		/* Now we can close it.  Keep the lock till end of transaction. */
 		relation_close(seqRel, NoLock);
diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c
index b442b5a29ef4..49de285f01e2 100644
--- a/src/backend/nodes/makefuncs.c
+++ b/src/backend/nodes/makefuncs.c
@@ -145,8 +145,10 @@ makeWholeRowVar(RangeTblEntry *rte,
 			/* relation: the rowtype is a named composite type */
 			toid = get_rel_type_id(rte->relid);
 			if (!OidIsValid(toid))
-				elog(ERROR, "could not find type OID for relation %u",
-					 rte->relid);
+				ereport(ERROR,
+						(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+						 errmsg("relation \"%s\" does not have a composite type",
+								get_rel_name(rte->relid))));
 			result = makeVar(varno,
 							 InvalidAttrNumber,
 							 toid,
diff --git a/src/backend/utils/adt/pg_upgrade_support.c b/src/backend/utils/adt/pg_upgrade_support.c
index 18f2ee8226cc..14d9eb2b5b3d 100644
--- a/src/backend/utils/adt/pg_upgrade_support.c
+++ b/src/backend/utils/adt/pg_upgrade_support.c
@@ -51,17 +51,6 @@ binary_upgrade_set_next_array_pg_type_oid(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }
 
-Datum
-binary_upgrade_set_next_toast_pg_type_oid(PG_FUNCTION_ARGS)
-{
-	Oid			typoid = PG_GETARG_OID(0);
-
-	CHECK_IS_BINARY_UPGRADE;
-	binary_upgrade_next_toast_pg_type_oid = typoid;
-
-	PG_RETURN_VOID();
-}
-
 Datum
 binary_upgrade_set_next_heap_pg_class_oid(PG_FUNCTION_ARGS)
 {
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 0b9eb00d2de4..a2453cf1f421 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -506,9 +506,10 @@ RelationBuildTupleDesc(Relation relation)
 	AttrMissing *attrmiss = NULL;
 	int			ndef = 0;
 
-	/* copy some fields from pg_class row to rd_att */
-	relation->rd_att->tdtypeid = relation->rd_rel->reltype;
-	relation->rd_att->tdtypmod = -1;	/* unnecessary, but... */
+	/* fill rd_att's type ID fields (compare heap.c's AddNewRelationTuple) */
+	relation->rd_att->tdtypeid =
+		relation->rd_rel->reltype ? relation->rd_rel->reltype : RECORDOID;
+	relation->rd_att->tdtypmod = -1;	/* just to be sure */
 
 	constr = (TupleConstr *) MemoryContextAlloc(CacheMemoryContext,
 												sizeof(TupleConstr));
@@ -1886,7 +1887,7 @@ formrdesc(const char *relationName, Oid relationReltype,
 	relation->rd_att->tdrefcount = 1;	/* mark as refcounted */
 
 	relation->rd_att->tdtypeid = relationReltype;
-	relation->rd_att->tdtypmod = -1;	/* unnecessary, but... */
+	relation->rd_att->tdtypmod = -1;	/* just to be sure */
 
 	/*
 	 * initialize tuple desc info
@@ -5692,8 +5693,8 @@ load_relcache_init_file(bool shared)
 		rel->rd_att = CreateTemplateTupleDesc(relform->relnatts);
 		rel->rd_att->tdrefcount = 1;	/* mark as refcounted */
 
-		rel->rd_att->tdtypeid = relform->reltype;
-		rel->rd_att->tdtypmod = -1; /* unnecessary, but... */
+		rel->rd_att->tdtypeid = relform->reltype ? relform->reltype : RECORDOID;
+		rel->rd_att->tdtypmod = -1; /* just to be sure */
 
 		/* next read all the attribute tuple form data entries */
 		has_not_null = false;
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index a41a3db876ca..fd7b3e092037 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -272,7 +272,7 @@ static void binary_upgrade_set_type_oids_by_type_oid(Archive *fout,
 													 PQExpBuffer upgrade_buffer,
 													 Oid pg_type_oid,
 													 bool force_array_type);
-static bool binary_upgrade_set_type_oids_by_rel_oid(Archive *fout,
+static void binary_upgrade_set_type_oids_by_rel_oid(Archive *fout,
 													PQExpBuffer upgrade_buffer, Oid pg_rel_oid);
 static void binary_upgrade_set_pg_class_oids(Archive *fout,
 											 PQExpBuffer upgrade_buffer,
@@ -4493,7 +4493,7 @@ binary_upgrade_set_type_oids_by_type_oid(Archive *fout,
 	destroyPQExpBuffer(upgrade_query);
 }
 
-static bool
+static void
 binary_upgrade_set_type_oids_by_rel_oid(Archive *fout,
 										PQExpBuffer upgrade_buffer,
 										Oid pg_rel_oid)
@@ -4501,48 +4501,23 @@ binary_upgrade_set_type_oids_by_rel_oid(Archive *fout,
 	PQExpBuffer upgrade_query = createPQExpBuffer();
 	PGresult   *upgrade_res;
 	Oid			pg_type_oid;
-	bool		toast_set = false;
 
-	/*
-	 * We only support old >= 8.3 for binary upgrades.
-	 *
-	 * We purposefully ignore toast OIDs for partitioned tables; the reason is
-	 * that versions 10 and 11 have them, but 12 does not, so emitting them
-	 * causes the upgrade to fail.
-	 */
 	appendPQExpBuffer(upgrade_query,
-					  "SELECT c.reltype AS crel, t.reltype AS trel "
+					  "SELECT c.reltype AS crel "
 					  "FROM pg_catalog.pg_class c "
-					  "LEFT JOIN pg_catalog.pg_class t ON "
-					  "  (c.reltoastrelid = t.oid AND c.relkind <> '%c') "
 					  "WHERE c.oid = '%u'::pg_catalog.oid;",
-					  RELKIND_PARTITIONED_TABLE, pg_rel_oid);
+					  pg_rel_oid);
 
 	upgrade_res = ExecuteSqlQueryForSingleRow(fout, upgrade_query->data);
 
 	pg_type_oid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "crel")));
 
-	binary_upgrade_set_type_oids_by_type_oid(fout, upgrade_buffer,
-											 pg_type_oid, false);
-
-	if (!PQgetisnull(upgrade_res, 0, PQfnumber(upgrade_res, "trel")))
-	{
-		/* Toast tables do not have pg_type array rows */
-		Oid			pg_type_toast_oid = atooid(PQgetvalue(upgrade_res, 0,
-														  PQfnumber(upgrade_res, "trel")));
-
-		appendPQExpBufferStr(upgrade_buffer, "\n-- For binary upgrade, must preserve pg_type toast oid\n");
-		appendPQExpBuffer(upgrade_buffer,
-						  "SELECT pg_catalog.binary_upgrade_set_next_toast_pg_type_oid('%u'::pg_catalog.oid);\n\n",
-						  pg_type_toast_oid);
-
-		toast_set = true;
-	}
+	if (OidIsValid(pg_type_oid))
+		binary_upgrade_set_type_oids_by_type_oid(fout, upgrade_buffer,
+												 pg_type_oid, false);
 
 	PQclear(upgrade_res);
 	destroyPQExpBuffer(upgrade_query);
-
-	return toast_set;
 }
 
 static void
@@ -17209,8 +17184,11 @@ dumpSequence(Archive *fout, TableInfo *tbinfo)
 	{
 		binary_upgrade_set_pg_class_oids(fout, query,
 										 tbinfo->dobj.catId.oid, false);
-		binary_upgrade_set_type_oids_by_rel_oid(fout, query,
-												tbinfo->dobj.catId.oid);
+
+		/*
+		 * In older PG versions a sequence will have a pg_type entry, but v14
+		 * and up don't use that, so don't attempt to preserve the type OID.
+		 */
 	}
 
 	if (tbinfo->is_identity_sequence)
diff --git a/src/include/catalog/binary_upgrade.h b/src/include/catalog/binary_upgrade.h
index 12d94fe1b3c4..02fecb90f79a 100644
--- a/src/include/catalog/binary_upgrade.h
+++ b/src/include/catalog/binary_upgrade.h
@@ -16,7 +16,6 @@
 
 extern PGDLLIMPORT Oid binary_upgrade_next_pg_type_oid;
 extern PGDLLIMPORT Oid binary_upgrade_next_array_pg_type_oid;
-extern PGDLLIMPORT Oid binary_upgrade_next_toast_pg_type_oid;
 
 extern PGDLLIMPORT Oid binary_upgrade_next_heap_pg_class_oid;
 extern PGDLLIMPORT Oid binary_upgrade_next_index_pg_class_oid;
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 1b35510d46dc..60e5361af66e 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202007071
+#define CATALOG_VERSION_NO	202007072
 
 #endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 3c89c53aa28d..d951b4a36f24 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10306,10 +10306,6 @@
   proname => 'binary_upgrade_set_next_array_pg_type_oid', provolatile => 'v',
   proparallel => 'r', prorettype => 'void', proargtypes => 'oid',
   prosrc => 'binary_upgrade_set_next_array_pg_type_oid' },
-{ oid => '3585', descr => 'for use by pg_upgrade',
-  proname => 'binary_upgrade_set_next_toast_pg_type_oid', provolatile => 'v',
-  proparallel => 'r', prorettype => 'void', proargtypes => 'oid',
-  prosrc => 'binary_upgrade_set_next_toast_pg_type_oid' },
 { oid => '3586', descr => 'for use by pg_upgrade',
   proname => 'binary_upgrade_set_next_heap_pg_class_oid', provolatile => 'v',
   proparallel => 'r', prorettype => 'void', proargtypes => 'oid',
diff --git a/src/pl/plpgsql/src/pl_comp.c b/src/pl/plpgsql/src/pl_comp.c
index 828ff5a288fc..e7f4a5f291d1 100644
--- a/src/pl/plpgsql/src/pl_comp.c
+++ b/src/pl/plpgsql/src/pl_comp.c
@@ -1778,6 +1778,7 @@ PLpgSQL_type *
 plpgsql_parse_wordrowtype(char *ident)
 {
 	Oid			classOid;
+	Oid			typOid;
 
 	/*
 	 * Look up the relation.  Note that because relation rowtypes have the
@@ -1792,8 +1793,16 @@ plpgsql_parse_wordrowtype(char *ident)
 				(errcode(ERRCODE_UNDEFINED_TABLE),
 				 errmsg("relation \"%s\" does not exist", ident)));
 
+	/* Some relkinds lack type OIDs */
+	typOid = get_rel_type_id(classOid);
+	if (!OidIsValid(typOid))
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("relation \"%s\" does not have a composite type",
+						ident)));
+
 	/* Build and return the row type struct */
-	return plpgsql_build_datatype(get_rel_type_id(classOid), -1, InvalidOid,
+	return plpgsql_build_datatype(typOid, -1, InvalidOid,
 								  makeTypeName(ident));
 }
 
@@ -1806,6 +1815,7 @@ PLpgSQL_type *
 plpgsql_parse_cwordrowtype(List *idents)
 {
 	Oid			classOid;
+	Oid			typOid;
 	RangeVar   *relvar;
 	MemoryContext oldCxt;
 
@@ -1825,10 +1835,18 @@ plpgsql_parse_cwordrowtype(List *idents)
 						  -1);
 	classOid = RangeVarGetRelid(relvar, NoLock, false);
 
+	/* Some relkinds lack type OIDs */
+	typOid = get_rel_type_id(classOid);
+	if (!OidIsValid(typOid))
+		ereport(ERROR,
+				(errcode(ERRCODE_WRONG_OBJECT_TYPE),
+				 errmsg("relation \"%s\" does not have a composite type",
+						strVal(lsecond(idents)))));
+
 	MemoryContextSwitchTo(oldCxt);
 
 	/* Build and return the row type struct */
-	return plpgsql_build_datatype(get_rel_type_id(classOid), -1, InvalidOid,
+	return plpgsql_build_datatype(typOid, -1, InvalidOid,
 								  makeTypeNameFromNameList(idents));
 }
 

From 3f96af4619c8b129ec8d5f4fb961df4310999383 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 7 Jul 2020 18:10:42 -0400
Subject: [PATCH 095/334] Un-break pg_upgrade from pre-v12 servers.

I neglected to test this scenario while preparing commit f3faf35f3,
so of course it was broken, thanks to some very obscure and undocumented
code in pg_dump.  Pre-v12 databases might have toast tables attached to
partitioned tables, which we need to ignore since newer servers never
create such useless toast tables.  There was a filter for this case in
binary_upgrade_set_type_oids_by_rel_oid(), which appeared to just
prevent the pg_type OID from being copied.  But actually it managed to
prevent the toast table from being created at all --- or it did before
I took out that logic.  But that was a fundamentally bizarre place to be
making the test in the first place.  The place where the filter should
have been, one would think, is binary_upgrade_set_pg_class_oids(), so
add it there.

While at it, reorganize binary_upgrade_set_pg_class_oids() so that it
doesn't make a completely useless query when it knows it's being
invoked for an index.  And correct a comment that mis-described the
scenario where we need to force creation of a TOAST table.

Per buildfarm.
---
 src/bin/pg_dump/pg_dump.c | 75 +++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 31 deletions(-)

diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index fd7b3e092037..c2627bb630b7 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -4525,43 +4525,56 @@ binary_upgrade_set_pg_class_oids(Archive *fout,
 								 PQExpBuffer upgrade_buffer, Oid pg_class_oid,
 								 bool is_index)
 {
-	PQExpBuffer upgrade_query = createPQExpBuffer();
-	PGresult   *upgrade_res;
-	Oid			pg_class_reltoastrelid;
-	Oid			pg_index_indexrelid;
-
-	appendPQExpBuffer(upgrade_query,
-					  "SELECT c.reltoastrelid, i.indexrelid "
-					  "FROM pg_catalog.pg_class c LEFT JOIN "
-					  "pg_catalog.pg_index i ON (c.reltoastrelid = i.indrelid AND i.indisvalid) "
-					  "WHERE c.oid = '%u'::pg_catalog.oid;",
-					  pg_class_oid);
-
-	upgrade_res = ExecuteSqlQueryForSingleRow(fout, upgrade_query->data);
-
-	pg_class_reltoastrelid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "reltoastrelid")));
-	pg_index_indexrelid = atooid(PQgetvalue(upgrade_res, 0, PQfnumber(upgrade_res, "indexrelid")));
-
 	appendPQExpBufferStr(upgrade_buffer,
 						 "\n-- For binary upgrade, must preserve pg_class oids\n");
 
 	if (!is_index)
 	{
+		PQExpBuffer upgrade_query = createPQExpBuffer();
+		PGresult   *upgrade_res;
+		Oid			pg_class_reltoastrelid;
+		char		pg_class_relkind;
+		Oid			pg_index_indexrelid;
+
 		appendPQExpBuffer(upgrade_buffer,
 						  "SELECT pg_catalog.binary_upgrade_set_next_heap_pg_class_oid('%u'::pg_catalog.oid);\n",
 						  pg_class_oid);
-		/* only tables have toast tables, not indexes */
-		if (OidIsValid(pg_class_reltoastrelid))
-		{
-			/*
-			 * One complexity is that the table definition might not require
-			 * the creation of a TOAST table, and the TOAST table might have
-			 * been created long after table creation, when the table was
-			 * loaded with wide data.  By setting the TOAST oid we force
-			 * creation of the TOAST heap and TOAST index by the backend so we
-			 * can cleanly copy the files during binary upgrade.
-			 */
 
+		/*
+		 * Preserve the OIDs of the table's toast table and index, if any.
+		 * Indexes cannot have toast tables, so we need not make this probe in
+		 * the index code path.
+		 *
+		 * One complexity is that the current table definition might not
+		 * require the creation of a TOAST table, but the old database might
+		 * have a TOAST table that was created earlier, before some wide
+		 * columns were dropped.  By setting the TOAST oid we force creation
+		 * of the TOAST heap and index by the new backend, so we can copy the
+		 * files during binary upgrade without worrying about this case.
+		 */
+		appendPQExpBuffer(upgrade_query,
+						  "SELECT c.reltoastrelid, c.relkind, i.indexrelid "
+						  "FROM pg_catalog.pg_class c LEFT JOIN "
+						  "pg_catalog.pg_index i ON (c.reltoastrelid = i.indrelid AND i.indisvalid) "
+						  "WHERE c.oid = '%u'::pg_catalog.oid;",
+						  pg_class_oid);
+
+		upgrade_res = ExecuteSqlQueryForSingleRow(fout, upgrade_query->data);
+
+		pg_class_reltoastrelid = atooid(PQgetvalue(upgrade_res, 0,
+												   PQfnumber(upgrade_res, "reltoastrelid")));
+		pg_class_relkind = *PQgetvalue(upgrade_res, 0,
+									   PQfnumber(upgrade_res, "relkind"));
+		pg_index_indexrelid = atooid(PQgetvalue(upgrade_res, 0,
+												PQfnumber(upgrade_res, "indexrelid")));
+
+		/*
+		 * In a pre-v12 database, partitioned tables might be marked as having
+		 * toast tables, but we should ignore them if so.
+		 */
+		if (OidIsValid(pg_class_reltoastrelid) &&
+			pg_class_relkind != RELKIND_PARTITIONED_TABLE)
+		{
 			appendPQExpBuffer(upgrade_buffer,
 							  "SELECT pg_catalog.binary_upgrade_set_next_toast_pg_class_oid('%u'::pg_catalog.oid);\n",
 							  pg_class_reltoastrelid);
@@ -4571,6 +4584,9 @@ binary_upgrade_set_pg_class_oids(Archive *fout,
 							  "SELECT pg_catalog.binary_upgrade_set_next_index_pg_class_oid('%u'::pg_catalog.oid);\n",
 							  pg_index_indexrelid);
 		}
+
+		PQclear(upgrade_res);
+		destroyPQExpBuffer(upgrade_query);
 	}
 	else
 		appendPQExpBuffer(upgrade_buffer,
@@ -4578,9 +4594,6 @@ binary_upgrade_set_pg_class_oids(Archive *fout,
 						  pg_class_oid);
 
 	appendPQExpBufferChar(upgrade_buffer, '\n');
-
-	PQclear(upgrade_res);
-	destroyPQExpBuffer(upgrade_query);
 }
 
 /*

From d92be269103acaf58e87ba4ba732eac0e3c4d78a Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 8 Jul 2020 10:41:53 +0900
Subject: [PATCH 096/334] doc: Fix inconsistencies in GIN, BRIN and SP-GiST for
 optional opclass methods

The GIN and SP-GiST parts were out-of-sync since the changes of 14903f2,
and the BRIN part was wrong since its introduction in 15cb2bd.

Author: Guillaume Lelarge
Reviewed-by: Daniel Gustafsson
Discussion: https://postgr.es/m/CAECtzeXKvEPEr967h0PRYRi39uTmdEms=oUtc_PWGjZRNN1prw@mail.gmail.com
Backpatch-through: 13
---
 doc/src/sgml/brin.sgml   | 2 +-
 doc/src/sgml/gin.sgml    | 4 ++--
 doc/src/sgml/spgist.sgml | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/src/sgml/brin.sgml b/doc/src/sgml/brin.sgml
index 4c5eeb875f4b..55b6272db62e 100644
--- a/doc/src/sgml/brin.sgml
+++ b/doc/src/sgml/brin.sgml
@@ -585,7 +585,7 @@ typedef struct BrinOpcInfo
 
       <para>
        Since both key extraction of indexed values and representation of the
-       key in <acronym>GIN</acronym> are flexible, they may depend on
+       key in <acronym>BRIN</acronym> are flexible, they may depend on
        user-specified parameters.
       </para>
      </listitem>
diff --git a/doc/src/sgml/gin.sgml b/doc/src/sgml/gin.sgml
index 7a8c18a4495c..07114f77199c 100644
--- a/doc/src/sgml/gin.sgml
+++ b/doc/src/sgml/gin.sgml
@@ -379,7 +379,7 @@
  </para>
 
  <para>
-  Optionally, an operator class for <acronym>GIN</acronym> can supply the
+  An operator class for <acronym>GIN</acronym> can optionally supply the
   following methods:
 
   <variablelist>
@@ -406,7 +406,7 @@
      <term><function>void options(local_relopts *relopts)</function></term>
      <listitem>
       <para>
-       Defines set of user-visible parameters that control operator class
+       Defines a set of user-visible parameters that control operator class
        behavior.
       </para>
 
diff --git a/doc/src/sgml/spgist.sgml b/doc/src/sgml/spgist.sgml
index d9ecfe74ed11..5d6e893d4918 100644
--- a/doc/src/sgml/spgist.sgml
+++ b/doc/src/sgml/spgist.sgml
@@ -881,7 +881,7 @@ typedef struct spgLeafConsistentOut
      <term><function>options</function></term>
      <listitem>
       <para>
-       Defines set of user-visible parameters that control operator class
+       Defines a set of user-visible parameters that control operator class
        behavior.
       </para>
 

From 5e574d170e2eb112e1da8a40f011741b82340f68 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Wed, 8 Jul 2020 11:00:23 +0900
Subject: [PATCH 097/334] Fix function name in comment.

Author: Masahiro Ikeda
Discussion: https://postgr.es/m/0043eee90b38351ea199d7e3294c10c4@oss.nttdata.com
---
 src/backend/utils/cache/relfilenodemap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/cache/relfilenodemap.c b/src/backend/utils/cache/relfilenodemap.c
index 68b01ca68fe7..3acda32d17af 100644
--- a/src/backend/utils/cache/relfilenodemap.c
+++ b/src/backend/utils/cache/relfilenodemap.c
@@ -82,7 +82,7 @@ RelfilenodeMapInvalidateCallback(Datum arg, Oid relid)
 }
 
 /*
- * RelfilenodeMapInvalidateCallback
+ * InitializeRelfilenodeMap
  *		Initialize cache, either on first use or after a reset.
  */
 static void

From 98f0eba5b7840197ee43f52833ef08f9090fbc44 Mon Sep 17 00:00:00 2001
From: Magnus Hagander <magnus@hagander.net>
Date: Wed, 8 Jul 2020 10:11:43 +0200
Subject: [PATCH 098/334] Fix typo

Author: Daniel Gustafsson
---
 src/backend/postmaster/pgstat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index edfa774ee409..88992c2da2c8 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -6681,7 +6681,7 @@ pgstat_clip_activity(const char *raw_activity)
  *
  * Determine index of entry for a SLRU with a given name. If there's no exact
  * match, returns index of the last "other" entry used for SLRUs defined in
- * external proejcts.
+ * external projects.
  */
 int
 pgstat_slru_index(const char *name)

From eb2c8a8f816b6c411047eeab7a326e8356a2c0a8 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Wed, 8 Jul 2020 11:17:52 +0200
Subject: [PATCH 099/334] Remove junk in test file

Remove a redundant and failing command, probably a typo.
---
 src/test/modules/unsafe_tests/expected/rolenames.out | 2 --
 src/test/modules/unsafe_tests/sql/rolenames.sql      | 1 -
 2 files changed, 3 deletions(-)

diff --git a/src/test/modules/unsafe_tests/expected/rolenames.out b/src/test/modules/unsafe_tests/expected/rolenames.out
index 116d12ee71dc..ff6aa69fc097 100644
--- a/src/test/modules/unsafe_tests/expected/rolenames.out
+++ b/src/test/modules/unsafe_tests/expected/rolenames.out
@@ -576,8 +576,6 @@ CREATE AGGREGATE testagg2(int2) (SFUNC = int2_sum, STYPE = int8);
 CREATE AGGREGATE testagg3(int2) (SFUNC = int2_sum, STYPE = int8);
 CREATE AGGREGATE testagg4(int2) (SFUNC = int2_sum, STYPE = int8);
 CREATE AGGREGATE testagg5(int2) (SFUNC = int2_sum, STYPE = int8);
-CREATE AGGREGATE testagg5(int2) (SFUNC = int2_sum, STYPE = int8);
-ERROR:  function "testagg5" already exists with same argument types
 CREATE AGGREGATE testagg6(int2) (SFUNC = int2_sum, STYPE = int8);
 CREATE AGGREGATE testagg7(int2) (SFUNC = int2_sum, STYPE = int8);
 CREATE AGGREGATE testagg8(int2) (SFUNC = int2_sum, STYPE = int8);
diff --git a/src/test/modules/unsafe_tests/sql/rolenames.sql b/src/test/modules/unsafe_tests/sql/rolenames.sql
index c931b8ebee26..c3013c146498 100644
--- a/src/test/modules/unsafe_tests/sql/rolenames.sql
+++ b/src/test/modules/unsafe_tests/sql/rolenames.sql
@@ -263,7 +263,6 @@ CREATE AGGREGATE testagg2(int2) (SFUNC = int2_sum, STYPE = int8);
 CREATE AGGREGATE testagg3(int2) (SFUNC = int2_sum, STYPE = int8);
 CREATE AGGREGATE testagg4(int2) (SFUNC = int2_sum, STYPE = int8);
 CREATE AGGREGATE testagg5(int2) (SFUNC = int2_sum, STYPE = int8);
-CREATE AGGREGATE testagg5(int2) (SFUNC = int2_sum, STYPE = int8);
 CREATE AGGREGATE testagg6(int2) (SFUNC = int2_sum, STYPE = int8);
 CREATE AGGREGATE testagg7(int2) (SFUNC = int2_sum, STYPE = int8);
 CREATE AGGREGATE testagg8(int2) (SFUNC = int2_sum, STYPE = int8);

From 654242fd81cfbdd229ec2a752a9f857a84cc039b Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Wed, 8 Jul 2020 21:24:34 +0900
Subject: [PATCH 100/334] Fix incorrect variable datatype.

Since slot_keep_segs indicates the number of WAL segments not LSN,
its datatype should not be XLogRecPtr.

Back-patch to v13 where this issue was added.

Reported-by: Atsushi Torikoshi
Author: Atsushi Torikoshi, tweaked by Fujii Masao
Discussion: https://postgr.es/m/ebd0d674f3e050222238a960cac5251a@oss.nttdata.com
---
 src/backend/access/transam/xlog.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c2feb9257621..91d99c113c04 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9601,7 +9601,7 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
 		/* Cap by max_slot_wal_keep_size ... */
 		if (max_slot_wal_keep_size_mb >= 0)
 		{
-			XLogRecPtr	slot_keep_segs;
+			uint64		slot_keep_segs;
 
 			slot_keep_segs =
 				ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);

From 1c4e88e2fe41dbd1cb3431fbfed5ee6d92bfe089 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 8 Jul 2020 13:41:52 -0400
Subject: [PATCH 101/334] Add test coverage for pg_current_logfile() function.

There has been no coverage at all up to now.  Given Thomas Kellerer's
recent report, I suspect this may fail on (some?) Windows machines,
but let's find out.

Discussion: https://postgr.es/m/412ae8da-76bb-640f-039a-f3513499e53d@gmx.net
---
 src/bin/pg_ctl/t/004_logrotate.pl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/bin/pg_ctl/t/004_logrotate.pl b/src/bin/pg_ctl/t/004_logrotate.pl
index 71dbfd203011..acaade8d812e 100644
--- a/src/bin/pg_ctl/t/004_logrotate.pl
+++ b/src/bin/pg_ctl/t/004_logrotate.pl
@@ -3,7 +3,7 @@
 
 use PostgresNode;
 use TestLib;
-use Test::More tests => 4;
+use Test::More tests => 5;
 use Time::HiRes qw(usleep);
 
 # Set up node with logging collector
@@ -47,6 +47,10 @@
 
 like($first_logfile, qr/division by zero/, 'found expected log file content');
 
+# While we're at it, test pg_current_logfile() function
+is($node->safe_psql('postgres', "SELECT pg_current_logfile('stderr')"),
+	$lfname, 'pg_current_logfile() gives correct answer');
+
 # Sleep 2 seconds and ask for log rotation; this should result in
 # output into a different log file name.
 sleep(2);

From 2661a793ff59149af1d844d6be504e421c04733e Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Wed, 8 Jul 2020 14:52:39 -0400
Subject: [PATCH 102/334] Don't treat DumpOptions->dump_inserts like a boolean

This has been an integer count since 7e413a0f82c8 so treat it explicitly
like an integer.

No backpatch since this is just cosmetic.
---
 src/bin/pg_dump/pg_backup.h | 2 +-
 src/bin/pg_dump/pg_dump.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h
index 8c0cedcd983e..b17c9dbb8beb 100644
--- a/src/bin/pg_dump/pg_backup.h
+++ b/src/bin/pg_dump/pg_backup.h
@@ -71,7 +71,7 @@ typedef struct _restoreOptions
 	char	   *use_role;		/* Issue SET ROLE to this */
 	int			dropSchema;
 	int			disable_dollar_quoting;
-	int			dump_inserts;
+	int			dump_inserts;	/* 0 = COPY, otherwise rows per INSERT */
 	int			column_inserts;
 	int			if_exists;
 	int			no_comments;	/* Skip comments */
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index c2627bb630b7..45946eec4639 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -2254,7 +2254,7 @@ dumpTableData(Archive *fout, TableDataInfo *tdinfo)
 	char	   *copyStmt;
 	const char *copyFrom;
 
-	if (!dopt->dump_inserts)
+	if (dopt->dump_inserts == 0)
 	{
 		/* Dump/restore using COPY */
 		dumpFn = dumpTableData_copy;

From 229f8c219f8fffacc253eca6023eab10a16eb009 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 14 Jun 2020 11:47:37 -0700
Subject: [PATCH 103/334] tap tests: replace 'master' with 'primary'.

We've largely replaced master with primary in docs etc, but tap test
still widely used master.

Author: Andres Freund
Reviewed-By: David Steele
Discussion: https://postgr.es/m/20200615182235.x7lch5n6kcjq4aue@alap3.anarazel.de
---
 contrib/bloom/t/001_wal.pl                    |  40 ++--
 src/bin/pg_rewind/t/001_basic.pl              |  76 +++---
 src/bin/pg_rewind/t/002_databases.pl          |  24 +-
 src/bin/pg_rewind/t/003_extrafiles.pl         |  54 ++---
 src/bin/pg_rewind/t/004_pg_xlog_symlink.pl    |  40 ++--
 src/bin/pg_rewind/t/005_same_timeline.pl      |   2 +-
 src/bin/pg_rewind/t/RewindTest.pm             | 134 +++++------
 src/bin/pg_verifybackup/t/002_algorithm.pl    |  14 +-
 src/bin/pg_verifybackup/t/003_corruption.pl   |  12 +-
 src/bin/pg_verifybackup/t/004_options.pl      |  10 +-
 src/bin/pg_verifybackup/t/006_encoding.pl     |  10 +-
 src/bin/pg_verifybackup/t/007_wal.pl          |  12 +-
 src/test/authentication/t/001_password.pl     |   4 +-
 src/test/authentication/t/002_saslprep.pl     |   4 +-
 src/test/modules/commit_ts/t/002_standby.pl   |  40 ++--
 src/test/modules/commit_ts/t/003_standby_2.pl |  36 +--
 src/test/modules/commit_ts/t/004_restart.pl   |  64 ++---
 .../test_misc/t/001_constraint_validation.pl  |   2 +-
 src/test/perl/PostgresNode.pm                 |  14 +-
 src/test/perl/README                          |   2 +-
 src/test/recovery/t/001_stream_rep.pl         | 130 +++++-----
 src/test/recovery/t/002_archiving.pl          |  26 +-
 src/test/recovery/t/003_recovery_targets.pl   |  60 ++---
 src/test/recovery/t/004_timeline_switch.pl    |  26 +-
 src/test/recovery/t/005_replay_delay.pl       |  28 +--
 src/test/recovery/t/006_logical_decoding.pl   |  78 +++---
 src/test/recovery/t/007_sync_rep.pl           |  70 +++---
 src/test/recovery/t/008_fsm_truncation.pl     |  30 +--
 src/test/recovery/t/009_twophase.pl           | 222 +++++++++---------
 .../t/010_logical_decoding_timelines.pl       |  64 ++---
 src/test/recovery/t/011_crash_recovery.pl     |   2 +-
 src/test/recovery/t/012_subtransactions.pl    |  84 +++----
 src/test/recovery/t/013_crash_restart.pl      |   2 +-
 src/test/recovery/t/019_replslot_limit.pl     | 132 +++++------
 src/test/recovery/t/020_archive_status.pl     |   2 +-
 src/test/ssl/t/001_ssltests.pl                |   2 +-
 src/test/ssl/t/002_scram.pl                   |   2 +-
 37 files changed, 777 insertions(+), 777 deletions(-)

diff --git a/contrib/bloom/t/001_wal.pl b/contrib/bloom/t/001_wal.pl
index 0f2628b5575f..7f6398f57129 100644
--- a/contrib/bloom/t/001_wal.pl
+++ b/contrib/bloom/t/001_wal.pl
@@ -5,10 +5,10 @@
 use TestLib;
 use Test::More tests => 31;
 
-my $node_master;
+my $node_primary;
 my $node_standby;
 
-# Run few queries on both master and standby and check their results match.
+# Run few queries on both primary and standby and check their results match.
 sub test_index_replay
 {
 	my ($test_name) = @_;
@@ -17,7 +17,7 @@ sub test_index_replay
 	my $applname = $node_standby->name;
 	my $caughtup_query =
 	  "SELECT pg_current_wal_lsn() <= write_lsn FROM pg_stat_replication WHERE application_name = '$applname';";
-	$node_master->poll_query_until('postgres', $caughtup_query)
+	$node_primary->poll_query_until('postgres', $caughtup_query)
 	  or die "Timed out while waiting for standby 1 to catch up";
 
 	my $queries = qq(SET enable_seqscan=off;
@@ -32,35 +32,35 @@ sub test_index_replay
 );
 
 	# Run test queries and compare their result
-	my $master_result = $node_master->safe_psql("postgres", $queries);
+	my $primary_result = $node_primary->safe_psql("postgres", $queries);
 	my $standby_result = $node_standby->safe_psql("postgres", $queries);
 
-	is($master_result, $standby_result, "$test_name: query result matches");
+	is($primary_result, $standby_result, "$test_name: query result matches");
 	return;
 }
 
-# Initialize master node
-$node_master = get_new_node('master');
-$node_master->init(allows_streaming => 1);
-$node_master->start;
+# Initialize primary node
+$node_primary = get_new_node('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->start;
 my $backup_name = 'my_backup';
 
 # Take backup
-$node_master->backup($backup_name);
+$node_primary->backup($backup_name);
 
-# Create streaming standby linking to master
+# Create streaming standby linking to primary
 $node_standby = get_new_node('standby');
-$node_standby->init_from_backup($node_master, $backup_name,
+$node_standby->init_from_backup($node_primary, $backup_name,
 	has_streaming => 1);
 $node_standby->start;
 
-# Create some bloom index on master
-$node_master->safe_psql("postgres", "CREATE EXTENSION bloom;");
-$node_master->safe_psql("postgres", "CREATE TABLE tst (i int4, t text);");
-$node_master->safe_psql("postgres",
+# Create some bloom index on primary
+$node_primary->safe_psql("postgres", "CREATE EXTENSION bloom;");
+$node_primary->safe_psql("postgres", "CREATE TABLE tst (i int4, t text);");
+$node_primary->safe_psql("postgres",
 	"INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series(1,100000) i;"
 );
-$node_master->safe_psql("postgres",
+$node_primary->safe_psql("postgres",
 	"CREATE INDEX bloomidx ON tst USING bloom (i, t) WITH (col1 = 3);");
 
 # Test that queries give same result
@@ -69,12 +69,12 @@ sub test_index_replay
 # Run 10 cycles of table modification. Run test queries after each modification.
 for my $i (1 .. 10)
 {
-	$node_master->safe_psql("postgres", "DELETE FROM tst WHERE i = $i;");
+	$node_primary->safe_psql("postgres", "DELETE FROM tst WHERE i = $i;");
 	test_index_replay("delete $i");
-	$node_master->safe_psql("postgres", "VACUUM tst;");
+	$node_primary->safe_psql("postgres", "VACUUM tst;");
 	test_index_replay("vacuum $i");
 	my ($start, $end) = (100001 + ($i - 1) * 10000, 100000 + $i * 10000);
-	$node_master->safe_psql("postgres",
+	$node_primary->safe_psql("postgres",
 		"INSERT INTO tst SELECT i%10, substr(md5(i::text), 1, 1) FROM generate_series($start,$end) i;"
 	);
 	test_index_replay("insert $i");
diff --git a/src/bin/pg_rewind/t/001_basic.pl b/src/bin/pg_rewind/t/001_basic.pl
index d97e4377419e..fb4a0acd965a 100644
--- a/src/bin/pg_rewind/t/001_basic.pl
+++ b/src/bin/pg_rewind/t/001_basic.pl
@@ -13,58 +13,58 @@ sub run_test
 	my $test_mode = shift;
 
 	RewindTest::setup_cluster($test_mode);
-	RewindTest::start_master();
+	RewindTest::start_primary();
 
-	# Create a test table and insert a row in master.
-	master_psql("CREATE TABLE tbl1 (d text)");
-	master_psql("INSERT INTO tbl1 VALUES ('in master')");
+	# Create a test table and insert a row in primary.
+	primary_psql("CREATE TABLE tbl1 (d text)");
+	primary_psql("INSERT INTO tbl1 VALUES ('in primary')");
 
 	# This test table will be used to test truncation, i.e. the table
-	# is extended in the old master after promotion
-	master_psql("CREATE TABLE trunc_tbl (d text)");
-	master_psql("INSERT INTO trunc_tbl VALUES ('in master')");
+	# is extended in the old primary after promotion
+	primary_psql("CREATE TABLE trunc_tbl (d text)");
+	primary_psql("INSERT INTO trunc_tbl VALUES ('in primary')");
 
 	# This test table will be used to test the "copy-tail" case, i.e. the
-	# table is truncated in the old master after promotion
-	master_psql("CREATE TABLE tail_tbl (id integer, d text)");
-	master_psql("INSERT INTO tail_tbl VALUES (0, 'in master')");
+	# table is truncated in the old primary after promotion
+	primary_psql("CREATE TABLE tail_tbl (id integer, d text)");
+	primary_psql("INSERT INTO tail_tbl VALUES (0, 'in primary')");
 
-	master_psql("CHECKPOINT");
+	primary_psql("CHECKPOINT");
 
 	RewindTest::create_standby($test_mode);
 
-	# Insert additional data on master that will be replicated to standby
-	master_psql("INSERT INTO tbl1 values ('in master, before promotion')");
-	master_psql(
-		"INSERT INTO trunc_tbl values ('in master, before promotion')");
-	master_psql(
-		"INSERT INTO tail_tbl SELECT g, 'in master, before promotion: ' || g FROM generate_series(1, 10000) g"
+	# Insert additional data on primary that will be replicated to standby
+	primary_psql("INSERT INTO tbl1 values ('in primary, before promotion')");
+	primary_psql(
+		"INSERT INTO trunc_tbl values ('in primary, before promotion')");
+	primary_psql(
+		"INSERT INTO tail_tbl SELECT g, 'in primary, before promotion: ' || g FROM generate_series(1, 10000) g"
 	);
 
-	master_psql('CHECKPOINT');
+	primary_psql('CHECKPOINT');
 
 	RewindTest::promote_standby();
 
-	# Insert a row in the old master. This causes the master and standby
+	# Insert a row in the old primary. This causes the primary and standby
 	# to have "diverged", it's no longer possible to just apply the
-	# standy's logs over master directory - you need to rewind.
-	master_psql("INSERT INTO tbl1 VALUES ('in master, after promotion')");
+	# standy's logs over primary directory - you need to rewind.
+	primary_psql("INSERT INTO tbl1 VALUES ('in primary, after promotion')");
 
 	# Also insert a new row in the standby, which won't be present in the
-	# old master.
+	# old primary.
 	standby_psql("INSERT INTO tbl1 VALUES ('in standby, after promotion')");
 
 	# Insert enough rows to trunc_tbl to extend the file. pg_rewind should
 	# truncate it back to the old size.
-	master_psql(
-		"INSERT INTO trunc_tbl SELECT 'in master, after promotion: ' || g FROM generate_series(1, 10000) g"
+	primary_psql(
+		"INSERT INTO trunc_tbl SELECT 'in primary, after promotion: ' || g FROM generate_series(1, 10000) g"
 	);
 
 	# Truncate tail_tbl. pg_rewind should copy back the truncated part
 	# (We cannot use an actual TRUNCATE command here, as that creates a
 	# whole new relfilenode)
-	master_psql("DELETE FROM tail_tbl WHERE id > 10");
-	master_psql("VACUUM tail_tbl");
+	primary_psql("DELETE FROM tail_tbl WHERE id > 10");
+	primary_psql("VACUUM tail_tbl");
 
 	# Before running pg_rewind, do a couple of extra tests with several
 	# option combinations.  As the code paths taken by those tests
@@ -72,7 +72,7 @@ sub run_test
 	# in "local" mode for simplicity's sake.
 	if ($test_mode eq 'local')
 	{
-		my $master_pgdata  = $node_master->data_dir;
+		my $primary_pgdata  = $node_primary->data_dir;
 		my $standby_pgdata = $node_standby->data_dir;
 
 		# First check that pg_rewind fails if the target cluster is
@@ -82,7 +82,7 @@ sub run_test
 			[
 				'pg_rewind',       '--debug',
 				'--source-pgdata', $standby_pgdata,
-				'--target-pgdata', $master_pgdata,
+				'--target-pgdata', $primary_pgdata,
 				'--no-sync'
 			],
 			'pg_rewind with running target');
@@ -94,7 +94,7 @@ sub run_test
 			[
 				'pg_rewind',       '--debug',
 				'--source-pgdata', $standby_pgdata,
-				'--target-pgdata', $master_pgdata,
+				'--target-pgdata', $primary_pgdata,
 				'--no-sync',       '--no-ensure-shutdown'
 			],
 			'pg_rewind --no-ensure-shutdown with running target');
@@ -102,12 +102,12 @@ sub run_test
 		# Stop the target, and attempt to run with a local source
 		# still running.  This fails as pg_rewind requires to have
 		# a source cleanly stopped.
-		$node_master->stop;
+		$node_primary->stop;
 		command_fails(
 			[
 				'pg_rewind',       '--debug',
 				'--source-pgdata', $standby_pgdata,
-				'--target-pgdata', $master_pgdata,
+				'--target-pgdata', $primary_pgdata,
 				'--no-sync',       '--no-ensure-shutdown'
 			],
 			'pg_rewind with unexpected running source');
@@ -121,30 +121,30 @@ sub run_test
 			[
 				'pg_rewind',       '--debug',
 				'--source-pgdata', $standby_pgdata,
-				'--target-pgdata', $master_pgdata,
+				'--target-pgdata', $primary_pgdata,
 				'--no-sync',       '--dry-run'
 			],
 			'pg_rewind --dry-run');
 
 		# Both clusters need to be alive moving forward.
 		$node_standby->start;
-		$node_master->start;
+		$node_primary->start;
 	}
 
 	RewindTest::run_pg_rewind($test_mode);
 
 	check_query(
 		'SELECT * FROM tbl1',
-		qq(in master
-in master, before promotion
+		qq(in primary
+in primary, before promotion
 in standby, after promotion
 ),
 		'table content');
 
 	check_query(
 		'SELECT * FROM trunc_tbl',
-		qq(in master
-in master, before promotion
+		qq(in primary
+in primary, before promotion
 ),
 		'truncation');
 
@@ -160,7 +160,7 @@ sub run_test
 		skip "unix-style permissions not supported on Windows", 1
 		  if ($windows_os);
 
-		ok(check_mode_recursive($node_master->data_dir(), 0700, 0600),
+		ok(check_mode_recursive($node_primary->data_dir(), 0700, 0600),
 			'check PGDATA permissions');
 	}
 
diff --git a/src/bin/pg_rewind/t/002_databases.pl b/src/bin/pg_rewind/t/002_databases.pl
index 1db534c0dc0c..5506fe425bca 100644
--- a/src/bin/pg_rewind/t/002_databases.pl
+++ b/src/bin/pg_rewind/t/002_databases.pl
@@ -13,26 +13,26 @@ sub run_test
 	my $test_mode = shift;
 
 	RewindTest::setup_cluster($test_mode, ['-g']);
-	RewindTest::start_master();
+	RewindTest::start_primary();
 
-	# Create a database in master with a table.
-	master_psql('CREATE DATABASE inmaster');
-	master_psql('CREATE TABLE inmaster_tab (a int)', 'inmaster');
+	# Create a database in primary with a table.
+	primary_psql('CREATE DATABASE inprimary');
+	primary_psql('CREATE TABLE inprimary_tab (a int)', 'inprimary');
 
 	RewindTest::create_standby($test_mode);
 
 	# Create another database with another table, the creation is
 	# replicated to the standby.
-	master_psql('CREATE DATABASE beforepromotion');
-	master_psql('CREATE TABLE beforepromotion_tab (a int)',
+	primary_psql('CREATE DATABASE beforepromotion');
+	primary_psql('CREATE TABLE beforepromotion_tab (a int)',
 		'beforepromotion');
 
 	RewindTest::promote_standby();
 
-	# Create databases in the old master and the new promoted standby.
-	master_psql('CREATE DATABASE master_afterpromotion');
-	master_psql('CREATE TABLE master_promotion_tab (a int)',
-		'master_afterpromotion');
+	# Create databases in the old primary and the new promoted standby.
+	primary_psql('CREATE DATABASE primary_afterpromotion');
+	primary_psql('CREATE TABLE primary_promotion_tab (a int)',
+		'primary_afterpromotion');
 	standby_psql('CREATE DATABASE standby_afterpromotion');
 	standby_psql('CREATE TABLE standby_promotion_tab (a int)',
 		'standby_afterpromotion');
@@ -45,7 +45,7 @@ sub run_test
 	check_query(
 		'SELECT datname FROM pg_database ORDER BY 1',
 		qq(beforepromotion
-inmaster
+inprimary
 postgres
 standby_afterpromotion
 template0
@@ -59,7 +59,7 @@ sub run_test
 		skip "unix-style permissions not supported on Windows", 1
 		  if ($windows_os);
 
-		ok(check_mode_recursive($node_master->data_dir(), 0750, 0640),
+		ok(check_mode_recursive($node_primary->data_dir(), 0750, 0640),
 			'check PGDATA permissions');
 	}
 
diff --git a/src/bin/pg_rewind/t/003_extrafiles.pl b/src/bin/pg_rewind/t/003_extrafiles.pl
index f4710440fc3a..48849fb49aa2 100644
--- a/src/bin/pg_rewind/t/003_extrafiles.pl
+++ b/src/bin/pg_rewind/t/003_extrafiles.pl
@@ -18,21 +18,21 @@ sub run_test
 	my $test_mode = shift;
 
 	RewindTest::setup_cluster($test_mode);
-	RewindTest::start_master();
+	RewindTest::start_primary();
 
-	my $test_master_datadir = $node_master->data_dir;
+	my $test_primary_datadir = $node_primary->data_dir;
 
 	# Create a subdir and files that will be present in both
-	mkdir "$test_master_datadir/tst_both_dir";
-	append_to_file "$test_master_datadir/tst_both_dir/both_file1", "in both1";
-	append_to_file "$test_master_datadir/tst_both_dir/both_file2", "in both2";
-	mkdir "$test_master_datadir/tst_both_dir/both_subdir/";
-	append_to_file "$test_master_datadir/tst_both_dir/both_subdir/both_file3",
+	mkdir "$test_primary_datadir/tst_both_dir";
+	append_to_file "$test_primary_datadir/tst_both_dir/both_file1", "in both1";
+	append_to_file "$test_primary_datadir/tst_both_dir/both_file2", "in both2";
+	mkdir "$test_primary_datadir/tst_both_dir/both_subdir/";
+	append_to_file "$test_primary_datadir/tst_both_dir/both_subdir/both_file3",
 	  "in both3";
 
 	RewindTest::create_standby($test_mode);
 
-	# Create different subdirs and files in master and standby
+	# Create different subdirs and files in primary and standby
 	my $test_standby_datadir = $node_standby->data_dir;
 
 	mkdir "$test_standby_datadir/tst_standby_dir";
@@ -45,15 +45,15 @@ sub run_test
 	  "$test_standby_datadir/tst_standby_dir/standby_subdir/standby_file3",
 	  "in standby3";
 
-	mkdir "$test_master_datadir/tst_master_dir";
-	append_to_file "$test_master_datadir/tst_master_dir/master_file1",
-	  "in master1";
-	append_to_file "$test_master_datadir/tst_master_dir/master_file2",
-	  "in master2";
-	mkdir "$test_master_datadir/tst_master_dir/master_subdir/";
+	mkdir "$test_primary_datadir/tst_primary_dir";
+	append_to_file "$test_primary_datadir/tst_primary_dir/primary_file1",
+	  "in primary1";
+	append_to_file "$test_primary_datadir/tst_primary_dir/primary_file2",
+	  "in primary2";
+	mkdir "$test_primary_datadir/tst_primary_dir/primary_subdir/";
 	append_to_file
-	  "$test_master_datadir/tst_master_dir/master_subdir/master_file3",
-	  "in master3";
+	  "$test_primary_datadir/tst_primary_dir/primary_subdir/primary_file3",
+	  "in primary3";
 
 	RewindTest::promote_standby();
 	RewindTest::run_pg_rewind($test_mode);
@@ -65,21 +65,21 @@ sub run_test
 			push @paths, $File::Find::name
 			  if $File::Find::name =~ m/.*tst_.*/;
 		},
-		$test_master_datadir);
+		$test_primary_datadir);
 	@paths = sort @paths;
 	is_deeply(
 		\@paths,
 		[
-			"$test_master_datadir/tst_both_dir",
-			"$test_master_datadir/tst_both_dir/both_file1",
-			"$test_master_datadir/tst_both_dir/both_file2",
-			"$test_master_datadir/tst_both_dir/both_subdir",
-			"$test_master_datadir/tst_both_dir/both_subdir/both_file3",
-			"$test_master_datadir/tst_standby_dir",
-			"$test_master_datadir/tst_standby_dir/standby_file1",
-			"$test_master_datadir/tst_standby_dir/standby_file2",
-			"$test_master_datadir/tst_standby_dir/standby_subdir",
-			"$test_master_datadir/tst_standby_dir/standby_subdir/standby_file3"
+			"$test_primary_datadir/tst_both_dir",
+			"$test_primary_datadir/tst_both_dir/both_file1",
+			"$test_primary_datadir/tst_both_dir/both_file2",
+			"$test_primary_datadir/tst_both_dir/both_subdir",
+			"$test_primary_datadir/tst_both_dir/both_subdir/both_file3",
+			"$test_primary_datadir/tst_standby_dir",
+			"$test_primary_datadir/tst_standby_dir/standby_file1",
+			"$test_primary_datadir/tst_standby_dir/standby_file2",
+			"$test_primary_datadir/tst_standby_dir/standby_subdir",
+			"$test_primary_datadir/tst_standby_dir/standby_subdir/standby_file3"
 		],
 		"file lists match");
 
diff --git a/src/bin/pg_rewind/t/004_pg_xlog_symlink.pl b/src/bin/pg_rewind/t/004_pg_xlog_symlink.pl
index 639eeb9c910c..3813543ee1cc 100644
--- a/src/bin/pg_rewind/t/004_pg_xlog_symlink.pl
+++ b/src/bin/pg_rewind/t/004_pg_xlog_symlink.pl
@@ -26,50 +26,50 @@ sub run_test
 {
 	my $test_mode = shift;
 
-	my $master_xlogdir = "${TestLib::tmp_check}/xlog_master";
+	my $primary_xlogdir = "${TestLib::tmp_check}/xlog_primary";
 
-	rmtree($master_xlogdir);
+	rmtree($primary_xlogdir);
 	RewindTest::setup_cluster($test_mode);
 
-	my $test_master_datadir = $node_master->data_dir;
+	my $test_primary_datadir = $node_primary->data_dir;
 
 	# turn pg_wal into a symlink
-	print("moving $test_master_datadir/pg_wal to $master_xlogdir\n");
-	move("$test_master_datadir/pg_wal", $master_xlogdir) or die;
-	symlink($master_xlogdir, "$test_master_datadir/pg_wal") or die;
+	print("moving $test_primary_datadir/pg_wal to $primary_xlogdir\n");
+	move("$test_primary_datadir/pg_wal", $primary_xlogdir) or die;
+	symlink($primary_xlogdir, "$test_primary_datadir/pg_wal") or die;
 
-	RewindTest::start_master();
+	RewindTest::start_primary();
 
-	# Create a test table and insert a row in master.
-	master_psql("CREATE TABLE tbl1 (d text)");
-	master_psql("INSERT INTO tbl1 VALUES ('in master')");
+	# Create a test table and insert a row in primary.
+	primary_psql("CREATE TABLE tbl1 (d text)");
+	primary_psql("INSERT INTO tbl1 VALUES ('in primary')");
 
-	master_psql("CHECKPOINT");
+	primary_psql("CHECKPOINT");
 
 	RewindTest::create_standby($test_mode);
 
-	# Insert additional data on master that will be replicated to standby
-	master_psql("INSERT INTO tbl1 values ('in master, before promotion')");
+	# Insert additional data on primary that will be replicated to standby
+	primary_psql("INSERT INTO tbl1 values ('in primary, before promotion')");
 
-	master_psql('CHECKPOINT');
+	primary_psql('CHECKPOINT');
 
 	RewindTest::promote_standby();
 
-	# Insert a row in the old master. This causes the master and standby
+	# Insert a row in the old primary. This causes the primary and standby
 	# to have "diverged", it's no longer possible to just apply the
-	# standy's logs over master directory - you need to rewind.
-	master_psql("INSERT INTO tbl1 VALUES ('in master, after promotion')");
+	# standy's logs over primary directory - you need to rewind.
+	primary_psql("INSERT INTO tbl1 VALUES ('in primary, after promotion')");
 
 	# Also insert a new row in the standby, which won't be present in the
-	# old master.
+	# old primary.
 	standby_psql("INSERT INTO tbl1 VALUES ('in standby, after promotion')");
 
 	RewindTest::run_pg_rewind($test_mode);
 
 	check_query(
 		'SELECT * FROM tbl1',
-		qq(in master
-in master, before promotion
+		qq(in primary
+in primary, before promotion
 in standby, after promotion
 ),
 		'table content');
diff --git a/src/bin/pg_rewind/t/005_same_timeline.pl b/src/bin/pg_rewind/t/005_same_timeline.pl
index 5464f4203a70..8706d5aed5c4 100644
--- a/src/bin/pg_rewind/t/005_same_timeline.pl
+++ b/src/bin/pg_rewind/t/005_same_timeline.pl
@@ -13,7 +13,7 @@
 use RewindTest;
 
 RewindTest::setup_cluster();
-RewindTest::start_master();
+RewindTest::start_primary();
 RewindTest::create_standby();
 RewindTest::run_pg_rewind('local');
 RewindTest::clean_rewind_test();
diff --git a/src/bin/pg_rewind/t/RewindTest.pm b/src/bin/pg_rewind/t/RewindTest.pm
index 7dabf395e106..149b99159d08 100644
--- a/src/bin/pg_rewind/t/RewindTest.pm
+++ b/src/bin/pg_rewind/t/RewindTest.pm
@@ -2,31 +2,31 @@ package RewindTest;
 
 # Test driver for pg_rewind. Each test consists of a cycle where a new cluster
 # is first created with initdb, and a streaming replication standby is set up
-# to follow the master. Then the master is shut down and the standby is
-# promoted, and finally pg_rewind is used to rewind the old master, using the
+# to follow the primary. Then the primary is shut down and the standby is
+# promoted, and finally pg_rewind is used to rewind the old primary, using the
 # standby as the source.
 #
 # To run a test, the test script (in t/ subdirectory) calls the functions
 # in this module. These functions should be called in this sequence:
 #
-# 1. setup_cluster - creates a PostgreSQL cluster that runs as the master
+# 1. setup_cluster - creates a PostgreSQL cluster that runs as the primary
 #
-# 2. start_master - starts the master server
+# 2. start_primary - starts the primary server
 #
 # 3. create_standby - runs pg_basebackup to initialize a standby server, and
-#    sets it up to follow the master.
+#    sets it up to follow the primary.
 #
 # 4. promote_standby - runs "pg_ctl promote" to promote the standby server.
-# The old master keeps running.
+# The old primary keeps running.
 #
-# 5. run_pg_rewind - stops the old master (if it's still running) and runs
+# 5. run_pg_rewind - stops the old primary (if it's still running) and runs
 # pg_rewind to synchronize it with the now-promoted standby server.
 #
 # 6. clean_rewind_test - stops both servers used in the test, if they're
 # still running.
 #
-# The test script can use the helper functions master_psql and standby_psql
-# to run psql against the master and standby servers, respectively.
+# The test script can use the helper functions primary_psql and standby_psql
+# to run psql against the primary and standby servers, respectively.
 
 use strict;
 use warnings;
@@ -43,15 +43,15 @@ use TestLib;
 use Test::More;
 
 our @EXPORT = qw(
-  $node_master
+  $node_primary
   $node_standby
 
-  master_psql
+  primary_psql
   standby_psql
   check_query
 
   setup_cluster
-  start_master
+  start_primary
   create_standby
   promote_standby
   run_pg_rewind
@@ -59,16 +59,16 @@ our @EXPORT = qw(
 );
 
 # Our nodes.
-our $node_master;
+our $node_primary;
 our $node_standby;
 
-sub master_psql
+sub primary_psql
 {
 	my $cmd = shift;
 	my $dbname = shift || 'postgres';
 
 	system_or_bail 'psql', '-q', '--no-psqlrc', '-d',
-	  $node_master->connstr($dbname), '-c', "$cmd";
+	  $node_primary->connstr($dbname), '-c', "$cmd";
 	return;
 }
 
@@ -82,7 +82,7 @@ sub standby_psql
 	return;
 }
 
-# Run a query against the master, and check that the output matches what's
+# Run a query against the primary, and check that the output matches what's
 # expected
 sub check_query
 {
@@ -94,7 +94,7 @@ sub check_query
 	# we want just the output, no formatting
 	my $result = run [
 		'psql', '-q', '-A', '-t', '--no-psqlrc', '-d',
-		$node_master->connstr('postgres'),
+		$node_primary->connstr('postgres'),
 		'-c', $query
 	  ],
 	  '>', \$stdout, '2>', \$stderr;
@@ -123,34 +123,34 @@ sub setup_cluster
 	my $extra_name = shift;    # Used to differentiate clusters
 	my $extra      = shift;    # Extra params for initdb
 
-	# Initialize master, data checksums are mandatory
-	$node_master =
-	  get_new_node('master' . ($extra_name ? "_${extra_name}" : ''));
+	# Initialize primary, data checksums are mandatory
+	$node_primary =
+	  get_new_node('primary' . ($extra_name ? "_${extra_name}" : ''));
 
 	# Set up pg_hba.conf and pg_ident.conf for the role running
 	# pg_rewind.  This role is used for all the tests, and has
 	# minimal permissions enough to rewind from an online source.
-	$node_master->init(
+	$node_primary->init(
 		allows_streaming => 1,
 		extra            => $extra,
 		auth_extra       => [ '--create-role', 'rewind_user' ]);
 
 	# Set wal_keep_segments to prevent WAL segment recycling after enforced
 	# checkpoints in the tests.
-	$node_master->append_conf(
+	$node_primary->append_conf(
 		'postgresql.conf', qq(
 wal_keep_segments = 20
 ));
 	return;
 }
 
-sub start_master
+sub start_primary
 {
-	$node_master->start;
+	$node_primary->start;
 
 	# Create custom role which is used to run pg_rewind, and adjust its
 	# permissions to the minimum necessary.
-	$node_master->safe_psql(
+	$node_primary->safe_psql(
 		'postgres', "
 		CREATE ROLE rewind_user LOGIN;
 		GRANT EXECUTE ON function pg_catalog.pg_ls_dir(text, boolean, boolean)
@@ -162,7 +162,7 @@ sub start_master
 		GRANT EXECUTE ON function pg_catalog.pg_read_binary_file(text, bigint, bigint, boolean)
 		  TO rewind_user;");
 
-	#### Now run the test-specific parts to initialize the master before setting
+	#### Now run the test-specific parts to initialize the primary before setting
 	# up standby
 
 	return;
@@ -174,13 +174,13 @@ sub create_standby
 
 	$node_standby =
 	  get_new_node('standby' . ($extra_name ? "_${extra_name}" : ''));
-	$node_master->backup('my_backup');
-	$node_standby->init_from_backup($node_master, 'my_backup');
-	my $connstr_master = $node_master->connstr();
+	$node_primary->backup('my_backup');
+	$node_standby->init_from_backup($node_primary, 'my_backup');
+	my $connstr_primary = $node_primary->connstr();
 
 	$node_standby->append_conf(
 		"postgresql.conf", qq(
-primary_conninfo='$connstr_master'
+primary_conninfo='$connstr_primary'
 ));
 
 	$node_standby->set_standby_mode();
@@ -200,10 +200,10 @@ sub promote_standby
 	# up standby
 
 	# Wait for the standby to receive and write all WAL.
-	$node_master->wait_for_catchup($node_standby, 'write');
+	$node_primary->wait_for_catchup($node_standby, 'write');
 
-	# Now promote standby and insert some new data on master, this will put
-	# the master out-of-sync with the standby.
+	# Now promote standby and insert some new data on primary, this will put
+	# the primary out-of-sync with the standby.
 	$node_standby->promote;
 
 	# Force a checkpoint after the promotion. pg_rewind looks at the control
@@ -220,7 +220,7 @@ sub promote_standby
 sub run_pg_rewind
 {
 	my $test_mode       = shift;
-	my $master_pgdata   = $node_master->data_dir;
+	my $primary_pgdata   = $node_primary->data_dir;
 	my $standby_pgdata  = $node_standby->data_dir;
 	my $standby_connstr = $node_standby->connstr('postgres');
 	my $tmp_folder      = TestLib::tempdir;
@@ -239,14 +239,14 @@ sub run_pg_rewind
 		# segments but that would just make the test more costly,
 		# without improving the coverage.  Hence, instead, stop
 		# gracefully the primary here.
-		$node_master->stop;
+		$node_primary->stop;
 	}
 	else
 	{
-		# Stop the master and be ready to perform the rewind.  The cluster
+		# Stop the primary and be ready to perform the rewind.  The cluster
 		# needs recovery to finish once, and pg_rewind makes sure that it
 		# happens automatically.
-		$node_master->stop('immediate');
+		$node_primary->stop('immediate');
 	}
 
 	# At this point, the rewind processing is ready to run.
@@ -254,25 +254,25 @@ sub run_pg_rewind
 	# The real testing begins really now with a bifurcation of the possible
 	# scenarios that pg_rewind supports.
 
-	# Keep a temporary postgresql.conf for master node or it would be
+	# Keep a temporary postgresql.conf for primary node or it would be
 	# overwritten during the rewind.
 	copy(
-		"$master_pgdata/postgresql.conf",
-		"$tmp_folder/master-postgresql.conf.tmp");
+		"$primary_pgdata/postgresql.conf",
+		"$tmp_folder/primary-postgresql.conf.tmp");
 
 	# Now run pg_rewind
 	if ($test_mode eq "local")
 	{
 
 		# Do rewind using a local pgdata as source
-		# Stop the master and be ready to perform the rewind
+		# Stop the primary and be ready to perform the rewind
 		$node_standby->stop;
 		command_ok(
 			[
 				'pg_rewind',
 				"--debug",
 				"--source-pgdata=$standby_pgdata",
-				"--target-pgdata=$master_pgdata",
+				"--target-pgdata=$primary_pgdata",
 				"--no-sync"
 			],
 			'pg_rewind local');
@@ -285,19 +285,19 @@ sub run_pg_rewind
 			[
 				'pg_rewind',                      "--debug",
 				"--source-server",                $standby_connstr,
-				"--target-pgdata=$master_pgdata", "--no-sync",
+				"--target-pgdata=$primary_pgdata", "--no-sync",
 				"--write-recovery-conf"
 			],
 			'pg_rewind remote');
 
 		# Check that standby.signal is here as recovery configuration
 		# was requested.
-		ok( -e "$master_pgdata/standby.signal",
+		ok( -e "$primary_pgdata/standby.signal",
 			'standby.signal created after pg_rewind');
 
 		# Now, when pg_rewind apparently succeeded with minimal permissions,
 		# add REPLICATION privilege.  So we could test that new standby
-		# is able to connect to the new master with generated config.
+		# is able to connect to the new primary with generated config.
 		$node_standby->safe_psql('postgres',
 			"ALTER ROLE rewind_user WITH REPLICATION;");
 	}
@@ -305,30 +305,30 @@ sub run_pg_rewind
 	{
 
 		# Do rewind using a local pgdata as source and specified
-		# directory with target WAL archive.  The old master has
+		# directory with target WAL archive.  The old primary has
 		# to be stopped at this point.
 
 		# Remove the existing archive directory and move all WAL
-		# segments from the old master to the archives.  These
+		# segments from the old primary to the archives.  These
 		# will be used by pg_rewind.
-		rmtree($node_master->archive_dir);
-		RecursiveCopy::copypath($node_master->data_dir . "/pg_wal",
-			$node_master->archive_dir);
+		rmtree($node_primary->archive_dir);
+		RecursiveCopy::copypath($node_primary->data_dir . "/pg_wal",
+			$node_primary->archive_dir);
 
 		# Fast way to remove entire directory content
-		rmtree($node_master->data_dir . "/pg_wal");
-		mkdir($node_master->data_dir . "/pg_wal");
+		rmtree($node_primary->data_dir . "/pg_wal");
+		mkdir($node_primary->data_dir . "/pg_wal");
 
 		# Make sure that directories have the right umask as this is
 		# required by a follow-up check on permissions, and better
 		# safe than sorry.
-		chmod(0700, $node_master->archive_dir);
-		chmod(0700, $node_master->data_dir . "/pg_wal");
+		chmod(0700, $node_primary->archive_dir);
+		chmod(0700, $node_primary->data_dir . "/pg_wal");
 
 		# Add appropriate restore_command to the target cluster
-		$node_master->enable_restoring($node_master, 0);
+		$node_primary->enable_restoring($node_primary, 0);
 
-		# Stop the new master and be ready to perform the rewind.
+		# Stop the new primary and be ready to perform the rewind.
 		$node_standby->stop;
 
 		# Note the use of --no-ensure-shutdown here.  WAL files are
@@ -339,7 +339,7 @@ sub run_pg_rewind
 				'pg_rewind',
 				"--debug",
 				"--source-pgdata=$standby_pgdata",
-				"--target-pgdata=$master_pgdata",
+				"--target-pgdata=$primary_pgdata",
 				"--no-sync",
 				"--no-ensure-shutdown",
 				"--restore-target-wal"
@@ -355,28 +355,28 @@ sub run_pg_rewind
 
 	# Now move back postgresql.conf with old settings
 	move(
-		"$tmp_folder/master-postgresql.conf.tmp",
-		"$master_pgdata/postgresql.conf");
+		"$tmp_folder/primary-postgresql.conf.tmp",
+		"$primary_pgdata/postgresql.conf");
 
 	chmod(
-		$node_master->group_access() ? 0640 : 0600,
-		"$master_pgdata/postgresql.conf")
+		$node_primary->group_access() ? 0640 : 0600,
+		"$primary_pgdata/postgresql.conf")
 	  or BAIL_OUT(
-		"unable to set permissions for $master_pgdata/postgresql.conf");
+		"unable to set permissions for $primary_pgdata/postgresql.conf");
 
 	# Plug-in rewound node to the now-promoted standby node
 	if ($test_mode ne "remote")
 	{
 		my $port_standby = $node_standby->port;
-		$node_master->append_conf(
+		$node_primary->append_conf(
 			'postgresql.conf', qq(
 primary_conninfo='port=$port_standby'));
 
-		$node_master->set_standby_mode();
+		$node_primary->set_standby_mode();
 	}
 
-	# Restart the master to check that rewind went correctly
-	$node_master->start;
+	# Restart the primary to check that rewind went correctly
+	$node_primary->start;
 
 	#### Now run the test-specific parts to check the result
 
@@ -386,7 +386,7 @@ primary_conninfo='port=$port_standby'));
 # Clean up after the test. Stop both servers, if they're still running.
 sub clean_rewind_test
 {
-	$node_master->teardown_node  if defined $node_master;
+	$node_primary->teardown_node if defined $node_primary;
 	$node_standby->teardown_node if defined $node_standby;
 	return;
 }
diff --git a/src/bin/pg_verifybackup/t/002_algorithm.pl b/src/bin/pg_verifybackup/t/002_algorithm.pl
index d0c97ae3cc30..6c118832668d 100644
--- a/src/bin/pg_verifybackup/t/002_algorithm.pl
+++ b/src/bin/pg_verifybackup/t/002_algorithm.pl
@@ -9,13 +9,13 @@
 use TestLib;
 use Test::More tests => 19;
 
-my $master = get_new_node('master');
-$master->init(allows_streaming => 1);
-$master->start;
+my $primary = get_new_node('primary');
+$primary->init(allows_streaming => 1);
+$primary->start;
 
 for my $algorithm (qw(bogus none crc32c sha224 sha256 sha384 sha512))
 {
-	my $backup_path = $master->backup_dir . '/' . $algorithm;
+	my $backup_path = $primary->backup_dir . '/' . $algorithm;
 	my @backup      = (
 		'pg_basebackup', '-D', $backup_path,
 		'--manifest-checksums', $algorithm, '--no-sync');
@@ -24,13 +24,13 @@
 	# A backup with a bogus algorithm should fail.
 	if ($algorithm eq 'bogus')
 	{
-		$master->command_fails(\@backup,
+		$primary->command_fails(\@backup,
 			"backup fails with algorithm \"$algorithm\"");
 		next;
 	}
 
 	# A backup with a valid algorithm should work.
-	$master->command_ok(\@backup, "backup ok with algorithm \"$algorithm\"");
+	$primary->command_ok(\@backup, "backup ok with algorithm \"$algorithm\"");
 
 	# We expect each real checksum algorithm to be mentioned on every line of
 	# the backup manifest file except the first and last; for simplicity, we
@@ -50,7 +50,7 @@
 	}
 
 	# Make sure that it verifies OK.
-	$master->command_ok(\@verify,
+	$primary->command_ok(\@verify,
 		"verify backup with algorithm \"$algorithm\"");
 
 	# Remove backup immediately to save disk space.
diff --git a/src/bin/pg_verifybackup/t/003_corruption.pl b/src/bin/pg_verifybackup/t/003_corruption.pl
index c2e04d0be201..0c0691ba2b20 100644
--- a/src/bin/pg_verifybackup/t/003_corruption.pl
+++ b/src/bin/pg_verifybackup/t/003_corruption.pl
@@ -9,9 +9,9 @@
 use TestLib;
 use Test::More tests => 44;
 
-my $master = get_new_node('master');
-$master->init(allows_streaming => 1);
-$master->start;
+my $primary = get_new_node('primary');
+$primary->init(allows_streaming => 1);
+$primary->start;
 
 # Include a user-defined tablespace in the hopes of detecting problems in that
 # area.
@@ -19,7 +19,7 @@
 my $source_ts_prefix = $source_ts_path;
 $source_ts_prefix =~ s!(^[A-Z]:/[^/]*)/.*!$1!;
 
-$master->safe_psql('postgres', <<EOM);
+$primary->safe_psql('postgres', <<EOM);
 CREATE TABLE x1 (a int);
 INSERT INTO x1 VALUES (111);
 CREATE TABLESPACE ts1 LOCATION '$source_ts_path';
@@ -103,13 +103,13 @@
 		  if $scenario->{'skip_on_windows'} && $windows_os;
 
 		# Take a backup and check that it verifies OK.
-		my $backup_path    = $master->backup_dir . '/' . $name;
+		my $backup_path    = $primary->backup_dir . '/' . $name;
 		my $backup_ts_path = TestLib::perl2host(TestLib::tempdir_short());
 		# The tablespace map parameter confuses Msys2, which tries to mangle
 		# it. Tell it not to.
 		# See https://www.msys2.org/wiki/Porting/#filesystem-namespaces
 		local $ENV{MSYS2_ARG_CONV_EXCL} = $source_ts_prefix;
-		$master->command_ok(
+		$primary->command_ok(
 			[
 				'pg_basebackup', '-D', $backup_path, '--no-sync',
 				'-T', "${source_ts_path}=${backup_ts_path}"
diff --git a/src/bin/pg_verifybackup/t/004_options.pl b/src/bin/pg_verifybackup/t/004_options.pl
index 271b7ee50431..1bd0aab54596 100644
--- a/src/bin/pg_verifybackup/t/004_options.pl
+++ b/src/bin/pg_verifybackup/t/004_options.pl
@@ -10,11 +10,11 @@
 use Test::More tests => 25;
 
 # Start up the server and take a backup.
-my $master = get_new_node('master');
-$master->init(allows_streaming => 1);
-$master->start;
-my $backup_path = $master->backup_dir . '/test_options';
-$master->command_ok([ 'pg_basebackup', '-D', $backup_path, '--no-sync' ],
+my $primary = get_new_node('primary');
+$primary->init(allows_streaming => 1);
+$primary->start;
+my $backup_path = $primary->backup_dir . '/test_options';
+$primary->command_ok([ 'pg_basebackup', '-D', $backup_path, '--no-sync' ],
 	"base backup ok");
 
 # Verify that pg_verifybackup -q succeeds and produces no output.
diff --git a/src/bin/pg_verifybackup/t/006_encoding.pl b/src/bin/pg_verifybackup/t/006_encoding.pl
index 5ab9649ab6fc..35b854a78e89 100644
--- a/src/bin/pg_verifybackup/t/006_encoding.pl
+++ b/src/bin/pg_verifybackup/t/006_encoding.pl
@@ -8,11 +8,11 @@
 use TestLib;
 use Test::More tests => 5;
 
-my $master = get_new_node('master');
-$master->init(allows_streaming => 1);
-$master->start;
-my $backup_path = $master->backup_dir . '/test_encoding';
-$master->command_ok(
+my $primary = get_new_node('primary');
+$primary->init(allows_streaming => 1);
+$primary->start;
+my $backup_path = $primary->backup_dir . '/test_encoding';
+$primary->command_ok(
 	[
 		'pg_basebackup', '-D',
 		$backup_path,    '--no-sync',
diff --git a/src/bin/pg_verifybackup/t/007_wal.pl b/src/bin/pg_verifybackup/t/007_wal.pl
index 56d536675c93..23a4f8bbd8d6 100644
--- a/src/bin/pg_verifybackup/t/007_wal.pl
+++ b/src/bin/pg_verifybackup/t/007_wal.pl
@@ -10,16 +10,16 @@
 use Test::More tests => 7;
 
 # Start up the server and take a backup.
-my $master = get_new_node('master');
-$master->init(allows_streaming => 1);
-$master->start;
-my $backup_path = $master->backup_dir . '/test_wal';
-$master->command_ok([ 'pg_basebackup', '-D', $backup_path, '--no-sync' ],
+my $primary = get_new_node('primary');
+$primary->init(allows_streaming => 1);
+$primary->start;
+my $backup_path = $primary->backup_dir . '/test_wal';
+$primary->command_ok([ 'pg_basebackup', '-D', $backup_path, '--no-sync' ],
 	"base backup ok");
 
 # Rename pg_wal.
 my $original_pg_wal  = $backup_path . '/pg_wal';
-my $relocated_pg_wal = $master->backup_dir . '/relocated_pg_wal';
+my $relocated_pg_wal = $primary->backup_dir . '/relocated_pg_wal';
 rename($original_pg_wal, $relocated_pg_wal) || die "rename pg_wal: $!";
 
 # WAL verification should fail.
diff --git a/src/test/authentication/t/001_password.pl b/src/test/authentication/t/001_password.pl
index 82536eb60fb7..1305de0051a6 100644
--- a/src/test/authentication/t/001_password.pl
+++ b/src/test/authentication/t/001_password.pl
@@ -51,8 +51,8 @@ sub test_role
 	return;
 }
 
-# Initialize master node
-my $node = get_new_node('master');
+# Initialize primary node
+my $node = get_new_node('primary');
 $node->init;
 $node->start;
 
diff --git a/src/test/authentication/t/002_saslprep.pl b/src/test/authentication/t/002_saslprep.pl
index 32d4e43fc7df..0aaab090ec53 100644
--- a/src/test/authentication/t/002_saslprep.pl
+++ b/src/test/authentication/t/002_saslprep.pl
@@ -49,9 +49,9 @@ sub test_login
 	return;
 }
 
-# Initialize master node. Force UTF-8 encoding, so that we can use non-ASCII
+# Initialize primary node. Force UTF-8 encoding, so that we can use non-ASCII
 # characters in the passwords below.
-my $node = get_new_node('master');
+my $node = get_new_node('primary');
 $node->init(extra => [ '--locale=C', '--encoding=UTF8' ]);
 $node->start;
 
diff --git a/src/test/modules/commit_ts/t/002_standby.pl b/src/test/modules/commit_ts/t/002_standby.pl
index f376b595962d..872efb2e8eae 100644
--- a/src/test/modules/commit_ts/t/002_standby.pl
+++ b/src/test/modules/commit_ts/t/002_standby.pl
@@ -8,45 +8,45 @@
 use PostgresNode;
 
 my $bkplabel = 'backup';
-my $master   = get_new_node('master');
-$master->init(allows_streaming => 1);
+my $primary   = get_new_node('primary');
+$primary->init(allows_streaming => 1);
 
-$master->append_conf(
+$primary->append_conf(
 	'postgresql.conf', qq{
 	track_commit_timestamp = on
 	max_wal_senders = 5
 	});
-$master->start;
-$master->backup($bkplabel);
+$primary->start;
+$primary->backup($bkplabel);
 
 my $standby = get_new_node('standby');
-$standby->init_from_backup($master, $bkplabel, has_streaming => 1);
+$standby->init_from_backup($primary, $bkplabel, has_streaming => 1);
 $standby->start;
 
 for my $i (1 .. 10)
 {
-	$master->safe_psql('postgres', "create table t$i()");
+	$primary->safe_psql('postgres', "create table t$i()");
 }
-my $master_ts = $master->safe_psql('postgres',
+my $primary_ts = $primary->safe_psql('postgres',
 	qq{SELECT ts.* FROM pg_class, pg_xact_commit_timestamp(xmin) AS ts WHERE relname = 't10'}
 );
-my $master_lsn =
-  $master->safe_psql('postgres', 'select pg_current_wal_lsn()');
+my $primary_lsn =
+  $primary->safe_psql('postgres', 'select pg_current_wal_lsn()');
 $standby->poll_query_until('postgres',
-	qq{SELECT '$master_lsn'::pg_lsn <= pg_last_wal_replay_lsn()})
+	qq{SELECT '$primary_lsn'::pg_lsn <= pg_last_wal_replay_lsn()})
   or die "standby never caught up";
 
 my $standby_ts = $standby->safe_psql('postgres',
 	qq{select ts.* from pg_class, pg_xact_commit_timestamp(xmin) ts where relname = 't10'}
 );
-is($master_ts, $standby_ts, "standby gives same value as master");
+is($primary_ts, $standby_ts, "standby gives same value as primary");
 
-$master->append_conf('postgresql.conf', 'track_commit_timestamp = off');
-$master->restart;
-$master->safe_psql('postgres', 'checkpoint');
-$master_lsn = $master->safe_psql('postgres', 'select pg_current_wal_lsn()');
+$primary->append_conf('postgresql.conf', 'track_commit_timestamp = off');
+$primary->restart;
+$primary->safe_psql('postgres', 'checkpoint');
+$primary_lsn = $primary->safe_psql('postgres', 'select pg_current_wal_lsn()');
 $standby->poll_query_until('postgres',
-	qq{SELECT '$master_lsn'::pg_lsn <= pg_last_wal_replay_lsn()})
+	qq{SELECT '$primary_lsn'::pg_lsn <= pg_last_wal_replay_lsn()})
   or die "standby never caught up";
 $standby->safe_psql('postgres', 'checkpoint');
 
@@ -54,10 +54,10 @@
 my ($ret, $standby_ts_stdout, $standby_ts_stderr) = $standby->psql('postgres',
 	'select ts.* from pg_class, pg_xact_commit_timestamp(xmin) ts where relname = \'t10\''
 );
-is($ret, 3, 'standby errors when master turned feature off');
+is($ret, 3, 'standby errors when primary turned feature off');
 is($standby_ts_stdout, '',
-	"standby gives no value when master turned feature off");
+	"standby gives no value when primary turned feature off");
 like(
 	$standby_ts_stderr,
 	qr/could not get commit timestamp data/,
-	'expected error when master turned feature off');
+	'expected error when primary turned feature off');
diff --git a/src/test/modules/commit_ts/t/003_standby_2.pl b/src/test/modules/commit_ts/t/003_standby_2.pl
index 9165d5005364..36ab829dfdd2 100644
--- a/src/test/modules/commit_ts/t/003_standby_2.pl
+++ b/src/test/modules/commit_ts/t/003_standby_2.pl
@@ -1,4 +1,4 @@
-# Test master/standby scenario where the track_commit_timestamp GUC is
+# Test primary/standby scenario where the track_commit_timestamp GUC is
 # repeatedly toggled on and off.
 use strict;
 use warnings;
@@ -8,31 +8,31 @@
 use PostgresNode;
 
 my $bkplabel = 'backup';
-my $master   = get_new_node('master');
-$master->init(allows_streaming => 1);
-$master->append_conf(
+my $primary   = get_new_node('primary');
+$primary->init(allows_streaming => 1);
+$primary->append_conf(
 	'postgresql.conf', qq{
 	track_commit_timestamp = on
 	max_wal_senders = 5
 	});
-$master->start;
-$master->backup($bkplabel);
+$primary->start;
+$primary->backup($bkplabel);
 
 my $standby = get_new_node('standby');
-$standby->init_from_backup($master, $bkplabel, has_streaming => 1);
+$standby->init_from_backup($primary, $bkplabel, has_streaming => 1);
 $standby->start;
 
 for my $i (1 .. 10)
 {
-	$master->safe_psql('postgres', "create table t$i()");
+	$primary->safe_psql('postgres', "create table t$i()");
 }
-$master->append_conf('postgresql.conf', 'track_commit_timestamp = off');
-$master->restart;
-$master->safe_psql('postgres', 'checkpoint');
-my $master_lsn =
-  $master->safe_psql('postgres', 'select pg_current_wal_lsn()');
+$primary->append_conf('postgresql.conf', 'track_commit_timestamp = off');
+$primary->restart;
+$primary->safe_psql('postgres', 'checkpoint');
+my $primary_lsn =
+  $primary->safe_psql('postgres', 'select pg_current_wal_lsn()');
 $standby->poll_query_until('postgres',
-	qq{SELECT '$master_lsn'::pg_lsn <= pg_last_wal_replay_lsn()})
+	qq{SELECT '$primary_lsn'::pg_lsn <= pg_last_wal_replay_lsn()})
   or die "standby never caught up";
 
 $standby->safe_psql('postgres', 'checkpoint');
@@ -49,10 +49,10 @@
 	qr/could not get commit timestamp data/,
 	'expected err msg after restart');
 
-$master->append_conf('postgresql.conf', 'track_commit_timestamp = on');
-$master->restart;
-$master->append_conf('postgresql.conf', 'track_commit_timestamp = off');
-$master->restart;
+$primary->append_conf('postgresql.conf', 'track_commit_timestamp = on');
+$primary->restart;
+$primary->append_conf('postgresql.conf', 'track_commit_timestamp = off');
+$primary->restart;
 
 system_or_bail('pg_ctl', '-D', $standby->data_dir, 'promote');
 
diff --git a/src/test/modules/commit_ts/t/004_restart.pl b/src/test/modules/commit_ts/t/004_restart.pl
index 39ca25a06bf0..4e6ae776b979 100644
--- a/src/test/modules/commit_ts/t/004_restart.pl
+++ b/src/test/modules/commit_ts/t/004_restart.pl
@@ -5,15 +5,15 @@
 use TestLib;
 use Test::More tests => 16;
 
-my $node_master = get_new_node('master');
-$node_master->init(allows_streaming => 1);
-$node_master->append_conf('postgresql.conf', 'track_commit_timestamp = on');
-$node_master->start;
+my $node_primary = get_new_node('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->append_conf('postgresql.conf', 'track_commit_timestamp = on');
+$node_primary->start;
 
 my ($ret, $stdout, $stderr);
 
 ($ret, $stdout, $stderr) =
-  $node_master->psql('postgres', qq[SELECT pg_xact_commit_timestamp('0');]);
+  $node_primary->psql('postgres', qq[SELECT pg_xact_commit_timestamp('0');]);
 is($ret, 3, 'getting ts of InvalidTransactionId reports error');
 like(
 	$stderr,
@@ -21,27 +21,27 @@
 	'expected error from InvalidTransactionId');
 
 ($ret, $stdout, $stderr) =
-  $node_master->psql('postgres', qq[SELECT pg_xact_commit_timestamp('1');]);
+  $node_primary->psql('postgres', qq[SELECT pg_xact_commit_timestamp('1');]);
 is($ret,    0,  'getting ts of BootstrapTransactionId succeeds');
 is($stdout, '', 'timestamp of BootstrapTransactionId is null');
 
 ($ret, $stdout, $stderr) =
-  $node_master->psql('postgres', qq[SELECT pg_xact_commit_timestamp('2');]);
+  $node_primary->psql('postgres', qq[SELECT pg_xact_commit_timestamp('2');]);
 is($ret,    0,  'getting ts of FrozenTransactionId succeeds');
 is($stdout, '', 'timestamp of FrozenTransactionId is null');
 
 # Since FirstNormalTransactionId will've occurred during initdb, long before we
 # enabled commit timestamps, it'll be null since we have no cts data for it but
 # cts are enabled.
-is( $node_master->safe_psql(
+is( $node_primary->safe_psql(
 		'postgres', qq[SELECT pg_xact_commit_timestamp('3');]),
 	'',
 	'committs for FirstNormalTransactionId is null');
 
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	qq[CREATE TABLE committs_test(x integer, y timestamp with time zone);]);
 
-my $xid = $node_master->safe_psql(
+my $xid = $node_primary->safe_psql(
 	'postgres', qq[
 	BEGIN;
 	INSERT INTO committs_test(x, y) VALUES (1, current_timestamp);
@@ -49,43 +49,43 @@
 	COMMIT;
 ]);
 
-my $before_restart_ts = $node_master->safe_psql('postgres',
+my $before_restart_ts = $node_primary->safe_psql('postgres',
 	qq[SELECT pg_xact_commit_timestamp('$xid');]);
 ok($before_restart_ts ne '' && $before_restart_ts ne 'null',
 	'commit timestamp recorded');
 
-$node_master->stop('immediate');
-$node_master->start;
+$node_primary->stop('immediate');
+$node_primary->start;
 
-my $after_crash_ts = $node_master->safe_psql('postgres',
+my $after_crash_ts = $node_primary->safe_psql('postgres',
 	qq[SELECT pg_xact_commit_timestamp('$xid');]);
 is($after_crash_ts, $before_restart_ts,
 	'timestamps before and after crash are equal');
 
-$node_master->stop('fast');
-$node_master->start;
+$node_primary->stop('fast');
+$node_primary->start;
 
-my $after_restart_ts = $node_master->safe_psql('postgres',
+my $after_restart_ts = $node_primary->safe_psql('postgres',
 	qq[SELECT pg_xact_commit_timestamp('$xid');]);
 is($after_restart_ts, $before_restart_ts,
 	'timestamps before and after restart are equal');
 
 # Now disable commit timestamps
-$node_master->append_conf('postgresql.conf', 'track_commit_timestamp = off');
-$node_master->stop('fast');
+$node_primary->append_conf('postgresql.conf', 'track_commit_timestamp = off');
+$node_primary->stop('fast');
 
 # Start the server, which generates a XLOG_PARAMETER_CHANGE record where
 # the parameter change is registered.
-$node_master->start;
+$node_primary->start;
 
 # Now restart again the server so as no XLOG_PARAMETER_CHANGE record are
 # replayed with the follow-up immediate shutdown.
-$node_master->restart;
+$node_primary->restart;
 
 # Move commit timestamps across page boundaries.  Things should still
 # be able to work across restarts with those transactions committed while
 # track_commit_timestamp is disabled.
-$node_master->safe_psql(
+$node_primary->safe_psql(
 	'postgres',
 	qq(CREATE PROCEDURE consume_xid(cnt int)
 AS \$\$
@@ -100,9 +100,9 @@
 \$\$
 LANGUAGE plpgsql;
 ));
-$node_master->safe_psql('postgres', 'CALL consume_xid(2000)');
+$node_primary->safe_psql('postgres', 'CALL consume_xid(2000)');
 
-($ret, $stdout, $stderr) = $node_master->psql('postgres',
+($ret, $stdout, $stderr) = $node_primary->psql('postgres',
 	qq[SELECT pg_xact_commit_timestamp('$xid');]);
 is($ret, 3, 'no commit timestamp from enable tx when cts disabled');
 like(
@@ -111,7 +111,7 @@
 	'expected error from enabled tx when committs disabled');
 
 # Do a tx while cts disabled
-my $xid_disabled = $node_master->safe_psql(
+my $xid_disabled = $node_primary->safe_psql(
 	'postgres', qq[
 	BEGIN;
 	INSERT INTO committs_test(x, y) VALUES (2, current_timestamp);
@@ -120,7 +120,7 @@
 ]);
 
 # Should be inaccessible
-($ret, $stdout, $stderr) = $node_master->psql('postgres',
+($ret, $stdout, $stderr) = $node_primary->psql('postgres',
 	qq[SELECT pg_xact_commit_timestamp('$xid_disabled');]);
 is($ret, 3, 'no commit timestamp when disabled');
 like(
@@ -129,21 +129,21 @@
 	'expected error from disabled tx when committs disabled');
 
 # Re-enable, restart and ensure we can still get the old timestamps
-$node_master->append_conf('postgresql.conf', 'track_commit_timestamp = on');
+$node_primary->append_conf('postgresql.conf', 'track_commit_timestamp = on');
 
 # An immediate shutdown is used here.  At next startup recovery will
 # replay transactions which committed when track_commit_timestamp was
 # disabled, and the facility should be able to work properly.
-$node_master->stop('immediate');
-$node_master->start;
+$node_primary->stop('immediate');
+$node_primary->start;
 
-my $after_enable_ts = $node_master->safe_psql('postgres',
+my $after_enable_ts = $node_primary->safe_psql('postgres',
 	qq[SELECT pg_xact_commit_timestamp('$xid');]);
 is($after_enable_ts, '', 'timestamp of enabled tx null after re-enable');
 
-my $after_enable_disabled_ts = $node_master->safe_psql('postgres',
+my $after_enable_disabled_ts = $node_primary->safe_psql('postgres',
 	qq[SELECT pg_xact_commit_timestamp('$xid_disabled');]);
 is($after_enable_disabled_ts, '',
 	'timestamp of disabled tx null after re-enable');
 
-$node_master->stop;
+$node_primary->stop;
diff --git a/src/test/modules/test_misc/t/001_constraint_validation.pl b/src/test/modules/test_misc/t/001_constraint_validation.pl
index f762bc21c199..22497f22b01c 100644
--- a/src/test/modules/test_misc/t/001_constraint_validation.pl
+++ b/src/test/modules/test_misc/t/001_constraint_validation.pl
@@ -7,7 +7,7 @@
 use Test::More tests => 42;
 
 # Initialize a test cluster
-my $node = get_new_node('master');
+my $node = get_new_node('primary');
 $node->init();
 # Turn message level up to DEBUG1 so that we get the messages we want to see
 $node->append_conf('postgresql.conf', 'client_min_messages = DEBUG1');
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 1407359aef66..b216bbbe4bbd 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -1822,11 +1822,11 @@ sub run_log
 
 Look up WAL locations on the server:
 
- * insert location (master only, error on replica)
- * write location (master only, error on replica)
- * flush location (master only, error on replica)
- * receive location (always undef on master)
- * replay location (always undef on master)
+ * insert location (primary only, error on replica)
+ * write location (primary only, error on replica)
+ * flush location (primary only, error on replica)
+ * receive location (always undef on primary)
+ * replay location (always undef on primary)
 
 mode must be specified.
 
@@ -1876,7 +1876,7 @@ poll_query_until timeout.
 
 Requires that the 'postgres' db exists and is accessible.
 
-target_lsn may be any arbitrary lsn, but is typically $master_node->lsn('insert').
+target_lsn may be any arbitrary lsn, but is typically $primary_node->lsn('insert').
 If omitted, pg_current_wal_lsn() is used.
 
 This is not a test. It die()s on failure.
@@ -1935,7 +1935,7 @@ This is not a test. It die()s on failure.
 
 If the slot is not active, will time out after poll_query_until's timeout.
 
-target_lsn may be any arbitrary lsn, but is typically $master_node->lsn('insert').
+target_lsn may be any arbitrary lsn, but is typically $primary_node->lsn('insert').
 
 Note that for logical slots, restart_lsn is held down by the oldest in-progress tx.
 
diff --git a/src/test/perl/README b/src/test/perl/README
index c61c3f5e9423..fd9394957f73 100644
--- a/src/test/perl/README
+++ b/src/test/perl/README
@@ -48,7 +48,7 @@ Each test script should begin with:
 then it will generally need to set up one or more nodes, run commands
 against them and evaluate the results. For example:
 
-    my $node = PostgresNode->get_new_node('master');
+    my $node = PostgresNode->get_new_node('primary');
     $node->init;
     $node->start;
 
diff --git a/src/test/recovery/t/001_stream_rep.pl b/src/test/recovery/t/001_stream_rep.pl
index 778f11b28b43..9e31a53de773 100644
--- a/src/test/recovery/t/001_stream_rep.pl
+++ b/src/test/recovery/t/001_stream_rep.pl
@@ -5,22 +5,22 @@
 use TestLib;
 use Test::More tests => 36;
 
-# Initialize master node
-my $node_master = get_new_node('master');
+# Initialize primary node
+my $node_primary = get_new_node('primary');
 # A specific role is created to perform some tests related to replication,
 # and it needs proper authentication configuration.
-$node_master->init(
+$node_primary->init(
 	allows_streaming => 1,
 	auth_extra       => [ '--create-role', 'repl_role' ]);
-$node_master->start;
+$node_primary->start;
 my $backup_name = 'my_backup';
 
 # Take backup
-$node_master->backup($backup_name);
+$node_primary->backup($backup_name);
 
-# Create streaming standby linking to master
+# Create streaming standby linking to primary
 my $node_standby_1 = get_new_node('standby_1');
-$node_standby_1->init_from_backup($node_master, $backup_name,
+$node_standby_1->init_from_backup($node_primary, $backup_name,
 	has_streaming => 1);
 $node_standby_1->start;
 
@@ -28,10 +28,10 @@
 # pg_basebackup works on a standby).
 $node_standby_1->backup($backup_name);
 
-# Take a second backup of the standby while the master is offline.
-$node_master->stop;
+# Take a second backup of the standby while the primary is offline.
+$node_primary->stop;
 $node_standby_1->backup('my_backup_2');
-$node_master->start;
+$node_primary->start;
 
 # Create second standby node linking to standby 1
 my $node_standby_2 = get_new_node('standby_2');
@@ -39,13 +39,13 @@
 	has_streaming => 1);
 $node_standby_2->start;
 
-# Create some content on master and check its presence in standby 1
-$node_master->safe_psql('postgres',
+# Create some content on primary and check its presence in standby 1
+$node_primary->safe_psql('postgres',
 	"CREATE TABLE tab_int AS SELECT generate_series(1,1002) AS a");
 
 # Wait for standbys to catch up
-$node_master->wait_for_catchup($node_standby_1, 'replay',
-	$node_master->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby_1, 'replay',
+	$node_primary->lsn('insert'));
 $node_standby_1->wait_for_catchup($node_standby_2, 'replay',
 	$node_standby_1->lsn('replay'));
 
@@ -105,57 +105,57 @@ sub test_target_session_attrs
 	return;
 }
 
-# Connect to master in "read-write" mode with master,standby1 list.
-test_target_session_attrs($node_master, $node_standby_1, $node_master,
+# Connect to primary in "read-write" mode with primary,standby1 list.
+test_target_session_attrs($node_primary, $node_standby_1, $node_primary,
 	"read-write", 0);
 
-# Connect to master in "read-write" mode with standby1,master list.
-test_target_session_attrs($node_standby_1, $node_master, $node_master,
+# Connect to primary in "read-write" mode with standby1,primary list.
+test_target_session_attrs($node_standby_1, $node_primary, $node_primary,
 	"read-write", 0);
 
-# Connect to master in "any" mode with master,standby1 list.
-test_target_session_attrs($node_master, $node_standby_1, $node_master, "any",
+# Connect to primary in "any" mode with primary,standby1 list.
+test_target_session_attrs($node_primary, $node_standby_1, $node_primary, "any",
 	0);
 
-# Connect to standby1 in "any" mode with standby1,master list.
-test_target_session_attrs($node_standby_1, $node_master, $node_standby_1,
+# Connect to standby1 in "any" mode with standby1,primary list.
+test_target_session_attrs($node_standby_1, $node_primary, $node_standby_1,
 	"any", 0);
 
 # Test for SHOW commands using a WAL sender connection with a replication
 # role.
 note "testing SHOW commands for replication connection";
 
-$node_master->psql(
+$node_primary->psql(
 	'postgres', "
 CREATE ROLE repl_role REPLICATION LOGIN;
 GRANT pg_read_all_settings TO repl_role;");
-my $master_host    = $node_master->host;
-my $master_port    = $node_master->port;
-my $connstr_common = "host=$master_host port=$master_port user=repl_role";
+my $primary_host    = $node_primary->host;
+my $primary_port    = $node_primary->port;
+my $connstr_common = "host=$primary_host port=$primary_port user=repl_role";
 my $connstr_rep    = "$connstr_common replication=1";
 my $connstr_db     = "$connstr_common replication=database dbname=postgres";
 
 # Test SHOW ALL
-my ($ret, $stdout, $stderr) = $node_master->psql(
+my ($ret, $stdout, $stderr) = $node_primary->psql(
 	'postgres', 'SHOW ALL;',
 	on_error_die => 1,
 	extra_params => [ '-d', $connstr_rep ]);
 ok($ret == 0, "SHOW ALL with replication role and physical replication");
-($ret, $stdout, $stderr) = $node_master->psql(
+($ret, $stdout, $stderr) = $node_primary->psql(
 	'postgres', 'SHOW ALL;',
 	on_error_die => 1,
 	extra_params => [ '-d', $connstr_db ]);
 ok($ret == 0, "SHOW ALL with replication role and logical replication");
 
 # Test SHOW with a user-settable parameter
-($ret, $stdout, $stderr) = $node_master->psql(
+($ret, $stdout, $stderr) = $node_primary->psql(
 	'postgres', 'SHOW work_mem;',
 	on_error_die => 1,
 	extra_params => [ '-d', $connstr_rep ]);
 ok( $ret == 0,
 	"SHOW with user-settable parameter, replication role and physical replication"
 );
-($ret, $stdout, $stderr) = $node_master->psql(
+($ret, $stdout, $stderr) = $node_primary->psql(
 	'postgres', 'SHOW work_mem;',
 	on_error_die => 1,
 	extra_params => [ '-d', $connstr_db ]);
@@ -164,14 +164,14 @@ sub test_target_session_attrs
 );
 
 # Test SHOW with a superuser-settable parameter
-($ret, $stdout, $stderr) = $node_master->psql(
+($ret, $stdout, $stderr) = $node_primary->psql(
 	'postgres', 'SHOW primary_conninfo;',
 	on_error_die => 1,
 	extra_params => [ '-d', $connstr_rep ]);
 ok( $ret == 0,
 	"SHOW with superuser-settable parameter, replication role and physical replication"
 );
-($ret, $stdout, $stderr) = $node_master->psql(
+($ret, $stdout, $stderr) = $node_primary->psql(
 	'postgres', 'SHOW primary_conninfo;',
 	on_error_die => 1,
 	extra_params => [ '-d', $connstr_db ]);
@@ -186,13 +186,13 @@ sub test_target_session_attrs
 # standbys. Since we're going to be testing things that affect the slot state,
 # also increase the standby feedback interval to ensure timely updates.
 my ($slotname_1, $slotname_2) = ('standby_1', 'standby_2');
-$node_master->append_conf('postgresql.conf', "max_replication_slots = 4");
-$node_master->restart;
-is( $node_master->psql(
+$node_primary->append_conf('postgresql.conf', "max_replication_slots = 4");
+$node_primary->restart;
+is( $node_primary->psql(
 		'postgres',
 		qq[SELECT pg_create_physical_replication_slot('$slotname_1');]),
 	0,
-	'physical slot created on master');
+	'physical slot created on primary');
 $node_standby_1->append_conf('postgresql.conf',
 	"primary_slot_name = $slotname_1");
 $node_standby_1->append_conf('postgresql.conf',
@@ -231,7 +231,7 @@ sub get_slot_xmins
 
 # There's no hot standby feedback and there are no logical slots on either peer
 # so xmin and catalog_xmin should be null on both slots.
-my ($xmin, $catalog_xmin) = get_slot_xmins($node_master, $slotname_1,
+my ($xmin, $catalog_xmin) = get_slot_xmins($node_primary, $slotname_1,
 	"xmin IS NULL AND catalog_xmin IS NULL");
 is($xmin, '', 'xmin of non-cascaded slot null with no hs_feedback');
 is($catalog_xmin, '',
@@ -244,20 +244,20 @@ sub get_slot_xmins
 	'catalog xmin of cascaded slot null with no hs_feedback');
 
 # Replication still works?
-$node_master->safe_psql('postgres', 'CREATE TABLE replayed(val integer);');
+$node_primary->safe_psql('postgres', 'CREATE TABLE replayed(val integer);');
 
 sub replay_check
 {
-	my $newval = $node_master->safe_psql('postgres',
+	my $newval = $node_primary->safe_psql('postgres',
 		'INSERT INTO replayed(val) SELECT coalesce(max(val),0) + 1 AS newval FROM replayed RETURNING val'
 	);
-	$node_master->wait_for_catchup($node_standby_1, 'replay',
-		$node_master->lsn('insert'));
+	$node_primary->wait_for_catchup($node_standby_1, 'replay',
+		$node_primary->lsn('insert'));
 	$node_standby_1->wait_for_catchup($node_standby_2, 'replay',
 		$node_standby_1->lsn('replay'));
 	$node_standby_1->safe_psql('postgres',
 		qq[SELECT 1 FROM replayed WHERE val = $newval])
-	  or die "standby_1 didn't replay master value $newval";
+	  or die "standby_1 didn't replay primary value $newval";
 	$node_standby_2->safe_psql('postgres',
 		qq[SELECT 1 FROM replayed WHERE val = $newval])
 	  or die "standby_2 didn't replay standby_1 value $newval";
@@ -278,7 +278,7 @@ sub replay_check
 $node_standby_2->reload;
 replay_check();
 
-($xmin, $catalog_xmin) = get_slot_xmins($node_master, $slotname_1,
+($xmin, $catalog_xmin) = get_slot_xmins($node_primary, $slotname_1,
 	"xmin IS NOT NULL AND catalog_xmin IS NULL");
 isnt($xmin, '', 'xmin of non-cascaded slot non-null with hs feedback');
 is($catalog_xmin, '',
@@ -291,7 +291,7 @@ sub replay_check
 	'catalog xmin of cascaded slot still null with hs_feedback');
 
 note "doing some work to advance xmin";
-$node_master->safe_psql(
+$node_primary->safe_psql(
 	'postgres', q{
 do $$
 begin
@@ -306,12 +306,12 @@ sub replay_check
 end$$;
 });
 
-$node_master->safe_psql('postgres', 'VACUUM;');
-$node_master->safe_psql('postgres', 'CHECKPOINT;');
+$node_primary->safe_psql('postgres', 'VACUUM;');
+$node_primary->safe_psql('postgres', 'CHECKPOINT;');
 
 my ($xmin2, $catalog_xmin2) =
-  get_slot_xmins($node_master, $slotname_1, "xmin <> '$xmin'");
-note "master slot's new xmin $xmin2, old xmin $xmin";
+  get_slot_xmins($node_primary, $slotname_1, "xmin <> '$xmin'");
+note "primary slot's new xmin $xmin2, old xmin $xmin";
 isnt($xmin2, $xmin, 'xmin of non-cascaded slot with hs feedback has changed');
 is($catalog_xmin2, '',
 	'catalog xmin of non-cascaded slot still null with hs_feedback unchanged'
@@ -335,7 +335,7 @@ sub replay_check
 $node_standby_2->reload;
 replay_check();
 
-($xmin, $catalog_xmin) = get_slot_xmins($node_master, $slotname_1,
+($xmin, $catalog_xmin) = get_slot_xmins($node_primary, $slotname_1,
 	"xmin IS NULL AND catalog_xmin IS NULL");
 is($xmin, '', 'xmin of non-cascaded slot null with hs feedback reset');
 is($catalog_xmin, '',
@@ -349,55 +349,55 @@ sub replay_check
 
 note "check change primary_conninfo without restart";
 $node_standby_2->append_conf('postgresql.conf', "primary_slot_name = ''");
-$node_standby_2->enable_streaming($node_master);
+$node_standby_2->enable_streaming($node_primary);
 $node_standby_2->reload;
 
 # be sure do not streaming from cascade
 $node_standby_1->stop;
 
-my $newval = $node_master->safe_psql('postgres',
+my $newval = $node_primary->safe_psql('postgres',
 	'INSERT INTO replayed(val) SELECT coalesce(max(val),0) + 1 AS newval FROM replayed RETURNING val'
 );
-$node_master->wait_for_catchup($node_standby_2, 'replay',
-	$node_master->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby_2, 'replay',
+	$node_primary->lsn('insert'));
 my $is_replayed = $node_standby_2->safe_psql('postgres',
 	qq[SELECT 1 FROM replayed WHERE val = $newval]);
-is($is_replayed, qq(1), "standby_2 didn't replay master value $newval");
+is($is_replayed, qq(1), "standby_2 didn't replay primary value $newval");
 
 # Drop any existing slots on the primary, for the follow-up tests.
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"SELECT pg_drop_replication_slot(slot_name) FROM pg_replication_slots;");
 
 # Test physical slot advancing and its durability.  Create a new slot on
 # the primary, not used by any of the standbys. This reserves WAL at creation.
 my $phys_slot = 'phys_slot';
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"SELECT pg_create_physical_replication_slot('$phys_slot', true);");
 # Generate some WAL, and switch to a new segment, used to check that
 # the previous segment is correctly getting recycled as the slot advancing
 # would recompute the minimum LSN calculated across all slots.
-my $segment_removed = $node_master->safe_psql('postgres',
+my $segment_removed = $node_primary->safe_psql('postgres',
 	'SELECT pg_walfile_name(pg_current_wal_lsn())');
 chomp($segment_removed);
-$node_master->psql(
+$node_primary->psql(
 	'postgres', "
 	CREATE TABLE tab_phys_slot (a int);
 	INSERT INTO tab_phys_slot VALUES (generate_series(1,10));
 	SELECT pg_switch_wal();");
 my $current_lsn =
-  $node_master->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
+  $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
 chomp($current_lsn);
-my $psql_rc = $node_master->psql('postgres',
+my $psql_rc = $node_primary->psql('postgres',
 	"SELECT pg_replication_slot_advance('$phys_slot', '$current_lsn'::pg_lsn);"
 );
 is($psql_rc, '0', 'slot advancing with physical slot');
-my $phys_restart_lsn_pre = $node_master->safe_psql('postgres',
+my $phys_restart_lsn_pre = $node_primary->safe_psql('postgres',
 	"SELECT restart_lsn from pg_replication_slots WHERE slot_name = '$phys_slot';"
 );
 chomp($phys_restart_lsn_pre);
 # Slot advance should persist across clean restarts.
-$node_master->restart;
-my $phys_restart_lsn_post = $node_master->safe_psql('postgres',
+$node_primary->restart;
+my $phys_restart_lsn_post = $node_primary->safe_psql('postgres',
 	"SELECT restart_lsn from pg_replication_slots WHERE slot_name = '$phys_slot';"
 );
 chomp($phys_restart_lsn_post);
@@ -406,6 +406,6 @@ sub replay_check
 
 # Check if the previous segment gets correctly recycled after the
 # server stopped cleanly, causing a shutdown checkpoint to be generated.
-my $master_data = $node_master->data_dir;
-ok(!-f "$master_data/pg_wal/$segment_removed",
+my $primary_data = $node_primary->data_dir;
+ok(!-f "$primary_data/pg_wal/$segment_removed",
 	"WAL segment $segment_removed recycled after physical slot advancing");
diff --git a/src/test/recovery/t/002_archiving.pl b/src/test/recovery/t/002_archiving.pl
index 683c33b51000..cf8988f62a7b 100644
--- a/src/test/recovery/t/002_archiving.pl
+++ b/src/test/recovery/t/002_archiving.pl
@@ -6,38 +6,38 @@
 use Test::More tests => 3;
 use File::Copy;
 
-# Initialize master node, doing archives
-my $node_master = get_new_node('master');
-$node_master->init(
+# Initialize primary node, doing archives
+my $node_primary = get_new_node('primary');
+$node_primary->init(
 	has_archiving    => 1,
 	allows_streaming => 1);
 my $backup_name = 'my_backup';
 
 # Start it
-$node_master->start;
+$node_primary->start;
 
 # Take backup for standby
-$node_master->backup($backup_name);
+$node_primary->backup($backup_name);
 
 # Initialize standby node from backup, fetching WAL from archives
 my $node_standby = get_new_node('standby');
-$node_standby->init_from_backup($node_master, $backup_name,
+$node_standby->init_from_backup($node_primary, $backup_name,
 	has_restoring => 1);
 $node_standby->append_conf('postgresql.conf',
 	"wal_retrieve_retry_interval = '100ms'");
 $node_standby->start;
 
-# Create some content on master
-$node_master->safe_psql('postgres',
+# Create some content on primary
+$node_primary->safe_psql('postgres',
 	"CREATE TABLE tab_int AS SELECT generate_series(1,1000) AS a");
 my $current_lsn =
-  $node_master->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
+  $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
 
-# Force archiving of WAL file to make it present on master
-$node_master->safe_psql('postgres', "SELECT pg_switch_wal()");
+# Force archiving of WAL file to make it present on primary
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal()");
 
 # Add some more content, it should not be present on standby
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"INSERT INTO tab_int VALUES (generate_series(1001,2000))");
 
 # Wait until necessary replay has been done on standby
@@ -60,7 +60,7 @@
 $node_standby->promote;
 
 my $node_standby2 = get_new_node('standby2');
-$node_standby2->init_from_backup($node_master, $backup_name,
+$node_standby2->init_from_backup($node_primary, $backup_name,
 	has_restoring => 1);
 $node_standby2->start;
 
diff --git a/src/test/recovery/t/003_recovery_targets.pl b/src/test/recovery/t/003_recovery_targets.pl
index 8d114eb7ad50..cc701c5539eb 100644
--- a/src/test/recovery/t/003_recovery_targets.pl
+++ b/src/test/recovery/t/003_recovery_targets.pl
@@ -13,13 +13,13 @@ sub test_recovery_standby
 {
 	my $test_name       = shift;
 	my $node_name       = shift;
-	my $node_master     = shift;
+	my $node_primary     = shift;
 	my $recovery_params = shift;
 	my $num_rows        = shift;
 	my $until_lsn       = shift;
 
 	my $node_standby = get_new_node($node_name);
-	$node_standby->init_from_backup($node_master, 'my_backup',
+	$node_standby->init_from_backup($node_primary, 'my_backup',
 		has_restoring => 1);
 
 	foreach my $param_item (@$recovery_params)
@@ -35,7 +35,7 @@ sub test_recovery_standby
 	$node_standby->poll_query_until('postgres', $caughtup_query)
 	  or die "Timed out while waiting for standby to catch up";
 
-	# Create some content on master and check its presence in standby
+	# Create some content on primary and check its presence in standby
 	my $result =
 	  $node_standby->safe_psql('postgres', "SELECT count(*) FROM tab_int");
 	is($result, qq($num_rows), "check standby content for $test_name");
@@ -46,74 +46,74 @@ sub test_recovery_standby
 	return;
 }
 
-# Initialize master node
-my $node_master = get_new_node('master');
-$node_master->init(has_archiving => 1, allows_streaming => 1);
+# Initialize primary node
+my $node_primary = get_new_node('primary');
+$node_primary->init(has_archiving => 1, allows_streaming => 1);
 
 # Start it
-$node_master->start;
+$node_primary->start;
 
 # Create data before taking the backup, aimed at testing
 # recovery_target = 'immediate'
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"CREATE TABLE tab_int AS SELECT generate_series(1,1000) AS a");
 my $lsn1 =
-  $node_master->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
+  $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
 
 # Take backup from which all operations will be run
-$node_master->backup('my_backup');
+$node_primary->backup('my_backup');
 
 # Insert some data with used as a replay reference, with a recovery
 # target TXID.
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"INSERT INTO tab_int VALUES (generate_series(1001,2000))");
-my $ret = $node_master->safe_psql('postgres',
+my $ret = $node_primary->safe_psql('postgres',
 	"SELECT pg_current_wal_lsn(), pg_current_xact_id();");
 my ($lsn2, $recovery_txid) = split /\|/, $ret;
 
 # More data, with recovery target timestamp
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"INSERT INTO tab_int VALUES (generate_series(2001,3000))");
 my $lsn3 =
-  $node_master->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
-my $recovery_time = $node_master->safe_psql('postgres', "SELECT now()");
+  $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
+my $recovery_time = $node_primary->safe_psql('postgres', "SELECT now()");
 
 # Even more data, this time with a recovery target name
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"INSERT INTO tab_int VALUES (generate_series(3001,4000))");
 my $recovery_name = "my_target";
 my $lsn4 =
-  $node_master->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
-$node_master->safe_psql('postgres',
+  $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
+$node_primary->safe_psql('postgres',
 	"SELECT pg_create_restore_point('$recovery_name');");
 
 # And now for a recovery target LSN
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"INSERT INTO tab_int VALUES (generate_series(4001,5000))");
 my $lsn5 = my $recovery_lsn =
-  $node_master->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+  $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
 
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"INSERT INTO tab_int VALUES (generate_series(5001,6000))");
 
 # Force archiving of WAL file
-$node_master->safe_psql('postgres', "SELECT pg_switch_wal()");
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal()");
 
 # Test recovery targets
 my @recovery_params = ("recovery_target = 'immediate'");
 test_recovery_standby('immediate target',
-	'standby_1', $node_master, \@recovery_params, "1000", $lsn1);
+	'standby_1', $node_primary, \@recovery_params, "1000", $lsn1);
 @recovery_params = ("recovery_target_xid = '$recovery_txid'");
-test_recovery_standby('XID', 'standby_2', $node_master, \@recovery_params,
+test_recovery_standby('XID', 'standby_2', $node_primary, \@recovery_params,
 	"2000", $lsn2);
 @recovery_params = ("recovery_target_time = '$recovery_time'");
-test_recovery_standby('time', 'standby_3', $node_master, \@recovery_params,
+test_recovery_standby('time', 'standby_3', $node_primary, \@recovery_params,
 	"3000", $lsn3);
 @recovery_params = ("recovery_target_name = '$recovery_name'");
-test_recovery_standby('name', 'standby_4', $node_master, \@recovery_params,
+test_recovery_standby('name', 'standby_4', $node_primary, \@recovery_params,
 	"4000", $lsn4);
 @recovery_params = ("recovery_target_lsn = '$recovery_lsn'");
-test_recovery_standby('LSN', 'standby_5', $node_master, \@recovery_params,
+test_recovery_standby('LSN', 'standby_5', $node_primary, \@recovery_params,
 	"5000", $lsn5);
 
 # Multiple targets
@@ -127,10 +127,10 @@ sub test_recovery_standby
 	"recovery_target_name = ''",
 	"recovery_target_time = '$recovery_time'");
 test_recovery_standby('multiple overriding settings',
-	'standby_6', $node_master, \@recovery_params, "3000", $lsn3);
+	'standby_6', $node_primary, \@recovery_params, "3000", $lsn3);
 
 my $node_standby = get_new_node('standby_7');
-$node_standby->init_from_backup($node_master, 'my_backup',
+$node_standby->init_from_backup($node_primary, 'my_backup',
 	has_restoring => 1);
 $node_standby->append_conf(
 	'postgresql.conf', "recovery_target_name = '$recovery_name'
@@ -151,7 +151,7 @@ sub test_recovery_standby
 
 $node_standby = get_new_node('standby_8');
 $node_standby->init_from_backup(
-	$node_master, 'my_backup',
+	$node_primary, 'my_backup',
 	has_restoring => 1,
 	standby       => 0);
 $node_standby->append_conf('postgresql.conf',
diff --git a/src/test/recovery/t/004_timeline_switch.pl b/src/test/recovery/t/004_timeline_switch.pl
index 7e952d366763..1ecdb0eba0d1 100644
--- a/src/test/recovery/t/004_timeline_switch.pl
+++ b/src/test/recovery/t/004_timeline_switch.pl
@@ -10,35 +10,35 @@
 
 $ENV{PGDATABASE} = 'postgres';
 
-# Initialize master node
-my $node_master = get_new_node('master');
-$node_master->init(allows_streaming => 1);
-$node_master->start;
+# Initialize primary node
+my $node_primary = get_new_node('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->start;
 
 # Take backup
 my $backup_name = 'my_backup';
-$node_master->backup($backup_name);
+$node_primary->backup($backup_name);
 
 # Create two standbys linking to it
 my $node_standby_1 = get_new_node('standby_1');
-$node_standby_1->init_from_backup($node_master, $backup_name,
+$node_standby_1->init_from_backup($node_primary, $backup_name,
 	has_streaming => 1);
 $node_standby_1->start;
 my $node_standby_2 = get_new_node('standby_2');
-$node_standby_2->init_from_backup($node_master, $backup_name,
+$node_standby_2->init_from_backup($node_primary, $backup_name,
 	has_streaming => 1);
 $node_standby_2->start;
 
-# Create some content on master
-$node_master->safe_psql('postgres',
+# Create some content on primary
+$node_primary->safe_psql('postgres',
 	"CREATE TABLE tab_int AS SELECT generate_series(1,1000) AS a");
 
 # Wait until standby has replayed enough data on standby 1
-$node_master->wait_for_catchup($node_standby_1, 'replay',
-	$node_master->lsn('write'));
+$node_primary->wait_for_catchup($node_standby_1, 'replay',
+	$node_primary->lsn('write'));
 
-# Stop and remove master
-$node_master->teardown_node;
+# Stop and remove primary
+$node_primary->teardown_node;
 
 # promote standby 1 using "pg_promote", switching it to a new timeline
 my $psql_out = '';
diff --git a/src/test/recovery/t/005_replay_delay.pl b/src/test/recovery/t/005_replay_delay.pl
index 6c85c928c10e..459772f6c44f 100644
--- a/src/test/recovery/t/005_replay_delay.pl
+++ b/src/test/recovery/t/005_replay_delay.pl
@@ -6,23 +6,23 @@
 use TestLib;
 use Test::More tests => 1;
 
-# Initialize master node
-my $node_master = get_new_node('master');
-$node_master->init(allows_streaming => 1);
-$node_master->start;
+# Initialize primary node
+my $node_primary = get_new_node('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->start;
 
 # And some content
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"CREATE TABLE tab_int AS SELECT generate_series(1, 10) AS a");
 
 # Take backup
 my $backup_name = 'my_backup';
-$node_master->backup($backup_name);
+$node_primary->backup($backup_name);
 
 # Create streaming standby from backup
 my $node_standby = get_new_node('standby');
 my $delay        = 3;
-$node_standby->init_from_backup($node_master, $backup_name,
+$node_standby->init_from_backup($node_primary, $backup_name,
 	has_streaming => 1);
 $node_standby->append_conf(
 	'postgresql.conf', qq(
@@ -30,19 +30,19 @@
 ));
 $node_standby->start;
 
-# Make new content on master and check its presence in standby depending
+# Make new content on primary and check its presence in standby depending
 # on the delay applied above. Before doing the insertion, get the
 # current timestamp that will be used as a comparison base. Even on slow
 # machines, this allows to have a predictable behavior when comparing the
-# delay between data insertion moment on master and replay time on standby.
-my $master_insert_time = time();
-$node_master->safe_psql('postgres',
+# delay between data insertion moment on primary and replay time on standby.
+my $primary_insert_time = time();
+$node_primary->safe_psql('postgres',
 	"INSERT INTO tab_int VALUES (generate_series(11, 20))");
 
 # Now wait for replay to complete on standby. We're done waiting when the
-# standby has replayed up to the previously saved master LSN.
+# standby has replayed up to the previously saved primary LSN.
 my $until_lsn =
-  $node_master->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+  $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
 
 $node_standby->poll_query_until('postgres',
 	"SELECT (pg_last_wal_replay_lsn() - '$until_lsn'::pg_lsn) >= 0")
@@ -50,5 +50,5 @@
 
 # This test is successful if and only if the LSN has been applied with at least
 # the configured apply delay.
-ok(time() - $master_insert_time >= $delay,
+ok(time() - $primary_insert_time >= $delay,
 	"standby applies WAL only after replication delay");
diff --git a/src/test/recovery/t/006_logical_decoding.pl b/src/test/recovery/t/006_logical_decoding.pl
index 78229a7b92bc..8cdfae1e1e2d 100644
--- a/src/test/recovery/t/006_logical_decoding.pl
+++ b/src/test/recovery/t/006_logical_decoding.pl
@@ -10,25 +10,25 @@
 use Test::More tests => 14;
 use Config;
 
-# Initialize master node
-my $node_master = get_new_node('master');
-$node_master->init(allows_streaming => 1);
-$node_master->append_conf(
+# Initialize primary node
+my $node_primary = get_new_node('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->append_conf(
 	'postgresql.conf', qq(
 wal_level = logical
 ));
-$node_master->start;
-my $backup_name = 'master_backup';
+$node_primary->start;
+my $backup_name = 'primary_backup';
 
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	qq[CREATE TABLE decoding_test(x integer, y text);]);
 
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	qq[SELECT pg_create_logical_replication_slot('test_slot', 'test_decoding');]
 );
 
 # Cover walsender error shutdown code
-my ($result, $stdout, $stderr) = $node_master->psql(
+my ($result, $stdout, $stderr) = $node_primary->psql(
 	'template1',
 	qq[START_REPLICATION SLOT test_slot LOGICAL 0/0],
 	replication => 'database');
@@ -38,19 +38,19 @@
 
 # Check case of walsender not using a database connection.  Logical
 # decoding should not be allowed.
-($result, $stdout, $stderr) = $node_master->psql(
+($result, $stdout, $stderr) = $node_primary->psql(
 	'template1',
 	qq[START_REPLICATION SLOT s1 LOGICAL 0/1],
 	replication => 'true');
 ok($stderr =~ /ERROR:  logical decoding requires a database connection/,
 	"Logical decoding fails on non-database connection");
 
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	qq[INSERT INTO decoding_test(x,y) SELECT s, s::text FROM generate_series(1,10) s;]
 );
 
 # Basic decoding works
-$result = $node_master->safe_psql('postgres',
+$result = $node_primary->safe_psql('postgres',
 	qq[SELECT pg_logical_slot_get_changes('test_slot', NULL, NULL);]);
 is(scalar(my @foobar = split /^/m, $result),
 	12, 'Decoding produced 12 rows inc BEGIN/COMMIT');
@@ -58,17 +58,17 @@
 # If we immediately crash the server we might lose the progress we just made
 # and replay the same changes again. But a clean shutdown should never repeat
 # the same changes when we use the SQL decoding interface.
-$node_master->restart('fast');
+$node_primary->restart('fast');
 
 # There are no new writes, so the result should be empty.
-$result = $node_master->safe_psql('postgres',
+$result = $node_primary->safe_psql('postgres',
 	qq[SELECT pg_logical_slot_get_changes('test_slot', NULL, NULL);]);
 chomp($result);
 is($result, '', 'Decoding after fast restart repeats no rows');
 
 # Insert some rows and verify that we get the same results from pg_recvlogical
 # and the SQL interface.
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	qq[INSERT INTO decoding_test(x,y) SELECT s, s::text FROM generate_series(1,4) s;]
 );
 
@@ -79,22 +79,22 @@
 table public.decoding_test: INSERT: x[integer]:4 y[text]:'4'
 COMMIT};
 
-my $stdout_sql = $node_master->safe_psql('postgres',
+my $stdout_sql = $node_primary->safe_psql('postgres',
 	qq[SELECT data FROM pg_logical_slot_peek_changes('test_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');]
 );
 is($stdout_sql, $expected, 'got expected output from SQL decoding session');
 
-my $endpos = $node_master->safe_psql('postgres',
+my $endpos = $node_primary->safe_psql('postgres',
 	"SELECT lsn FROM pg_logical_slot_peek_changes('test_slot', NULL, NULL) ORDER BY lsn DESC LIMIT 1;"
 );
 print "waiting to replay $endpos\n";
 
 # Insert some rows after $endpos, which we won't read.
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	qq[INSERT INTO decoding_test(x,y) SELECT s, s::text FROM generate_series(5,50) s;]
 );
 
-my $stdout_recv = $node_master->pg_recvlogical_upto(
+my $stdout_recv = $node_primary->pg_recvlogical_upto(
 	'postgres', 'test_slot', $endpos, 180,
 	'include-xids'     => '0',
 	'skip-empty-xacts' => '1');
@@ -102,27 +102,27 @@
 is($stdout_recv, $expected,
 	'got same expected output from pg_recvlogical decoding session');
 
-$node_master->poll_query_until('postgres',
+$node_primary->poll_query_until('postgres',
 	"SELECT EXISTS (SELECT 1 FROM pg_replication_slots WHERE slot_name = 'test_slot' AND active_pid IS NULL)"
 ) or die "slot never became inactive";
 
-$stdout_recv = $node_master->pg_recvlogical_upto(
+$stdout_recv = $node_primary->pg_recvlogical_upto(
 	'postgres', 'test_slot', $endpos, 180,
 	'include-xids'     => '0',
 	'skip-empty-xacts' => '1');
 chomp($stdout_recv);
 is($stdout_recv, '', 'pg_recvlogical acknowledged changes');
 
-$node_master->safe_psql('postgres', 'CREATE DATABASE otherdb');
+$node_primary->safe_psql('postgres', 'CREATE DATABASE otherdb');
 
-is( $node_master->psql(
+is( $node_primary->psql(
 		'otherdb',
 		"SELECT lsn FROM pg_logical_slot_peek_changes('test_slot', NULL, NULL) ORDER BY lsn DESC LIMIT 1;"
 	),
 	3,
 	'replaying logical slot from another database fails');
 
-$node_master->safe_psql('otherdb',
+$node_primary->safe_psql('otherdb',
 	qq[SELECT pg_create_logical_replication_slot('otherdb_slot', 'test_decoding');]
 );
 
@@ -135,51 +135,51 @@
 
 	my $pg_recvlogical = IPC::Run::start(
 		[
-			'pg_recvlogical', '-d', $node_master->connstr('otherdb'),
+			'pg_recvlogical', '-d', $node_primary->connstr('otherdb'),
 			'-S', 'otherdb_slot', '-f', '-', '--start'
 		]);
-	$node_master->poll_query_until('otherdb',
+	$node_primary->poll_query_until('otherdb',
 		"SELECT EXISTS (SELECT 1 FROM pg_replication_slots WHERE slot_name = 'otherdb_slot' AND active_pid IS NOT NULL)"
 	) or die "slot never became active";
-	is($node_master->psql('postgres', 'DROP DATABASE otherdb'),
+	is($node_primary->psql('postgres', 'DROP DATABASE otherdb'),
 		3, 'dropping a DB with active logical slots fails');
 	$pg_recvlogical->kill_kill;
-	is($node_master->slot('otherdb_slot')->{'slot_name'},
+	is($node_primary->slot('otherdb_slot')->{'slot_name'},
 		undef, 'logical slot still exists');
 }
 
-$node_master->poll_query_until('otherdb',
+$node_primary->poll_query_until('otherdb',
 	"SELECT EXISTS (SELECT 1 FROM pg_replication_slots WHERE slot_name = 'otherdb_slot' AND active_pid IS NULL)"
 ) or die "slot never became inactive";
 
-is($node_master->psql('postgres', 'DROP DATABASE otherdb'),
+is($node_primary->psql('postgres', 'DROP DATABASE otherdb'),
 	0, 'dropping a DB with inactive logical slots succeeds');
-is($node_master->slot('otherdb_slot')->{'slot_name'},
+is($node_primary->slot('otherdb_slot')->{'slot_name'},
 	undef, 'logical slot was actually dropped with DB');
 
 # Test logical slot advancing and its durability.
 my $logical_slot = 'logical_slot';
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"SELECT pg_create_logical_replication_slot('$logical_slot', 'test_decoding', false);"
 );
-$node_master->psql(
+$node_primary->psql(
 	'postgres', "
 	CREATE TABLE tab_logical_slot (a int);
 	INSERT INTO tab_logical_slot VALUES (generate_series(1,10));");
 my $current_lsn =
-  $node_master->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
+  $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
 chomp($current_lsn);
-my $psql_rc = $node_master->psql('postgres',
+my $psql_rc = $node_primary->psql('postgres',
 	"SELECT pg_replication_slot_advance('$logical_slot', '$current_lsn'::pg_lsn);"
 );
 is($psql_rc, '0', 'slot advancing with logical slot');
-my $logical_restart_lsn_pre = $node_master->safe_psql('postgres',
+my $logical_restart_lsn_pre = $node_primary->safe_psql('postgres',
 	"SELECT restart_lsn from pg_replication_slots WHERE slot_name = '$logical_slot';"
 );
 chomp($logical_restart_lsn_pre);
 # Slot advance should persist across clean restarts.
-$node_master->restart;
-my $logical_restart_lsn_post = $node_master->safe_psql('postgres',
+$node_primary->restart;
+my $logical_restart_lsn_post = $node_primary->safe_psql('postgres',
 	"SELECT restart_lsn from pg_replication_slots WHERE slot_name = '$logical_slot';"
 );
 chomp($logical_restart_lsn_post);
@@ -187,4 +187,4 @@
 	"logical slot advance persists across restarts");
 
 # done with the node
-$node_master->stop;
+$node_primary->stop;
diff --git a/src/test/recovery/t/007_sync_rep.pl b/src/test/recovery/t/007_sync_rep.pl
index 05803bed4e37..e3c6738d3ab8 100644
--- a/src/test/recovery/t/007_sync_rep.pl
+++ b/src/test/recovery/t/007_sync_rep.pl
@@ -32,53 +32,53 @@ sub test_sync_state
 # until the standby is confirmed as registered.
 sub start_standby_and_wait
 {
-	my ($master, $standby) = @_;
-	my $master_name  = $master->name;
+	my ($primary, $standby) = @_;
+	my $primary_name  = $primary->name;
 	my $standby_name = $standby->name;
 	my $query =
 	  "SELECT count(1) = 1 FROM pg_stat_replication WHERE application_name = '$standby_name'";
 
 	$standby->start;
 
-	print("### Waiting for standby \"$standby_name\" on \"$master_name\"\n");
-	$master->poll_query_until('postgres', $query);
+	print("### Waiting for standby \"$standby_name\" on \"$primary_name\"\n");
+	$primary->poll_query_until('postgres', $query);
 	return;
 }
 
-# Initialize master node
-my $node_master = get_new_node('master');
-$node_master->init(allows_streaming => 1);
-$node_master->start;
-my $backup_name = 'master_backup';
+# Initialize primary node
+my $node_primary = get_new_node('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->start;
+my $backup_name = 'primary_backup';
 
 # Take backup
-$node_master->backup($backup_name);
+$node_primary->backup($backup_name);
 
 # Create all the standbys.  Their status on the primary is checked to ensure
 # the ordering of each one of them in the WAL sender array of the primary.
 
-# Create standby1 linking to master
+# Create standby1 linking to primary
 my $node_standby_1 = get_new_node('standby1');
-$node_standby_1->init_from_backup($node_master, $backup_name,
+$node_standby_1->init_from_backup($node_primary, $backup_name,
 	has_streaming => 1);
-start_standby_and_wait($node_master, $node_standby_1);
+start_standby_and_wait($node_primary, $node_standby_1);
 
-# Create standby2 linking to master
+# Create standby2 linking to primary
 my $node_standby_2 = get_new_node('standby2');
-$node_standby_2->init_from_backup($node_master, $backup_name,
+$node_standby_2->init_from_backup($node_primary, $backup_name,
 	has_streaming => 1);
-start_standby_and_wait($node_master, $node_standby_2);
+start_standby_and_wait($node_primary, $node_standby_2);
 
-# Create standby3 linking to master
+# Create standby3 linking to primary
 my $node_standby_3 = get_new_node('standby3');
-$node_standby_3->init_from_backup($node_master, $backup_name,
+$node_standby_3->init_from_backup($node_primary, $backup_name,
 	has_streaming => 1);
-start_standby_and_wait($node_master, $node_standby_3);
+start_standby_and_wait($node_primary, $node_standby_3);
 
 # Check that sync_state is determined correctly when
 # synchronous_standby_names is specified in old syntax.
 test_sync_state(
-	$node_master, qq(standby1|1|sync
+	$node_primary, qq(standby1|1|sync
 standby2|2|potential
 standby3|0|async),
 	'old syntax of synchronous_standby_names',
@@ -90,7 +90,7 @@ sub start_standby_and_wait
 # it's stored in the head of WalSnd array which manages
 # all the standbys though they have the same priority.
 test_sync_state(
-	$node_master, qq(standby1|1|sync
+	$node_primary, qq(standby1|1|sync
 standby2|1|potential
 standby3|1|potential),
 	'asterisk in synchronous_standby_names',
@@ -105,23 +105,23 @@ sub start_standby_and_wait
 
 # Make sure that each standby reports back to the primary in the wanted
 # order.
-start_standby_and_wait($node_master, $node_standby_2);
-start_standby_and_wait($node_master, $node_standby_3);
+start_standby_and_wait($node_primary, $node_standby_2);
+start_standby_and_wait($node_primary, $node_standby_3);
 
 # Specify 2 as the number of sync standbys.
 # Check that two standbys are in 'sync' state.
 test_sync_state(
-	$node_master, qq(standby2|2|sync
+	$node_primary, qq(standby2|2|sync
 standby3|3|sync),
 	'2 synchronous standbys',
 	'2(standby1,standby2,standby3)');
 
 # Start standby1
-start_standby_and_wait($node_master, $node_standby_1);
+start_standby_and_wait($node_primary, $node_standby_1);
 
-# Create standby4 linking to master
+# Create standby4 linking to primary
 my $node_standby_4 = get_new_node('standby4');
-$node_standby_4->init_from_backup($node_master, $backup_name,
+$node_standby_4->init_from_backup($node_primary, $backup_name,
 	has_streaming => 1);
 $node_standby_4->start;
 
@@ -130,7 +130,7 @@ sub start_standby_and_wait
 # standby3 appearing later represents potential, and standby4 is
 # in 'async' state because it's not in the list.
 test_sync_state(
-	$node_master, qq(standby1|1|sync
+	$node_primary, qq(standby1|1|sync
 standby2|2|sync
 standby3|3|potential
 standby4|0|async),
@@ -140,7 +140,7 @@ sub start_standby_and_wait
 # when num_sync exceeds the number of names of potential sync standbys
 # specified in synchronous_standby_names.
 test_sync_state(
-	$node_master, qq(standby1|0|async
+	$node_primary, qq(standby1|0|async
 standby2|4|sync
 standby3|3|sync
 standby4|1|sync),
@@ -154,7 +154,7 @@ sub start_standby_and_wait
 # second standby listed first in the WAL sender array, which is
 # standby2 in this case.
 test_sync_state(
-	$node_master, qq(standby1|1|sync
+	$node_primary, qq(standby1|1|sync
 standby2|2|sync
 standby3|2|potential
 standby4|2|potential),
@@ -164,7 +164,7 @@ sub start_standby_and_wait
 # Check that the setting of '2(*)' chooses standby2 and standby3 that are stored
 # earlier in WalSnd array as sync standbys.
 test_sync_state(
-	$node_master, qq(standby1|1|potential
+	$node_primary, qq(standby1|1|potential
 standby2|1|sync
 standby3|1|sync
 standby4|1|potential),
@@ -177,7 +177,7 @@ sub start_standby_and_wait
 # Check that the state of standby1 stored earlier in WalSnd array than
 # standby4 is transited from potential to sync.
 test_sync_state(
-	$node_master, qq(standby1|1|sync
+	$node_primary, qq(standby1|1|sync
 standby2|1|sync
 standby4|1|potential),
 	'potential standby found earlier in array is promoted to sync');
@@ -185,7 +185,7 @@ sub start_standby_and_wait
 # Check that standby1 and standby2 are chosen as sync standbys
 # based on their priorities.
 test_sync_state(
-	$node_master, qq(standby1|1|sync
+	$node_primary, qq(standby1|1|sync
 standby2|2|sync
 standby4|0|async),
 	'priority-based sync replication specified by FIRST keyword',
@@ -194,7 +194,7 @@ sub start_standby_and_wait
 # Check that all the listed standbys are considered as candidates
 # for sync standbys in a quorum-based sync replication.
 test_sync_state(
-	$node_master, qq(standby1|1|quorum
+	$node_primary, qq(standby1|1|quorum
 standby2|1|quorum
 standby4|0|async),
 	'2 quorum and 1 async',
@@ -206,7 +206,7 @@ sub start_standby_and_wait
 # Check that the setting of 'ANY 2(*)' chooses all standbys as
 # candidates for quorum sync standbys.
 test_sync_state(
-	$node_master, qq(standby1|1|quorum
+	$node_primary, qq(standby1|1|quorum
 standby2|1|quorum
 standby3|1|quorum
 standby4|1|quorum),
diff --git a/src/test/recovery/t/008_fsm_truncation.pl b/src/test/recovery/t/008_fsm_truncation.pl
index ddab464a9733..37967c11744f 100644
--- a/src/test/recovery/t/008_fsm_truncation.pl
+++ b/src/test/recovery/t/008_fsm_truncation.pl
@@ -9,10 +9,10 @@
 use TestLib;
 use Test::More tests => 1;
 
-my $node_master = get_new_node('master');
-$node_master->init(allows_streaming => 1);
+my $node_primary = get_new_node('primary');
+$node_primary->init(allows_streaming => 1);
 
-$node_master->append_conf(
+$node_primary->append_conf(
 	'postgresql.conf', qq{
 fsync = on
 wal_log_hints = on
@@ -20,17 +20,17 @@
 autovacuum = off
 });
 
-# Create a master node and its standby, initializing both with some data
+# Create a primary node and its standby, initializing both with some data
 # at the same time.
-$node_master->start;
+$node_primary->start;
 
-$node_master->backup('master_backup');
+$node_primary->backup('primary_backup');
 my $node_standby = get_new_node('standby');
-$node_standby->init_from_backup($node_master, 'master_backup',
+$node_standby->init_from_backup($node_primary, 'primary_backup',
 	has_streaming => 1);
 $node_standby->start;
 
-$node_master->psql(
+$node_primary->psql(
 	'postgres', qq{
 create table testtab (a int, b char(100));
 insert into testtab select generate_series(1,1000), 'foo';
@@ -39,7 +39,7 @@
 });
 
 # Take a lock on the table to prevent following vacuum from truncating it
-$node_master->psql(
+$node_primary->psql(
 	'postgres', qq{
 begin;
 lock table testtab in row share mode;
@@ -47,14 +47,14 @@
 });
 
 # Vacuum, update FSM without truncation
-$node_master->psql('postgres', 'vacuum verbose testtab');
+$node_primary->psql('postgres', 'vacuum verbose testtab');
 
 # Force a checkpoint
-$node_master->psql('postgres', 'checkpoint');
+$node_primary->psql('postgres', 'checkpoint');
 
 # Now do some more insert/deletes, another vacuum to ensure full-page writes
 # are done
-$node_master->psql(
+$node_primary->psql(
 	'postgres', qq{
 insert into testtab select generate_series(1,1000), 'foo';
 delete from testtab where ctid > '(8,0)';
@@ -65,15 +65,15 @@
 $node_standby->psql('postgres', 'checkpoint');
 
 # Release the lock, vacuum again which should lead to truncation
-$node_master->psql(
+$node_primary->psql(
 	'postgres', qq{
 rollback prepared 'p1';
 vacuum verbose testtab;
 });
 
-$node_master->psql('postgres', 'checkpoint');
+$node_primary->psql('postgres', 'checkpoint');
 my $until_lsn =
-  $node_master->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
+  $node_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn();");
 
 # Wait long enough for standby to receive and apply all WAL
 my $caughtup_query =
diff --git a/src/test/recovery/t/009_twophase.pl b/src/test/recovery/t/009_twophase.pl
index 1b748ad857b0..9da3464bc1dd 100644
--- a/src/test/recovery/t/009_twophase.pl
+++ b/src/test/recovery/t/009_twophase.pl
@@ -23,7 +23,7 @@ sub configure_and_reload
 	return;
 }
 
-# Set up two nodes, which will alternately be master and replication standby.
+# Set up two nodes, which will alternately be primary and replication standby.
 
 # Setup london node
 my $node_london = get_new_node("london");
@@ -46,13 +46,13 @@ sub configure_and_reload
 configure_and_reload($node_london, "synchronous_standby_names = 'paris'");
 configure_and_reload($node_paris,  "synchronous_standby_names = 'london'");
 
-# Set up nonce names for current master and standby nodes
-note "Initially, london is master and paris is standby";
-my ($cur_master, $cur_standby) = ($node_london, $node_paris);
-my $cur_master_name = $cur_master->name;
+# Set up nonce names for current primary and standby nodes
+note "Initially, london is primary and paris is standby";
+my ($cur_primary, $cur_standby) = ($node_london, $node_paris);
+my $cur_primary_name = $cur_primary->name;
 
 # Create table we'll use in the test transactions
-$cur_master->psql('postgres', "CREATE TABLE t_009_tbl (id int, msg text)");
+$cur_primary->psql('postgres', "CREATE TABLE t_009_tbl (id int, msg text)");
 
 ###############################################################################
 # Check that we can commit and abort transaction after soft restart.
@@ -61,25 +61,25 @@ sub configure_and_reload
 # files.
 ###############################################################################
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres', "
 	BEGIN;
-	INSERT INTO t_009_tbl VALUES (1, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (1, 'issued to ${cur_primary_name}');
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl VALUES (2, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (2, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_1';
 	BEGIN;
-	INSERT INTO t_009_tbl VALUES (3, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (3, 'issued to ${cur_primary_name}');
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl VALUES (4, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (4, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_2';");
-$cur_master->stop;
-$cur_master->start;
+$cur_primary->stop;
+$cur_primary->start;
 
-$psql_rc = $cur_master->psql('postgres', "COMMIT PREPARED 'xact_009_1'");
+$psql_rc = $cur_primary->psql('postgres', "COMMIT PREPARED 'xact_009_1'");
 is($psql_rc, '0', 'Commit prepared transaction after restart');
 
-$psql_rc = $cur_master->psql('postgres', "ROLLBACK PREPARED 'xact_009_2'");
+$psql_rc = $cur_primary->psql('postgres', "ROLLBACK PREPARED 'xact_009_2'");
 is($psql_rc, '0', 'Rollback prepared transaction after restart');
 
 ###############################################################################
@@ -88,50 +88,50 @@ sub configure_and_reload
 # transaction using dedicated WAL records.
 ###############################################################################
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres', "
 	CHECKPOINT;
 	BEGIN;
-	INSERT INTO t_009_tbl VALUES (5, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (5, 'issued to ${cur_primary_name}');
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl VALUES (6, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (6, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_3';
 	BEGIN;
-	INSERT INTO t_009_tbl VALUES (7, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (7, 'issued to ${cur_primary_name}');
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl VALUES (8, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (8, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_4';");
-$cur_master->teardown_node;
-$cur_master->start;
+$cur_primary->teardown_node;
+$cur_primary->start;
 
-$psql_rc = $cur_master->psql('postgres', "COMMIT PREPARED 'xact_009_3'");
+$psql_rc = $cur_primary->psql('postgres', "COMMIT PREPARED 'xact_009_3'");
 is($psql_rc, '0', 'Commit prepared transaction after teardown');
 
-$psql_rc = $cur_master->psql('postgres', "ROLLBACK PREPARED 'xact_009_4'");
+$psql_rc = $cur_primary->psql('postgres', "ROLLBACK PREPARED 'xact_009_4'");
 is($psql_rc, '0', 'Rollback prepared transaction after teardown');
 
 ###############################################################################
 # Check that WAL replay can handle several transactions with same GID name.
 ###############################################################################
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres', "
 	CHECKPOINT;
 	BEGIN;
-	INSERT INTO t_009_tbl VALUES (9, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (9, 'issued to ${cur_primary_name}');
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl VALUES (10, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (10, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_5';
 	COMMIT PREPARED 'xact_009_5';
 	BEGIN;
-	INSERT INTO t_009_tbl VALUES (11, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (11, 'issued to ${cur_primary_name}');
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl VALUES (12, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (12, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_5';");
-$cur_master->teardown_node;
-$cur_master->start;
+$cur_primary->teardown_node;
+$cur_primary->start;
 
-$psql_rc = $cur_master->psql('postgres', "COMMIT PREPARED 'xact_009_5'");
+$psql_rc = $cur_primary->psql('postgres', "COMMIT PREPARED 'xact_009_5'");
 is($psql_rc, '0', 'Replay several transactions with same GID');
 
 ###############################################################################
@@ -139,39 +139,39 @@ sub configure_and_reload
 # while replaying transaction commits.
 ###############################################################################
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres', "
 	BEGIN;
-	INSERT INTO t_009_tbl VALUES (13, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (13, 'issued to ${cur_primary_name}');
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl VALUES (14, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (14, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_6';
 	COMMIT PREPARED 'xact_009_6';");
-$cur_master->teardown_node;
-$cur_master->start;
-$psql_rc = $cur_master->psql(
+$cur_primary->teardown_node;
+$cur_primary->start;
+$psql_rc = $cur_primary->psql(
 	'postgres', "
 	BEGIN;
-	INSERT INTO t_009_tbl VALUES (15, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (15, 'issued to ${cur_primary_name}');
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl VALUES (16, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (16, 'issued to ${cur_primary_name}');
 	-- This prepare can fail due to conflicting GID or locks conflicts if
 	-- replay did not fully cleanup its state on previous commit.
 	PREPARE TRANSACTION 'xact_009_7';");
 is($psql_rc, '0', "Cleanup of shared memory state for 2PC commit");
 
-$cur_master->psql('postgres', "COMMIT PREPARED 'xact_009_7'");
+$cur_primary->psql('postgres', "COMMIT PREPARED 'xact_009_7'");
 
 ###############################################################################
 # Check that WAL replay will cleanup its shared memory state on running standby.
 ###############################################################################
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres', "
 	BEGIN;
-	INSERT INTO t_009_tbl VALUES (17, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (17, 'issued to ${cur_primary_name}');
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl VALUES (18, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (18, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_8';
 	COMMIT PREPARED 'xact_009_8';");
 $cur_standby->psql(
@@ -186,15 +186,15 @@ sub configure_and_reload
 # prepare and commit to use on-disk twophase files.
 ###############################################################################
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres', "
 	BEGIN;
-	INSERT INTO t_009_tbl VALUES (19, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (19, 'issued to ${cur_primary_name}');
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl VALUES (20, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (20, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_9';");
 $cur_standby->psql('postgres', "CHECKPOINT");
-$cur_master->psql('postgres', "COMMIT PREPARED 'xact_009_9'");
+$cur_primary->psql('postgres', "COMMIT PREPARED 'xact_009_9'");
 $cur_standby->psql(
 	'postgres',
 	"SELECT count(*) FROM pg_prepared_xacts",
@@ -206,114 +206,114 @@ sub configure_and_reload
 # Check that prepared transactions can be committed on promoted standby.
 ###############################################################################
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres', "
 	BEGIN;
-	INSERT INTO t_009_tbl VALUES (21, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (21, 'issued to ${cur_primary_name}');
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl VALUES (22, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (22, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_10';");
-$cur_master->teardown_node;
+$cur_primary->teardown_node;
 $cur_standby->promote;
 
 # change roles
-note "Now paris is master and london is standby";
-($cur_master, $cur_standby) = ($node_paris, $node_london);
-$cur_master_name = $cur_master->name;
+note "Now paris is primary and london is standby";
+($cur_primary, $cur_standby) = ($node_paris, $node_london);
+$cur_primary_name = $cur_primary->name;
 
 # because london is not running at this point, we can't use syncrep commit
 # on this command
-$psql_rc = $cur_master->psql('postgres',
+$psql_rc = $cur_primary->psql('postgres',
 	"SET synchronous_commit = off; COMMIT PREPARED 'xact_009_10'");
 is($psql_rc, '0', "Restore of prepared transaction on promoted standby");
 
-# restart old master as new standby
-$cur_standby->enable_streaming($cur_master);
+# restart old primary as new standby
+$cur_standby->enable_streaming($cur_primary);
 $cur_standby->start;
 
 ###############################################################################
 # Check that prepared transactions are replayed after soft restart of standby
-# while master is down. Since standby knows that master is down it uses a
+# while primary is down. Since standby knows that primary is down it uses a
 # different code path on startup to ensure that the status of transactions is
 # consistent.
 ###############################################################################
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres', "
 	BEGIN;
-	INSERT INTO t_009_tbl VALUES (23, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (23, 'issued to ${cur_primary_name}');
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl VALUES (24, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (24, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_11';");
-$cur_master->stop;
+$cur_primary->stop;
 $cur_standby->restart;
 $cur_standby->promote;
 
 # change roles
-note "Now london is master and paris is standby";
-($cur_master, $cur_standby) = ($node_london, $node_paris);
-$cur_master_name = $cur_master->name;
+note "Now london is primary and paris is standby";
+($cur_primary, $cur_standby) = ($node_london, $node_paris);
+$cur_primary_name = $cur_primary->name;
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres',
 	"SELECT count(*) FROM pg_prepared_xacts",
 	stdout => \$psql_out);
 is($psql_out, '1',
-	"Restore prepared transactions from files with master down");
+	"Restore prepared transactions from files with primary down");
 
-# restart old master as new standby
-$cur_standby->enable_streaming($cur_master);
+# restart old primary as new standby
+$cur_standby->enable_streaming($cur_primary);
 $cur_standby->start;
 
-$cur_master->psql('postgres', "COMMIT PREPARED 'xact_009_11'");
+$cur_primary->psql('postgres', "COMMIT PREPARED 'xact_009_11'");
 
 ###############################################################################
 # Check that prepared transactions are correctly replayed after standby hard
-# restart while master is down.
+# restart while primary is down.
 ###############################################################################
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres', "
 	BEGIN;
-	INSERT INTO t_009_tbl VALUES (25, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (25, 'issued to ${cur_primary_name}');
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl VALUES (26, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl VALUES (26, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_12';
 	");
-$cur_master->stop;
+$cur_primary->stop;
 $cur_standby->teardown_node;
 $cur_standby->start;
 $cur_standby->promote;
 
 # change roles
-note "Now paris is master and london is standby";
-($cur_master, $cur_standby) = ($node_paris, $node_london);
-$cur_master_name = $cur_master->name;
+note "Now paris is primary and london is standby";
+($cur_primary, $cur_standby) = ($node_paris, $node_london);
+$cur_primary_name = $cur_primary->name;
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres',
 	"SELECT count(*) FROM pg_prepared_xacts",
 	stdout => \$psql_out);
 is($psql_out, '1',
-	"Restore prepared transactions from records with master down");
+	"Restore prepared transactions from records with primary down");
 
-# restart old master as new standby
-$cur_standby->enable_streaming($cur_master);
+# restart old primary as new standby
+$cur_standby->enable_streaming($cur_primary);
 $cur_standby->start;
 
-$cur_master->psql('postgres', "COMMIT PREPARED 'xact_009_12'");
+$cur_primary->psql('postgres', "COMMIT PREPARED 'xact_009_12'");
 
 ###############################################################################
 # Check for a lock conflict between prepared transaction with DDL inside and
 # replay of XLOG_STANDBY_LOCK wal record.
 ###############################################################################
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres', "
 	BEGIN;
 	CREATE TABLE t_009_tbl2 (id int, msg text);
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl2 VALUES (27, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl2 VALUES (27, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_13';
 	-- checkpoint will issue XLOG_STANDBY_LOCK that can conflict with lock
 	-- held by 'create table' statement
@@ -321,10 +321,10 @@ sub configure_and_reload
 	COMMIT PREPARED 'xact_009_13';");
 
 # Ensure that last transaction is replayed on standby.
-my $cur_master_lsn =
-  $cur_master->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
+my $cur_primary_lsn =
+  $cur_primary->safe_psql('postgres', "SELECT pg_current_wal_lsn()");
 my $caughtup_query =
-  "SELECT '$cur_master_lsn'::pg_lsn <= pg_last_wal_replay_lsn()";
+  "SELECT '$cur_primary_lsn'::pg_lsn <= pg_last_wal_replay_lsn()";
 $cur_standby->poll_query_until('postgres', $caughtup_query)
   or die "Timed out while waiting for standby to catch up";
 
@@ -336,69 +336,69 @@ sub configure_and_reload
 
 ###############################################################################
 # Check recovery of prepared transaction with DDL inside after a hard restart
-# of the master.
+# of the primary.
 ###############################################################################
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres', "
 	BEGIN;
 	CREATE TABLE t_009_tbl3 (id int, msg text);
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl3 VALUES (28, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl3 VALUES (28, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_14';
 	BEGIN;
 	CREATE TABLE t_009_tbl4 (id int, msg text);
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl4 VALUES (29, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl4 VALUES (29, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_15';");
 
-$cur_master->teardown_node;
-$cur_master->start;
+$cur_primary->teardown_node;
+$cur_primary->start;
 
-$psql_rc = $cur_master->psql('postgres', "COMMIT PREPARED 'xact_009_14'");
+$psql_rc = $cur_primary->psql('postgres', "COMMIT PREPARED 'xact_009_14'");
 is($psql_rc, '0', 'Commit prepared transaction after teardown');
 
-$psql_rc = $cur_master->psql('postgres', "ROLLBACK PREPARED 'xact_009_15'");
+$psql_rc = $cur_primary->psql('postgres', "ROLLBACK PREPARED 'xact_009_15'");
 is($psql_rc, '0', 'Rollback prepared transaction after teardown');
 
 ###############################################################################
 # Check recovery of prepared transaction with DDL inside after a soft restart
-# of the master.
+# of the primary.
 ###############################################################################
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres', "
 	BEGIN;
 	CREATE TABLE t_009_tbl5 (id int, msg text);
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl5 VALUES (30, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl5 VALUES (30, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_16';
 	BEGIN;
 	CREATE TABLE t_009_tbl6 (id int, msg text);
 	SAVEPOINT s1;
-	INSERT INTO t_009_tbl6 VALUES (31, 'issued to ${cur_master_name}');
+	INSERT INTO t_009_tbl6 VALUES (31, 'issued to ${cur_primary_name}');
 	PREPARE TRANSACTION 'xact_009_17';");
 
-$cur_master->stop;
-$cur_master->start;
+$cur_primary->stop;
+$cur_primary->start;
 
-$psql_rc = $cur_master->psql('postgres', "COMMIT PREPARED 'xact_009_16'");
+$psql_rc = $cur_primary->psql('postgres', "COMMIT PREPARED 'xact_009_16'");
 is($psql_rc, '0', 'Commit prepared transaction after restart');
 
-$psql_rc = $cur_master->psql('postgres', "ROLLBACK PREPARED 'xact_009_17'");
+$psql_rc = $cur_primary->psql('postgres', "ROLLBACK PREPARED 'xact_009_17'");
 is($psql_rc, '0', 'Rollback prepared transaction after restart');
 
 ###############################################################################
 # Verify expected data appears on both servers.
 ###############################################################################
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres',
 	"SELECT count(*) FROM pg_prepared_xacts",
 	stdout => \$psql_out);
-is($psql_out, '0', "No uncommitted prepared transactions on master");
+is($psql_out, '0', "No uncommitted prepared transactions on primary");
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres',
 	"SELECT * FROM t_009_tbl ORDER BY id",
 	stdout => \$psql_out);
@@ -424,15 +424,15 @@ sub configure_and_reload
 24|issued to paris
 25|issued to london
 26|issued to london},
-	"Check expected t_009_tbl data on master");
+	"Check expected t_009_tbl data on primary");
 
-$cur_master->psql(
+$cur_primary->psql(
 	'postgres',
 	"SELECT * FROM t_009_tbl2",
 	stdout => \$psql_out);
 is( $psql_out,
 	qq{27|issued to paris},
-	"Check expected t_009_tbl2 data on master");
+	"Check expected t_009_tbl2 data on primary");
 
 $cur_standby->psql(
 	'postgres',
diff --git a/src/test/recovery/t/010_logical_decoding_timelines.pl b/src/test/recovery/t/010_logical_decoding_timelines.pl
index 11f5595e2bf4..09aaefa9f032 100644
--- a/src/test/recovery/t/010_logical_decoding_timelines.pl
+++ b/src/test/recovery/t/010_logical_decoding_timelines.pl
@@ -30,10 +30,10 @@
 
 my ($stdout, $stderr, $ret);
 
-# Initialize master node
-my $node_master = get_new_node('master');
-$node_master->init(allows_streaming => 1, has_archiving => 1);
-$node_master->append_conf(
+# Initialize primary node
+my $node_primary = get_new_node('primary');
+$node_primary->init(allows_streaming => 1, has_archiving => 1);
+$node_primary->append_conf(
 	'postgresql.conf', q[
 wal_level = 'logical'
 max_replication_slots = 3
@@ -42,38 +42,38 @@
 hot_standby_feedback = on
 wal_receiver_status_interval = 1
 ]);
-$node_master->dump_info;
-$node_master->start;
+$node_primary->dump_info;
+$node_primary->start;
 
 note "testing logical timeline following with a filesystem-level copy";
 
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"SELECT pg_create_logical_replication_slot('before_basebackup', 'test_decoding');"
 );
-$node_master->safe_psql('postgres', "CREATE TABLE decoding(blah text);");
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres', "CREATE TABLE decoding(blah text);");
+$node_primary->safe_psql('postgres',
 	"INSERT INTO decoding(blah) VALUES ('beforebb');");
 
 # We also want to verify that DROP DATABASE on a standby with a logical
 # slot works. This isn't strictly related to timeline following, but
 # the only way to get a logical slot on a standby right now is to use
 # the same physical copy trick, so:
-$node_master->safe_psql('postgres', 'CREATE DATABASE dropme;');
-$node_master->safe_psql('dropme',
+$node_primary->safe_psql('postgres', 'CREATE DATABASE dropme;');
+$node_primary->safe_psql('dropme',
 	"SELECT pg_create_logical_replication_slot('dropme_slot', 'test_decoding');"
 );
 
-$node_master->safe_psql('postgres', 'CHECKPOINT;');
+$node_primary->safe_psql('postgres', 'CHECKPOINT;');
 
 my $backup_name = 'b1';
-$node_master->backup_fs_hot($backup_name);
+$node_primary->backup_fs_hot($backup_name);
 
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	q[SELECT pg_create_physical_replication_slot('phys_slot');]);
 
 my $node_replica = get_new_node('replica');
 $node_replica->init_from_backup(
-	$node_master, $backup_name,
+	$node_primary, $backup_name,
 	has_streaming => 1,
 	has_restoring => 1);
 $node_replica->append_conf('postgresql.conf',
@@ -81,26 +81,26 @@
 
 $node_replica->start;
 
-# If we drop 'dropme' on the master, the standby should drop the
+# If we drop 'dropme' on the primary, the standby should drop the
 # db and associated slot.
-is($node_master->psql('postgres', 'DROP DATABASE dropme'),
-	0, 'dropped DB with logical slot OK on master');
-$node_master->wait_for_catchup($node_replica, 'replay',
-	$node_master->lsn('insert'));
+is($node_primary->psql('postgres', 'DROP DATABASE dropme'),
+	0, 'dropped DB with logical slot OK on primary');
+$node_primary->wait_for_catchup($node_replica, 'replay',
+	$node_primary->lsn('insert'));
 is( $node_replica->safe_psql(
 		'postgres', q[SELECT 1 FROM pg_database WHERE datname = 'dropme']),
 	'',
 	'dropped DB dropme on standby');
-is($node_master->slot('dropme_slot')->{'slot_name'},
+is($node_primary->slot('dropme_slot')->{'slot_name'},
 	undef, 'logical slot was actually dropped on standby');
 
 # Back to testing failover...
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"SELECT pg_create_logical_replication_slot('after_basebackup', 'test_decoding');"
 );
-$node_master->safe_psql('postgres',
+$node_primary->safe_psql('postgres',
 	"INSERT INTO decoding(blah) VALUES ('afterbb');");
-$node_master->safe_psql('postgres', 'CHECKPOINT;');
+$node_primary->safe_psql('postgres', 'CHECKPOINT;');
 
 # Verify that only the before base_backup slot is on the replica
 $stdout = $node_replica->safe_psql('postgres',
@@ -109,20 +109,20 @@
 	'Expected to find only slot before_basebackup on replica');
 
 # Examine the physical slot the replica uses to stream changes
-# from the master to make sure its hot_standby_feedback
+# from the primary to make sure its hot_standby_feedback
 # has locked in a catalog_xmin on the physical slot, and that
 # any xmin is < the catalog_xmin
-$node_master->poll_query_until(
+$node_primary->poll_query_until(
 	'postgres', q[
 	SELECT catalog_xmin IS NOT NULL
 	FROM pg_replication_slots
 	WHERE slot_name = 'phys_slot'
 	]) or die "slot's catalog_xmin never became set";
 
-my $phys_slot = $node_master->slot('phys_slot');
-isnt($phys_slot->{'xmin'}, '', 'xmin assigned on physical slot of master');
+my $phys_slot = $node_primary->slot('phys_slot');
+isnt($phys_slot->{'xmin'}, '', 'xmin assigned on physical slot of primary');
 isnt($phys_slot->{'catalog_xmin'},
-	'', 'catalog_xmin assigned on physical slot of master');
+	'', 'catalog_xmin assigned on physical slot of primary');
 
 # Ignore wrap-around here, we're on a new cluster:
 cmp_ok(
@@ -130,11 +130,11 @@
 	$phys_slot->{'catalog_xmin'},
 	'xmin on physical slot must not be lower than catalog_xmin');
 
-$node_master->safe_psql('postgres', 'CHECKPOINT');
-$node_master->wait_for_catchup($node_replica, 'write');
+$node_primary->safe_psql('postgres', 'CHECKPOINT');
+$node_primary->wait_for_catchup($node_replica, 'write');
 
 # Boom, crash
-$node_master->stop('immediate');
+$node_primary->stop('immediate');
 
 $node_replica->promote;
 
diff --git a/src/test/recovery/t/011_crash_recovery.pl b/src/test/recovery/t/011_crash_recovery.pl
index ca6e92b50df4..5fe917978c6a 100644
--- a/src/test/recovery/t/011_crash_recovery.pl
+++ b/src/test/recovery/t/011_crash_recovery.pl
@@ -18,7 +18,7 @@
 	plan tests => 3;
 }
 
-my $node = get_new_node('master');
+my $node = get_new_node('primary');
 $node->init(allows_streaming => 1);
 $node->start;
 
diff --git a/src/test/recovery/t/012_subtransactions.pl b/src/test/recovery/t/012_subtransactions.pl
index 292cd40fe2d4..6b9e29ae3c7f 100644
--- a/src/test/recovery/t/012_subtransactions.pl
+++ b/src/test/recovery/t/012_subtransactions.pl
@@ -6,30 +6,30 @@
 use TestLib;
 use Test::More tests => 12;
 
-# Setup master node
-my $node_master = get_new_node("master");
-$node_master->init(allows_streaming => 1);
-$node_master->append_conf(
+# Setup primary node
+my $node_primary = get_new_node("primary");
+$node_primary->init(allows_streaming => 1);
+$node_primary->append_conf(
 	'postgresql.conf', qq(
 	max_prepared_transactions = 10
 	log_checkpoints = true
 ));
-$node_master->start;
-$node_master->backup('master_backup');
-$node_master->psql('postgres', "CREATE TABLE t_012_tbl (id int)");
+$node_primary->start;
+$node_primary->backup('primary_backup');
+$node_primary->psql('postgres', "CREATE TABLE t_012_tbl (id int)");
 
 # Setup standby node
 my $node_standby = get_new_node('standby');
-$node_standby->init_from_backup($node_master, 'master_backup',
+$node_standby->init_from_backup($node_primary, 'primary_backup',
 	has_streaming => 1);
 $node_standby->start;
 
 # Switch to synchronous replication
-$node_master->append_conf(
+$node_primary->append_conf(
 	'postgresql.conf', qq(
 	synchronous_standby_names = '*'
 ));
-$node_master->psql('postgres', "SELECT pg_reload_conf()");
+$node_primary->psql('postgres', "SELECT pg_reload_conf()");
 
 my $psql_out = '';
 my $psql_rc  = '';
@@ -39,7 +39,7 @@
 # so that it won't conflict with savepoint xids.
 ###############################################################################
 
-$node_master->psql(
+$node_primary->psql(
 	'postgres', "
 	BEGIN;
 	DELETE FROM t_012_tbl;
@@ -57,9 +57,9 @@
 	PREPARE TRANSACTION 'xact_012_1';
 	CHECKPOINT;");
 
-$node_master->stop;
-$node_master->start;
-$node_master->psql(
+$node_primary->stop;
+$node_primary->start;
+$node_primary->psql(
 	'postgres', "
 	-- here we can get xid of previous savepoint if nextXid
 	-- wasn't properly advanced
@@ -68,7 +68,7 @@
 	ROLLBACK;
 	COMMIT PREPARED 'xact_012_1';");
 
-$node_master->psql(
+$node_primary->psql(
 	'postgres',
 	"SELECT count(*) FROM t_012_tbl",
 	stdout => \$psql_out);
@@ -79,10 +79,10 @@
 # PGPROC_MAX_CACHED_SUBXIDS subtransactions and also show data properly
 # on promotion
 ###############################################################################
-$node_master->psql('postgres', "DELETE FROM t_012_tbl");
+$node_primary->psql('postgres', "DELETE FROM t_012_tbl");
 
 # Function borrowed from src/test/regress/sql/hs_primary_extremes.sql
-$node_master->psql(
+$node_primary->psql(
 	'postgres', "
     CREATE OR REPLACE FUNCTION hs_subxids (n integer)
     RETURNS void
@@ -95,19 +95,19 @@
         RETURN;
     EXCEPTION WHEN raise_exception THEN NULL; END;
     \$\$;");
-$node_master->psql(
+$node_primary->psql(
 	'postgres', "
 	BEGIN;
 	SELECT hs_subxids(127);
 	COMMIT;");
-$node_master->wait_for_catchup($node_standby, 'replay',
-	$node_master->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby, 'replay',
+	$node_primary->lsn('insert'));
 $node_standby->psql(
 	'postgres',
 	"SELECT coalesce(sum(id),-1) FROM t_012_tbl",
 	stdout => \$psql_out);
 is($psql_out, '8128', "Visible");
-$node_master->stop;
+$node_primary->stop;
 $node_standby->promote;
 
 $node_standby->psql(
@@ -117,8 +117,8 @@
 is($psql_out, '8128', "Visible");
 
 # restore state
-($node_master, $node_standby) = ($node_standby, $node_master);
-$node_standby->enable_streaming($node_master);
+($node_primary, $node_standby) = ($node_standby, $node_primary);
+$node_standby->enable_streaming($node_primary);
 $node_standby->start;
 $node_standby->psql(
 	'postgres',
@@ -126,10 +126,10 @@
 	stdout => \$psql_out);
 is($psql_out, '8128', "Visible");
 
-$node_master->psql('postgres', "DELETE FROM t_012_tbl");
+$node_primary->psql('postgres', "DELETE FROM t_012_tbl");
 
 # Function borrowed from src/test/regress/sql/hs_primary_extremes.sql
-$node_master->psql(
+$node_primary->psql(
 	'postgres', "
     CREATE OR REPLACE FUNCTION hs_subxids (n integer)
     RETURNS void
@@ -142,19 +142,19 @@
         RETURN;
     EXCEPTION WHEN raise_exception THEN NULL; END;
     \$\$;");
-$node_master->psql(
+$node_primary->psql(
 	'postgres', "
 	BEGIN;
 	SELECT hs_subxids(127);
 	PREPARE TRANSACTION 'xact_012_1';");
-$node_master->wait_for_catchup($node_standby, 'replay',
-	$node_master->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby, 'replay',
+	$node_primary->lsn('insert'));
 $node_standby->psql(
 	'postgres',
 	"SELECT coalesce(sum(id),-1) FROM t_012_tbl",
 	stdout => \$psql_out);
 is($psql_out, '-1', "Not visible");
-$node_master->stop;
+$node_primary->stop;
 $node_standby->promote;
 
 $node_standby->psql(
@@ -164,34 +164,34 @@
 is($psql_out, '-1', "Not visible");
 
 # restore state
-($node_master, $node_standby) = ($node_standby, $node_master);
-$node_standby->enable_streaming($node_master);
+($node_primary, $node_standby) = ($node_standby, $node_primary);
+$node_standby->enable_streaming($node_primary);
 $node_standby->start;
-$psql_rc = $node_master->psql('postgres', "COMMIT PREPARED 'xact_012_1'");
+$psql_rc = $node_primary->psql('postgres', "COMMIT PREPARED 'xact_012_1'");
 is($psql_rc, '0',
 	"Restore of PGPROC_MAX_CACHED_SUBXIDS+ prepared transaction on promoted standby"
 );
 
-$node_master->psql(
+$node_primary->psql(
 	'postgres',
 	"SELECT coalesce(sum(id),-1) FROM t_012_tbl",
 	stdout => \$psql_out);
 is($psql_out, '8128', "Visible");
 
-$node_master->psql('postgres', "DELETE FROM t_012_tbl");
-$node_master->psql(
+$node_primary->psql('postgres', "DELETE FROM t_012_tbl");
+$node_primary->psql(
 	'postgres', "
 	BEGIN;
 	SELECT hs_subxids(201);
 	PREPARE TRANSACTION 'xact_012_1';");
-$node_master->wait_for_catchup($node_standby, 'replay',
-	$node_master->lsn('insert'));
+$node_primary->wait_for_catchup($node_standby, 'replay',
+	$node_primary->lsn('insert'));
 $node_standby->psql(
 	'postgres',
 	"SELECT coalesce(sum(id),-1) FROM t_012_tbl",
 	stdout => \$psql_out);
 is($psql_out, '-1', "Not visible");
-$node_master->stop;
+$node_primary->stop;
 $node_standby->promote;
 
 $node_standby->psql(
@@ -201,15 +201,15 @@
 is($psql_out, '-1', "Not visible");
 
 # restore state
-($node_master, $node_standby) = ($node_standby, $node_master);
-$node_standby->enable_streaming($node_master);
+($node_primary, $node_standby) = ($node_standby, $node_primary);
+$node_standby->enable_streaming($node_primary);
 $node_standby->start;
-$psql_rc = $node_master->psql('postgres', "ROLLBACK PREPARED 'xact_012_1'");
+$psql_rc = $node_primary->psql('postgres', "ROLLBACK PREPARED 'xact_012_1'");
 is($psql_rc, '0',
 	"Rollback of PGPROC_MAX_CACHED_SUBXIDS+ prepared transaction on promoted standby"
 );
 
-$node_master->psql(
+$node_primary->psql(
 	'postgres',
 	"SELECT coalesce(sum(id),-1) FROM t_012_tbl",
 	stdout => \$psql_out);
diff --git a/src/test/recovery/t/013_crash_restart.pl b/src/test/recovery/t/013_crash_restart.pl
index 2c477978e7da..95d7bb62425f 100644
--- a/src/test/recovery/t/013_crash_restart.pl
+++ b/src/test/recovery/t/013_crash_restart.pl
@@ -25,7 +25,7 @@
 # is really wrong.
 my $psql_timeout = IPC::Run::timer(60);
 
-my $node = get_new_node('master');
+my $node = get_new_node('primary');
 $node->init(allows_streaming => 1);
 $node->start();
 
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
index af656c6902f6..1fced12fca50 100644
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -13,21 +13,21 @@
 
 $ENV{PGDATABASE} = 'postgres';
 
-# Initialize master node, setting wal-segsize to 1MB
-my $node_master = get_new_node('master');
-$node_master->init(allows_streaming => 1, extra => ['--wal-segsize=1']);
-$node_master->append_conf(
+# Initialize primary node, setting wal-segsize to 1MB
+my $node_primary = get_new_node('primary');
+$node_primary->init(allows_streaming => 1, extra => ['--wal-segsize=1']);
+$node_primary->append_conf(
 	'postgresql.conf', qq(
 min_wal_size = 2MB
 max_wal_size = 4MB
 log_checkpoints = yes
 ));
-$node_master->start;
-$node_master->safe_psql('postgres',
+$node_primary->start;
+$node_primary->safe_psql('postgres',
 	"SELECT pg_create_physical_replication_slot('rep1')");
 
 # The slot state and remain should be null before the first connection
-my $result = $node_master->safe_psql('postgres',
+my $result = $node_primary->safe_psql('postgres',
 	"SELECT restart_lsn IS NULL, wal_status is NULL, safe_wal_size is NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "t|t|t", 'check the state of non-reserved slot is "unknown"');
@@ -35,133 +35,133 @@
 
 # Take backup
 my $backup_name = 'my_backup';
-$node_master->backup($backup_name);
+$node_primary->backup($backup_name);
 
 # Create a standby linking to it using the replication slot
 my $node_standby = get_new_node('standby_1');
-$node_standby->init_from_backup($node_master, $backup_name,
+$node_standby->init_from_backup($node_primary, $backup_name,
 	has_streaming => 1);
 $node_standby->append_conf('postgresql.conf', "primary_slot_name = 'rep1'");
 
 $node_standby->start;
 
 # Wait until standby has replayed enough data
-my $start_lsn = $node_master->lsn('write');
-$node_master->wait_for_catchup($node_standby, 'replay', $start_lsn);
+my $start_lsn = $node_primary->lsn('write');
+$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
 
 # Stop standby
 $node_standby->stop;
 
 # Preparation done, the slot is the state "reserved" now
-$result = $node_master->safe_psql('postgres',
+$result = $node_primary->safe_psql('postgres',
 	"SELECT wal_status, safe_wal_size IS NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "reserved|t", 'check the catching-up state');
 
-# Advance WAL by five segments (= 5MB) on master
-advance_wal($node_master, 1);
-$node_master->safe_psql('postgres', "CHECKPOINT;");
+# Advance WAL by five segments (= 5MB) on primary
+advance_wal($node_primary, 1);
+$node_primary->safe_psql('postgres', "CHECKPOINT;");
 
 # The slot is always "safe" when fitting max_wal_size
-$result = $node_master->safe_psql('postgres',
+$result = $node_primary->safe_psql('postgres',
 	"SELECT wal_status, safe_wal_size IS NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "reserved|t",
 	'check that it is safe if WAL fits in max_wal_size');
 
-advance_wal($node_master, 4);
-$node_master->safe_psql('postgres', "CHECKPOINT;");
+advance_wal($node_primary, 4);
+$node_primary->safe_psql('postgres', "CHECKPOINT;");
 
 # The slot is always "safe" when max_slot_wal_keep_size is not set
-$result = $node_master->safe_psql('postgres',
+$result = $node_primary->safe_psql('postgres',
 	"SELECT wal_status, safe_wal_size IS NULL FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "reserved|t", 'check that slot is working');
 
-# The standby can reconnect to master
+# The standby can reconnect to primary
 $node_standby->start;
 
-$start_lsn = $node_master->lsn('write');
-$node_master->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$start_lsn = $node_primary->lsn('write');
+$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
 
 $node_standby->stop;
 
-# Set max_slot_wal_keep_size on master
+# Set max_slot_wal_keep_size on primary
 my $max_slot_wal_keep_size_mb = 6;
-$node_master->append_conf(
+$node_primary->append_conf(
 	'postgresql.conf', qq(
 max_slot_wal_keep_size = ${max_slot_wal_keep_size_mb}MB
 ));
-$node_master->reload;
+$node_primary->reload;
 
 # The slot is in safe state.
 
-$result = $node_master->safe_psql('postgres',
+$result = $node_primary->safe_psql('postgres',
 	"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'");
 is($result, "reserved", 'check that max_slot_wal_keep_size is working');
 
 # Advance WAL again then checkpoint, reducing remain by 2 MB.
-advance_wal($node_master, 2);
-$node_master->safe_psql('postgres', "CHECKPOINT;");
+advance_wal($node_primary, 2);
+$node_primary->safe_psql('postgres', "CHECKPOINT;");
 
 # The slot is still working
-$result = $node_master->safe_psql('postgres',
+$result = $node_primary->safe_psql('postgres',
 	"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'");
 is($result, "reserved",
 	'check that safe_wal_size gets close to the current LSN');
 
-# The standby can reconnect to master
+# The standby can reconnect to primary
 $node_standby->start;
-$start_lsn = $node_master->lsn('write');
-$node_master->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$start_lsn = $node_primary->lsn('write');
+$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
 $node_standby->stop;
 
 # wal_keep_segments overrides max_slot_wal_keep_size
-$result = $node_master->safe_psql('postgres',
+$result = $node_primary->safe_psql('postgres',
 	"ALTER SYSTEM SET wal_keep_segments to 8; SELECT pg_reload_conf();");
 # Advance WAL again then checkpoint, reducing remain by 6 MB.
-advance_wal($node_master, 6);
-$result = $node_master->safe_psql('postgres',
+advance_wal($node_primary, 6);
+$result = $node_primary->safe_psql('postgres',
 	"SELECT wal_status as remain FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "extended",
 	'check that wal_keep_segments overrides max_slot_wal_keep_size');
 # restore wal_keep_segments
-$result = $node_master->safe_psql('postgres',
+$result = $node_primary->safe_psql('postgres',
 	"ALTER SYSTEM SET wal_keep_segments to 0; SELECT pg_reload_conf();");
 
-# The standby can reconnect to master
+# The standby can reconnect to primary
 $node_standby->start;
-$start_lsn = $node_master->lsn('write');
-$node_master->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$start_lsn = $node_primary->lsn('write');
+$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
 $node_standby->stop;
 
 # Advance WAL again without checkpoint, reducing remain by 6 MB.
-advance_wal($node_master, 6);
+advance_wal($node_primary, 6);
 
 # Slot gets into 'reserved' state
-$result = $node_master->safe_psql('postgres',
+$result = $node_primary->safe_psql('postgres',
 	"SELECT wal_status FROM pg_replication_slots WHERE slot_name = 'rep1'");
 is($result, "extended", 'check that the slot state changes to "extended"');
 
 # do checkpoint so that the next checkpoint runs too early
-$node_master->safe_psql('postgres', "CHECKPOINT;");
+$node_primary->safe_psql('postgres', "CHECKPOINT;");
 
 # Advance WAL again without checkpoint; remain goes to 0.
-advance_wal($node_master, 1);
+advance_wal($node_primary, 1);
 
 # Slot gets into 'unreserved' state and safe_wal_size is negative
-$result = $node_master->safe_psql('postgres',
+$result = $node_primary->safe_psql('postgres',
 	"SELECT wal_status, safe_wal_size <= 0 FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "unreserved|t",
 	'check that the slot state changes to "unreserved"');
 
-# The standby still can connect to master before a checkpoint
+# The standby still can connect to primary before a checkpoint
 $node_standby->start;
 
-$start_lsn = $node_master->lsn('write');
-$node_master->wait_for_catchup($node_standby, 'replay', $start_lsn);
+$start_lsn = $node_primary->lsn('write');
+$node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
 
 $node_standby->stop;
 
@@ -171,25 +171,25 @@
 	'check that required WAL segments are still available');
 
 # Advance WAL again, the slot loses the oldest segment.
-my $logstart = get_log_size($node_master);
-advance_wal($node_master, 7);
-$node_master->safe_psql('postgres', "CHECKPOINT;");
+my $logstart = get_log_size($node_primary);
+advance_wal($node_primary, 7);
+$node_primary->safe_psql('postgres', "CHECKPOINT;");
 
 # WARNING should be issued
 ok( find_in_log(
-		$node_master,
+		$node_primary,
 		"invalidating slot \"rep1\" because its restart_lsn [0-9A-F/]+ exceeds max_slot_wal_keep_size",
 		$logstart),
 	'check that the warning is logged');
 
 # This slot should be broken
-$result = $node_master->safe_psql('postgres',
+$result = $node_primary->safe_psql('postgres',
 	"SELECT slot_name, active, restart_lsn IS NULL, wal_status, safe_wal_size FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "rep1|f|t|lost|",
 	'check that the slot became inactive and the state "lost" persists');
 
-# The standby no longer can connect to the master
+# The standby no longer can connect to the primary
 $logstart = get_log_size($node_standby);
 $node_standby->start;
 
@@ -208,39 +208,39 @@
 }
 ok($failed, 'check that replication has been broken');
 
-$node_master->stop('immediate');
+$node_primary->stop('immediate');
 $node_standby->stop('immediate');
 
-my $node_master2 = get_new_node('master2');
-$node_master2->init(allows_streaming => 1);
-$node_master2->append_conf(
+my $node_primary2 = get_new_node('primary2');
+$node_primary2->init(allows_streaming => 1);
+$node_primary2->append_conf(
 	'postgresql.conf', qq(
 min_wal_size = 32MB
 max_wal_size = 32MB
 log_checkpoints = yes
 ));
-$node_master2->start;
-$node_master2->safe_psql('postgres',
+$node_primary2->start;
+$node_primary2->safe_psql('postgres',
 	"SELECT pg_create_physical_replication_slot('rep1')");
 $backup_name = 'my_backup2';
-$node_master2->backup($backup_name);
+$node_primary2->backup($backup_name);
 
-$node_master2->stop;
-$node_master2->append_conf(
+$node_primary2->stop;
+$node_primary2->append_conf(
 	'postgresql.conf', qq(
 max_slot_wal_keep_size = 0
 ));
-$node_master2->start;
+$node_primary2->start;
 
 $node_standby = get_new_node('standby_2');
-$node_standby->init_from_backup($node_master2, $backup_name,
+$node_standby->init_from_backup($node_primary2, $backup_name,
 	has_streaming => 1);
 $node_standby->append_conf('postgresql.conf', "primary_slot_name = 'rep1'");
 $node_standby->start;
 my @result =
   split(
 	'\n',
-	$node_master2->safe_psql(
+	$node_primary2->safe_psql(
 		'postgres',
 		"CREATE TABLE tt();
 		 DROP TABLE tt;
@@ -256,7 +256,7 @@ sub advance_wal
 {
 	my ($node, $n) = @_;
 
-	# Advance by $n segments (= (16 * $n) MB) on master
+	# Advance by $n segments (= (16 * $n) MB) on primary
 	for (my $i = 0; $i < $n; $i++)
 	{
 		$node->safe_psql('postgres',
diff --git a/src/test/recovery/t/020_archive_status.pl b/src/test/recovery/t/020_archive_status.pl
index c18b737785da..c726453417b1 100644
--- a/src/test/recovery/t/020_archive_status.pl
+++ b/src/test/recovery/t/020_archive_status.pl
@@ -8,7 +8,7 @@
 use Test::More tests => 16;
 use Config;
 
-my $primary = get_new_node('master');
+my $primary = get_new_node('primary');
 $primary->init(
 	has_archiving    => 1,
 	allows_streaming => 1);
diff --git a/src/test/ssl/t/001_ssltests.pl b/src/test/ssl/t/001_ssltests.pl
index c0680f39d6f8..fd2727b5684b 100644
--- a/src/test/ssl/t/001_ssltests.pl
+++ b/src/test/ssl/t/001_ssltests.pl
@@ -61,7 +61,7 @@
 #### Set up the server.
 
 note "setting up data directory";
-my $node = get_new_node('master');
+my $node = get_new_node('primary');
 $node->init;
 
 # PGHOST is enforced here to set up the node, subsequent connections
diff --git a/src/test/ssl/t/002_scram.pl b/src/test/ssl/t/002_scram.pl
index a1ab9119880d..01231f8ba0f0 100644
--- a/src/test/ssl/t/002_scram.pl
+++ b/src/test/ssl/t/002_scram.pl
@@ -35,7 +35,7 @@
 # Set up the server.
 
 note "setting up data directory";
-my $node = get_new_node('master');
+my $node = get_new_node('primary');
 $node->init;
 
 # PGHOST is enforced here to set up the node, subsequent connections

From 5e7bbb528638c0f6d585bab107ec7a19e3a39deb Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 14 Jun 2020 14:05:18 -0700
Subject: [PATCH 104/334] code: replace 'master' with 'primary' where
 appropriate.

Also changed "in the primary" to "on the primary", and added a few
"the" before "primary".

Author: Andres Freund
Reviewed-By: David Steele
Discussion: https://postgr.es/m/20200615182235.x7lch5n6kcjq4aue@alap3.anarazel.de
---
 src/backend/access/common/bufmask.c           |  4 +-
 src/backend/access/gist/gistxlog.c            |  2 +-
 src/backend/access/heap/heapam.c              | 12 ++--
 src/backend/access/heap/pruneheap.c           |  2 +-
 src/backend/access/nbtree/README              |  2 +-
 src/backend/access/nbtree/nbtxlog.c           |  2 +-
 src/backend/access/transam/commit_ts.c        | 12 ++--
 src/backend/access/transam/xlog.c             | 56 +++++++++----------
 src/backend/access/transam/xlogutils.c        |  6 +-
 src/backend/catalog/namespace.c               |  2 +-
 src/backend/commands/tablecmds.c              |  2 +-
 src/backend/postmaster/postmaster.c           |  4 +-
 src/backend/replication/README                |  6 +-
 src/backend/replication/basebackup.c          |  2 +-
 src/backend/replication/logical/worker.c      |  4 +-
 src/backend/replication/walreceiver.c         | 32 +++++------
 src/backend/replication/walsender.c           |  8 +--
 src/backend/storage/ipc/procarray.c           | 20 +++----
 src/backend/storage/ipc/standby.c             |  2 +-
 src/backend/storage/lmgr/README               |  2 +-
 src/backend/storage/page/README               |  2 +-
 src/backend/utils/misc/guc.c                  |  8 +--
 src/backend/utils/misc/postgresql.conf.sample |  8 +--
 src/bin/pg_basebackup/pg_recvlogical.c        |  2 +-
 src/bin/pg_basebackup/receivelog.c            |  4 +-
 src/bin/pg_rewind/copy_fetch.c                |  2 +-
 src/bin/pg_rewind/filemap.c                   |  2 +-
 src/bin/pg_rewind/parsexlog.c                 |  2 +-
 src/include/access/xlog.h                     |  4 +-
 src/include/tcop/utility.h                    |  2 +-
 src/include/utils/guc_tables.h                |  2 +-
 31 files changed, 110 insertions(+), 110 deletions(-)

diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c
index 8dcc747b94a2..4bdb1848ad24 100644
--- a/src/backend/access/common/bufmask.c
+++ b/src/backend/access/common/bufmask.c
@@ -88,8 +88,8 @@ mask_unused_space(Page page)
 /*
  * mask_lp_flags
  *
- * In some index AMs, line pointer flags can be modified in master without
- * emitting any WAL record.
+ * In some index AMs, line pointer flags can be modified on the primary
+ * without emitting any WAL record.
  */
 void
 mask_lp_flags(Page page)
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index b60dba052fa8..3f0effd5e429 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -391,7 +391,7 @@ gistRedoPageReuse(XLogReaderState *record)
 	 * RecentGlobalXmin test in gistPageRecyclable() conceptually mirrors the
 	 * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs().
 	 * Consequently, one XID value achieves the same exclusion effect on
-	 * master and standby.
+	 * primary and standby.
 	 */
 	if (InHotStandby)
 	{
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 537913d1bb3e..7bd45703aa6d 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -410,10 +410,10 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
 	 * visible to everyone, we can skip the per-tuple visibility tests.
 	 *
 	 * Note: In hot standby, a tuple that's already visible to all
-	 * transactions in the master might still be invisible to a read-only
+	 * transactions on the primary might still be invisible to a read-only
 	 * transaction in the standby. We partly handle this problem by tracking
 	 * the minimum xmin of visible tuples as the cut-off XID while marking a
-	 * page all-visible on master and WAL log that along with the visibility
+	 * page all-visible on the primary and WAL log that along with the visibility
 	 * map SET operation. In hot standby, we wait for (or abort) all
 	 * transactions that can potentially may not see one or more tuples on the
 	 * page. That's how index-only scans work fine in hot standby. A crucial
@@ -6889,7 +6889,7 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
 	 * updated/deleted by the inserting transaction.
 	 *
 	 * Look for a committed hint bit, or if no xmin bit is set, check clog.
-	 * This needs to work on both master and standby, where it is used to
+	 * This needs to work on both primary and standby, where it is used to
 	 * assess btree delete records.
 	 */
 	if (HeapTupleHeaderXminCommitted(tuple) ||
@@ -6951,9 +6951,9 @@ xid_horizon_prefetch_buffer(Relation rel,
  * tuples being deleted.
  *
  * We used to do this during recovery rather than on the primary, but that
- * approach now appears inferior.  It meant that the master could generate
+ * approach now appears inferior.  It meant that the primary could generate
  * a lot of work for the standby without any back-pressure to slow down the
- * master, and it required the standby to have reached consistency, whereas
+ * primary, and it required the standby to have reached consistency, whereas
  * we want to have correct information available even before that point.
  *
  * It's possible for this to generate a fair amount of I/O, since we may be
@@ -8943,7 +8943,7 @@ heap_mask(char *pagedata, BlockNumber blkno)
 			 *
 			 * During redo, heap_xlog_insert() sets t_ctid to current block
 			 * number and self offset number. It doesn't care about any
-			 * speculative insertions in master. Hence, we set t_ctid to
+			 * speculative insertions on the primary. Hence, we set t_ctid to
 			 * current block number and self offset number to ignore any
 			 * inconsistency.
 			 */
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 1794cfd8d9aa..256df4de1050 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -78,7 +78,7 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
 
 	/*
 	 * We can't write WAL in recovery mode, so there's no point trying to
-	 * clean the page. The master will likely issue a cleaning WAL record soon
+	 * clean the page. The primary will likely issue a cleaning WAL record soon
 	 * anyway, so this is no particular loss.
 	 */
 	if (RecoveryInProgress())
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 216d419841c6..32ad9e339a29 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -574,7 +574,7 @@ writers that insert on to the page being deleted.)
 
 During recovery all index scans start with ignore_killed_tuples = false
 and we never set kill_prior_tuple. We do this because the oldest xmin
-on the standby server can be older than the oldest xmin on the master
+on the standby server can be older than the oldest xmin on the primary
 server, which means tuples can be marked LP_DEAD even when they are
 still visible on the standby. We don't WAL log tuple LP_DEAD bits, but
 they can still appear in the standby because of full page writes. So
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 5bec59d448dd..5d346da84fde 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -932,7 +932,7 @@ btree_xlog_reuse_page(XLogReaderState *record)
 	 * RecentGlobalXmin test in _bt_page_recyclable() conceptually mirrors the
 	 * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs().
 	 * Consequently, one XID value achieves the same exclusion effect on
-	 * master and standby.
+	 * primary and standby.
 	 */
 	if (InHotStandby)
 	{
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 9cdb1364359c..182e5391f7b7 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -392,7 +392,7 @@ error_commit_ts_disabled(void)
 			(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 			 errmsg("could not get commit timestamp data"),
 			 RecoveryInProgress() ?
-			 errhint("Make sure the configuration parameter \"%s\" is set on the master server.",
+			 errhint("Make sure the configuration parameter \"%s\" is set on the primary server.",
 					 "track_commit_timestamp") :
 			 errhint("Make sure the configuration parameter \"%s\" is set.",
 					 "track_commit_timestamp")));
@@ -592,12 +592,12 @@ CommitTsParameterChange(bool newvalue, bool oldvalue)
 {
 	/*
 	 * If the commit_ts module is disabled in this server and we get word from
-	 * the master server that it is enabled there, activate it so that we can
+	 * the primary server that it is enabled there, activate it so that we can
 	 * replay future WAL records involving it; also mark it as active on
 	 * pg_control.  If the old value was already set, we already did this, so
 	 * don't do anything.
 	 *
-	 * If the module is disabled in the master, disable it here too, unless
+	 * If the module is disabled in the primary, disable it here too, unless
 	 * the module is enabled locally.
 	 *
 	 * Note this only runs in the recovery process, so an unlocked read is
@@ -616,12 +616,12 @@ CommitTsParameterChange(bool newvalue, bool oldvalue)
  * Activate this module whenever necessary.
  *		This must happen during postmaster or standalone-backend startup,
  *		or during WAL replay anytime the track_commit_timestamp setting is
- *		changed in the master.
+ *		changed in the primary.
  *
  * The reason why this SLRU needs separate activation/deactivation functions is
  * that it can be enabled/disabled during start and the activation/deactivation
- * on master is propagated to standby via replay. Other SLRUs don't have this
- * property and they can be just initialized during normal startup.
+ * on the primary is propagated to the standby via replay. Other SLRUs don't
+ * have this property and they can be just initialized during normal startup.
  *
  * This is in charge of creating the currently active segment, if it's not
  * already there.  The reason for this is that the server might have been
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 91d99c113c04..6f9c5513e3a5 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -273,7 +273,7 @@ static bool restoredFromArchive = false;
 
 /* Buffers dedicated to consistency checks of size BLCKSZ */
 static char *replay_image_masked = NULL;
-static char *master_image_masked = NULL;
+static char *primary_image_masked = NULL;
 
 /* options formerly taken from recovery.conf for archive recovery */
 char	   *recoveryRestoreCommand = NULL;
@@ -784,7 +784,7 @@ typedef enum
 	XLOG_FROM_ANY = 0,			/* request to read WAL from any source */
 	XLOG_FROM_ARCHIVE,			/* restored using restore_command */
 	XLOG_FROM_PG_WAL,			/* existing file in pg_wal */
-	XLOG_FROM_STREAM			/* streamed from master */
+	XLOG_FROM_STREAM			/* streamed from primary */
 } XLogSource;
 
 /* human-readable names for XLogSources, for debugging output */
@@ -1478,21 +1478,21 @@ checkXLogConsistency(XLogReaderState *record)
 		 * page here, a local buffer is fine to hold its contents and a mask
 		 * can be directly applied on it.
 		 */
-		if (!RestoreBlockImage(record, block_id, master_image_masked))
+		if (!RestoreBlockImage(record, block_id, primary_image_masked))
 			elog(ERROR, "failed to restore block image");
 
 		/*
-		 * If masking function is defined, mask both the master and replay
+		 * If masking function is defined, mask both the primary and replay
 		 * images
 		 */
 		if (RmgrTable[rmid].rm_mask != NULL)
 		{
 			RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
-			RmgrTable[rmid].rm_mask(master_image_masked, blkno);
+			RmgrTable[rmid].rm_mask(primary_image_masked, blkno);
 		}
 
-		/* Time to compare the master and replay images. */
-		if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
+		/* Time to compare the primary and replay images. */
+		if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
 		{
 			elog(FATAL,
 				 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
@@ -2301,7 +2301,7 @@ CalculateCheckpointSegments(void)
 	 * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
 	 *    WAL for two checkpoint cycles to allow us to recover from the
 	 *    secondary checkpoint if the first checkpoint failed, though we
-	 *    only did this on the master anyway, not on standby. Keeping just
+	 *    only did this on the primary anyway, not on standby. Keeping just
 	 *    one checkpoint simplifies processing and reduces disk space in
 	 *    many smaller databases.)
 	 * b) during checkpoint, we consume checkpoint_completion_target *
@@ -3770,7 +3770,7 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
 	 * however, unless we actually find a valid segment.  That way if there is
 	 * neither a timeline history file nor a WAL segment in the archive, and
 	 * streaming replication is set up, we'll read the timeline history file
-	 * streamed from the master when we start streaming, instead of recovering
+	 * streamed from the primary when we start streaming, instead of recovering
 	 * with a dummy history generated here.
 	 */
 	if (expectedTLEs)
@@ -6057,7 +6057,7 @@ SetRecoveryPause(bool recoveryPause)
 
 /*
  * When recovery_min_apply_delay is set, we wait long enough to make sure
- * certain record types are applied at least that interval behind the master.
+ * certain record types are applied at least that interval behind the primary.
  *
  * Returns true if we waited.
  *
@@ -6239,7 +6239,7 @@ do { \
 	if ((currValue) < (minValue)) \
 		ereport(ERROR, \
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
-				 errmsg("hot standby is not possible because %s = %d is a lower setting than on the master server (its value was %d)", \
+				 errmsg("hot standby is not possible because %s = %d is a lower setting than on the primary server (its value was %d)", \
 						param_name, \
 						currValue, \
 						minValue))); \
@@ -6275,8 +6275,8 @@ CheckRequiredParameterValues(void)
 	{
 		if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
 			ereport(ERROR,
-					(errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
-					 errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
+					(errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the primary server"),
+					 errhint("Either set wal_level to \"replica\" on the primary, or turn off hot_standby here.")));
 
 		/* We ignore autovacuum_max_workers when we make this test. */
 		RecoveryRequiresIntParameter("max_connections",
@@ -6502,7 +6502,7 @@ StartupXLOG(void)
 	 * alignment, whereas palloc() will provide MAXALIGN'd storage.
 	 */
 	replay_image_masked = (char *) palloc(BLCKSZ);
-	master_image_masked = (char *) palloc(BLCKSZ);
+	primary_image_masked = (char *) palloc(BLCKSZ);
 
 	if (read_backup_label(&checkPointLoc, &backupEndRequired,
 						  &backupFromStandby))
@@ -6631,7 +6631,7 @@ StartupXLOG(void)
 		 * know how far we need to replay the WAL before we reach consistency.
 		 * This can happen for example if a base backup is taken from a
 		 * running server using an atomic filesystem snapshot, without calling
-		 * pg_start/stop_backup. Or if you just kill a running master server
+		 * pg_start/stop_backup. Or if you just kill a running primary server
 		 * and put it into archive recovery by creating a recovery signal
 		 * file.
 		 *
@@ -6829,7 +6829,7 @@ StartupXLOG(void)
 	 * ourselves - the history file of the recovery target timeline covers all
 	 * the previous timelines in the history too - a cascading standby server
 	 * might be interested in them. Or, if you archive the WAL from this
-	 * server to a different archive than the master, it'd be good for all the
+	 * server to a different archive than the primary, it'd be good for all the
 	 * history files to get archived there after failover, so that you can use
 	 * one of the old timelines as a PITR target. Timeline history files are
 	 * small, so it's better to copy them unnecessarily than not copy them and
@@ -7065,7 +7065,7 @@ StartupXLOG(void)
 
 			/*
 			 * If we're beginning at a shutdown checkpoint, we know that
-			 * nothing was running on the master at this point. So fake-up an
+			 * nothing was running on the primary at this point. So fake-up an
 			 * empty running-xacts record and use that here and now. Recover
 			 * additional standby state for prepared transactions.
 			 */
@@ -7233,7 +7233,7 @@ StartupXLOG(void)
 				}
 
 				/*
-				 * If we've been asked to lag the master, wait on latch until
+				 * If we've been asked to lag the primary, wait on latch until
 				 * enough time has passed.
 				 */
 				if (recoveryApplyDelay(xlogreader))
@@ -7348,7 +7348,7 @@ StartupXLOG(void)
 				/*
 				 * If rm_redo called XLogRequestWalReceiverReply, then we wake
 				 * up the receiver so that it notices the updated
-				 * lastReplayedEndRecPtr and sends a reply to the master.
+				 * lastReplayedEndRecPtr and sends a reply to the primary.
 				 */
 				if (doRequestWalReceiverReply)
 				{
@@ -9949,7 +9949,7 @@ xlog_redo(XLogReaderState *record)
 
 		/*
 		 * If we see a shutdown checkpoint, we know that nothing was running
-		 * on the master at this point. So fake-up an empty running-xacts
+		 * on the primary at this point. So fake-up an empty running-xacts
 		 * record and use that here and now. Recover additional standby state
 		 * for prepared transactions.
 		 */
@@ -10663,7 +10663,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
 									"since last restartpoint"),
 							 errhint("This means that the backup being taken on the standby "
 									 "is corrupt and should not be used. "
-									 "Enable full_page_writes and run CHECKPOINT on the master, "
+									 "Enable full_page_writes and run CHECKPOINT on the primary, "
 									 "and then try an online backup again.")));
 
 				/*
@@ -10811,7 +10811,7 @@ do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
 		appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
 						 exclusive ? "pg_start_backup" : "streamed");
 		appendStringInfo(labelfile, "BACKUP FROM: %s\n",
-						 backup_started_in_recovery ? "standby" : "master");
+						 backup_started_in_recovery ? "standby" : "primary");
 		appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
 		appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
 		appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
@@ -11246,7 +11246,7 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
 							"during online backup"),
 					 errhint("This means that the backup being taken on the standby "
 							 "is corrupt and should not be used. "
-							 "Enable full_page_writes and run CHECKPOINT on the master, "
+							 "Enable full_page_writes and run CHECKPOINT on the primary, "
 							 "and then try an online backup again.")));
 
 
@@ -11928,7 +11928,7 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
 	Assert(readFile != -1);
 
 	/*
-	 * If the current segment is being streamed from master, calculate how
+	 * If the current segment is being streamed from the primary, calculate how
 	 * much of the current page we have received already. We know the
 	 * requested record has been received, but this is for the benefit of
 	 * future calls, to allow quick exit at the top of this function.
@@ -11989,8 +11989,8 @@ XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
 	 * example, imagine a scenario where a streaming replica is started up,
 	 * and replay reaches a record that's split across two WAL segments. The
 	 * first page is only available locally, in pg_wal, because it's already
-	 * been recycled in the master. The second page, however, is not present
-	 * in pg_wal, and we should stream it from the master. There is a recycled
+	 * been recycled on the primary. The second page, however, is not present
+	 * in pg_wal, and we should stream it from the primary. There is a recycled
 	 * WAL segment present in pg_wal, with garbage contents, however. We would
 	 * read the first page from the local WAL segment, but when reading the
 	 * second page, we would read the bogus, recycled, WAL segment. If we
@@ -12150,7 +12150,7 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
 					 * Failure while streaming. Most likely, we got here
 					 * because streaming replication was terminated, or
 					 * promotion was triggered. But we also get here if we
-					 * find an invalid record in the WAL streamed from master,
+					 * find an invalid record in the WAL streamed from the primary,
 					 * in which case something is seriously wrong. There's
 					 * little chance that the problem will just go away, but
 					 * PANIC is not good for availability either, especially
@@ -12511,7 +12511,7 @@ StartupRequestWalReceiverRestart(void)
  * we're retrying the exact same record that we've tried previously, only
  * complain the first time to keep the noise down.  However, we only do when
  * reading from pg_wal, because we don't expect any invalid records in archive
- * or in records streamed from master. Files in the archive should be complete,
+ * or in records streamed from the primary. Files in the archive should be complete,
  * and we should never hit the end of WAL because we stop and wait for more WAL
  * to arrive before replaying it.
  *
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 322b0e8ff5b0..b2ca0cd4cf39 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -654,8 +654,8 @@ XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum,
  *
  * We care about timelines in xlogreader when we might be reading xlog
  * generated prior to a promotion, either if we're currently a standby in
- * recovery or if we're a promoted master reading xlogs generated by the old
- * master before our promotion.
+ * recovery or if we're a promoted primary reading xlogs generated by the old
+ * primary before our promotion.
  *
  * wantPage must be set to the start address of the page to read and
  * wantLength to the amount of the page that will be read, up to
@@ -878,7 +878,7 @@ read_local_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr,
 		 * we actually read the xlog page, we might still try to read from the
 		 * old (now renamed) segment and fail. There's not much we can do
 		 * about this, but it can only happen when we're a leaf of a cascading
-		 * standby whose master gets promoted while we're decoding, so a
+		 * standby whose primary gets promoted while we're decoding, so a
 		 * one-off ERROR isn't too bad.
 		 */
 		XLogReadDetermineTimeline(state, targetPagePtr, reqLen);
diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c
index 2ec23016fe53..0152e3869abe 100644
--- a/src/backend/catalog/namespace.c
+++ b/src/backend/catalog/namespace.c
@@ -3969,7 +3969,7 @@ InitTempTableNamespace(void)
 	 * Do not allow a Hot Standby session to make temp tables.  Aside from
 	 * problems with modifying the system catalogs, there is a naming
 	 * conflict: pg_temp_N belongs to the session with BackendId N on the
-	 * master, not to a hot standby session with the same BackendId.  We
+	 * primary, not to a hot standby session with the same BackendId.  We
 	 * should not be able to get here anyway due to XactReadOnly checks, but
 	 * let's just make real sure.  Note that this also backstops various
 	 * operations that allow XactReadOnly transactions to modify temp tables;
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 42330692e769..26700da27891 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -3676,7 +3676,7 @@ AlterTableInternal(Oid relid, List *cmds, bool recurse)
  * and does not travel through this section of code and cannot be combined with
  * any of the subcommands given here.
  *
- * Note that Hot Standby only knows about AccessExclusiveLocks on the master
+ * Note that Hot Standby only knows about AccessExclusiveLocks on the primary
  * so any changes that might affect SELECTs running on standbys need to use
  * AccessExclusiveLocks even if you think a lesser lock would do, unless you
  * have a solution for that also.
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index b4d475bb0ba2..dec02586c7f1 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -1059,8 +1059,8 @@ PostmasterMain(int argc, char *argv[])
 	 * only during a few moments during a standby promotion. However there is
 	 * a race condition: if pg_ctl promote is executed and creates the files
 	 * during a promotion, the files can stay around even after the server is
-	 * brought up to new master. Then, if new standby starts by using the
-	 * backup taken from that master, the files can exist at the server
+	 * brought up to be the primary. Then, if a new standby starts by using the
+	 * backup taken from the new primary, the files can exist at the server
 	 * startup and should be removed in order to avoid an unexpected
 	 * promotion.
 	 *
diff --git a/src/backend/replication/README b/src/backend/replication/README
index eae6ca729f44..8fcd78da9aad 100644
--- a/src/backend/replication/README
+++ b/src/backend/replication/README
@@ -28,11 +28,11 @@ it. Before that, however, startup process fills in WalRcvData->conninfo
 and WalRcvData->slotname, and initializes the starting point in
 WalRcvData->receiveStart.
 
-As walreceiver receives WAL from the master server, and writes and flushes
+As walreceiver receives WAL from the primary server, and writes and flushes
 it to disk (in pg_wal), it updates WalRcvData->flushedUpto and signals
 the startup process to know how far WAL replay can advance.
 
-Walreceiver sends information about replication progress to the master server
+Walreceiver sends information about replication progress to the primary server
 whenever it either writes or flushes new WAL, or the specified interval elapses.
 This is used for reporting purpose.
 
@@ -43,7 +43,7 @@ At shutdown, postmaster handles walsender processes differently from regular
 backends. It waits for regular backends to die before writing the
 shutdown checkpoint and terminating pgarch and other auxiliary processes, but
 that's not desirable for walsenders, because we want the standby servers to
-receive all the WAL, including the shutdown checkpoint, before the master
+receive all the WAL, including the shutdown checkpoint, before the primary
 is shut down. Therefore postmaster treats walsenders like the pgarch process,
 and instructs them to terminate at PM_SHUTDOWN_2 phase, after all regular
 backends have died and checkpointer has issued the shutdown checkpoint.
diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 096b0fcef0d1..6064384e32a4 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -161,7 +161,7 @@ static const char *const excludeDirContents[] =
 
 	/*
 	 * It is generally not useful to backup the contents of this directory
-	 * even if the intention is to restore to another master. See backup.sgml
+	 * even if the intention is to restore to another primary. See backup.sgml
 	 * for a more detailed description.
 	 */
 	"pg_replslot",
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index a752a1224d6f..f90a896fc3e9 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -1312,7 +1312,7 @@ apply_handle_truncate(StringInfo s)
 	}
 
 	/*
-	 * Even if we used CASCADE on the upstream master we explicitly default to
+	 * Even if we used CASCADE on the upstream primary we explicitly default to
 	 * replaying changes without further cascading. This might be later
 	 * changeable with a user specified option.
 	 */
@@ -1661,7 +1661,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
 			 * from the server for more than wal_receiver_timeout / 2, ping
 			 * the server. Also, if it's been longer than
 			 * wal_receiver_status_interval since the last update we sent,
-			 * send a status update to the master anyway, to report any
+			 * send a status update to the primary anyway, to report any
 			 * progress in applying WAL.
 			 */
 			bool		requestReply = false;
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index d1ad75da87ae..d5a9b568a682 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -357,8 +357,8 @@ WalReceiverMain(void)
 		/*
 		 * Get any missing history files. We do this always, even when we're
 		 * not interested in that timeline, so that if we're promoted to
-		 * become the master later on, we don't select the same timeline that
-		 * was already used in the current master. This isn't bullet-proof -
+		 * become the primary later on, we don't select the same timeline that
+		 * was already used in the current primary. This isn't bullet-proof -
 		 * you'll need some external software to manage your cluster if you
 		 * need to ensure that a unique timeline id is chosen in every case,
 		 * but let's avoid the confusion of timeline id collisions where we
@@ -464,7 +464,7 @@ WalReceiverMain(void)
 						if (len > 0)
 						{
 							/*
-							 * Something was received from master, so reset
+							 * Something was received from primary, so reset
 							 * timeout
 							 */
 							last_recv_timestamp = GetCurrentTimestamp();
@@ -486,7 +486,7 @@ WalReceiverMain(void)
 						len = walrcv_receive(wrconn, &buf, &wait_fd);
 					}
 
-					/* Let the master know that we received some data. */
+					/* Let the primary know that we received some data. */
 					XLogWalRcvSendReply(false, false);
 
 					/*
@@ -545,7 +545,7 @@ WalReceiverMain(void)
 					 * wal_receiver_timeout / 2, ping the server. Also, if
 					 * it's been longer than wal_receiver_status_interval
 					 * since the last update we sent, send a status update to
-					 * the master anyway, to report any progress in applying
+					 * the primary anyway, to report any progress in applying
 					 * WAL.
 					 */
 					bool		requestReply = false;
@@ -745,7 +745,7 @@ WalRcvFetchTimeLineHistoryFiles(TimeLineID first, TimeLineID last)
 			walrcv_readtimelinehistoryfile(wrconn, tli, &fname, &content, &len);
 
 			/*
-			 * Check that the filename on the master matches what we
+			 * Check that the filename on the primary matches what we
 			 * calculated ourselves. This is just a sanity check, it should
 			 * always match.
 			 */
@@ -1034,7 +1034,7 @@ XLogWalRcvFlush(bool dying)
 			set_ps_display(activitymsg);
 		}
 
-		/* Also let the master know that we made some progress */
+		/* Also let the primary know that we made some progress */
 		if (!dying)
 		{
 			XLogWalRcvSendReply(false, false);
@@ -1066,7 +1066,7 @@ XLogWalRcvSendReply(bool force, bool requestReply)
 	TimestampTz now;
 
 	/*
-	 * If the user doesn't want status to be reported to the master, be sure
+	 * If the user doesn't want status to be reported to the primary, be sure
 	 * to exit before doing anything at all.
 	 */
 	if (!force && wal_receiver_status_interval <= 0)
@@ -1080,7 +1080,7 @@ XLogWalRcvSendReply(bool force, bool requestReply)
 	 * sent without taking any lock, but the apply position requires a spin
 	 * lock, so we don't check that unless something else has changed or 10
 	 * seconds have passed.  This means that the apply WAL location will
-	 * appear, from the master's point of view, to lag slightly, but since
+	 * appear, from the primary's point of view, to lag slightly, but since
 	 * this is only for reporting purposes and only on idle systems, that's
 	 * probably OK.
 	 */
@@ -1138,14 +1138,14 @@ XLogWalRcvSendHSFeedback(bool immed)
 	static TimestampTz sendTime = 0;
 
 	/* initially true so we always send at least one feedback message */
-	static bool master_has_standby_xmin = true;
+	static bool primary_has_standby_xmin = true;
 
 	/*
-	 * If the user doesn't want status to be reported to the master, be sure
+	 * If the user doesn't want status to be reported to the primary, be sure
 	 * to exit before doing anything at all.
 	 */
 	if ((wal_receiver_status_interval <= 0 || !hot_standby_feedback) &&
-		!master_has_standby_xmin)
+		!primary_has_standby_xmin)
 		return;
 
 	/* Get current timestamp. */
@@ -1168,7 +1168,7 @@ XLogWalRcvSendHSFeedback(bool immed)
 	 * calls.
 	 *
 	 * Bailing out here also ensures that we don't send feedback until we've
-	 * read our own replication slot state, so we don't tell the master to
+	 * read our own replication slot state, so we don't tell the primary to
 	 * discard needed xmin or catalog_xmin from any slots that may exist on
 	 * this replica.
 	 */
@@ -1230,9 +1230,9 @@ XLogWalRcvSendHSFeedback(bool immed)
 	pq_sendint32(&reply_message, catalog_xmin_epoch);
 	walrcv_send(wrconn, reply_message.data, reply_message.len);
 	if (TransactionIdIsValid(xmin) || TransactionIdIsValid(catalog_xmin))
-		master_has_standby_xmin = true;
+		primary_has_standby_xmin = true;
 	else
-		master_has_standby_xmin = false;
+		primary_has_standby_xmin = false;
 }
 
 /*
@@ -1291,7 +1291,7 @@ ProcessWalSndrMessage(XLogRecPtr walEnd, TimestampTz sendTime)
  *
  * This is called by the startup process whenever interesting xlog records
  * are applied, so that walreceiver can check if it needs to send an apply
- * notification back to the master which may be waiting in a COMMIT with
+ * notification back to the primary which may be waiting in a COMMIT with
  * synchronous_commit = remote_apply.
  */
 void
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index e2477c47e0a1..f66acb872068 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -2628,14 +2628,14 @@ XLogSendPhysical(void)
 	else
 	{
 		/*
-		 * Streaming the current timeline on a master.
+		 * Streaming the current timeline on a primary.
 		 *
 		 * Attempt to send all data that's already been written out and
 		 * fsync'd to disk.  We cannot go further than what's been written out
 		 * given the current implementation of WALRead().  And in any case
 		 * it's unsafe to send WAL that is not securely down to disk on the
-		 * master: if the master subsequently crashes and restarts, standbys
-		 * must not have applied any WAL that got lost on the master.
+		 * primary: if the primary subsequently crashes and restarts, standbys
+		 * must not have applied any WAL that got lost on the primary.
 		 */
 		SendRqstPtr = GetFlushRecPtr();
 	}
@@ -2672,7 +2672,7 @@ XLogSendPhysical(void)
 	 *
 	 * Note: We might already have sent WAL > sendTimeLineValidUpto. The
 	 * startup process will normally replay all WAL that has been received
-	 * from the master, before promoting, but if the WAL streaming is
+	 * from the primary, before promoting, but if the WAL streaming is
 	 * terminated at a WAL page boundary, the valid portion of the timeline
 	 * might end in the middle of a WAL record. We might've already sent the
 	 * first half of that partial WAL record to the cascading standby, so that
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 3c2b369615f8..b44853356446 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -18,7 +18,7 @@
  * at need by checking for pid == 0.
  *
  * During hot standby, we also keep a list of XIDs representing transactions
- * that are known to be running in the master (or more precisely, were running
+ * that are known to be running on the primary (or more precisely, were running
  * as of the current point in the WAL stream).  This list is kept in the
  * KnownAssignedXids array, and is updated by watching the sequence of
  * arriving XIDs.  This is necessary because if we leave those XIDs out of
@@ -27,7 +27,7 @@
  * array represents standby processes, which by definition are not running
  * transactions that have XIDs.
  *
- * It is perhaps possible for a backend on the master to terminate without
+ * It is perhaps possible for a backend on the primary to terminate without
  * writing an abort record for its transaction.  While that shouldn't really
  * happen, it would tie up KnownAssignedXids indefinitely, so we protect
  * ourselves by pruning the array when a valid list of running XIDs arrives.
@@ -651,7 +651,7 @@ ProcArrayInitRecovery(TransactionId initializedUptoXID)
  * Normal case is to go all the way to Ready straight away, though there
  * are atypical cases where we need to take it in steps.
  *
- * Use the data about running transactions on master to create the initial
+ * Use the data about running transactions on the primary to create the initial
  * state of KnownAssignedXids. We also use these records to regularly prune
  * KnownAssignedXids because we know it is possible that some transactions
  * with FATAL errors fail to write abort records, which could cause eventual
@@ -969,7 +969,7 @@ ProcArrayApplyXidAssignment(TransactionId topxid,
  * We can find this out cheaply too.
  *
  * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
- * if the Xid is running on the master.
+ * if the Xid is running on the primary.
  *
  * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see
  * if that is running according to PGXACT or KnownAssignedXids.  This is the
@@ -1198,7 +1198,7 @@ TransactionIdIsInProgress(TransactionId xid)
  * TransactionIdIsActive -- is xid the top-level XID of an active backend?
  *
  * This differs from TransactionIdIsInProgress in that it ignores prepared
- * transactions, as well as transactions running on the master if we're in
+ * transactions, as well as transactions running on the primary if we're in
  * hot standby.  Also, we ignore subtransactions since that's not needed
  * for current uses.
  */
@@ -1289,7 +1289,7 @@ TransactionIdIsActive(TransactionId xid)
  * Nonetheless it is safe to vacuum a table in the current database with the
  * first result.  There are also replication-related effects: a walsender
  * process can set its xmin based on transactions that are no longer running
- * in the master but are still being replayed on the standby, thus possibly
+ * on the primary but are still being replayed on the standby, thus possibly
  * making the GetOldestXmin reading go backwards.  In this case there is a
  * possibility that we lose data that the standby would like to have, but
  * unless the standby uses a replication slot to make its xmin persistent
@@ -1404,7 +1404,7 @@ GetOldestXmin(Relation rel, int flags)
 		 *
 		 * vacuum_defer_cleanup_age provides some additional "slop" for the
 		 * benefit of hot standby queries on standby servers.  This is quick
-		 * and dirty, and perhaps not all that useful unless the master has a
+		 * and dirty, and perhaps not all that useful unless the primary has a
 		 * predictable transaction rate, but it offers some protection when
 		 * there's no walsender connection.  Note that we are assuming
 		 * vacuum_defer_cleanup_age isn't large enough to cause wraparound ---
@@ -3244,7 +3244,7 @@ DisplayXidCache(void)
 
 /*
  * In Hot Standby mode, we maintain a list of transactions that are (or were)
- * running in the master at the current point in WAL.  These XIDs must be
+ * running on the primary at the current point in WAL.  These XIDs must be
  * treated as running by standby transactions, even though they are not in
  * the standby server's PGXACT array.
  *
@@ -3264,7 +3264,7 @@ DisplayXidCache(void)
  * links are *not* maintained (which does not affect visibility).
  *
  * We have room in KnownAssignedXids and in snapshots to hold maxProcs *
- * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every master transaction must
+ * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every primary transaction must
  * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at
  * least every PGPROC_MAX_CACHED_SUBXIDS.  When we receive one of these
  * records, we mark the subXIDs as children of the top XID in pg_subtrans,
@@ -3439,7 +3439,7 @@ ExpireOldKnownAssignedTransactionIds(TransactionId xid)
  * order, to be exact --- to allow binary search for specific XIDs.  Note:
  * in general TransactionIdPrecedes would not provide a total order, but
  * we know that the entries present at any instant should not extend across
- * a large enough fraction of XID space to wrap around (the master would
+ * a large enough fraction of XID space to wrap around (the primary would
  * shut down for fear of XID wrap long before that happens).  So it's OK to
  * use TransactionIdPrecedes as a binary-search comparator.
  *
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 9e0d5ec257ff..f5229839cfc3 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -61,7 +61,7 @@ typedef struct RecoveryLockListsEntry
 
 /*
  * InitRecoveryTransactionEnvironment
- *		Initialize tracking of in-progress transactions in master
+ *		Initialize tracking of our primary's in-progress transactions.
  *
  * We need to issue shared invalidations and hold locks. Holding locks
  * means others may want to wait on us, so we need to make a lock table
diff --git a/src/backend/storage/lmgr/README b/src/backend/storage/lmgr/README
index 13eb1cc785a8..c96cc7b7c5f7 100644
--- a/src/backend/storage/lmgr/README
+++ b/src/backend/storage/lmgr/README
@@ -725,7 +725,7 @@ Deadlocks involving AccessExclusiveLocks are not possible, so we need
 not be concerned that a user initiated deadlock can prevent recovery from
 progressing.
 
-AccessExclusiveLocks on the primary or master node generate WAL records
+AccessExclusiveLocks on the primary node generate WAL records
 that are then applied by the Startup process. Locks are released at end
 of transaction just as they are in normal processing. These locks are
 held by the Startup process, acting as a proxy for the backends that
diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README
index 4e45bd92abc7..e30d7ac59adc 100644
--- a/src/backend/storage/page/README
+++ b/src/backend/storage/page/README
@@ -61,4 +61,4 @@ recovery must not dirty the page if the buffer is not already dirty, when
 checksums are enabled.  Systems in Hot-Standby mode may benefit from hint bits
 being set, but with checksums enabled, a page cannot be dirtied after setting a
 hint bit (due to the torn page risk). So, it must wait for full-page images
-containing the hint bit updates to arrive from the master.
+containing the hint bit updates to arrive from the primary.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 3a802d8627dc..9ff7f013c567 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -708,8 +708,8 @@ const char *const config_group_names[] =
 	gettext_noop("Replication"),
 	/* REPLICATION_SENDING */
 	gettext_noop("Replication / Sending Servers"),
-	/* REPLICATION_MASTER */
-	gettext_noop("Replication / Master Server"),
+	/* REPLICATION_PRIMARY */
+	gettext_noop("Replication / Primary Server"),
 	/* REPLICATION_STANDBY */
 	gettext_noop("Replication / Standby Servers"),
 	/* REPLICATION_SUBSCRIBERS */
@@ -2549,7 +2549,7 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
-		{"vacuum_defer_cleanup_age", PGC_SIGHUP, REPLICATION_MASTER,
+		{"vacuum_defer_cleanup_age", PGC_SIGHUP, REPLICATION_PRIMARY,
 			gettext_noop("Number of transactions by which VACUUM and HOT cleanup should be deferred, if any."),
 			NULL
 		},
@@ -4292,7 +4292,7 @@ static struct config_string ConfigureNamesString[] =
 	},
 
 	{
-		{"synchronous_standby_names", PGC_SIGHUP, REPLICATION_MASTER,
+		{"synchronous_standby_names", PGC_SIGHUP, REPLICATION_PRIMARY,
 			gettext_noop("Number of synchronous standbys and list of names of potential synchronous ones."),
 			NULL,
 			GUC_LIST_INPUT
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 0d98e546a6b4..e430e33c7b4b 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -284,7 +284,7 @@
 
 # - Sending Servers -
 
-# Set these on the master and on any standby that will send replication data.
+# Set these on the primary and on any standby that will send replication data.
 
 #max_wal_senders = 10		# max number of walsender processes
 				# (change requires restart)
@@ -297,7 +297,7 @@
 #track_commit_timestamp = off	# collect timestamp of transaction commit
 				# (change requires restart)
 
-# - Master Server -
+# - Primary Server -
 
 # These settings are ignored on a standby server.
 
@@ -309,7 +309,7 @@
 
 # - Standby Servers -
 
-# These settings are ignored on a master server.
+# These settings are ignored on a primary server.
 
 #primary_conninfo = ''			# connection string to sending server
 #primary_slot_name = ''			# replication slot on sending server
@@ -329,7 +329,7 @@
 #hot_standby_feedback = off		# send info from standby to prevent
 					# query conflicts
 #wal_receiver_timeout = 60s		# time that receiver waits for
-					# communication from master
+					# communication from primary
 					# in milliseconds; 0 disables
 #wal_retrieve_retry_interval = 5s	# time to wait before retrying to
 					# retrieve WAL after a failed attempt
diff --git a/src/bin/pg_basebackup/pg_recvlogical.c b/src/bin/pg_basebackup/pg_recvlogical.c
index cc8616844ba1..a4e0d6aeb29c 100644
--- a/src/bin/pg_basebackup/pg_recvlogical.c
+++ b/src/bin/pg_basebackup/pg_recvlogical.c
@@ -286,7 +286,7 @@ StreamLogicalLog(void)
 		}
 
 		/*
-		 * Potentially send a status message to the master
+		 * Potentially send a status message to the primary.
 		 */
 		now = feGetCurrentTimestamp();
 
diff --git a/src/bin/pg_basebackup/receivelog.c b/src/bin/pg_basebackup/receivelog.c
index 62a342f77c9e..d3f99d89c5c8 100644
--- a/src/bin/pg_basebackup/receivelog.c
+++ b/src/bin/pg_basebackup/receivelog.c
@@ -417,7 +417,7 @@ CheckServerVersionForStreaming(PGconn *conn)
  * race-y since a signal received while busy won't interrupt the wait.
  *
  * standby_message_timeout controls how often we send a message
- * back to the master letting it know our progress, in milliseconds.
+ * back to the primary letting it know our progress, in milliseconds.
  * Zero means no messages are sent.
  * This message will only contain the write location, and never
  * flush or replay.
@@ -776,7 +776,7 @@ HandleCopyStream(PGconn *conn, StreamCtl *stream,
 		}
 
 		/*
-		 * Potentially send a status message to the master
+		 * Potentially send a status message to the primary
 		 */
 		if (still_sending && stream->standby_message_timeout > 0 &&
 			feTimestampDifferenceExceeds(last_status, now,
diff --git a/src/bin/pg_rewind/copy_fetch.c b/src/bin/pg_rewind/copy_fetch.c
index 223c32628dd3..1edab5f18670 100644
--- a/src/bin/pg_rewind/copy_fetch.c
+++ b/src/bin/pg_rewind/copy_fetch.c
@@ -76,7 +76,7 @@ recurse_dir(const char *datadir, const char *parentpath,
 			if (errno == ENOENT)
 			{
 				/*
-				 * File doesn't exist anymore. This is ok, if the new master
+				 * File doesn't exist anymore. This is ok, if the new primary
 				 * is running and the file was just removed. If it was a data
 				 * file, there should be a WAL record of the removal. If it
 				 * was something else, it couldn't have been anyway.
diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
index 36a2d6234159..1879257b66a3 100644
--- a/src/bin/pg_rewind/filemap.c
+++ b/src/bin/pg_rewind/filemap.c
@@ -62,7 +62,7 @@ static const char *excludeDirContents[] =
 
 	/*
 	 * It is generally not useful to backup the contents of this directory
-	 * even if the intention is to restore to another master. See backup.sgml
+	 * even if the intention is to restore to another primary. See backup.sgml
 	 * for a more detailed description.
 	 */
 	"pg_replslot",
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index bc6f97699413..2325fb5d3021 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -206,7 +206,7 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex,
 		/*
 		 * Check if it is a checkpoint record. This checkpoint record needs to
 		 * be the latest checkpoint before WAL forked and not the checkpoint
-		 * where the master has been stopped to be rewinded.
+		 * where the primary has been stopped to be rewinded.
 		 */
 		info = XLogRecGetInfo(xlogreader) & ~XLR_INFO_MASK;
 		if (searchptr < forkptr &&
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 77ac4e785fce..5b143348879b 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -50,7 +50,7 @@ extern bool InRecovery;
  *
  * In INITIALIZED state, we've run InitRecoveryTransactionEnvironment, but
  * we haven't yet processed a RUNNING_XACTS or shutdown-checkpoint WAL record
- * to initialize our master-transaction tracking system.
+ * to initialize our primary-transaction tracking system.
  *
  * When the transaction tracking is initialized, we enter the SNAPSHOT_PENDING
  * state. The tracked information might still be incomplete, so we can't allow
@@ -58,7 +58,7 @@ extern bool InRecovery;
  * appropriate.
  *
  * In SNAPSHOT_READY mode, we have full knowledge of transactions that are
- * (or were) running in the master at the current WAL location. Snapshots
+ * (or were) running on the primary at the current WAL location. Snapshots
  * can be taken, and read-only queries can be run.
  */
 typedef enum
diff --git a/src/include/tcop/utility.h b/src/include/tcop/utility.h
index 4aec19a00873..9594856c88aa 100644
--- a/src/include/tcop/utility.h
+++ b/src/include/tcop/utility.h
@@ -51,7 +51,7 @@ typedef struct AlterTableUtilityContext
  *
  * COMMAND_OK_IN_RECOVERY means that the command is permissible even when in
  * recovery. It can't write WAL, nor can it do things that would imperil
- * replay of future WAL received from the master.
+ * replay of future WAL received from the primary.
  */
 #define COMMAND_OK_IN_READ_ONLY_TXN	0x0001
 #define COMMAND_OK_IN_PARALLEL_MODE	0x0002
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 454c2df48781..04431d0eb25c 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -73,7 +73,7 @@ enum config_group
 	WAL_RECOVERY_TARGET,
 	REPLICATION,
 	REPLICATION_SENDING,
-	REPLICATION_MASTER,
+	REPLICATION_PRIMARY,
 	REPLICATION_STANDBY,
 	REPLICATION_SUBSCRIBERS,
 	QUERY_TUNING,

From e07633646a22734e85d7fc58a66855f747128e6b Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 14 Jun 2020 14:22:47 -0700
Subject: [PATCH 105/334] code: replace 'master' with 'leader' where
 appropriate.

Leader already is the more widely used terminology, but a few places
didn't get the message.

Author: Andres Freund
Reviewed-By: David Steele
Discussion: https://postgr.es/m/20200615182235.x7lch5n6kcjq4aue@alap3.anarazel.de
---
 contrib/pg_prewarm/autoprewarm.c      |  18 ++---
 doc/src/sgml/ref/pg_dump.sgml         |   8 +-
 src/backend/access/transam/parallel.c |  36 ++++-----
 src/backend/access/transam/xact.c     |  12 +--
 src/backend/executor/execGrouping.c   |   2 +-
 src/backend/libpq/pqmq.c              |  18 ++---
 src/backend/optimizer/path/costsize.c |   2 +-
 src/backend/optimizer/util/clauses.c  |   4 +-
 src/backend/utils/init/globals.c      |   2 +-
 src/backend/utils/misc/guc.c          |   2 +-
 src/bin/pg_dump/parallel.c            | 108 +++++++++++++-------------
 src/bin/pg_dump/parallel.h            |   2 +-
 src/bin/pg_dump/pg_backup_archiver.c  |  10 +--
 src/bin/pg_dump/pg_backup_directory.c |   2 +-
 src/bin/pg_dump/pg_dump.c             |   2 +-
 src/include/catalog/pg_proc.h         |   6 +-
 src/include/libpq/pqmq.h              |   2 +-
 src/include/storage/backendid.h       |   4 +-
 18 files changed, 120 insertions(+), 120 deletions(-)

diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index 6cc8634a8417..d797095458a4 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -11,7 +11,7 @@
  *		pages from a relation that is in the process of being dropped.
  *
  *		While prewarming, autoprewarm will use two workers.  There's a
- *		master worker that reads and sorts the list of blocks to be
+ *		leader worker that reads and sorts the list of blocks to be
  *		prewarmed and then launches a per-database worker for each
  *		relevant database in turn.  The former keeps running after the
  *		initial prewarm is complete to update the dump file periodically.
@@ -88,7 +88,7 @@ PG_FUNCTION_INFO_V1(autoprewarm_dump_now);
 
 static void apw_load_buffers(void);
 static int	apw_dump_now(bool is_bgworker, bool dump_unlogged);
-static void apw_start_master_worker(void);
+static void apw_start_leader_worker(void);
 static void apw_start_database_worker(void);
 static bool apw_init_shmem(void);
 static void apw_detach_shmem(int code, Datum arg);
@@ -146,11 +146,11 @@ _PG_init(void)
 
 	/* Register autoprewarm worker, if enabled. */
 	if (autoprewarm)
-		apw_start_master_worker();
+		apw_start_leader_worker();
 }
 
 /*
- * Main entry point for the master autoprewarm process.  Per-database workers
+ * Main entry point for the leader autoprewarm process.  Per-database workers
  * have a separate entry point.
  */
 void
@@ -716,7 +716,7 @@ autoprewarm_start_worker(PG_FUNCTION_ARGS)
 				 errmsg("autoprewarm worker is already running under PID %lu",
 						(unsigned long) pid)));
 
-	apw_start_master_worker();
+	apw_start_leader_worker();
 
 	PG_RETURN_VOID();
 }
@@ -786,10 +786,10 @@ apw_detach_shmem(int code, Datum arg)
 }
 
 /*
- * Start autoprewarm master worker process.
+ * Start autoprewarm leader worker process.
  */
 static void
-apw_start_master_worker(void)
+apw_start_leader_worker(void)
 {
 	BackgroundWorker worker;
 	BackgroundWorkerHandle *handle;
@@ -801,8 +801,8 @@ apw_start_master_worker(void)
 	worker.bgw_start_time = BgWorkerStart_ConsistentState;
 	strcpy(worker.bgw_library_name, "pg_prewarm");
 	strcpy(worker.bgw_function_name, "autoprewarm_main");
-	strcpy(worker.bgw_name, "autoprewarm master");
-	strcpy(worker.bgw_type, "autoprewarm master");
+	strcpy(worker.bgw_name, "autoprewarm leader");
+	strcpy(worker.bgw_type, "autoprewarm leader");
 
 	if (process_shared_preload_libraries_in_progress)
 	{
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 8aadaa2a12e2..d5fb5430dc85 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -332,12 +332,12 @@ PostgreSQL documentation
        </para>
        <para>
         Requesting exclusive locks on database objects while running a parallel dump could
-        cause the dump to fail. The reason is that the <application>pg_dump</application> master process
+        cause the dump to fail. The reason is that the <application>pg_dump</application> leader process
         requests shared locks on the objects that the worker processes are going to dump later
         in order to
         make sure that nobody deletes them and makes them go away while the dump is running.
         If another client then requests an exclusive lock on a table, that lock will not be
-        granted but will be queued waiting for the shared lock of the master process to be
+        granted but will be queued waiting for the shared lock of the leader process to be
         released. Consequently any other access to the table will not be granted either and
         will queue after the exclusive lock request. This includes the worker process trying
         to dump the table. Without any precautions this would be a classic deadlock situation.
@@ -354,14 +354,14 @@ PostgreSQL documentation
         for standbys. With this feature, database clients can ensure they see
         the same data set even though they use different connections.
         <command>pg_dump -j</command> uses multiple database connections; it
-        connects to the database once with the master process and once again
+        connects to the database once with the leader process and once again
         for each worker job. Without the synchronized snapshot feature, the
         different worker jobs wouldn't be guaranteed to see the same data in
         each connection, which could lead to an inconsistent backup.
        </para>
        <para>
         If you want to run a parallel dump of a pre-9.2 server, you need to make sure that the
-        database content doesn't change from between the time the master connects to the
+        database content doesn't change from between the time the leader connects to the
         database until the last worker job has connected to the database. The easiest way to
         do this is to halt any data modifying processes (DDL and DML) accessing the database
         before starting the backup. You also need to specify the
diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c
index 14a869001989..b0426960c786 100644
--- a/src/backend/access/transam/parallel.c
+++ b/src/backend/access/transam/parallel.c
@@ -89,9 +89,9 @@ typedef struct FixedParallelState
 	Oid			temp_toast_namespace_id;
 	int			sec_context;
 	bool		is_superuser;
-	PGPROC	   *parallel_master_pgproc;
-	pid_t		parallel_master_pid;
-	BackendId	parallel_master_backend_id;
+	PGPROC	   *parallel_leader_pgproc;
+	pid_t		parallel_leader_pid;
+	BackendId	parallel_leader_backend_id;
 	TimestampTz xact_ts;
 	TimestampTz stmt_ts;
 	SerializableXactHandle serializable_xact_handle;
@@ -124,7 +124,7 @@ static FixedParallelState *MyFixedParallelState;
 static dlist_head pcxt_list = DLIST_STATIC_INIT(pcxt_list);
 
 /* Backend-local copy of data from FixedParallelState. */
-static pid_t ParallelMasterPid;
+static pid_t ParallelLeaderPid;
 
 /*
  * List of internal parallel worker entry points.  We need this for
@@ -323,9 +323,9 @@ InitializeParallelDSM(ParallelContext *pcxt)
 	GetUserIdAndSecContext(&fps->current_user_id, &fps->sec_context);
 	GetTempNamespaceState(&fps->temp_namespace_id,
 						  &fps->temp_toast_namespace_id);
-	fps->parallel_master_pgproc = MyProc;
-	fps->parallel_master_pid = MyProcPid;
-	fps->parallel_master_backend_id = MyBackendId;
+	fps->parallel_leader_pgproc = MyProc;
+	fps->parallel_leader_pid = MyProcPid;
+	fps->parallel_leader_backend_id = MyBackendId;
 	fps->xact_ts = GetCurrentTransactionStartTimestamp();
 	fps->stmt_ts = GetCurrentStatementStartTimestamp();
 	fps->serializable_xact_handle = ShareSerializableXact();
@@ -857,8 +857,8 @@ WaitForParallelWorkersToFinish(ParallelContext *pcxt)
  *
  * This function ensures that workers have been completely shutdown.  The
  * difference between WaitForParallelWorkersToFinish and this function is
- * that former just ensures that last message sent by worker backend is
- * received by master backend whereas this ensures the complete shutdown.
+ * that the former just ensures that last message sent by a worker backend is
+ * received by the leader backend whereas this ensures the complete shutdown.
  */
 static void
 WaitForParallelWorkersToExit(ParallelContext *pcxt)
@@ -1302,8 +1302,8 @@ ParallelWorkerMain(Datum main_arg)
 	MyFixedParallelState = fps;
 
 	/* Arrange to signal the leader if we exit. */
-	ParallelMasterPid = fps->parallel_master_pid;
-	ParallelMasterBackendId = fps->parallel_master_backend_id;
+	ParallelLeaderPid = fps->parallel_leader_pid;
+	ParallelLeaderBackendId = fps->parallel_leader_backend_id;
 	on_shmem_exit(ParallelWorkerShutdown, (Datum) 0);
 
 	/*
@@ -1318,8 +1318,8 @@ ParallelWorkerMain(Datum main_arg)
 	shm_mq_set_sender(mq, MyProc);
 	mqh = shm_mq_attach(mq, seg, NULL);
 	pq_redirect_to_shm_mq(seg, mqh);
-	pq_set_parallel_master(fps->parallel_master_pid,
-						   fps->parallel_master_backend_id);
+	pq_set_parallel_leader(fps->parallel_leader_pid,
+						   fps->parallel_leader_backend_id);
 
 	/*
 	 * Send a BackendKeyData message to the process that initiated parallelism
@@ -1347,8 +1347,8 @@ ParallelWorkerMain(Datum main_arg)
 	 * deadlock.  (If we can't join the lock group, the leader has gone away,
 	 * so just exit quietly.)
 	 */
-	if (!BecomeLockGroupMember(fps->parallel_master_pgproc,
-							   fps->parallel_master_pid))
+	if (!BecomeLockGroupMember(fps->parallel_leader_pgproc,
+							   fps->parallel_leader_pid))
 		return;
 
 	/*
@@ -1410,7 +1410,7 @@ ParallelWorkerMain(Datum main_arg)
 	/* Restore transaction snapshot. */
 	tsnapspace = shm_toc_lookup(toc, PARALLEL_KEY_TRANSACTION_SNAPSHOT, false);
 	RestoreTransactionSnapshot(RestoreSnapshot(tsnapspace),
-							   fps->parallel_master_pgproc);
+							   fps->parallel_leader_pgproc);
 
 	/* Restore active snapshot. */
 	asnapspace = shm_toc_lookup(toc, PARALLEL_KEY_ACTIVE_SNAPSHOT, false);
@@ -1510,9 +1510,9 @@ ParallelWorkerReportLastRecEnd(XLogRecPtr last_xlog_end)
 static void
 ParallelWorkerShutdown(int code, Datum arg)
 {
-	SendProcSignal(ParallelMasterPid,
+	SendProcSignal(ParallelLeaderPid,
 				   PROCSIG_PARALLEL_MESSAGE,
-				   ParallelMasterBackendId);
+				   ParallelLeaderBackendId);
 }
 
 /*
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 905dc7d8d3bc..b3ee7fa7ea04 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -750,7 +750,7 @@ GetCurrentCommandId(bool used)
 	{
 		/*
 		 * Forbid setting currentCommandIdUsed in a parallel worker, because
-		 * we have no provision for communicating this back to the master.  We
+		 * we have no provision for communicating this back to the leader.  We
 		 * could relax this restriction when currentCommandIdUsed was already
 		 * true at the start of the parallel operation.
 		 */
@@ -987,7 +987,7 @@ ExitParallelMode(void)
 /*
  *	IsInParallelMode
  *
- * Are we in a parallel operation, as either the master or a worker?  Check
+ * Are we in a parallel operation, as either the leader or a worker?  Check
  * this to prohibit operations that change backend-local state expected to
  * match across all workers.  Mere caches usually don't require such a
  * restriction.  State modified in a strict push/pop fashion, such as the
@@ -2164,13 +2164,13 @@ CommitTransaction(void)
 	else
 	{
 		/*
-		 * We must not mark our XID committed; the parallel master is
+		 * We must not mark our XID committed; the parallel leader is
 		 * responsible for that.
 		 */
 		latestXid = InvalidTransactionId;
 
 		/*
-		 * Make sure the master will know about any WAL we wrote before it
+		 * Make sure the leader will know about any WAL we wrote before it
 		 * commits.
 		 */
 		ParallelWorkerReportLastRecEnd(XactLastRecEnd);
@@ -2699,7 +2699,7 @@ AbortTransaction(void)
 		latestXid = InvalidTransactionId;
 
 		/*
-		 * Since the parallel master won't get our value of XactLastRecEnd in
+		 * Since the parallel leader won't get our value of XactLastRecEnd in
 		 * this case, we nudge WAL-writer ourselves in this case.  See related
 		 * comments in RecordTransactionAbort for why this matters.
 		 */
@@ -4488,7 +4488,7 @@ RollbackAndReleaseCurrentSubTransaction(void)
 
 	/*
 	 * Unlike ReleaseCurrentSubTransaction(), this is nominally permitted
-	 * during parallel operations.  That's because we may be in the master,
+	 * during parallel operations.  That's because we may be in the leader,
 	 * recovering from an error thrown while we were in parallel mode.  We
 	 * won't reach here in a worker, because BeginInternalSubTransaction()
 	 * will have failed.
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 8be36ca7634b..019b87df21ec 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -190,7 +190,7 @@ BuildTupleHashTableExt(PlanState *parent,
 	hashtable->cur_eq_func = NULL;
 
 	/*
-	 * If parallelism is in use, even if the master backend is performing the
+	 * If parallelism is in use, even if the leader backend is performing the
 	 * scan itself, we don't want to create the hashtable exactly the same way
 	 * in all workers. As hashtables are iterated over in keyspace-order,
 	 * doing so in all processes in the same way is likely to lead to
diff --git a/src/backend/libpq/pqmq.c b/src/backend/libpq/pqmq.c
index 743d24cee5cb..f51d935daf83 100644
--- a/src/backend/libpq/pqmq.c
+++ b/src/backend/libpq/pqmq.c
@@ -23,8 +23,8 @@
 
 static shm_mq_handle *pq_mq_handle;
 static bool pq_mq_busy = false;
-static pid_t pq_mq_parallel_master_pid = 0;
-static pid_t pq_mq_parallel_master_backend_id = InvalidBackendId;
+static pid_t pq_mq_parallel_leader_pid = 0;
+static pid_t pq_mq_parallel_leader_backend_id = InvalidBackendId;
 
 static void pq_cleanup_redirect_to_shm_mq(dsm_segment *seg, Datum arg);
 static void mq_comm_reset(void);
@@ -73,15 +73,15 @@ pq_cleanup_redirect_to_shm_mq(dsm_segment *seg, Datum arg)
 }
 
 /*
- * Arrange to SendProcSignal() to the parallel master each time we transmit
+ * Arrange to SendProcSignal() to the parallel leader each time we transmit
  * message data via the shm_mq.
  */
 void
-pq_set_parallel_master(pid_t pid, BackendId backend_id)
+pq_set_parallel_leader(pid_t pid, BackendId backend_id)
 {
 	Assert(PqCommMethods == &PqCommMqMethods);
-	pq_mq_parallel_master_pid = pid;
-	pq_mq_parallel_master_backend_id = backend_id;
+	pq_mq_parallel_leader_pid = pid;
+	pq_mq_parallel_leader_backend_id = backend_id;
 }
 
 static void
@@ -160,10 +160,10 @@ mq_putmessage(char msgtype, const char *s, size_t len)
 	{
 		result = shm_mq_sendv(pq_mq_handle, iov, 2, true);
 
-		if (pq_mq_parallel_master_pid != 0)
-			SendProcSignal(pq_mq_parallel_master_pid,
+		if (pq_mq_parallel_leader_pid != 0)
+			SendProcSignal(pq_mq_parallel_leader_pid,
 						   PROCSIG_PARALLEL_MESSAGE,
-						   pq_mq_parallel_master_backend_id);
+						   pq_mq_parallel_leader_backend_id);
 
 		if (result != SHM_MQ_WOULD_BLOCK)
 			break;
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 87c9b49ce147..945aa9337481 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -11,7 +11,7 @@
  *	cpu_tuple_cost		Cost of typical CPU time to process a tuple
  *	cpu_index_tuple_cost  Cost of typical CPU time to process an index tuple
  *	cpu_operator_cost	Cost of CPU time to execute an operator or function
- *	parallel_tuple_cost Cost of CPU time to pass a tuple from worker to master backend
+ *	parallel_tuple_cost Cost of CPU time to pass a tuple from worker to leader backend
  *	parallel_setup_cost Cost of setting up shared memory for parallelism
  *
  * We expect that the kernel will typically do some amount of read-ahead
diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
index 0c6fe0115a10..e04b14407236 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -1028,8 +1028,8 @@ max_parallel_hazard_walker(Node *node, max_parallel_hazard_context *context)
 	 * We can't pass Params to workers at the moment either, so they are also
 	 * parallel-restricted, unless they are PARAM_EXTERN Params or are
 	 * PARAM_EXEC Params listed in safe_param_ids, meaning they could be
-	 * either generated within the worker or can be computed in master and
-	 * then their value can be passed to the worker.
+	 * either generated within workers or can be computed by the leader and
+	 * then their value can be passed to workers.
 	 */
 	else if (IsA(node, Param))
 	{
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 74b52b713236..497d7c38ae6f 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -80,7 +80,7 @@ char		postgres_exec_path[MAXPGPATH];	/* full path to backend */
 
 BackendId	MyBackendId = InvalidBackendId;
 
-BackendId	ParallelMasterBackendId = InvalidBackendId;
+BackendId	ParallelLeaderBackendId = InvalidBackendId;
 
 Oid			MyDatabaseId = InvalidOid;
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 9ff7f013c567..031ca0327f0e 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3448,7 +3448,7 @@ static struct config_real ConfigureNamesReal[] =
 	{
 		{"parallel_tuple_cost", PGC_USERSET, QUERY_TUNING_COST,
 			gettext_noop("Sets the planner's estimate of the cost of "
-						 "passing each tuple (row) from worker to master backend."),
+						 "passing each tuple (row) from worker to leader backend."),
 			NULL,
 			GUC_EXPLAIN
 		},
diff --git a/src/bin/pg_dump/parallel.c b/src/bin/pg_dump/parallel.c
index c25e3f7a888f..f0587f41e492 100644
--- a/src/bin/pg_dump/parallel.c
+++ b/src/bin/pg_dump/parallel.c
@@ -16,20 +16,20 @@
 /*
  * Parallel operation works like this:
  *
- * The original, master process calls ParallelBackupStart(), which forks off
+ * The original, leader process calls ParallelBackupStart(), which forks off
  * the desired number of worker processes, which each enter WaitForCommands().
  *
- * The master process dispatches an individual work item to one of the worker
+ * The leader process dispatches an individual work item to one of the worker
  * processes in DispatchJobForTocEntry().  We send a command string such as
  * "DUMP 1234" or "RESTORE 1234", where 1234 is the TocEntry ID.
  * The worker process receives and decodes the command and passes it to the
  * routine pointed to by AH->WorkerJobDumpPtr or AH->WorkerJobRestorePtr,
  * which are routines of the current archive format.  That routine performs
  * the required action (dump or restore) and returns an integer status code.
- * This is passed back to the master where we pass it to the
+ * This is passed back to the leader where we pass it to the
  * ParallelCompletionPtr callback function that was passed to
  * DispatchJobForTocEntry().  The callback function does state updating
- * for the master control logic in pg_backup_archiver.c.
+ * for the leader control logic in pg_backup_archiver.c.
  *
  * In principle additional archive-format-specific information might be needed
  * in commands or worker status responses, but so far that hasn't proved
@@ -40,7 +40,7 @@
  * threads in the same process.  To avoid problems, they work with cloned
  * copies of the Archive data structure; see RunWorker().)
  *
- * In the master process, the workerStatus field for each worker has one of
+ * In the leader process, the workerStatus field for each worker has one of
  * the following values:
  *		WRKR_NOT_STARTED: we've not yet forked this worker
  *		WRKR_IDLE: it's waiting for a command
@@ -88,8 +88,8 @@ typedef enum
 /*
  * Private per-parallel-worker state (typedef for this is in parallel.h).
  *
- * Much of this is valid only in the master process (or, on Windows, should
- * be touched only by the master thread).  But the AH field should be touched
+ * Much of this is valid only in the leader process (or, on Windows, should
+ * be touched only by the leader thread).  But the AH field should be touched
  * only by workers.  The pipe descriptors are valid everywhere.
  */
 struct ParallelSlot
@@ -102,7 +102,7 @@ struct ParallelSlot
 
 	ArchiveHandle *AH;			/* Archive data worker is using */
 
-	int			pipeRead;		/* master's end of the pipes */
+	int			pipeRead;		/* leader's end of the pipes */
 	int			pipeWrite;
 	int			pipeRevRead;	/* child's end of the pipes */
 	int			pipeRevWrite;
@@ -124,7 +124,7 @@ struct ParallelSlot
  */
 typedef struct
 {
-	ArchiveHandle *AH;			/* master database connection */
+	ArchiveHandle *AH;			/* leader database connection */
 	ParallelSlot *slot;			/* this worker's parallel slot */
 } WorkerInfo;
 
@@ -157,9 +157,9 @@ static ShutdownInformation shutdown_info;
  * State info for signal handling.
  * We assume signal_info initializes to zeroes.
  *
- * On Unix, myAH is the master DB connection in the master process, and the
+ * On Unix, myAH is the leader DB connection in the leader process, and the
  * worker's own connection in worker processes.  On Windows, we have only one
- * instance of signal_info, so myAH is the master connection and the worker
+ * instance of signal_info, so myAH is the leader connection and the worker
  * connections must be dug out of pstate->parallelSlot[].
  */
 typedef struct DumpSignalInformation
@@ -216,8 +216,8 @@ static void lockTableForWorker(ArchiveHandle *AH, TocEntry *te);
 static void WaitForCommands(ArchiveHandle *AH, int pipefd[2]);
 static bool ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate,
 							bool do_wait);
-static char *getMessageFromMaster(int pipefd[2]);
-static void sendMessageToMaster(int pipefd[2], const char *str);
+static char *getMessageFromLeader(int pipefd[2]);
+static void sendMessageToLeader(int pipefd[2], const char *str);
 static int	select_loop(int maxFd, fd_set *workerset);
 static char *getMessageFromWorker(ParallelState *pstate,
 								  bool do_wait, int *worker);
@@ -277,7 +277,7 @@ init_parallel_dump_utils(void)
 /*
  * Find the ParallelSlot for the current worker process or thread.
  *
- * Returns NULL if no matching slot is found (this implies we're the master).
+ * Returns NULL if no matching slot is found (this implies we're the leader).
  */
 static ParallelSlot *
 GetMyPSlot(ParallelState *pstate)
@@ -367,7 +367,7 @@ archive_close_connection(int code, void *arg)
 		if (!slot)
 		{
 			/*
-			 * We're the master.  Forcibly shut down workers, then close our
+			 * We're the leader.  Forcibly shut down workers, then close our
 			 * own database connection, if any.
 			 */
 			ShutdownWorkersHard(si->pstate);
@@ -381,7 +381,7 @@ archive_close_connection(int code, void *arg)
 			 * We're a worker.  Shut down our own DB connection if any.  On
 			 * Windows, we also have to close our communication sockets, to
 			 * emulate what will happen on Unix when the worker process exits.
-			 * (Without this, if this is a premature exit, the master would
+			 * (Without this, if this is a premature exit, the leader would
 			 * fail to detect it because there would be no EOF condition on
 			 * the other end of the pipe.)
 			 */
@@ -396,7 +396,7 @@ archive_close_connection(int code, void *arg)
 	}
 	else
 	{
-		/* Non-parallel operation: just kill the master DB connection */
+		/* Non-parallel operation: just kill the leader DB connection */
 		if (si->AHX)
 			DisconnectDatabase(si->AHX);
 	}
@@ -541,11 +541,11 @@ WaitForTerminatingWorkers(ParallelState *pstate)
  *
  * In parallel operation on Unix, each process is responsible for canceling
  * its own connection (this must be so because nobody else has access to it).
- * Furthermore, the master process should attempt to forward its signal to
+ * Furthermore, the leader process should attempt to forward its signal to
  * each child.  In simple manual use of pg_dump/pg_restore, forwarding isn't
  * needed because typing control-C at the console would deliver SIGINT to
  * every member of the terminal process group --- but in other scenarios it
- * might be that only the master gets signaled.
+ * might be that only the leader gets signaled.
  *
  * On Windows, the cancel handler runs in a separate thread, because that's
  * how SetConsoleCtrlHandler works.  We make it stop worker threads, send
@@ -576,8 +576,8 @@ sigTermHandler(SIGNAL_ARGS)
 	pqsignal(SIGQUIT, SIG_IGN);
 
 	/*
-	 * If we're in the master, forward signal to all workers.  (It seems best
-	 * to do this before PQcancel; killing the master transaction will result
+	 * If we're in the leader, forward signal to all workers.  (It seems best
+	 * to do this before PQcancel; killing the leader transaction will result
 	 * in invalid-snapshot errors from active workers, which maybe we can
 	 * quiet by killing workers first.)  Ignore any errors.
 	 */
@@ -601,7 +601,7 @@ sigTermHandler(SIGNAL_ARGS)
 
 	/*
 	 * Report we're quitting, using nothing more complicated than write(2).
-	 * When in parallel operation, only the master process should do this.
+	 * When in parallel operation, only the leader process should do this.
 	 */
 	if (!signal_info.am_worker)
 	{
@@ -665,7 +665,7 @@ consoleHandler(DWORD dwCtrlType)
 		 * If in parallel mode, stop worker threads and send QueryCancel to
 		 * their connected backends.  The main point of stopping the worker
 		 * threads is to keep them from reporting the query cancels as errors,
-		 * which would clutter the user's screen.  We needn't stop the master
+		 * which would clutter the user's screen.  We needn't stop the leader
 		 * thread since it won't be doing much anyway.  Do this before
 		 * canceling the main transaction, else we might get invalid-snapshot
 		 * errors reported before we can stop the workers.  Ignore errors,
@@ -693,7 +693,7 @@ consoleHandler(DWORD dwCtrlType)
 		}
 
 		/*
-		 * Send QueryCancel to master connection, if enabled.  Ignore errors,
+		 * Send QueryCancel to leader connection, if enabled.  Ignore errors,
 		 * there's not much we can do about them anyway.
 		 */
 		if (signal_info.myAH != NULL && signal_info.myAH->connCancel != NULL)
@@ -949,11 +949,11 @@ ParallelBackupStart(ArchiveHandle *AH)
 	shutdown_info.pstate = pstate;
 
 	/*
-	 * Temporarily disable query cancellation on the master connection.  This
+	 * Temporarily disable query cancellation on the leader connection.  This
 	 * ensures that child processes won't inherit valid AH->connCancel
-	 * settings and thus won't try to issue cancels against the master's
+	 * settings and thus won't try to issue cancels against the leader's
 	 * connection.  No harm is done if we fail while it's disabled, because
-	 * the master connection is idle at this point anyway.
+	 * the leader connection is idle at this point anyway.
 	 */
 	set_archive_cancel_info(AH, NULL);
 
@@ -977,7 +977,7 @@ ParallelBackupStart(ArchiveHandle *AH)
 		if (pgpipe(pipeMW) < 0 || pgpipe(pipeWM) < 0)
 			fatal("could not create communication channels: %m");
 
-		/* master's ends of the pipes */
+		/* leader's ends of the pipes */
 		slot->pipeRead = pipeWM[PIPE_READ];
 		slot->pipeWrite = pipeMW[PIPE_WRITE];
 		/* child's ends of the pipes */
@@ -1008,13 +1008,13 @@ ParallelBackupStart(ArchiveHandle *AH)
 			/* instruct signal handler that we're in a worker now */
 			signal_info.am_worker = true;
 
-			/* close read end of Worker -> Master */
+			/* close read end of Worker -> Leader */
 			closesocket(pipeWM[PIPE_READ]);
-			/* close write end of Master -> Worker */
+			/* close write end of Leader -> Worker */
 			closesocket(pipeMW[PIPE_WRITE]);
 
 			/*
-			 * Close all inherited fds for communication of the master with
+			 * Close all inherited fds for communication of the leader with
 			 * previously-forked workers.
 			 */
 			for (j = 0; j < i; j++)
@@ -1035,19 +1035,19 @@ ParallelBackupStart(ArchiveHandle *AH)
 			fatal("could not create worker process: %m");
 		}
 
-		/* In Master after successful fork */
+		/* In Leader after successful fork */
 		slot->pid = pid;
 		slot->workerStatus = WRKR_IDLE;
 
-		/* close read end of Master -> Worker */
+		/* close read end of Leader -> Worker */
 		closesocket(pipeMW[PIPE_READ]);
-		/* close write end of Worker -> Master */
+		/* close write end of Worker -> Leader */
 		closesocket(pipeWM[PIPE_WRITE]);
 #endif							/* WIN32 */
 	}
 
 	/*
-	 * Having forked off the workers, disable SIGPIPE so that master isn't
+	 * Having forked off the workers, disable SIGPIPE so that leader isn't
 	 * killed if it tries to send a command to a dead worker.  We don't want
 	 * the workers to inherit this setting, though.
 	 */
@@ -1056,7 +1056,7 @@ ParallelBackupStart(ArchiveHandle *AH)
 #endif
 
 	/*
-	 * Re-establish query cancellation on the master connection.
+	 * Re-establish query cancellation on the leader connection.
 	 */
 	set_archive_cancel_info(AH, AH->connection);
 
@@ -1162,12 +1162,12 @@ parseWorkerCommand(ArchiveHandle *AH, TocEntry **te, T_Action *act,
 		Assert(*te != NULL);
 	}
 	else
-		fatal("unrecognized command received from master: \"%s\"",
+		fatal("unrecognized command received from leader: \"%s\"",
 			  msg);
 }
 
 /*
- * buildWorkerResponse: format a response string to send to the master.
+ * buildWorkerResponse: format a response string to send to the leader.
  *
  * The string is built in the caller-supplied buffer of size buflen.
  */
@@ -1299,16 +1299,16 @@ IsEveryWorkerIdle(ParallelState *pstate)
 /*
  * Acquire lock on a table to be dumped by a worker process.
  *
- * The master process is already holding an ACCESS SHARE lock.  Ordinarily
+ * The leader process is already holding an ACCESS SHARE lock.  Ordinarily
  * it's no problem for a worker to get one too, but if anything else besides
  * pg_dump is running, there's a possible deadlock:
  *
- * 1) Master dumps the schema and locks all tables in ACCESS SHARE mode.
+ * 1) Leader dumps the schema and locks all tables in ACCESS SHARE mode.
  * 2) Another process requests an ACCESS EXCLUSIVE lock (which is not granted
- *	  because the master holds a conflicting ACCESS SHARE lock).
+ *	  because the leader holds a conflicting ACCESS SHARE lock).
  * 3) A worker process also requests an ACCESS SHARE lock to read the table.
  *	  The worker is enqueued behind the ACCESS EXCLUSIVE lock request.
- * 4) Now we have a deadlock, since the master is effectively waiting for
+ * 4) Now we have a deadlock, since the leader is effectively waiting for
  *	  the worker.  The server cannot detect that, however.
  *
  * To prevent an infinite wait, prior to touching a table in a worker, request
@@ -1349,7 +1349,7 @@ lockTableForWorker(ArchiveHandle *AH, TocEntry *te)
 /*
  * WaitForCommands: main routine for a worker process.
  *
- * Read and execute commands from the master until we see EOF on the pipe.
+ * Read and execute commands from the leader until we see EOF on the pipe.
  */
 static void
 WaitForCommands(ArchiveHandle *AH, int pipefd[2])
@@ -1362,7 +1362,7 @@ WaitForCommands(ArchiveHandle *AH, int pipefd[2])
 
 	for (;;)
 	{
-		if (!(command = getMessageFromMaster(pipefd)))
+		if (!(command = getMessageFromLeader(pipefd)))
 		{
 			/* EOF, so done */
 			return;
@@ -1387,10 +1387,10 @@ WaitForCommands(ArchiveHandle *AH, int pipefd[2])
 		else
 			Assert(false);
 
-		/* Return status to master */
+		/* Return status to leader */
 		buildWorkerResponse(AH, te, act, status, buf, sizeof(buf));
 
-		sendMessageToMaster(pipefd, buf);
+		sendMessageToLeader(pipefd, buf);
 
 		/* command was pg_malloc'd and we are responsible for free()ing it. */
 		free(command);
@@ -1464,7 +1464,7 @@ ListenToWorkers(ArchiveHandle *AH, ParallelState *pstate, bool do_wait)
  * Any received results are passed to the callback specified to
  * DispatchJobForTocEntry.
  *
- * This function is executed in the master process.
+ * This function is executed in the leader process.
  */
 void
 WaitForWorkers(ArchiveHandle *AH, ParallelState *pstate, WFW_WaitOption mode)
@@ -1525,25 +1525,25 @@ WaitForWorkers(ArchiveHandle *AH, ParallelState *pstate, WFW_WaitOption mode)
 }
 
 /*
- * Read one command message from the master, blocking if necessary
+ * Read one command message from the leader, blocking if necessary
  * until one is available, and return it as a malloc'd string.
  * On EOF, return NULL.
  *
  * This function is executed in worker processes.
  */
 static char *
-getMessageFromMaster(int pipefd[2])
+getMessageFromLeader(int pipefd[2])
 {
 	return readMessageFromPipe(pipefd[PIPE_READ]);
 }
 
 /*
- * Send a status message to the master.
+ * Send a status message to the leader.
  *
  * This function is executed in worker processes.
  */
 static void
-sendMessageToMaster(int pipefd[2], const char *str)
+sendMessageToLeader(int pipefd[2], const char *str)
 {
 	int			len = strlen(str) + 1;
 
@@ -1592,7 +1592,7 @@ select_loop(int maxFd, fd_set *workerset)
  * that's hard to distinguish from the no-data-available case, but for now
  * our one caller is okay with that.
  *
- * This function is executed in the master process.
+ * This function is executed in the leader process.
  */
 static char *
 getMessageFromWorker(ParallelState *pstate, bool do_wait, int *worker)
@@ -1657,7 +1657,7 @@ getMessageFromWorker(ParallelState *pstate, bool do_wait, int *worker)
 /*
  * Send a command message to the specified worker process.
  *
- * This function is executed in the master process.
+ * This function is executed in the leader process.
  */
 static void
 sendMessageToWorker(ParallelState *pstate, int worker, const char *str)
@@ -1688,7 +1688,7 @@ readMessageFromPipe(int fd)
 	/*
 	 * In theory, if we let piperead() read multiple bytes, it might give us
 	 * back fragments of multiple messages.  (That can't actually occur, since
-	 * neither master nor workers send more than one message without waiting
+	 * neither leader nor workers send more than one message without waiting
 	 * for a reply, but we don't wish to assume that here.)  For simplicity,
 	 * read a byte at a time until we get the terminating '\0'.  This method
 	 * is a bit inefficient, but since this is only used for relatively short
diff --git a/src/bin/pg_dump/parallel.h b/src/bin/pg_dump/parallel.h
index 4f8e627cd5b3..a2e98cb87bf0 100644
--- a/src/bin/pg_dump/parallel.h
+++ b/src/bin/pg_dump/parallel.h
@@ -18,7 +18,7 @@
 
 #include "pg_backup_archiver.h"
 
-/* Function to call in master process on completion of a worker task */
+/* Function to call in leader process on completion of a worker task */
 typedef void (*ParallelCompletionPtr) (ArchiveHandle *AH,
 									   TocEntry *te,
 									   int status,
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 4c91b9e1bcc1..c05a1fd6af0d 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -662,7 +662,7 @@ RestoreArchive(Archive *AHX)
 		restore_toc_entries_parallel(AH, pstate, &pending_list);
 		ParallelBackupEnd(AH, pstate);
 
-		/* reconnect the master and see if we missed something */
+		/* reconnect the leader and see if we missed something */
 		restore_toc_entries_postfork(AH, &pending_list);
 		Assert(AH->connection != NULL);
 	}
@@ -2393,7 +2393,7 @@ WriteDataChunks(ArchiveHandle *AH, ParallelState *pstate)
 	if (pstate && pstate->numWorkers > 1)
 	{
 		/*
-		 * In parallel mode, this code runs in the master process.  We
+		 * In parallel mode, this code runs in the leader process.  We
 		 * construct an array of candidate TEs, then sort it into decreasing
 		 * size order, then dispatch each TE to a data-transfer worker.  By
 		 * dumping larger tables first, we avoid getting into a situation
@@ -2447,7 +2447,7 @@ WriteDataChunks(ArchiveHandle *AH, ParallelState *pstate)
 
 
 /*
- * Callback function that's invoked in the master process after a step has
+ * Callback function that's invoked in the leader process after a step has
  * been parallel dumped.
  *
  * We don't need to do anything except check for worker failure.
@@ -4437,7 +4437,7 @@ pop_next_work_item(ArchiveHandle *AH, ParallelReadyList *ready_list,
  * this is run in the worker, i.e. in a thread (Windows) or a separate process
  * (everything else). A worker process executes several such work items during
  * a parallel backup or restore. Once we terminate here and report back that
- * our work is finished, the master process will assign us a new work item.
+ * our work is finished, the leader process will assign us a new work item.
  */
 int
 parallel_restore(ArchiveHandle *AH, TocEntry *te)
@@ -4457,7 +4457,7 @@ parallel_restore(ArchiveHandle *AH, TocEntry *te)
 
 
 /*
- * Callback function that's invoked in the master process after a step has
+ * Callback function that's invoked in the leader process after a step has
  * been parallel restored.
  *
  * Update status and reduce the dependency count of any dependent items.
diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c
index cb0f7f31fd7d..48fa7cb1a38c 100644
--- a/src/bin/pg_dump/pg_backup_directory.c
+++ b/src/bin/pg_dump/pg_backup_directory.c
@@ -807,7 +807,7 @@ _Clone(ArchiveHandle *AH)
 	 */
 
 	/*
-	 * We also don't copy the ParallelState pointer (pstate), only the master
+	 * We also don't copy the ParallelState pointer (pstate), only the leader
 	 * process ever writes to it.
 	 */
 }
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 45946eec4639..e222e68437fd 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -1238,7 +1238,7 @@ static void
 setupDumpWorker(Archive *AH)
 {
 	/*
-	 * We want to re-select all the same values the master connection is
+	 * We want to re-select all the same values the leader connection is
 	 * using.  We'll have inherited directly-usable values in
 	 * AH->sync_snapshot_id and AH->use_role, but we need to translate the
 	 * inherited encoding value back to a string to pass to setup_connection.
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index ee3959da09f4..65e8c9f0546d 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -157,10 +157,10 @@ typedef FormData_pg_proc *Form_pg_proc;
 /*
  * Symbolic values for proparallel column: these indicate whether a function
  * can be safely be run in a parallel backend, during parallelism but
- * necessarily in the master, or only in non-parallel mode.
+ * necessarily in the leader, or only in non-parallel mode.
  */
-#define PROPARALLEL_SAFE		's' /* can run in worker or master */
-#define PROPARALLEL_RESTRICTED	'r' /* can run in parallel master only */
+#define PROPARALLEL_SAFE		's' /* can run in worker or leader */
+#define PROPARALLEL_RESTRICTED	'r' /* can run in parallel leader only */
 #define PROPARALLEL_UNSAFE		'u' /* banned while in parallel mode */
 
 /*
diff --git a/src/include/libpq/pqmq.h b/src/include/libpq/pqmq.h
index 6a3ccba97ac4..ac0eb789d84e 100644
--- a/src/include/libpq/pqmq.h
+++ b/src/include/libpq/pqmq.h
@@ -17,7 +17,7 @@
 #include "storage/shm_mq.h"
 
 extern void pq_redirect_to_shm_mq(dsm_segment *seg, shm_mq_handle *mqh);
-extern void pq_set_parallel_master(pid_t pid, BackendId backend_id);
+extern void pq_set_parallel_leader(pid_t pid, BackendId backend_id);
 
 extern void pq_parse_errornotice(StringInfo str, ErrorData *edata);
 
diff --git a/src/include/storage/backendid.h b/src/include/storage/backendid.h
index 0c776a3e6cbc..e5fe0e724c83 100644
--- a/src/include/storage/backendid.h
+++ b/src/include/storage/backendid.h
@@ -25,13 +25,13 @@ typedef int BackendId;			/* unique currently active backend identifier */
 extern PGDLLIMPORT BackendId MyBackendId;	/* backend id of this backend */
 
 /* backend id of our parallel session leader, or InvalidBackendId if none */
-extern PGDLLIMPORT BackendId ParallelMasterBackendId;
+extern PGDLLIMPORT BackendId ParallelLeaderBackendId;
 
 /*
  * The BackendId to use for our session's temp relations is normally our own,
  * but parallel workers should use their leader's ID.
  */
 #define BackendIdForTempRelations() \
-	(ParallelMasterBackendId == InvalidBackendId ? MyBackendId : ParallelMasterBackendId)
+	(ParallelLeaderBackendId == InvalidBackendId ? MyBackendId : ParallelLeaderBackendId)
 
 #endif							/* BACKENDID_H */

From 9e101cf60612f4be4f855d7393531900c2986a55 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 15 Jun 2020 10:12:58 -0700
Subject: [PATCH 106/334] docs: replace 'master' with 'primary' where
 appropriate.

Also changed "in the primary" to "on the primary", and added a few
"the" before "primary".

Author: Andres Freund
Reviewed-By: David Steele
Discussion: https://postgr.es/m/20200615182235.x7lch5n6kcjq4aue@alap3.anarazel.de
---
 doc/src/sgml/amcheck.sgml             |  2 +-
 doc/src/sgml/backup.sgml              | 16 +++----
 doc/src/sgml/config.sgml              | 42 ++++++++---------
 doc/src/sgml/external-projects.sgml   |  2 +-
 doc/src/sgml/high-availability.sgml   | 67 +++++++++++++--------------
 doc/src/sgml/libpq.sgml               |  2 +-
 doc/src/sgml/logical-replication.sgml |  4 +-
 doc/src/sgml/monitoring.sgml          |  6 +--
 doc/src/sgml/mvcc.sgml                |  6 +--
 doc/src/sgml/pgstandby.sgml           |  2 +-
 doc/src/sgml/protocol.sgml            |  2 +-
 doc/src/sgml/ref/pg_basebackup.sgml   | 10 ++--
 doc/src/sgml/ref/pg_rewind.sgml       |  4 +-
 doc/src/sgml/runtime.sgml             |  4 +-
 doc/src/sgml/wal.sgml                 |  4 +-
 15 files changed, 86 insertions(+), 87 deletions(-)

diff --git a/doc/src/sgml/amcheck.sgml b/doc/src/sgml/amcheck.sgml
index 75518a7820a3..a9df2c1a9d22 100644
--- a/doc/src/sgml/amcheck.sgml
+++ b/doc/src/sgml/amcheck.sgml
@@ -253,7 +253,7 @@ SET client_min_messages = DEBUG1;
      implies that operating system collation rules must never change.
      Though rare, updates to operating system collation rules can
      cause these issues. More commonly, an inconsistency in the
-     collation order between a master server and a standby server is
+     collation order between a primary server and a standby server is
      implicated, possibly because the <emphasis>major</emphasis> operating
      system version in use is inconsistent.  Such inconsistencies will
      generally only arise on standby servers, and so can generally
diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml
index bdc9026c6295..b9331830f7d0 100644
--- a/doc/src/sgml/backup.sgml
+++ b/doc/src/sgml/backup.sgml
@@ -964,7 +964,7 @@ SELECT * FROM pg_stop_backup(false, true);
      non-exclusive one, but it differs in a few key steps. This type of
      backup can only be taken on a primary and does not allow concurrent
      backups.  Moreover, because it creates a backup label file, as
-     described below, it can block automatic restart of the master server
+     described below, it can block automatic restart of the primary server
      after a crash.  On the other hand, the erroneous removal of this
      file from a backup or standby is a common mistake, which can result
      in serious data corruption.  If it is necessary to use this method,
@@ -1033,9 +1033,9 @@ SELECT pg_start_backup('label', true);
      this will result in corruption.  Confusion about when it is appropriate
      to remove this file is a common cause of data corruption when using this
      method; be very certain that you remove the file only on an existing
-     master and never when building a standby or restoring a backup, even if
+     primary and never when building a standby or restoring a backup, even if
      you are building a standby that will subsequently be promoted to a new
-     master.
+     primary.
     </para>
    </listitem>
    <listitem>
@@ -1128,16 +1128,16 @@ SELECT pg_stop_backup();
    <para>
     It is often a good idea to also omit from the backup the files
     within the cluster's <filename>pg_replslot/</filename> directory, so that
-    replication slots that exist on the master do not become part of the
+    replication slots that exist on the primary do not become part of the
     backup.  Otherwise, the subsequent use of the backup to create a standby
     may result in indefinite retention of WAL files on the standby, and
-    possibly bloat on the master if hot standby feedback is enabled, because
+    possibly bloat on the primary if hot standby feedback is enabled, because
     the clients that are using those replication slots will still be connecting
-    to and updating the slots on the master, not the standby.  Even if the
-    backup is only intended for use in creating a new master, copying the
+    to and updating the slots on the primary, not the standby.  Even if the
+    backup is only intended for use in creating a new primary, copying the
     replication slots isn't expected to be particularly useful, since the
     contents of those slots will likely be badly out of date by the time
-    the new master comes on line.
+    the new primary comes on line.
    </para>
 
    <para>
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 02909b1e6647..b353c6168308 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -697,7 +697,7 @@ include_dir 'conf.d'
 
        <para>
         When running a standby server, you must set this parameter to the
-        same or higher value than on the master server. Otherwise, queries
+        same or higher value than on the primary server. Otherwise, queries
         will not be allowed in the standby server.
        </para>
       </listitem>
@@ -1643,7 +1643,7 @@ include_dir 'conf.d'
 
        <para>
         When running a standby server, you must set this parameter to the
-        same or higher value than on the master server. Otherwise, queries
+        same or higher value than on the primary server. Otherwise, queries
         will not be allowed in the standby server.
        </para>
       </listitem>
@@ -2259,7 +2259,7 @@ include_dir 'conf.d'
 
         <para>
          When running a standby server, you must set this parameter to the
-         same or higher value than on the master server. Otherwise, queries
+         same or higher value than on the primary server. Otherwise, queries
          will not be allowed in the standby server.
         </para>
 
@@ -3253,7 +3253,7 @@ include_dir 'conf.d'
         <varname>archive_timeout</varname> &mdash; it will bloat your archive
         storage.  <varname>archive_timeout</varname> settings of a minute or so are
         usually reasonable.  You should consider using streaming replication,
-        instead of archiving, if you want data to be copied off the master
+        instead of archiving, if you want data to be copied off the primary
         server more quickly than that.
         If this value is specified without units, it is taken as seconds.
         This parameter can only be set in the
@@ -3678,12 +3678,12 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"'  # Windows
      These settings control the behavior of the built-in
      <firstterm>streaming replication</firstterm> feature (see
      <xref linkend="streaming-replication"/>).  Servers will be either a
-     master or a standby server.  Masters can send data, while standbys
+     primary or a standby server.  Primaries can send data, while standbys
      are always receivers of replicated data.  When cascading replication
      (see <xref linkend="cascading-replication"/>) is used, standby servers
      can also be senders, as well as receivers.
      Parameters are mainly for sending and standby servers, though some
-     parameters have meaning only on the master server.  Settings may vary
+     parameters have meaning only on the primary server.  Settings may vary
      across the cluster without problems if that is required.
     </para>
 
@@ -3693,10 +3693,10 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"'  # Windows
      <para>
       These parameters can be set on any server that is
       to send replication data to one or more standby servers.
-      The master is always a sending server, so these parameters must
-      always be set on the master.
+      The primary is always a sending server, so these parameters must
+      always be set on the primary.
       The role and meaning of these parameters does not change after a
-      standby becomes the master.
+      standby becomes the primary.
      </para>
 
      <variablelist>
@@ -3724,7 +3724,7 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"'  # Windows
 
        <para>
          When running a standby server, you must set this parameter to the
-         same or higher value than on the master server. Otherwise, queries
+         same or higher value than on the primary server. Otherwise, queries
          will not be allowed in the standby server.
         </para>
        </listitem>
@@ -3855,19 +3855,19 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"'  # Windows
      </variablelist>
     </sect2>
 
-    <sect2 id="runtime-config-replication-master">
-     <title>Master Server</title>
+    <sect2 id="runtime-config-replication-primary">
+     <title>Primary Server</title>
 
      <para>
-      These parameters can be set on the master/primary server that is
+      These parameters can be set on the primary server that is
       to send replication data to one or more standby servers.
       Note that in addition to these parameters,
-      <xref linkend="guc-wal-level"/> must be set appropriately on the master
+      <xref linkend="guc-wal-level"/> must be set appropriately on the primary
       server, and optionally WAL archiving can be enabled as
       well (see <xref linkend="runtime-config-wal-archiving"/>).
       The values of these parameters on standby servers are irrelevant,
       although you may wish to set them there in preparation for the
-      possibility of a standby becoming the master.
+      possibility of a standby becoming the primary.
      </para>
 
     <variablelist>
@@ -4042,7 +4042,7 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
 
      <para>
       These settings control the behavior of a standby server that is
-      to receive replication data.  Their values on the master server
+      to receive replication data.  Their values on the primary server
       are irrelevant.
      </para>
 
@@ -4369,7 +4369,7 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
         of time.  For example, if
         you set this parameter to <literal>5min</literal>, the standby will
         replay each transaction commit only when the system time on the standby
-        is at least five minutes past the commit time reported by the master.
+        is at least five minutes past the commit time reported by the primary.
         If this value is specified without units, it is taken as milliseconds.
         The default is zero, adding no delay.
        </para>
@@ -4377,10 +4377,10 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
         It is possible that the replication delay between servers exceeds the
         value of this parameter, in which case no delay is added.
         Note that the delay is calculated between the WAL time stamp as written
-        on master and the current time on the standby. Delays in transfer
+        on primary and the current time on the standby. Delays in transfer
         because of network lag or cascading replication configurations
         may reduce the actual wait time significantly. If the system
-        clocks on master and standby are not synchronized, this may lead to
+        clocks on primary and standby are not synchronized, this may lead to
         recovery applying records earlier than expected; but that is not a
         major issue because useful settings of this parameter are much larger
         than typical time deviations between servers.
@@ -4402,7 +4402,7 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
         except crash recovery.
 
         <varname>hot_standby_feedback</varname> will be delayed by use of this feature
-        which could lead to bloat on the master; use both together with care.
+        which could lead to bloat on the primary; use both together with care.
 
         <warning>
          <para>
@@ -8998,7 +8998,7 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
 
        <para>
         When running a standby server, you must set this parameter to the
-        same or higher value than on the master server. Otherwise, queries
+        same or higher value than on the primary server. Otherwise, queries
         will not be allowed in the standby server.
        </para>
       </listitem>
diff --git a/doc/src/sgml/external-projects.sgml b/doc/src/sgml/external-projects.sgml
index f94e450ef9e4..108bbc65d4c3 100644
--- a/doc/src/sgml/external-projects.sgml
+++ b/doc/src/sgml/external-projects.sgml
@@ -244,7 +244,7 @@
    <productname>PostgreSQL</productname> replication solutions can be developed
    externally. For example, <application> <ulink
    url="http://www.slony.info">Slony-I</ulink></application> is a popular
-   master/standby replication solution that is developed independently
+   primary/standby replication solution that is developed independently
    from the core project.
   </para>
  </sect1>
diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml
index 65c3fc62a975..6a9184f314e6 100644
--- a/doc/src/sgml/high-availability.sgml
+++ b/doc/src/sgml/high-availability.sgml
@@ -120,7 +120,7 @@
      system residing on another computer.  The only restriction is that
      the mirroring must be done in a way that ensures the standby server
      has a consistent copy of the file system &mdash; specifically, writes
-     to the standby must be done in the same order as those on the master.
+     to the standby must be done in the same order as those on the primary.
      <productname>DRBD</productname> is a popular file system replication solution
      for Linux.
     </para>
@@ -146,7 +146,7 @@ protocol to make nodes agree on a serializable transactional order.
      stream of write-ahead log (<acronym>WAL</acronym>)
      records.  If the main server fails, the standby contains
      almost all of the data of the main server, and can be quickly
-     made the new master database server.  This can be synchronous or
+     made the new primary database server.  This can be synchronous or
      asynchronous and can only be done for the entire database server.
     </para>
     <para>
@@ -167,7 +167,7 @@ protocol to make nodes agree on a serializable transactional order.
      logical replication constructs a stream of logical data modifications
      from the WAL.  Logical replication allows the data changes from
      individual tables to be replicated.  Logical replication doesn't require
-     a particular server to be designated as a master or a replica but allows
+     a particular server to be designated as a primary or a replica but allows
      data to flow in multiple directions.  For more information on logical
      replication, see <xref linkend="logical-replication"/>.  Through the
      logical decoding interface (<xref linkend="logicaldecoding"/>),
@@ -219,9 +219,9 @@ protocol to make nodes agree on a serializable transactional order.
      this is unacceptable, either the middleware or the application
      must query such values from a single server and then use those
      values in write queries.  Another option is to use this replication
-     option with a traditional master-standby setup, i.e. data modification
-     queries are sent only to the master and are propagated to the
-     standby servers via master-standby replication, not by the replication
+     option with a traditional primary-standby setup, i.e. data modification
+     queries are sent only to the primary and are propagated to the
+     standby servers via primary-standby replication, not by the replication
      middleware.  Care must also be taken that all
      transactions either commit or abort on all servers, perhaps
      using two-phase commit (<xref linkend="sql-prepare-transaction"/>
@@ -263,7 +263,7 @@ protocol to make nodes agree on a serializable transactional order.
      to reduce the communication overhead.  Synchronous multimaster
      replication is best for mostly read workloads, though its big
      advantage is that any server can accept write requests &mdash;
-     there is no need to partition workloads between master and
+     there is no need to partition workloads between primary and
      standby servers, and because the data changes are sent from one
      server to another, there is no problem with non-deterministic
      functions like <function>random()</function>.
@@ -363,7 +363,7 @@ protocol to make nodes agree on a serializable transactional order.
     </row>
 
     <row>
-     <entry>No master server overhead</entry>
+     <entry>No overhead on primary</entry>
      <entry align="center">&bull;</entry>
      <entry align="center"></entry>
      <entry align="center">&bull;</entry>
@@ -387,7 +387,7 @@ protocol to make nodes agree on a serializable transactional order.
     </row>
 
     <row>
-     <entry>Master failure will never lose data</entry>
+     <entry>Primary failure will never lose data</entry>
      <entry align="center">&bull;</entry>
      <entry align="center">&bull;</entry>
      <entry align="center">with sync on</entry>
@@ -454,7 +454,7 @@ protocol to make nodes agree on a serializable transactional order.
      partitioned by offices, e.g., London and Paris, with a server
      in each office.  If queries combining London and Paris data
      are necessary, an application can query both servers, or
-     master/standby replication can be used to keep a read-only copy
+     primary/standby replication can be used to keep a read-only copy
      of the other office's data on each server.
     </para>
    </listitem>
@@ -621,13 +621,13 @@ protocol to make nodes agree on a serializable transactional order.
 
    <para>
     In standby mode, the server continuously applies WAL received from the
-    master server. The standby server can read WAL from a WAL archive
-    (see <xref linkend="guc-restore-command"/>) or directly from the master
+    primary server. The standby server can read WAL from a WAL archive
+    (see <xref linkend="guc-restore-command"/>) or directly from the primary
     over a TCP connection (streaming replication). The standby server will
     also attempt to restore any WAL found in the standby cluster's
     <filename>pg_wal</filename> directory. That typically happens after a server
     restart, when the standby replays again WAL that was streamed from the
-    master before the restart, but you can also manually copy files to
+    primary before the restart, but you can also manually copy files to
     <filename>pg_wal</filename> at any time to have them replayed.
    </para>
 
@@ -652,20 +652,20 @@ protocol to make nodes agree on a serializable transactional order.
     <function>pg_promote()</function> is called, or a trigger file is found
     (<varname>promote_trigger_file</varname>). Before failover,
     any WAL immediately available in the archive or in <filename>pg_wal</filename> will be
-    restored, but no attempt is made to connect to the master.
+    restored, but no attempt is made to connect to the primary.
    </para>
   </sect2>
 
-  <sect2 id="preparing-master-for-standby">
-   <title>Preparing the Master for Standby Servers</title>
+  <sect2 id="preparing-primary-for-standby">
+   <title>Preparing the Primary for Standby Servers</title>
 
    <para>
     Set up continuous archiving on the primary to an archive directory
     accessible from the standby, as described
     in <xref linkend="continuous-archiving"/>. The archive location should be
-    accessible from the standby even when the master is down, i.e. it should
+    accessible from the standby even when the primary is down, i.e. it should
     reside on the standby server itself or another trusted server, not on
-    the master server.
+    the primary server.
    </para>
 
    <para>
@@ -898,7 +898,7 @@ primary_conninfo = 'host=192.168.1.50 port=5432 user=foo password=foopass'
      <link linkend="monitoring-pg-stat-replication-view"><structname>
      pg_stat_replication</structname></link> view. Large differences between
      <function>pg_current_wal_lsn</function> and the view's <literal>sent_lsn</literal> field
-     might indicate that the master server is under heavy load, while
+     might indicate that the primary server is under heavy load, while
      differences between <literal>sent_lsn</literal> and
      <function>pg_last_wal_receive_lsn</function> on the standby might indicate
      network delay, or that the standby is under heavy load.
@@ -921,9 +921,9 @@ primary_conninfo = 'host=192.168.1.50 port=5432 user=foo password=foopass'
     <secondary>streaming replication</secondary>
    </indexterm>
    <para>
-    Replication slots provide an automated way to ensure that the master does
+    Replication slots provide an automated way to ensure that the primary does
     not remove WAL segments until they have been received by all standbys,
-    and that the master does not remove rows which could cause a
+    and that the primary does not remove rows which could cause a
     <link linkend="hot-standby-conflict">recovery conflict</link> even when the
     standby is disconnected.
    </para>
@@ -1001,23 +1001,22 @@ primary_slot_name = 'node_a_slot'
    <para>
     The cascading replication feature allows a standby server to accept replication
     connections and stream WAL records to other standbys, acting as a relay.
-    This can be used to reduce the number of direct connections to the master
+    This can be used to reduce the number of direct connections to the primary
     and also to minimize inter-site bandwidth overheads.
    </para>
 
    <para>
     A standby acting as both a receiver and a sender is known as a cascading
-    standby.  Standbys that are more directly connected to the master are known
+    standby.  Standbys that are more directly connected to the primary are known
     as upstream servers, while those standby servers further away are downstream
     servers.  Cascading replication does not place limits on the number or
     arrangement of downstream servers, though each standby connects to only
-    one upstream server which eventually links to a single master/primary
-    server.
+    one upstream server which eventually links to a single primary server.
    </para>
 
    <para>
     A cascading standby sends not only WAL records received from the
-    master but also those restored from the archive. So even if the replication
+    primary but also those restored from the archive. So even if the replication
     connection in some upstream connection is terminated, streaming replication
     continues downstream for as long as new WAL records are available.
    </para>
@@ -1033,8 +1032,8 @@ primary_slot_name = 'node_a_slot'
    </para>
 
    <para>
-    If an upstream standby server is promoted to become new master, downstream
-    servers will continue to stream from the new master if
+    If an upstream standby server is promoted to become the new primary, downstream
+    servers will continue to stream from the new primary if
     <varname>recovery_target_timeline</varname> is set to <literal>'latest'</literal> (the default).
    </para>
 
@@ -1120,7 +1119,7 @@ primary_slot_name = 'node_a_slot'
     a non-empty value.  <varname>synchronous_commit</varname> must also be set to
     <literal>on</literal>, but since this is the default value, typically no change is
     required.  (See <xref linkend="runtime-config-wal-settings"/> and
-    <xref linkend="runtime-config-replication-master"/>.)
+    <xref linkend="runtime-config-replication-primary"/>.)
     This configuration will cause each commit to wait for
     confirmation that the standby has written the commit record to durable
     storage.
@@ -1145,8 +1144,8 @@ primary_slot_name = 'node_a_slot'
     confirmation that the commit record has been received. These parameters
     allow the administrator to specify which standby servers should be
     synchronous standbys. Note that the configuration of synchronous
-    replication is mainly on the master. Named standbys must be directly
-    connected to the master; the master knows nothing about downstream
+    replication is mainly on the primary. Named standbys must be directly
+    connected to the primary; the primary knows nothing about downstream
     standby servers using cascaded replication.
    </para>
 
@@ -1504,7 +1503,7 @@ synchronous_standby_names = 'ANY 2 (s1, s2, s3)'
    <para>
     Note that in this mode, the server will apply WAL one file at a
     time, so if you use the standby server for queries (see Hot Standby),
-    there is a delay between an action in the master and when the
+    there is a delay between an action in the primary and when the
     action becomes visible in the standby, corresponding the time it takes
     to fill up the WAL file. <varname>archive_timeout</varname> can be used to make that delay
     shorter. Also note that you can't combine streaming replication with
@@ -2049,7 +2048,7 @@ if (!triggered)
     cleanup of old row versions when there are no transactions that need to
     see them to ensure correct visibility of data according to MVCC rules.
     However, this rule can only be applied for transactions executing on the
-    master.  So it is possible that cleanup on the master will remove row
+    primary.  So it is possible that cleanup on the primary will remove row
     versions that are still visible to a transaction on the standby.
    </para>
 
@@ -2438,7 +2437,7 @@ LOG:  database system is ready to accept read only connections
    <listitem>
     <para>
      Valid starting points for standby queries are generated at each
-     checkpoint on the master. If the standby is shut down while the master
+     checkpoint on the primary. If the standby is shut down while the primary
      is in a shutdown state, it might not be possible to re-enter Hot Standby
      until the primary is started up, so that it generates further starting
      points in the WAL logs.  This situation isn't a problem in the most
diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml
index ea1909c08dc8..d1ccaa775a19 100644
--- a/doc/src/sgml/libpq.sgml
+++ b/doc/src/sgml/libpq.sgml
@@ -7362,7 +7362,7 @@ myEventProc(PGEventId evtId, void *evtInfo, void *passThrough)
    the <literal>host</literal> parameter
    matches <application>libpq</application>'s default socket directory path.
    In a standby server, a database field of <literal>replication</literal>
-   matches streaming replication connections made to the master server.
+   matches streaming replication connections made to the primary server.
    The database field is of limited usefulness otherwise, because users have
    the same password for all databases in the same cluster.
   </para>
diff --git a/doc/src/sgml/logical-replication.sgml b/doc/src/sgml/logical-replication.sgml
index e19bb3fd650e..7c8629d74efd 100644
--- a/doc/src/sgml/logical-replication.sgml
+++ b/doc/src/sgml/logical-replication.sgml
@@ -99,7 +99,7 @@
 
   <para>
    A <firstterm>publication</firstterm> can be defined on any physical
-   replication master.  The node where a publication is defined is referred to
+   replication primary.  The node where a publication is defined is referred to
    as <firstterm>publisher</firstterm>.  A publication is a set of changes
    generated from a table or a group of tables, and might also be described as
    a change set or replication set.  Each publication exists in only one database.
@@ -489,7 +489,7 @@
    Because logical replication is based on a similar architecture as
    <link linkend="streaming-replication">physical streaming replication</link>,
    the monitoring on a publication node is similar to monitoring of a
-   physical replication master
+   physical replication primary
    (see <xref linkend="streaming-replication-monitoring"/>).
   </para>
 
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 211d2790949c..f7ef4ba0f781 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -62,10 +62,10 @@ postgres  15610  0.0  0.0  58772  3056 ?        Ss   18:07   0:00 postgres: tgl
    (The appropriate invocation of <command>ps</command> varies across different
    platforms, as do the details of what is shown.  This example is from a
    recent Linux system.)  The first process listed here is the
-   master server process.  The command arguments
+   primary server process.  The command arguments
    shown for it are the same ones used when it was launched.  The next five
    processes are background worker processes automatically launched by the
-   master process.  (The <quote>stats collector</quote> process will not be present
+   primary process.  (The <quote>stats collector</quote> process will not be present
    if you have set the system not to start the statistics collector; likewise
    the <quote>autovacuum launcher</quote> process can be disabled.)
    Each of the remaining
@@ -3545,7 +3545,7 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
    one row per database, showing database-wide statistics about
    query cancels occurring due to conflicts with recovery on standby servers.
    This view will only contain information on standby servers, since
-   conflicts do not occur on master servers.
+   conflicts do not occur on primary servers.
   </para>
 
   <table id="pg-stat-database-conflicts-view" xreflabel="pg_stat_database_conflicts">
diff --git a/doc/src/sgml/mvcc.sgml b/doc/src/sgml/mvcc.sgml
index dda6f1f2adbe..d127c0b9ad8e 100644
--- a/doc/src/sgml/mvcc.sgml
+++ b/doc/src/sgml/mvcc.sgml
@@ -1642,7 +1642,7 @@ SELECT pg_advisory_lock(q.id) FROM
       This level of integrity protection using Serializable transactions
       does not yet extend to hot standby mode (<xref linkend="hot-standby"/>).
       Because of that, those using hot standby may want to use Repeatable
-      Read and explicit locking on the master.
+      Read and explicit locking on the primary.
      </para>
     </warning>
    </sect2>
@@ -1744,10 +1744,10 @@ SELECT pg_advisory_lock(q.id) FROM
     <xref linkend="hot-standby"/>).  The strictest isolation level currently
     supported in hot standby mode is Repeatable Read.  While performing all
     permanent database writes within Serializable transactions on the
-    master will ensure that all standbys will eventually reach a consistent
+    primary will ensure that all standbys will eventually reach a consistent
     state, a Repeatable Read transaction run on the standby can sometimes
     see a transient state that is inconsistent with any serial execution
-    of the transactions on the master.
+    of the transactions on the primary.
    </para>
 
    <para>
diff --git a/doc/src/sgml/pgstandby.sgml b/doc/src/sgml/pgstandby.sgml
index d8aded43840a..66a62559303f 100644
--- a/doc/src/sgml/pgstandby.sgml
+++ b/doc/src/sgml/pgstandby.sgml
@@ -73,7 +73,7 @@ restore_command = 'pg_standby <replaceable>archiveDir</replaceable> %f %p %r'
   </para>
   <para>
    There are two ways to fail over to a <quote>warm standby</quote> database server
-   when the master server fails:
+   when the primary server fails:
 
    <variablelist>
     <varlistentry>
diff --git a/doc/src/sgml/protocol.sgml b/doc/src/sgml/protocol.sgml
index 20d1fe0ad811..8b00235a5161 100644
--- a/doc/src/sgml/protocol.sgml
+++ b/doc/src/sgml/protocol.sgml
@@ -1793,7 +1793,7 @@ The commands accepted in replication mode are:
       <listitem>
       <para>
        Current timeline ID. Also useful to check that the standby is
-       consistent with the master.
+       consistent with the primary.
       </para>
       </listitem>
       </varlistentry>
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index db480be674ab..e2a01be895d7 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -65,11 +65,11 @@ PostgreSQL documentation
 
   <para>
    <application>pg_basebackup</application> can make a base backup from
-   not only the master but also the standby. To take a backup from the standby,
+   not only the primary but also the standby. To take a backup from the standby,
    set up the standby so that it can accept replication connections (that is, set
    <varname>max_wal_senders</varname> and <xref linkend="guc-hot-standby"/>,
    and configure <link linkend="auth-pg-hba-conf">host-based authentication</link>).
-   You will also need to enable <xref linkend="guc-full-page-writes"/> on the master.
+   You will also need to enable <xref linkend="guc-full-page-writes"/> on the primary.
   </para>
 
   <para>
@@ -89,13 +89,13 @@ PostgreSQL documentation
     </listitem>
     <listitem>
      <para>
-      If the standby is promoted to the master during online backup, the backup fails.
+      If the standby is promoted to the primary during online backup, the backup fails.
      </para>
     </listitem>
     <listitem>
      <para>
       All WAL records required for the backup must contain sufficient full-page writes,
-      which requires you to enable <varname>full_page_writes</varname> on the master and
+      which requires you to enable <varname>full_page_writes</varname> on the primary and
       not to use a tool like <application>pg_compresslog</application> as
       <varname>archive_command</varname> to remove full-page writes from WAL files.
      </para>
@@ -328,7 +328,7 @@ PostgreSQL documentation
             it will use up two connections configured by the
             <xref linkend="guc-max-wal-senders"/> parameter. As long as the
              client can keep up with write-ahead log received, using this mode
-             requires no extra write-ahead logs to be saved on the master.
+             requires no extra write-ahead logs to be saved on the primary.
            </para>
            <para>
             When tar format mode is used, the write-ahead log files will be
diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml
index 9ae1bf3ab6e8..440eed7d4b71 100644
--- a/doc/src/sgml/ref/pg_rewind.sgml
+++ b/doc/src/sgml/ref/pg_rewind.sgml
@@ -43,8 +43,8 @@ PostgreSQL documentation
   <para>
    <application>pg_rewind</application> is a tool for synchronizing a PostgreSQL cluster
    with another copy of the same cluster, after the clusters' timelines have
-   diverged. A typical scenario is to bring an old master server back online
-   after failover as a standby that follows the new master.
+   diverged. A typical scenario is to bring an old primary server back online
+   after failover as a standby that follows the new primary.
   </para>
 
   <para>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index 88210c4a5d3c..1fd4ab723c2e 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1864,9 +1864,9 @@ pg_dumpall -p 5432 | psql -d postgres -p 5433
     This is possible because logical replication supports
     replication between different major versions of
     <productname>PostgreSQL</productname>.  The standby can be on the same computer or
-    a different computer.  Once it has synced up with the master server
+    a different computer.  Once it has synced up with the primary server
     (running the older version of <productname>PostgreSQL</productname>), you can
-    switch masters and make the standby the master and shut down the older
+    switch primaries and make the standby the primary and shut down the older
     database instance.  Such a switch-over results in only several seconds
     of downtime for an upgrade.
    </para>
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index bd9fae544c1d..1902f36291db 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -596,8 +596,8 @@
    indicate that the already-processed WAL data need not be scanned again,
    and then recycles any old log segment files in the <filename>pg_wal</filename>
    directory.
-   Restartpoints can't be performed more frequently than checkpoints in the
-   master because restartpoints can only be performed at checkpoint records.
+   Restartpoints can't be performed more frequently than checkpoints on the
+   primary because restartpoints can only be performed at checkpoint records.
    A restartpoint is triggered when a checkpoint record is reached if at
    least <varname>checkpoint_timeout</varname> seconds have passed since the last
    restartpoint, or if WAL size is about to exceed

From 09dfd430118f1fadf52a782db5ee161b1eb16337 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 15 Jun 2020 10:18:41 -0700
Subject: [PATCH 107/334] docs: replace 'master' with 'root' where appropriate.

These uses of 'master' refer to partitioning / inheritance. 'root'
seems more descriptive than 'master'.

Author: Andres Freund
Reviewed-By: David Steele
Discussion: https://postgr.es/m/20200615182235.x7lch5n6kcjq4aue@alap3.anarazel.de
---
 doc/src/sgml/ddl.sgml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/doc/src/sgml/ddl.sgml b/doc/src/sgml/ddl.sgml
index 991323d34710..f45c951b2b08 100644
--- a/doc/src/sgml/ddl.sgml
+++ b/doc/src/sgml/ddl.sgml
@@ -4142,12 +4142,12 @@ ALTER INDEX measurement_city_id_logdate_key
       <orderedlist spacing="compact">
        <listitem>
         <para>
-         Create the <quote>master</quote> table, from which all of the
+         Create the <quote>root</quote> table, from which all of the
          <quote>child</quote> tables will inherit.  This table will contain no data.  Do not
          define any check constraints on this table, unless you intend them
          to be applied equally to all child tables.  There is no point in
          defining any indexes or unique constraints on it, either.  For our
-         example, the master table is the <structname>measurement</structname>
+         example, the root table is the <structname>measurement</structname>
          table as originally defined.
         </para>
        </listitem>
@@ -4155,8 +4155,8 @@ ALTER INDEX measurement_city_id_logdate_key
        <listitem>
         <para>
          Create several <quote>child</quote> tables that each inherit from
-         the master table.  Normally, these tables will not add any columns
-         to the set inherited from the master.  Just as with declarative
+         the root table.  Normally, these tables will not add any columns
+         to the set inherited from the root.  Just as with declarative
          partitioning, these tables are in every way normal
          <productname>PostgreSQL</productname> tables (or foreign tables).
         </para>
@@ -4244,7 +4244,7 @@ CREATE INDEX measurement_y2008m01_logdate ON measurement_y2008m01 (logdate);
          We want our application to be able to say <literal>INSERT INTO
          measurement ...</literal> and have the data be redirected into the
          appropriate child table.  We can arrange that by attaching
-         a suitable trigger function to the master table.
+         a suitable trigger function to the root table.
          If data will be added only to the latest child, we can
          use a very simple trigger function:
 
@@ -4326,7 +4326,7 @@ LANGUAGE plpgsql;
         <para>
          A different approach to redirecting inserts into the appropriate
          child table is to set up rules, instead of a trigger, on the
-         master table.  For example:
+         root table.  For example:
 
 <programlisting>
 CREATE RULE measurement_insert_y2006m02 AS
@@ -4351,7 +4351,7 @@ DO INSTEAD
         <para>
          Be aware that <command>COPY</command> ignores rules.  If you want to
          use <command>COPY</command> to insert data, you'll need to copy into the
-         correct child table rather than directly into the master. <command>COPY</command>
+         correct child table rather than directly into the root. <command>COPY</command>
          does fire triggers, so you can use it normally if you use the trigger
          approach.
         </para>
@@ -4359,7 +4359,7 @@ DO INSTEAD
         <para>
          Another disadvantage of the rule approach is that there is no simple
          way to force an error if the set of rules doesn't cover the insertion
-         date; the data will silently go into the master table instead.
+         date; the data will silently go into the root table instead.
         </para>
        </listitem>
 
@@ -4473,7 +4473,7 @@ ALTER TABLE measurement_y2008m02 INHERIT measurement;
 <programlisting>
 ANALYZE measurement;
 </programlisting>
-        will only process the master table.
+        will only process the root table.
        </para>
       </listitem>
 

From 7c89f8a5b810d10dae300ec58ea7d70024e9123e Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 15 Jun 2020 10:19:32 -0700
Subject: [PATCH 108/334] docs: replace 'master process' with 'supervisor
 process' where appropriate.

Author: Andres Freund
Reviewed-By: David Steele
Discussion: https://postgr.es/m/20200615182235.x7lch5n6kcjq4aue@alap3.anarazel.de
---
 doc/src/sgml/arch-dev.sgml | 5 +++--
 doc/src/sgml/runtime.sgml  | 6 +++---
 doc/src/sgml/start.sgml    | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/doc/src/sgml/arch-dev.sgml b/doc/src/sgml/arch-dev.sgml
index 9ffb8427bf09..7883c3cd827c 100644
--- a/doc/src/sgml/arch-dev.sgml
+++ b/doc/src/sgml/arch-dev.sgml
@@ -122,8 +122,9 @@
     there is one <firstterm>client process</firstterm> connected to
     exactly one <firstterm>server process</firstterm>.  As we do not
     know ahead of time how many connections will be made, we have to
-    use a <firstterm>master process</firstterm> that spawns a new
-    server process every time a connection is requested. This master
+    use a <firstterm>supervisor process</firstterm> (also
+    <firstterm>master process</firstterm>) that spawns a new
+    server process every time a connection is requested. This supervisor
     process is called <literal>postgres</literal> and listens at a
     specified TCP/IP port for incoming connections. Whenever a request
     for a connection is detected the <literal>postgres</literal>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index 1fd4ab723c2e..331d01b4445a 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1292,7 +1292,7 @@ default:\
     optimal for <productname>PostgreSQL</productname>. Because of the
     way that the kernel implements memory overcommit, the kernel might
     terminate the <productname>PostgreSQL</productname> postmaster (the
-    master server process) if the memory demands of either
+    supervisor server process) if the memory demands of either
     <productname>PostgreSQL</productname> or another process cause the
     system to run out of virtual memory.
    </para>
@@ -1465,7 +1465,7 @@ $ <userinput>grep Huge /proc/meminfo</userinput>
 
   <para>
    There are several ways to shut down the database server. You control
-   the type of shutdown by sending different signals to the master
+   the type of shutdown by sending different signals to the supervisor
    <command>postgres</command> process.
 
    <variablelist>
@@ -1511,7 +1511,7 @@ $ <userinput>grep Huge /proc/meminfo</userinput>
       The server will send <systemitem>SIGQUIT</systemitem> to all child
       processes and wait for them to terminate.  If any do not terminate
       within 5 seconds, they will be sent <systemitem>SIGKILL</systemitem>.
-      The master server process exits as soon as all child processes have
+      The supervisor server process exits as soon as all child processes have
       exited, without doing normal database shutdown processing.
       This will lead to recovery (by
       replaying the WAL log) upon next start-up. This is recommended
diff --git a/doc/src/sgml/start.sgml b/doc/src/sgml/start.sgml
index 2a47f69079bf..9bb5c1a6d5d1 100644
--- a/doc/src/sgml/start.sgml
+++ b/doc/src/sgml/start.sgml
@@ -113,7 +113,7 @@
     From that point on, the client and the new server process
     communicate without intervention by the original
     <filename>postgres</filename> process.  Thus, the
-    master server process is always running, waiting for
+    supervisor server process is always running, waiting for
     client connections, whereas client and associated server processes
     come and go.  (All of this is of course invisible to the user.  We
     only mention it here for completeness.)

From a9a4a7ad565b136cbee735d4bb505d98d06da522 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 15 Jun 2020 10:14:40 -0700
Subject: [PATCH 109/334] code: replace most remaining uses of 'master'.

Author: Andres Freund
Reviewed-By: David Steele
Discussion: https://postgr.es/m/20200615182235.x7lch5n6kcjq4aue@alap3.anarazel.de
---
 src/backend/access/transam/xlog.c    | 16 +++++++++-------
 src/backend/catalog/index.c          |  2 +-
 src/backend/catalog/toasting.c       |  4 ++--
 src/backend/commands/vacuum.c        |  4 ++--
 src/backend/libpq/hba.c              | 12 +++++++-----
 src/backend/libpq/pqcomm.c           |  4 ++--
 src/backend/optimizer/plan/planner.c |  4 ++--
 src/backend/parser/gram.y            |  2 +-
 src/backend/snowball/README          |  4 ++--
 src/backend/utils/time/snapmgr.c     |  4 ++--
 src/include/nodes/execnodes.h        |  2 +-
 src/include/utils/snapmgr.h          |  2 +-
 src/pl/tcl/pltcl.c                   | 10 +++++-----
 13 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 6f9c5513e3a5..28daf72a503a 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -562,11 +562,12 @@ typedef struct XLogCtlInsert
 	char		pad[PG_CACHE_LINE_SIZE];
 
 	/*
-	 * fullPageWrites is the master copy used by all backends to determine
-	 * whether to write full-page to WAL, instead of using process-local one.
-	 * This is required because, when full_page_writes is changed by SIGHUP,
-	 * we must WAL-log it before it actually affects WAL-logging by backends.
-	 * Checkpointer sets at startup or after SIGHUP.
+	 * fullPageWrites is the authoritative value used by all backends to
+	 * determine whether to write full-page image to WAL. This shared value,
+	 * instead of the process-local fullPageWrites, is required because, when
+	 * full_page_writes is changed by SIGHUP, we must WAL-log it before it
+	 * actually affects WAL-logging by backends.  Checkpointer sets at startup
+	 * or after SIGHUP.
 	 *
 	 * To read these fields, you must hold an insertion lock. To modify them,
 	 * you must hold ALL the locks.
@@ -8366,8 +8367,9 @@ GetRedoRecPtr(void)
 
 	/*
 	 * The possibly not up-to-date copy in XlogCtl is enough. Even if we
-	 * grabbed a WAL insertion lock to read the master copy, someone might
-	 * update it just after we've released the lock.
+	 * grabbed a WAL insertion lock to read the authoritative value in
+	 * Insert->RedoRecPtr, someone might update it just after we've released
+	 * the lock.
 	 */
 	SpinLockAcquire(&XLogCtl->info_lck);
 	ptr = XLogCtl->RedoRecPtr;
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index fc088d3f5275..8ec2864c76a9 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -3765,7 +3765,7 @@ reindex_relation(Oid relid, int flags, int options)
 
 	/*
 	 * If the relation has a secondary toast rel, reindex that too while we
-	 * still hold the lock on the master table.
+	 * still hold the lock on the main table.
 	 */
 	if ((flags & REINDEX_REL_PROCESS_TOAST) && OidIsValid(toast_relid))
 		result |= reindex_relation(toast_relid, flags, options);
diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c
index 8b8888af5ed5..c40d25b3012c 100644
--- a/src/backend/catalog/toasting.c
+++ b/src/backend/catalog/toasting.c
@@ -345,8 +345,8 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid,
 	table_close(class_rel, RowExclusiveLock);
 
 	/*
-	 * Register dependency from the toast table to the master, so that the
-	 * toast table will be deleted if the master is.  Skip this in bootstrap
+	 * Register dependency from the toast table to the main, so that the
+	 * toast table will be deleted if the main is.  Skip this in bootstrap
 	 * mode.
 	 */
 	if (!IsBootstrapProcessingMode())
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index d32de23e6268..576c7e63e99a 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1897,7 +1897,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params)
 
 	/*
 	 * If the relation has a secondary toast rel, vacuum that too while we
-	 * still hold the session lock on the master table.  Note however that
+	 * still hold the session lock on the main table.  Note however that
 	 * "analyze" will not get done on the toast table.  This is good, because
 	 * the toaster always uses hardcoded index access and statistics are
 	 * totally unimportant for toast relations.
@@ -1906,7 +1906,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params)
 		vacuum_rel(toast_relid, NULL, params);
 
 	/*
-	 * Now release the session-level lock on the master table.
+	 * Now release the session-level lock on the main table.
 	 */
 	UnlockRelationIdForSession(&onerelid, lmode);
 
diff --git a/src/backend/libpq/hba.c b/src/backend/libpq/hba.c
index da5189a4fa0a..9d63830553e0 100644
--- a/src/backend/libpq/hba.c
+++ b/src/backend/libpq/hba.c
@@ -145,7 +145,7 @@ static List *tokenize_inc_file(List *tokens, const char *outer_filename,
 static bool parse_hba_auth_opt(char *name, char *val, HbaLine *hbaline,
 							   int elevel, char **err_msg);
 static bool verify_option_list_length(List *options, const char *optionname,
-									  List *masters, const char *mastername, int line_num);
+									  List *comparelist, const char *comparename, int line_num);
 static ArrayType *gethba_options(HbaLine *hba);
 static void fill_hba_line(Tuplestorestate *tuple_store, TupleDesc tupdesc,
 						  int lineno, HbaLine *hba, const char *err_msg);
@@ -1648,11 +1648,13 @@ parse_hba_line(TokenizedLine *tok_line, int elevel)
 
 
 static bool
-verify_option_list_length(List *options, const char *optionname, List *masters, const char *mastername, int line_num)
+verify_option_list_length(List *options, const char *optionname,
+						  List *comparelist, const char *comparename,
+						  int line_num)
 {
 	if (list_length(options) == 0 ||
 		list_length(options) == 1 ||
-		list_length(options) == list_length(masters))
+		list_length(options) == list_length(comparelist))
 		return true;
 
 	ereport(LOG,
@@ -1660,8 +1662,8 @@ verify_option_list_length(List *options, const char *optionname, List *masters,
 			 errmsg("the number of %s (%d) must be 1 or the same as the number of %s (%d)",
 					optionname,
 					list_length(options),
-					mastername,
-					list_length(masters)
+					comparename,
+					list_length(comparelist)
 					),
 			 errcontext("line %d of configuration file \"%s\"",
 						line_num, HbaFileName)));
diff --git a/src/backend/libpq/pqcomm.c b/src/backend/libpq/pqcomm.c
index 7717bb27195e..ac986c050568 100644
--- a/src/backend/libpq/pqcomm.c
+++ b/src/backend/libpq/pqcomm.c
@@ -705,8 +705,8 @@ Setup_AF_UNIX(const char *sock_path)
  *		server port.  Set port->sock to the FD of the new connection.
  *
  * ASSUME: that this doesn't need to be non-blocking because
- *		the Postmaster uses select() to tell when the server master
- *		socket is ready for accept().
+ *		the Postmaster uses select() to tell when the socket is ready for
+ *		accept().
  *
  * RETURNS: STATUS_OK or STATUS_ERROR
  */
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 14f3fd44e361..b406d41e9189 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -1748,7 +1748,7 @@ inheritance_planner(PlannerInfo *root)
 	else
 	{
 		/*
-		 * Put back the final adjusted rtable into the master copy of the
+		 * Put back the final adjusted rtable into the original copy of the
 		 * Query.  (We mustn't do this if we found no non-excluded children,
 		 * since we never saved an adjusted rtable at all.)
 		 */
@@ -1757,7 +1757,7 @@ inheritance_planner(PlannerInfo *root)
 		root->simple_rel_array = save_rel_array;
 		root->append_rel_array = save_append_rel_array;
 
-		/* Must reconstruct master's simple_rte_array, too */
+		/* Must reconstruct original's simple_rte_array, too */
 		root->simple_rte_array = (RangeTblEntry **)
 			palloc0((list_length(final_rtable) + 1) * sizeof(RangeTblEntry *));
 		rti = 1;
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 4ff35095b855..1ea1fba43acc 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -15016,7 +15016,7 @@ ColLabel:	IDENT									{ $$ = $1; }
  *
  * Make sure that each keyword's category in kwlist.h matches where
  * it is listed here.  (Someday we may be able to generate these lists and
- * kwlist.h's table from a common master list.)
+ * kwlist.h's table from one source of truth.)
  */
 
 /* "Unreserved" keywords --- available for use as any kind of name.
diff --git a/src/backend/snowball/README b/src/backend/snowball/README
index 92ee16901fe6..6948c28b69f3 100644
--- a/src/backend/snowball/README
+++ b/src/backend/snowball/README
@@ -22,8 +22,8 @@ At least on Linux, no platform-specific adjustment is needed.
 Postgres' files under src/backend/snowball/libstemmer/ and
 src/include/snowball/libstemmer/ are taken directly from the Snowball
 files, with only some minor adjustments of file inclusions.  Note
-that most of these files are in fact derived files, not master source.
-The master sources are in the Snowball language, and are built using
+that most of these files are in fact derived files, not original source.
+The original sources are in the Snowball language, and are built using
 the Snowball-to-C compiler that is also part of the Snowball project.
 We choose to include the derived files in the PostgreSQL distribution
 because most installations will not have the Snowball compiler available.
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 1c063c592ceb..6b6c8571e237 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -2222,9 +2222,9 @@ RestoreSnapshot(char *start_address)
  * the declaration for PGPROC.
  */
 void
-RestoreTransactionSnapshot(Snapshot snapshot, void *master_pgproc)
+RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc)
 {
-	SetTransactionSnapshot(snapshot, NULL, InvalidPid, master_pgproc);
+	SetTransactionSnapshot(snapshot, NULL, InvalidPid, source_pgproc);
 }
 
 /*
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index f5dfa32d55c4..0187989fd19f 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -496,7 +496,7 @@ typedef struct ResultRelInfo
 /* ----------------
  *	  EState information
  *
- * Master working state for an Executor invocation
+ * Working state for an Executor invocation
  * ----------------
  */
 typedef struct EState
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index b28d13ce841e..ffb4ba3adfb0 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -153,6 +153,6 @@ extern bool HistoricSnapshotActive(void);
 extern Size EstimateSnapshotSpace(Snapshot snapshot);
 extern void SerializeSnapshot(Snapshot snapshot, char *start_address);
 extern Snapshot RestoreSnapshot(char *start_address);
-extern void RestoreTransactionSnapshot(Snapshot snapshot, void *master_pgproc);
+extern void RestoreTransactionSnapshot(Snapshot snapshot, void *source_pgproc);
 
 #endif							/* SNAPMGR_H */
diff --git a/src/pl/tcl/pltcl.c b/src/pl/tcl/pltcl.c
index 24d4b57f1a56..f4eabc8f39c0 100644
--- a/src/pl/tcl/pltcl.c
+++ b/src/pl/tcl/pltcl.c
@@ -432,9 +432,9 @@ _PG_init(void)
 	 * stdout and stderr on DeleteInterp
 	 ************************************************************/
 	if ((pltcl_hold_interp = Tcl_CreateInterp()) == NULL)
-		elog(ERROR, "could not create master Tcl interpreter");
+		elog(ERROR, "could not create dummy Tcl interpreter");
 	if (Tcl_Init(pltcl_hold_interp) == TCL_ERROR)
-		elog(ERROR, "could not initialize master Tcl interpreter");
+		elog(ERROR, "could not initialize dummy Tcl interpreter");
 
 	/************************************************************
 	 * Create the hash table for working interpreters
@@ -489,14 +489,14 @@ pltcl_init_interp(pltcl_interp_desc *interp_desc, Oid prolang, bool pltrusted)
 	char		interpname[32];
 
 	/************************************************************
-	 * Create the Tcl interpreter as a slave of pltcl_hold_interp.
+	 * Create the Tcl interpreter subsidiary to pltcl_hold_interp.
 	 * Note: Tcl automatically does Tcl_Init in the untrusted case,
 	 * and it's not wanted in the trusted case.
 	 ************************************************************/
-	snprintf(interpname, sizeof(interpname), "slave_%u", interp_desc->user_id);
+	snprintf(interpname, sizeof(interpname), "subsidiary_%u", interp_desc->user_id);
 	if ((interp = Tcl_CreateSlave(pltcl_hold_interp, interpname,
 								  pltrusted ? 1 : 0)) == NULL)
-		elog(ERROR, "could not create slave Tcl interpreter");
+		elog(ERROR, "could not create subsidiary Tcl interpreter");
 
 	/************************************************************
 	 * Initialize the query hash table associated with interpreter

From 2b7dbc0db6ecf38fc305284d006a1ef8f5354fbb Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Thu, 9 Jul 2020 10:06:24 +1200
Subject: [PATCH 110/334] Fix whitespace in HashAgg EXPLAIN ANALYZE

The Sort node does not put a space between the number of kilobytes and
the "kB" of memory or disk space used, but HashAgg does.  Here we align
HashAgg to do the same as Sort.  Sort has been displaying this
information for longer than HashAgg, so it makes sense to align HashAgg
to Sort rather than the other way around.

Reported-by: Justin Pryzby
Discussion: https://postgr.es/m/20200708163021.GW4107@telsasoft.com
Backpatch-through: 13, where the hashagg started showing these details
---
 src/backend/commands/explain.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 093864cfc04a..a283e4d45c84 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3099,11 +3099,11 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 		else
 			appendStringInfoString(es->str, "  ");
 
-		appendStringInfo(es->str, "Peak Memory Usage: " INT64_FORMAT " kB",
+		appendStringInfo(es->str, "Peak Memory Usage: " INT64_FORMAT "kB",
 						 memPeakKb);
 
 		if (aggstate->hash_batches_used > 0)
-			appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT " kB  HashAgg Batches: %d",
+			appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB  HashAgg Batches: %d",
 							 aggstate->hash_disk_used,
 							 aggstate->hash_batches_used);
 		appendStringInfoChar(es->str, '\n');
@@ -3130,11 +3130,11 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 			{
 				ExplainIndentText(es);
 
-				appendStringInfo(es->str, "Peak Memory Usage: " INT64_FORMAT " kB",
+				appendStringInfo(es->str, "Peak Memory Usage: " INT64_FORMAT "kB",
 								 memPeakKb);
 
 				if (hash_batches_used > 0)
-					appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT " kB  HashAgg Batches: %d",
+					appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB  HashAgg Batches: %d",
 									 hash_disk_used, hash_batches_used);
 				appendStringInfoChar(es->str, '\n');
 			}

From 91bdf499b37b0bbef34e2bab5cc40dde52bda52c Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 8 Jul 2020 20:25:52 -0400
Subject: [PATCH 111/334] Tighten up Windows CRLF conversion in our TAP test
 scripts.

The previous approach was to search-and-destroy all \r occurrences
no matter what.  That seems more likely to hide bugs than anything
else; indeed it seems to be hiding one now.  Fix things so that
we only transform \r\n to \n.

Side effects: must do this before, not after, chomp'ing if we're
going to chomp, else we'd fail to clean up a trailing \r\n.  Also,
remove safe_psql's redundant repetition of what psql already did;
else it might reduce \r\r\n to \n, which is exactly the scenario
I'm hoping to expose.

Perhaps this should be back-patched, but for now I'm content to
see what happens in HEAD.

Discussion: https://postgr.es/m/412ae8da-76bb-640f-039a-f3513499e53d@gmx.net
---
 src/bin/pg_rewind/t/RewindTest.pm |  2 +-
 src/test/perl/PostgresNode.pm     | 13 ++++++-------
 src/test/perl/TestLib.pm          |  2 +-
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/bin/pg_rewind/t/RewindTest.pm b/src/bin/pg_rewind/t/RewindTest.pm
index 149b99159d08..7516af7a01a6 100644
--- a/src/bin/pg_rewind/t/RewindTest.pm
+++ b/src/bin/pg_rewind/t/RewindTest.pm
@@ -112,7 +112,7 @@ sub check_query
 	}
 	else
 	{
-		$stdout =~ s/\r//g if $Config{osname} eq 'msys';
+		$stdout =~ s/\r\n/\n/g if $Config{osname} eq 'msys';
 		is($stdout, $expected_stdout, "$test_name: query result matches");
 	}
 	return;
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index b216bbbe4bbd..0914fdaa463f 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -1324,7 +1324,6 @@ sub safe_psql
 		print "\n#### End standard error\n";
 	}
 
-	$stdout =~ s/\r//g if $TestLib::windows_os;
 	return $stdout;
 }
 
@@ -1515,14 +1514,14 @@ sub psql
 
 	if (defined $$stdout)
 	{
+		$$stdout =~ s/\r\n/\n/g if $TestLib::windows_os;
 		chomp $$stdout;
-		$$stdout =~ s/\r//g if $TestLib::windows_os;
 	}
 
 	if (defined $$stderr)
 	{
+		$$stderr =~ s/\r\n/\n/g if $TestLib::windows_os;
 		chomp $$stderr;
-		$$stderr =~ s/\r//g if $TestLib::windows_os;
 	}
 
 	# See http://perldoc.perl.org/perlvar.html#%24CHILD_ERROR
@@ -1652,8 +1651,8 @@ sub poll_query_until
 	{
 		my $result = IPC::Run::run $cmd, '>', \$stdout, '2>', \$stderr;
 
+		$stdout =~ s/\r\n/\n/g if $TestLib::windows_os;
 		chomp($stdout);
-		$stdout =~ s/\r//g if $TestLib::windows_os;
 
 		if ($stdout eq $expected)
 		{
@@ -1668,8 +1667,8 @@ sub poll_query_until
 
 	# The query result didn't change in 180 seconds. Give up. Print the
 	# output from the last attempt, hopefully that's useful for debugging.
+	$stderr =~ s/\r\n/\n/g if $TestLib::windows_os;
 	chomp($stderr);
-	$stderr =~ s/\r//g if $TestLib::windows_os;
 	diag qq(poll_query_until timed out executing this query:
 $query
 expecting this output:
@@ -2113,8 +2112,8 @@ sub pg_recvlogical_upto
 		}
 	};
 
-	$stdout =~ s/\r//g if $TestLib::windows_os;
-	$stderr =~ s/\r//g if $TestLib::windows_os;
+	$stdout =~ s/\r\n/\n/g if $TestLib::windows_os;
+	$stderr =~ s/\r\n/\n/g if $TestLib::windows_os;
 
 	if (wantarray)
 	{
diff --git a/src/test/perl/TestLib.pm b/src/test/perl/TestLib.pm
index d579d5c177b0..a7490d2ce797 100644
--- a/src/test/perl/TestLib.pm
+++ b/src/test/perl/TestLib.pm
@@ -430,7 +430,7 @@ sub slurp_file
 		CloseHandle($fHandle)
 		  or die "could not close \"$filename\": $^E\n";
 	}
-	$contents =~ s/\r//g if $Config{osname} eq 'msys';
+	$contents =~ s/\r\n/\n/g if $Config{osname} eq 'msys';
 	return $contents;
 }
 

From a5cd7047e7c6dacdef79a763a26802469ac4e02b Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Thu, 9 Jul 2020 13:31:33 +0900
Subject: [PATCH 112/334] doc: Correct the description about the length of
 pg_stat_activity.query.

pg_stat_activity.query text is truncated at 1024 bytes. But previously
the document described that it's truncated at 1024 characters.
This was not accurate when considering multibyte characters.

Back-patch to v10 where this inaccurate description was added.

Author: Atsushi Torikoshi
Reviewed-by: Daniel Gustafsson, Fujii Masao
Discussion: https://postgr.es/m/cd5b49a5a14e887542f5f569c1c6bde2@oss.nttdata.com
---
 doc/src/sgml/monitoring.sgml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index f7ef4ba0f781..b56c10293bb9 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -894,7 +894,7 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
        <structfield>state</structfield> is <literal>active</literal> this field shows the
        currently executing query. In all other states, it shows the last query
        that was executed. By default the query text is truncated at 1024
-       characters; this value can be changed via the parameter
+       bytes; this value can be changed via the parameter
        <xref linkend="guc-track-activity-query-size"/>.
       </para></entry>
      </row>

From 991c444e7ad3a1ff7bcffff4121e6ff67c4783fc Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Thu, 9 Jul 2020 09:47:43 +0200
Subject: [PATCH 113/334] pg_dump: Further reorganize getTableAttrs()

After further discussion after
daa9fe8a5264a3f192efa5ddee8fb011ad9da365, reorder the version-specific
sections from oldest to newest.  Also, remove the variable assignments
from PQfnumber() to reduce vertical space.

Reviewed-by: Fabien COELHO <coelho@cri.ensmp.fr>
Discussion: https://www.postgresql.org/message-id/flat/6594334b-40fd-14f1-6bc5-877afa3feed5@2ndquadrant.com
---
 src/bin/pg_dump/pg_dump.c | 149 ++++++++++++++------------------------
 1 file changed, 54 insertions(+), 95 deletions(-)

diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index e222e68437fd..8bca147a33f0 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -8419,35 +8419,14 @@ void
 getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 {
 	DumpOptions *dopt = fout->dopt;
-	int			i,
-				j;
 	PQExpBuffer q = createPQExpBuffer();
-	int			i_attnum;
-	int			i_attname;
-	int			i_atttypname;
-	int			i_atttypmod;
-	int			i_attstattarget;
-	int			i_attstorage;
-	int			i_typstorage;
-	int			i_attnotnull;
-	int			i_atthasdef;
-	int			i_attidentity;
-	int			i_attgenerated;
-	int			i_attisdropped;
-	int			i_attlen;
-	int			i_attalign;
-	int			i_attislocal;
-	int			i_attoptions;
-	int			i_attcollation;
-	int			i_attfdwoptions;
-	int			i_attmissingval;
-	PGresult   *res;
-	int			ntups;
-	bool		hasdefaults;
 
-	for (i = 0; i < numTables; i++)
+	for (int i = 0; i < numTables; i++)
 	{
 		TableInfo  *tbinfo = &tblinfo[i];
+		PGresult   *res;
+		int			ntups;
+		bool		hasdefaults;
 
 		/* Don't bother to collect info for sequences */
 		if (tbinfo->relkind == RELKIND_SEQUENCE)
@@ -8485,27 +8464,27 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 							 "a.attislocal,\n"
 							 "pg_catalog.format_type(t.oid, a.atttypmod) AS atttypname,\n");
 
-		if (fout->remoteVersion >= 120000)
-			appendPQExpBufferStr(q,
-								 "a.attgenerated,\n");
-		else
-			appendPQExpBufferStr(q,
-								 "'' AS attgenerated,\n");
-
-		if (fout->remoteVersion >= 110000)
+		if (fout->remoteVersion >= 90000)
 			appendPQExpBufferStr(q,
-								 "CASE WHEN a.atthasmissing AND NOT a.attisdropped "
-								 "THEN a.attmissingval ELSE null END AS attmissingval,\n");
+								 "array_to_string(a.attoptions, ', ') AS attoptions,\n");
 		else
 			appendPQExpBufferStr(q,
-								 "NULL AS attmissingval,\n");
+								 "'' AS attoptions,\n");
 
-		if (fout->remoteVersion >= 100000)
+		if (fout->remoteVersion >= 90100)
+		{
+			/*
+			 * Since we only want to dump COLLATE clauses for attributes whose
+			 * collation is different from their type's default, we use a CASE
+			 * here to suppress uninteresting attcollations cheaply.
+			 */
 			appendPQExpBufferStr(q,
-								 "a.attidentity,\n");
+								 "CASE WHEN a.attcollation <> t.typcollation "
+								 "THEN a.attcollation ELSE 0 END AS attcollation,\n");
+		}
 		else
 			appendPQExpBufferStr(q,
-								 "'' AS attidentity,\n");
+								 "0 AS attcollation,\n");
 
 		if (fout->remoteVersion >= 90200)
 			appendPQExpBufferStr(q,
@@ -8519,27 +8498,27 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 			appendPQExpBufferStr(q,
 								 "'' AS attfdwoptions,\n");
 
-		if (fout->remoteVersion >= 90100)
-		{
-			/*
-			 * Since we only want to dump COLLATE clauses for attributes whose
-			 * collation is different from their type's default, we use a CASE
-			 * here to suppress uninteresting attcollations cheaply.
-			 */
+		if (fout->remoteVersion >= 100000)
 			appendPQExpBufferStr(q,
-								 "CASE WHEN a.attcollation <> t.typcollation "
-								 "THEN a.attcollation ELSE 0 END AS attcollation,\n");
-		}
+								 "a.attidentity,\n");
 		else
 			appendPQExpBufferStr(q,
-								 "0 AS attcollation,\n");
+								 "'' AS attidentity,\n");
 
-		if (fout->remoteVersion >= 90000)
+		if (fout->remoteVersion >= 110000)
 			appendPQExpBufferStr(q,
-								 "array_to_string(a.attoptions, ', ') AS attoptions\n");
+								 "CASE WHEN a.atthasmissing AND NOT a.attisdropped "
+								 "THEN a.attmissingval ELSE null END AS attmissingval,\n");
 		else
 			appendPQExpBufferStr(q,
-								 "'' AS attoptions\n");
+								 "NULL AS attmissingval,\n");
+
+		if (fout->remoteVersion >= 120000)
+			appendPQExpBufferStr(q,
+								 "a.attgenerated\n");
+		else
+			appendPQExpBufferStr(q,
+								 "'' AS attgenerated\n");
 
 		/* need left join here to not fail on dropped columns ... */
 		appendPQExpBuffer(q,
@@ -8554,26 +8533,6 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 
 		ntups = PQntuples(res);
 
-		i_attnum = PQfnumber(res, "attnum");
-		i_attname = PQfnumber(res, "attname");
-		i_atttypname = PQfnumber(res, "atttypname");
-		i_atttypmod = PQfnumber(res, "atttypmod");
-		i_attstattarget = PQfnumber(res, "attstattarget");
-		i_attstorage = PQfnumber(res, "attstorage");
-		i_typstorage = PQfnumber(res, "typstorage");
-		i_attnotnull = PQfnumber(res, "attnotnull");
-		i_atthasdef = PQfnumber(res, "atthasdef");
-		i_attidentity = PQfnumber(res, "attidentity");
-		i_attgenerated = PQfnumber(res, "attgenerated");
-		i_attisdropped = PQfnumber(res, "attisdropped");
-		i_attlen = PQfnumber(res, "attlen");
-		i_attalign = PQfnumber(res, "attalign");
-		i_attislocal = PQfnumber(res, "attislocal");
-		i_attoptions = PQfnumber(res, "attoptions");
-		i_attcollation = PQfnumber(res, "attcollation");
-		i_attfdwoptions = PQfnumber(res, "attfdwoptions");
-		i_attmissingval = PQfnumber(res, "attmissingval");
-
 		tbinfo->numatts = ntups;
 		tbinfo->attnames = (char **) pg_malloc(ntups * sizeof(char *));
 		tbinfo->atttypnames = (char **) pg_malloc(ntups * sizeof(char *));
@@ -8596,31 +8555,31 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 		tbinfo->attrdefs = (AttrDefInfo **) pg_malloc(ntups * sizeof(AttrDefInfo *));
 		hasdefaults = false;
 
-		for (j = 0; j < ntups; j++)
+		for (int j = 0; j < ntups; j++)
 		{
-			if (j + 1 != atoi(PQgetvalue(res, j, i_attnum)))
+			if (j + 1 != atoi(PQgetvalue(res, j, PQfnumber(res, "attnum"))))
 				fatal("invalid column numbering in table \"%s\"",
 					  tbinfo->dobj.name);
-			tbinfo->attnames[j] = pg_strdup(PQgetvalue(res, j, i_attname));
-			tbinfo->atttypnames[j] = pg_strdup(PQgetvalue(res, j, i_atttypname));
-			tbinfo->atttypmod[j] = atoi(PQgetvalue(res, j, i_atttypmod));
-			tbinfo->attstattarget[j] = atoi(PQgetvalue(res, j, i_attstattarget));
-			tbinfo->attstorage[j] = *(PQgetvalue(res, j, i_attstorage));
-			tbinfo->typstorage[j] = *(PQgetvalue(res, j, i_typstorage));
-			tbinfo->attidentity[j] = *(PQgetvalue(res, j, i_attidentity));
-			tbinfo->attgenerated[j] = *(PQgetvalue(res, j, i_attgenerated));
+			tbinfo->attnames[j] = pg_strdup(PQgetvalue(res, j, PQfnumber(res, "attname")));
+			tbinfo->atttypnames[j] = pg_strdup(PQgetvalue(res, j, PQfnumber(res, "atttypname")));
+			tbinfo->atttypmod[j] = atoi(PQgetvalue(res, j, PQfnumber(res, "atttypmod")));
+			tbinfo->attstattarget[j] = atoi(PQgetvalue(res, j, PQfnumber(res, "attstattarget")));
+			tbinfo->attstorage[j] = *(PQgetvalue(res, j, PQfnumber(res, "attstorage")));
+			tbinfo->typstorage[j] = *(PQgetvalue(res, j, PQfnumber(res, "typstorage")));
+			tbinfo->attidentity[j] = *(PQgetvalue(res, j, PQfnumber(res, "attidentity")));
+			tbinfo->attgenerated[j] = *(PQgetvalue(res, j, PQfnumber(res, "attgenerated")));
 			tbinfo->needs_override = tbinfo->needs_override || (tbinfo->attidentity[j] == ATTRIBUTE_IDENTITY_ALWAYS);
-			tbinfo->attisdropped[j] = (PQgetvalue(res, j, i_attisdropped)[0] == 't');
-			tbinfo->attlen[j] = atoi(PQgetvalue(res, j, i_attlen));
-			tbinfo->attalign[j] = *(PQgetvalue(res, j, i_attalign));
-			tbinfo->attislocal[j] = (PQgetvalue(res, j, i_attislocal)[0] == 't');
-			tbinfo->notnull[j] = (PQgetvalue(res, j, i_attnotnull)[0] == 't');
-			tbinfo->attoptions[j] = pg_strdup(PQgetvalue(res, j, i_attoptions));
-			tbinfo->attcollation[j] = atooid(PQgetvalue(res, j, i_attcollation));
-			tbinfo->attfdwoptions[j] = pg_strdup(PQgetvalue(res, j, i_attfdwoptions));
-			tbinfo->attmissingval[j] = pg_strdup(PQgetvalue(res, j, i_attmissingval));
+			tbinfo->attisdropped[j] = (PQgetvalue(res, j, PQfnumber(res, "attisdropped"))[0] == 't');
+			tbinfo->attlen[j] = atoi(PQgetvalue(res, j, PQfnumber(res, "attlen")));
+			tbinfo->attalign[j] = *(PQgetvalue(res, j, PQfnumber(res, "attalign")));
+			tbinfo->attislocal[j] = (PQgetvalue(res, j, PQfnumber(res, "attislocal"))[0] == 't');
+			tbinfo->notnull[j] = (PQgetvalue(res, j, PQfnumber(res, "attnotnull"))[0] == 't');
+			tbinfo->attoptions[j] = pg_strdup(PQgetvalue(res, j, PQfnumber(res, "attoptions")));
+			tbinfo->attcollation[j] = atooid(PQgetvalue(res, j, PQfnumber(res, "attcollation")));
+			tbinfo->attfdwoptions[j] = pg_strdup(PQgetvalue(res, j, PQfnumber(res, "attfdwoptions")));
+			tbinfo->attmissingval[j] = pg_strdup(PQgetvalue(res, j, PQfnumber(res, "attmissingval")));
 			tbinfo->attrdefs[j] = NULL; /* fix below */
-			if (PQgetvalue(res, j, i_atthasdef)[0] == 't')
+			if (PQgetvalue(res, j, PQfnumber(res, "atthasdef"))[0] == 't')
 				hasdefaults = true;
 			/* these flags will be set in flagInhAttrs() */
 			tbinfo->inhNotNull[j] = false;
@@ -8651,7 +8610,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 			numDefaults = PQntuples(res);
 			attrdefs = (AttrDefInfo *) pg_malloc(numDefaults * sizeof(AttrDefInfo));
 
-			for (j = 0; j < numDefaults; j++)
+			for (int j = 0; j < numDefaults; j++)
 			{
 				int			adnum;
 
@@ -8783,7 +8742,7 @@ getTableAttrs(Archive *fout, TableInfo *tblinfo, int numTables)
 			constrs = (ConstraintInfo *) pg_malloc(numConstrs * sizeof(ConstraintInfo));
 			tbinfo->checkexprs = constrs;
 
-			for (j = 0; j < numConstrs; j++)
+			for (int j = 0; j < numConstrs; j++)
 			{
 				bool		validated = PQgetvalue(res, j, 5)[0] == 't';
 

From ffb4cee43bdccb54f0cf072a51dedc74e343b6f1 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 9 Jul 2020 11:37:21 -0400
Subject: [PATCH 114/334] Further tighten Windows CRLF conversion in our TAP
 test scripts.

Buildfarm results now imply that Perl's IPC::Run does CRLF conversion
for us if we're using native Perl, but not when using MSys Perl.
Restrict the conversions done by PostgresNode.pm to act only in the
latter case.  (Similar conversions done in TestLib.pm and RewindTest.pm
were already handled this way.)

Discussion: https://postgr.es/m/412ae8da-76bb-640f-039a-f3513499e53d@gmx.net
---
 src/test/perl/PostgresNode.pm | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 0914fdaa463f..8c1b77376fb0 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -1512,15 +1512,19 @@ sub psql
 		}
 	};
 
+	# Note: on Windows, IPC::Run seems to convert \r\n to \n in program output
+	# if we're using native Perl, but not if we're using MSys Perl.  So do it
+	# by hand in the latter case, here and elsewhere.
+
 	if (defined $$stdout)
 	{
-		$$stdout =~ s/\r\n/\n/g if $TestLib::windows_os;
+		$$stdout =~ s/\r\n/\n/g if $Config{osname} eq 'msys';
 		chomp $$stdout;
 	}
 
 	if (defined $$stderr)
 	{
-		$$stderr =~ s/\r\n/\n/g if $TestLib::windows_os;
+		$$stderr =~ s/\r\n/\n/g if $Config{osname} eq 'msys';
 		chomp $$stderr;
 	}
 
@@ -1651,7 +1655,7 @@ sub poll_query_until
 	{
 		my $result = IPC::Run::run $cmd, '>', \$stdout, '2>', \$stderr;
 
-		$stdout =~ s/\r\n/\n/g if $TestLib::windows_os;
+		$stdout =~ s/\r\n/\n/g if $Config{osname} eq 'msys';
 		chomp($stdout);
 
 		if ($stdout eq $expected)
@@ -1667,7 +1671,7 @@ sub poll_query_until
 
 	# The query result didn't change in 180 seconds. Give up. Print the
 	# output from the last attempt, hopefully that's useful for debugging.
-	$stderr =~ s/\r\n/\n/g if $TestLib::windows_os;
+	$stderr =~ s/\r\n/\n/g if $Config{osname} eq 'msys';
 	chomp($stderr);
 	diag qq(poll_query_until timed out executing this query:
 $query
@@ -2112,8 +2116,8 @@ sub pg_recvlogical_upto
 		}
 	};
 
-	$stdout =~ s/\r\n/\n/g if $TestLib::windows_os;
-	$stderr =~ s/\r\n/\n/g if $TestLib::windows_os;
+	$stdout =~ s/\r\n/\n/g if $Config{osname} eq 'msys';
+	$stderr =~ s/\r\n/\n/g if $Config{osname} eq 'msys';
 
 	if (wantarray)
 	{

From 183926da3162b1807904710e46882c004beff5fb Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 9 Jul 2020 16:02:23 -0400
Subject: [PATCH 115/334] Fix pg_current_logfile() to not emit a carriage
 return on Windows.

Due to not having our signals straight about CRLF vs. LF line
termination, the output of pg_current_logfile() included a trailing
\r on Windows.  To fix, force the file descriptor it uses into text
mode.

While here, move a couple of local variable declarations to make
the function's logic clearer.

In v12 and v13, also back-patch the test added by 1c4e88e2f so that
this function has some test coverage.  However, the 004_logrotate.pl
test script doesn't exist before v12, and it didn't seem worth adding
to older branches just for this.

Per report from Thomas Kellerer.  Back-patch to v10 where this
function was added.

Discussion: https://postgr.es/m/412ae8da-76bb-640f-039a-f3513499e53d@gmx.net
---
 src/backend/utils/adt/misc.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/backend/utils/adt/misc.c b/src/backend/utils/adt/misc.c
index ee340fb0f021..37c23c9155af 100644
--- a/src/backend/utils/adt/misc.c
+++ b/src/backend/utils/adt/misc.c
@@ -16,6 +16,7 @@
 
 #include <sys/file.h>
 #include <dirent.h>
+#include <fcntl.h>
 #include <math.h>
 #include <unistd.h>
 
@@ -738,9 +739,6 @@ pg_current_logfile(PG_FUNCTION_ARGS)
 	FILE	   *fd;
 	char		lbuffer[MAXPGPATH];
 	char	   *logfmt;
-	char	   *log_filepath;
-	char	   *log_format = lbuffer;
-	char	   *nlpos;
 
 	/* The log format parameter is optional */
 	if (PG_NARGS() == 0 || PG_ARGISNULL(0))
@@ -767,16 +765,23 @@ pg_current_logfile(PG_FUNCTION_ARGS)
 		PG_RETURN_NULL();
 	}
 
+#ifdef WIN32
+	/* syslogger.c writes CRLF line endings on Windows */
+	_setmode(_fileno(fd), _O_TEXT);
+#endif
+
 	/*
 	 * Read the file to gather current log filename(s) registered by the
 	 * syslogger.
 	 */
 	while (fgets(lbuffer, sizeof(lbuffer), fd) != NULL)
 	{
-		/*
-		 * Extract log format and log file path from the line; lbuffer ==
-		 * log_format, they share storage.
-		 */
+		char	   *log_format;
+		char	   *log_filepath;
+		char	   *nlpos;
+
+		/* Extract log format and log file path from the line. */
+		log_format = lbuffer;
 		log_filepath = strchr(lbuffer, ' ');
 		if (log_filepath == NULL)
 		{

From 986529ce40c6edcd7f83689c00fcfaab8d9436d8 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Thu, 9 Jul 2020 20:13:25 -0400
Subject: [PATCH 116/334] Remove WARNING message from brin_desummarize_range

This message was being emitted on the grounds that only crashed
summarization could cause it, but in reality even an aborted vacuum
could do it ... which makes it way too noisy, particularly since it
shows up in regression tests and makes them die.

Reported by Tom Lane.
Discussion: https://postgr.es/m/489091.1593534251@sss.pgh.pa.us
---
 src/backend/access/brin/brin_revmap.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c
index 9c4b3e220216..e8b8308f82ec 100644
--- a/src/backend/access/brin/brin_revmap.c
+++ b/src/backend/access/brin/brin_revmap.c
@@ -333,7 +333,6 @@ brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)
 	OffsetNumber revmapOffset;
 	OffsetNumber regOffset;
 	ItemId		lp;
-	BrinTuple  *tup;
 
 	revmap = brinRevmapInitialize(idxrel, &pagesPerRange, NULL);
 
@@ -365,6 +364,10 @@ brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)
 	regBuf = ReadBuffer(idxrel, ItemPointerGetBlockNumber(iptr));
 	LockBuffer(regBuf, BUFFER_LOCK_EXCLUSIVE);
 	regPg = BufferGetPage(regBuf);
+	/*
+	 * We're only removing data, not reading it, so there's no need to
+	 * TestForOldSnapshot here.
+	 */
 
 	/* if this is no longer a regular page, tell caller to start over */
 	if (!BRIN_IS_REGULAR_PAGE(regPg))
@@ -386,24 +389,13 @@ brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)
 		ereport(ERROR,
 				(errcode(ERRCODE_INDEX_CORRUPTED),
 				 errmsg("corrupted BRIN index: inconsistent range map")));
-	tup = (BrinTuple *) PageGetItem(regPg, lp);
-	/* XXX apply sanity checks?  Might as well delete a bogus tuple ... */
-
-	/*
-	 * We're only removing data, not reading it, so there's no need to
-	 * TestForOldSnapshot here.
-	 */
 
 	/*
-	 * Because of ShareUpdateExclusive lock, this function shouldn't run
-	 * concurrently with summarization.  Placeholder tuples can only exist as
-	 * leftovers from crashed summarization, so if we detect any, we complain
-	 * but proceed.
+	 * Placeholder tuples only appear during unfinished summarization, and we
+	 * hold ShareUpdateExclusiveLock, so this function cannot run concurrently
+	 * with that.  So any placeholder tuples that exist are leftovers from a
+	 * crashed or aborted summarization; remove them silently.
 	 */
-	if (BrinTupleIsPlaceholder(tup))
-		ereport(WARNING,
-				(errmsg("leftover placeholder tuple detected in BRIN index \"%s\", deleting",
-						RelationGetRelationName(idxrel))));
 
 	START_CRIT_SECTION();
 

From ff61359ad7cd10e48a9ce662b5f2a5e7f3ca23e1 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Fri, 10 Jul 2020 08:27:00 +0200
Subject: [PATCH 117/334] Log the location field before any backtrace

This order makes more sense because the location is effectively at the
lowest level of the backtrace.

Discussion: https://www.postgresql.org/message-id/flat/90f5fa04-c410-a54e-9449-aa3749fb7972%402ndquadrant.com
---
 src/backend/utils/error/elog.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index e9762010309d..e4b717c79a9c 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -2938,13 +2938,6 @@ send_message_to_server_log(ErrorData *edata)
 			append_with_tabs(&buf, edata->context);
 			appendStringInfoChar(&buf, '\n');
 		}
-		if (edata->backtrace)
-		{
-			log_line_prefix(&buf, edata);
-			appendStringInfoString(&buf, _("BACKTRACE:  "));
-			append_with_tabs(&buf, edata->backtrace);
-			appendStringInfoChar(&buf, '\n');
-		}
 		if (Log_error_verbosity >= PGERROR_VERBOSE)
 		{
 			/* assume no newlines in funcname or filename... */
@@ -2962,6 +2955,13 @@ send_message_to_server_log(ErrorData *edata)
 								 edata->filename, edata->lineno);
 			}
 		}
+		if (edata->backtrace)
+		{
+			log_line_prefix(&buf, edata);
+			appendStringInfoString(&buf, _("BACKTRACE:  "));
+			append_with_tabs(&buf, edata->backtrace);
+			appendStringInfoChar(&buf, '\n');
+		}
 	}
 
 	/*

From 61be85afabba0cd85bb1bcaacbea8efa8641f564 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Fri, 10 Jul 2020 17:08:13 +0900
Subject: [PATCH 118/334] Revert "Remove reset of testtablespace from
 pg_regress on Windows"

This reverts commit 2b2a070, that moved the reset of path
"testtablespace" used by the regression tests as a path for tablespaces
(via --outputdir) from pg_regress to the MSVC script vcregress.pl, as
this broke the behavior added by ce5d342 to be able to safely run the
regression test suite with an administrative Windows account using a
restricted token.

Note that before 2b2a070, the code doing the reset in pg_regress.c
included a comment telling that we had better move that out to a
different place, leading to the mistake done in 2b2a070.  Fix this
comment, and document instead that we had better never remove this code,
for the sake of not breaking again the behavior we expect on Windows.

Thanks to Thomas Munro and Andrew Dunstan for the discussion.

Discussion: https://postgr.es/m/6d9eee97-54c8-e14a-48f7-3194e712f54f@2ndQuadrant.com
Discussion: https://postgr.es/m/CA+hUKGLiieEzfrdWxWFE+_wnXho_F5Smx972X1wEubhS7v1q9g@mail.gmail.com
---
 src/test/regress/pg_regress.c | 19 +++++++++++++++++++
 src/tools/msvc/vcregress.pl   | 17 ++---------------
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c
index c8d190d2489f..d82e0189dcfb 100644
--- a/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@ -494,6 +494,25 @@ convert_sourcefiles_in(const char *source_subdir, const char *dest_dir, const ch
 
 	snprintf(testtablespace, MAXPGPATH, "%s/testtablespace", outputdir);
 
+#ifdef WIN32
+
+	/*
+	 * On Windows only, clean out the test tablespace dir, or create it if it
+	 * doesn't exist so as it is possible to run the regression tests as a
+	 * Windows administrative user account with the restricted token obtained
+	 * when starting pg_regress.  On other platforms we expect the Makefile to
+	 * take care of that.
+	 */
+	if (directory_exists(testtablespace))
+		if (!rmtree(testtablespace, true))
+		{
+			fprintf(stderr, _("\n%s: could not remove test tablespace \"%s\"\n"),
+					progname, testtablespace);
+			exit(2);
+		}
+	make_directory(testtablespace);
+#endif
+
 	/* finally loop on each file and do the replacement */
 	for (name = names; *name; name++)
 	{
diff --git a/src/tools/msvc/vcregress.pl b/src/tools/msvc/vcregress.pl
index d6763ad4ac57..3365ee578c3d 100644
--- a/src/tools/msvc/vcregress.pl
+++ b/src/tools/msvc/vcregress.pl
@@ -123,8 +123,6 @@ sub installcheck_internal
 sub installcheck
 {
 	my $schedule = shift || 'serial';
-
-	CleanupTablespaceDirectory();
 	installcheck_internal($schedule);
 	return;
 }
@@ -145,7 +143,6 @@ sub check
 		"--temp-instance=./tmp_check");
 	push(@args, $maxconn)     if $maxconn;
 	push(@args, $temp_config) if $temp_config;
-	CleanupTablespaceDirectory();
 	system(@args);
 	my $status = $? >> 8;
 	exit $status if $status;
@@ -573,8 +570,8 @@ sub upgradecheck
 	$ENV{PGDATA} = "$data.old";
 	my $outputdir          = "$tmp_root/regress";
 	my @EXTRA_REGRESS_OPTS = ("--outputdir=$outputdir");
-	mkdir "$outputdir" || die $!;
-	CleanupTablespaceDirectory($outputdir);
+	mkdir "$outputdir"                || die $!;
+	mkdir "$outputdir/testtablespace" || die $!;
 
 	my $logdir = "$topdir/src/bin/pg_upgrade/log";
 	rmtree($logdir);
@@ -740,16 +737,6 @@ sub InstallTemp
 	return;
 }
 
-sub CleanupTablespaceDirectory
-{
-	my $testdir = shift || getcwd();
-
-	my $testtablespace = "$testdir/testtablespace";
-
-	rmtree($testtablespace) if (-d $testtablespace);
-	mkdir($testtablespace);
-}
-
 sub usage
 {
 	print STDERR

From 72a16cb3ee3017ccd8e4d58ea3c0c7bceec8b702 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Fri, 10 Jul 2020 16:51:29 +0200
Subject: [PATCH 119/334] Add missing <application> tags in application doc
 <refentrytitle>s
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Most of them already have this, but some were missing.

Author: Author: Dagfinn Ilmari Mannsåker <ilmari@ilmari.org>
Discussion: https://www.postgresql.org/message-id/flat/87o8pco34z.fsf%40wibble.ilmari.org
---
 doc/src/sgml/ref/initdb.sgml          | 2 +-
 doc/src/sgml/ref/pg_basebackup.sgml   | 2 +-
 doc/src/sgml/ref/pg_config-ref.sgml   | 2 +-
 doc/src/sgml/ref/pg_dump.sgml         | 2 +-
 doc/src/sgml/ref/pg_receivewal.sgml   | 2 +-
 doc/src/sgml/ref/pg_restore.sgml      | 2 +-
 doc/src/sgml/ref/pg_verifybackup.sgml | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/doc/src/sgml/ref/initdb.sgml b/doc/src/sgml/ref/initdb.sgml
index 1635fcb1fd26..b6a55ce105f9 100644
--- a/doc/src/sgml/ref/initdb.sgml
+++ b/doc/src/sgml/ref/initdb.sgml
@@ -9,7 +9,7 @@ PostgreSQL documentation
  </indexterm>
 
  <refmeta>
-  <refentrytitle>initdb</refentrytitle>
+  <refentrytitle><application>initdb</application></refentrytitle>
   <manvolnum>1</manvolnum>
   <refmiscinfo>Application</refmiscinfo>
  </refmeta>
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index e2a01be895d7..aa41fb444fa7 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -9,7 +9,7 @@ PostgreSQL documentation
  </indexterm>
 
  <refmeta>
-  <refentrytitle>pg_basebackup</refentrytitle>
+  <refentrytitle><application>pg_basebackup</application></refentrytitle>
   <manvolnum>1</manvolnum>
   <refmiscinfo>Application</refmiscinfo>
  </refmeta>
diff --git a/doc/src/sgml/ref/pg_config-ref.sgml b/doc/src/sgml/ref/pg_config-ref.sgml
index 6c797746ccd3..e177769188ba 100644
--- a/doc/src/sgml/ref/pg_config-ref.sgml
+++ b/doc/src/sgml/ref/pg_config-ref.sgml
@@ -9,7 +9,7 @@ PostgreSQL documentation
  </indexterm>
 
  <refmeta>
-  <refentrytitle>pg_config</refentrytitle>
+  <refentrytitle><application>pg_config</application></refentrytitle>
   <manvolnum>1</manvolnum>
   <refmiscinfo>Application</refmiscinfo>
  </refmeta>
diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index d5fb5430dc85..456e0ae0bd46 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -9,7 +9,7 @@ PostgreSQL documentation
  </indexterm>
 
  <refmeta>
-  <refentrytitle>pg_dump</refentrytitle>
+  <refentrytitle><application>pg_dump</application></refentrytitle>
   <manvolnum>1</manvolnum>
   <refmiscinfo>Application</refmiscinfo>
  </refmeta>
diff --git a/doc/src/sgml/ref/pg_receivewal.sgml b/doc/src/sgml/ref/pg_receivewal.sgml
index cad4689ae607..865ec8426219 100644
--- a/doc/src/sgml/ref/pg_receivewal.sgml
+++ b/doc/src/sgml/ref/pg_receivewal.sgml
@@ -9,7 +9,7 @@ PostgreSQL documentation
  </indexterm>
 
  <refmeta>
-  <refentrytitle>pg_receivewal</refentrytitle>
+  <refentrytitle><application>pg_receivewal</application></refentrytitle>
   <manvolnum>1</manvolnum>
   <refmiscinfo>Application</refmiscinfo>
  </refmeta>
diff --git a/doc/src/sgml/ref/pg_restore.sgml b/doc/src/sgml/ref/pg_restore.sgml
index 232f88024f04..6cb06d4910ca 100644
--- a/doc/src/sgml/ref/pg_restore.sgml
+++ b/doc/src/sgml/ref/pg_restore.sgml
@@ -9,7 +9,7 @@ PostgreSQL documentation
  </indexterm>
 
  <refmeta>
-  <refentrytitle>pg_restore</refentrytitle>
+  <refentrytitle><application>pg_restore</application></refentrytitle>
   <manvolnum>1</manvolnum>
   <refmiscinfo>Application</refmiscinfo>
  </refmeta>
diff --git a/doc/src/sgml/ref/pg_verifybackup.sgml b/doc/src/sgml/ref/pg_verifybackup.sgml
index c85d9136ccfd..c160992e6d7d 100644
--- a/doc/src/sgml/ref/pg_verifybackup.sgml
+++ b/doc/src/sgml/ref/pg_verifybackup.sgml
@@ -9,7 +9,7 @@ PostgreSQL documentation
  </indexterm>
 
  <refmeta>
-  <refentrytitle>pg_verifybackup</refentrytitle>
+  <refentrytitle><application>pg_verifybackup</application></refentrytitle>
   <manvolnum>1</manvolnum>
   <refmiscinfo>Application</refmiscinfo>
  </refmeta>

From e91cd951b1114a009a0256f211a41d0ec873b2fc Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 10 Jul 2020 13:16:00 -0400
Subject: [PATCH 120/334] Doc: update or remove dead external links.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Re-point comp.ai.genetic FAQ link to a more stable address.

Remove stale links to AIX documentation; we don't really need to
tell AIX users how to use their systems.

Remove stale links to HP documentation about SSL.  We've had to
update those twice before, making it increasingly obvious that
HP does not intend them to be stable landing points.  They're
not particularly authoritative, either.  (This change effectively
reverts bbd3bdba3.)

Daniel Gustafsson and Álvaro Herrera, per a gripe from
Kyotaro Horiguchi.  Back-patch, since these links are
just as dead in the back branches.

Discussion: https://postgr.es/m/20200709.161226.204639179120026914.horikyota.ntt@gmail.com
---
 doc/src/sgml/geqo.sgml         |  2 +-
 doc/src/sgml/installation.sgml | 56 ----------------------------------
 doc/src/sgml/libpq.sgml        |  3 --
 doc/src/sgml/runtime.sgml      |  3 --
 4 files changed, 1 insertion(+), 63 deletions(-)

diff --git a/doc/src/sgml/geqo.sgml b/doc/src/sgml/geqo.sgml
index 39d2163d160c..c754b2b63e09 100644
--- a/doc/src/sgml/geqo.sgml
+++ b/doc/src/sgml/geqo.sgml
@@ -271,7 +271,7 @@
    <itemizedlist>
     <listitem>
      <para>
-      <ulink url="http://www.aip.de/~ast/EvolCompFAQ/">
+      <ulink url="http://www.faqs.org/faqs/ai-faq/genetic/part1/">
       The Hitch-Hiker's Guide to Evolutionary Computation</ulink>, (FAQ for <ulink
       url="news://comp.ai.genetic"></ulink>)
      </para>
diff --git a/doc/src/sgml/installation.sgml b/doc/src/sgml/installation.sgml
index 4f89b4e9303d..552303e21142 100644
--- a/doc/src/sgml/installation.sgml
+++ b/doc/src/sgml/installation.sgml
@@ -2242,62 +2242,6 @@ ERROR:  could not load library "/opt/dbs/pgsql/lib/plperl.so": Bad address
      out-of-memory kill is configurable on a system- or process-wide
      basis if this becomes a problem.
     </para>
-
-    <bibliography>
-     <title>References and Resources</title>
-
-     <biblioentry>
-      <biblioset relation="article">
-       <title><ulink url="http://publib.boulder.ibm.com/infocenter/pseries/topic/com.ibm.aix.doc/aixprggd/genprogc/lrg_prg_support.htm">Large Program Support</ulink></title>
-      </biblioset>
-      <biblioset relation="book">
-       <title>AIX Documentation: General Programming Concepts: Writing and Debugging Programs</title>
-      </biblioset>
-     </biblioentry>
-
-     <biblioentry>
-      <biblioset relation="article">
-       <title><ulink url="http://publib.boulder.ibm.com/infocenter/pseries/topic/com.ibm.aix.doc/aixprggd/genprogc/address_space.htm">Program Address Space Overview</ulink></title>
-      </biblioset>
-      <biblioset relation="book">
-       <title>AIX Documentation: General Programming Concepts: Writing and Debugging Programs</title>
-      </biblioset>
-     </biblioentry>
-
-     <biblioentry>
-      <biblioset relation="article">
-       <title><ulink url="http://publib.boulder.ibm.com/infocenter/pseries/v5r3/topic/com.ibm.aix.doc/aixbman/prftungd/resmgmt2.htm">Performance Overview of the Virtual Memory Manager (VMM)</ulink></title>
-      </biblioset>
-      <biblioset relation="book">
-       <title>AIX Documentation: Performance Management Guide</title>
-      </biblioset>
-     </biblioentry>
-
-     <biblioentry>
-      <biblioset relation="article">
-       <title><ulink url="http://publib.boulder.ibm.com/infocenter/pseries/v5r3/topic/com.ibm.aix.doc/aixbman/prftungd/memperf7.htm">Page Space Allocation</ulink></title>
-      </biblioset>
-      <biblioset relation="book">
-       <title>AIX Documentation: Performance Management Guide</title>
-      </biblioset>
-     </biblioentry>
-
-     <biblioentry>
-      <biblioset relation="article">
-       <title><ulink url="http://publib.boulder.ibm.com/infocenter/pseries/v5r3/topic/com.ibm.aix.doc/aixbman/prftungd/memperf6.htm">Paging-space thresholds tuning</ulink></title>
-      </biblioset>
-      <biblioset relation="book">
-       <title>AIX Documentation: Performance Management Guide</title>
-      </biblioset>
-     </biblioentry>
-
-     <biblioentry>
-       <title><ulink url="http://www.redbooks.ibm.com/abstracts/sg245674.html?Open">Developing and Porting C and C++ Applications on AIX</ulink></title>
-       <publisher>
-        <publishername>IBM Redbook</publishername>
-       </publisher>
-     </biblioentry>
-    </bibliography>
    </sect3>
   </sect2>
 
diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml
index d1ccaa775a19..f7b765f76dc9 100644
--- a/doc/src/sgml/libpq.sgml
+++ b/doc/src/sgml/libpq.sgml
@@ -7916,9 +7916,6 @@ ldap://ldap.acme.com/cn=dbserver,cn=hosts?pgconnectinfo?base?(objectclass=*)
    that the <literal>libssl</literal> and/or <literal>libcrypto</literal> libraries
    have been initialized by your application, so that
    <application>libpq</application> will not also initialize those libraries.
-   See <ulink
-   url="http://h41379.www4.hpe.com/doc/83final/ba554_90007/ch04.html"></ulink>
-   for details on the SSL API.
   </para>
 
   <para>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index 331d01b4445a..937bb2e8ac95 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -2257,9 +2257,6 @@ pg_dumpall -p 5432 | psql -d postgres -p 5433
    intermediate certificates were created with <literal>v3_ca</literal>
    extensions).  Certificate Revocation List (CRL) entries are also
    checked if the parameter <xref linkend="guc-ssl-crl-file"/> is set.
-   (See <ulink
-   url="http://h41379.www4.hpe.com/doc/83final/ba554_90007/ch04s02.html"></ulink>
-   for diagrams showing SSL certificate usage.)
   </para>
 
   <para>

From 0657181167467ec5cfb5a28aacf47ca02f5f4588 Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Sat, 11 Jul 2020 03:20:46 +0300
Subject: [PATCH 121/334] Improve error reporting for jsonpath .double() method

When jsonpath .double() method detects that numeric or string can't be
converted to double precision, it throws an error.  This commit makes these
errors explicitly express the reason of failure.

Discussion: https://postgr.es/m/CAPpHfdtqJtiSXkP7tOXez18NxhLUH_-75bL8%3DOce4Ki%2Bbv7V6Q%40mail.gmail.com
Author: Alexander Korotkov
Reviewed-by: Tom Lane
Backpatch-through: 12
---
 src/backend/utils/adt/jsonpath_exec.c        | 4 ++--
 src/test/regress/expected/jsonb_jsonpath.out | 8 +++++---
 src/test/regress/sql/jsonb_jsonpath.sql      | 1 +
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/backend/utils/adt/jsonpath_exec.c b/src/backend/utils/adt/jsonpath_exec.c
index 135025cf570f..25ec3dcd8029 100644
--- a/src/backend/utils/adt/jsonpath_exec.c
+++ b/src/backend/utils/adt/jsonpath_exec.c
@@ -1055,7 +1055,7 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp,
 					if (have_error)
 						RETURN_ERROR(ereport(ERROR,
 											 (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM),
-											  errmsg("jsonpath item method .%s() can only be applied to a numeric value",
+											  errmsg("numeric argument of jsonpath item method .%s() is out of range for type double precision",
 													 jspOperationName(jsp->type)))));
 					res = jperOk;
 				}
@@ -1076,7 +1076,7 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp,
 					if (have_error || isinf(val))
 						RETURN_ERROR(ereport(ERROR,
 											 (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM),
-											  errmsg("jsonpath item method .%s() can only be applied to a numeric value",
+											  errmsg("string argument of jsonpath item method .%s() is not a valid representation of a double precision number",
 													 jspOperationName(jsp->type)))));
 
 					jb = &jbv;
diff --git a/src/test/regress/expected/jsonb_jsonpath.out b/src/test/regress/expected/jsonb_jsonpath.out
index 83a050d3e29d..f8a5deb2b995 100644
--- a/src/test/regress/expected/jsonb_jsonpath.out
+++ b/src/test/regress/expected/jsonb_jsonpath.out
@@ -1496,7 +1496,9 @@ select jsonb_path_query('"1.23"', '$.double()');
 (1 row)
 
 select jsonb_path_query('"1.23aaa"', '$.double()');
-ERROR:  jsonpath item method .double() can only be applied to a numeric value
+ERROR:  string argument of jsonpath item method .double() is not a valid representation of a double precision number
+select jsonb_path_query('1e1000', '$.double()');
+ERROR:  numeric argument of jsonpath item method .double() is out of range for type double precision
 select jsonb_path_query('"nan"', '$.double()');
  jsonb_path_query 
 ------------------
@@ -1510,9 +1512,9 @@ select jsonb_path_query('"NaN"', '$.double()');
 (1 row)
 
 select jsonb_path_query('"inf"', '$.double()');
-ERROR:  jsonpath item method .double() can only be applied to a numeric value
+ERROR:  string argument of jsonpath item method .double() is not a valid representation of a double precision number
 select jsonb_path_query('"-inf"', '$.double()');
-ERROR:  jsonpath item method .double() can only be applied to a numeric value
+ERROR:  string argument of jsonpath item method .double() is not a valid representation of a double precision number
 select jsonb_path_query('"inf"', '$.double()', silent => true);
  jsonb_path_query 
 ------------------
diff --git a/src/test/regress/sql/jsonb_jsonpath.sql b/src/test/regress/sql/jsonb_jsonpath.sql
index 731b4d444cc5..a50abed95da7 100644
--- a/src/test/regress/sql/jsonb_jsonpath.sql
+++ b/src/test/regress/sql/jsonb_jsonpath.sql
@@ -312,6 +312,7 @@ select jsonb_path_query('{}', '$.double()', silent => true);
 select jsonb_path_query('1.23', '$.double()');
 select jsonb_path_query('"1.23"', '$.double()');
 select jsonb_path_query('"1.23aaa"', '$.double()');
+select jsonb_path_query('1e1000', '$.double()');
 select jsonb_path_query('"nan"', '$.double()');
 select jsonb_path_query('"NaN"', '$.double()');
 select jsonb_path_query('"inf"', '$.double()');

From df646509f371069c65f84309eb5749642e8650b3 Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Sat, 11 Jul 2020 03:21:00 +0300
Subject: [PATCH 122/334] Forbid numeric NaN in jsonpath

SQL standard doesn't define numeric Inf or NaN values.  It appears even more
ridiculous to support then in jsonpath assuming JSON doesn't support these
values as well.  This commit forbids returning NaN from .double(), which was
previously allowed.  NaN can't be result of inner-jsonpath computation over
non-NaNs.  So, we can not expect NaN in the jsonpath output.

Reported-by: Tom Lane
Discussion: https://postgr.es/m/203949.1591879542%40sss.pgh.pa.us
Author: Alexander Korotkov
Reviewed-by: Tom Lane
Backpatch-through: 12
---
 src/backend/utils/adt/jsonb_util.c           |  8 --------
 src/backend/utils/adt/jsonpath_exec.c        | 15 ++++++++-------
 src/test/regress/expected/jsonb_jsonpath.out | 12 ++----------
 3 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c
index 04b70c805b44..4eeffa142434 100644
--- a/src/backend/utils/adt/jsonb_util.c
+++ b/src/backend/utils/adt/jsonb_util.c
@@ -1773,14 +1773,6 @@ convertJsonbScalar(StringInfo buffer, JEntry *jentry, JsonbValue *scalarVal)
 			break;
 
 		case jbvNumeric:
-			/* replace numeric NaN with string "NaN" */
-			if (numeric_is_nan(scalarVal->val.numeric))
-			{
-				appendToBuffer(buffer, "NaN", 3);
-				*jentry = 3;
-				break;
-			}
-
 			numlen = VARSIZE_ANY(scalarVal->val.numeric);
 			padlen = padBufferToInt(buffer);
 
diff --git a/src/backend/utils/adt/jsonpath_exec.c b/src/backend/utils/adt/jsonpath_exec.c
index 25ec3dcd8029..f146767bfc3a 100644
--- a/src/backend/utils/adt/jsonpath_exec.c
+++ b/src/backend/utils/adt/jsonpath_exec.c
@@ -1044,15 +1044,16 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp,
 				{
 					char	   *tmp = DatumGetCString(DirectFunctionCall1(numeric_out,
 																		  NumericGetDatum(jb->val.numeric)));
+					double		val;
 					bool		have_error = false;
 
-					(void) float8in_internal_opt_error(tmp,
-													   NULL,
-													   "double precision",
-													   tmp,
-													   &have_error);
+					val = float8in_internal_opt_error(tmp,
+													  NULL,
+													  "double precision",
+													  tmp,
+													  &have_error);
 
-					if (have_error)
+					if (have_error || isinf(val) || isnan(val))
 						RETURN_ERROR(ereport(ERROR,
 											 (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM),
 											  errmsg("numeric argument of jsonpath item method .%s() is out of range for type double precision",
@@ -1073,7 +1074,7 @@ executeItemOptUnwrapTarget(JsonPathExecContext *cxt, JsonPathItem *jsp,
 													  tmp,
 													  &have_error);
 
-					if (have_error || isinf(val))
+					if (have_error || isinf(val) || isnan(val))
 						RETURN_ERROR(ereport(ERROR,
 											 (errcode(ERRCODE_NON_NUMERIC_SQL_JSON_ITEM),
 											  errmsg("string argument of jsonpath item method .%s() is not a valid representation of a double precision number",
diff --git a/src/test/regress/expected/jsonb_jsonpath.out b/src/test/regress/expected/jsonb_jsonpath.out
index f8a5deb2b995..57332111b427 100644
--- a/src/test/regress/expected/jsonb_jsonpath.out
+++ b/src/test/regress/expected/jsonb_jsonpath.out
@@ -1500,17 +1500,9 @@ ERROR:  string argument of jsonpath item method .double() is not a valid represe
 select jsonb_path_query('1e1000', '$.double()');
 ERROR:  numeric argument of jsonpath item method .double() is out of range for type double precision
 select jsonb_path_query('"nan"', '$.double()');
- jsonb_path_query 
-------------------
- "NaN"
-(1 row)
-
+ERROR:  string argument of jsonpath item method .double() is not a valid representation of a double precision number
 select jsonb_path_query('"NaN"', '$.double()');
- jsonb_path_query 
-------------------
- "NaN"
-(1 row)
-
+ERROR:  string argument of jsonpath item method .double() is not a valid representation of a double precision number
 select jsonb_path_query('"inf"', '$.double()');
 ERROR:  string argument of jsonpath item method .double() is not a valid representation of a double precision number
 select jsonb_path_query('"-inf"', '$.double()');

From cc35d8933a211d9965eb1c1d2749a903d5735db2 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Sat, 11 Jul 2020 13:32:28 +0900
Subject: [PATCH 123/334] Rename field "relkind" to "objtype" for CTAS and
 ALTER TABLE nodes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

"relkind" normally refers to the char field from pg_class.  However, in
the parse nodes AlterTableStmt and CreateTableAsStmt, "relkind" was used
for a field of type enum ObjectType, that could refer to other object
types than those possible for a relkind.  Such fields being usually
named "objtype", switch the name in both structures to make things more
consistent.  Note that this led to some confusion in functions that
also operate on a RangeTableEntry object, which also has a field named
"relkind".

This naming goes back to commit 09d4e96, where only OBJECT_TABLE and
OBJECT_INDEX were used.  This got extended later to use as well
OBJECT_TYPE with e440e12, not really a relation kind.

Author: Mark Dilger
Reviewed-by: Daniel Gustafsson, Álvaro Herrera, Michael Paquier
Discussion: https://postgr.es/m/609181AE-E399-47C7-9221-856E0F96BF93@enterprisedb.com
---
 src/backend/commands/tablecmds.c   |  4 +--
 src/backend/nodes/copyfuncs.c      |  4 +--
 src/backend/nodes/equalfuncs.c     |  4 +--
 src/backend/parser/analyze.c       |  4 +--
 src/backend/parser/gram.y          | 44 +++++++++++++++---------------
 src/backend/parser/parse_utilcmd.c |  6 ++--
 src/backend/tcop/utility.c         |  4 +--
 src/include/nodes/parsenodes.h     |  4 +--
 8 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 26700da27891..ed553f73841c 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -4709,7 +4709,7 @@ ATParseTransformCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
 					 -1);
 	atstmt->relation->inh = recurse;
 	atstmt->cmds = list_make1(cmd);
-	atstmt->relkind = OBJECT_TABLE; /* needn't be picky here */
+	atstmt->objtype = OBJECT_TABLE; /* needn't be picky here */
 	atstmt->missing_ok = false;
 
 	/* Transform the AlterTableStmt */
@@ -15594,7 +15594,7 @@ RangeVarCallbackForAlterRelation(const RangeVar *rv, Oid relid, Oid oldrelid,
 		reltype = ((AlterObjectSchemaStmt *) stmt)->objectType;
 
 	else if (IsA(stmt, AlterTableStmt))
-		reltype = ((AlterTableStmt *) stmt)->relkind;
+		reltype = ((AlterTableStmt *) stmt)->objtype;
 	else
 	{
 		elog(ERROR, "unrecognized node type: %d", (int) nodeTag(stmt));
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index d8cf87e6d088..89c409de6640 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -3204,7 +3204,7 @@ _copyAlterTableStmt(const AlterTableStmt *from)
 
 	COPY_NODE_FIELD(relation);
 	COPY_NODE_FIELD(cmds);
-	COPY_SCALAR_FIELD(relkind);
+	COPY_SCALAR_FIELD(objtype);
 	COPY_SCALAR_FIELD(missing_ok);
 
 	return newnode;
@@ -3980,7 +3980,7 @@ _copyCreateTableAsStmt(const CreateTableAsStmt *from)
 
 	COPY_NODE_FIELD(query);
 	COPY_NODE_FIELD(into);
-	COPY_SCALAR_FIELD(relkind);
+	COPY_SCALAR_FIELD(objtype);
 	COPY_SCALAR_FIELD(is_select_into);
 	COPY_SCALAR_FIELD(if_not_exists);
 
diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c
index 627b026b195e..e3f33c40be53 100644
--- a/src/backend/nodes/equalfuncs.c
+++ b/src/backend/nodes/equalfuncs.c
@@ -1087,7 +1087,7 @@ _equalAlterTableStmt(const AlterTableStmt *a, const AlterTableStmt *b)
 {
 	COMPARE_NODE_FIELD(relation);
 	COMPARE_NODE_FIELD(cmds);
-	COMPARE_SCALAR_FIELD(relkind);
+	COMPARE_SCALAR_FIELD(objtype);
 	COMPARE_SCALAR_FIELD(missing_ok);
 
 	return true;
@@ -1735,7 +1735,7 @@ _equalCreateTableAsStmt(const CreateTableAsStmt *a, const CreateTableAsStmt *b)
 {
 	COMPARE_NODE_FIELD(query);
 	COMPARE_NODE_FIELD(into);
-	COMPARE_SCALAR_FIELD(relkind);
+	COMPARE_SCALAR_FIELD(objtype);
 	COMPARE_SCALAR_FIELD(is_select_into);
 	COMPARE_SCALAR_FIELD(if_not_exists);
 
diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c
index 401da5dedf73..c159fb2957ba 100644
--- a/src/backend/parser/analyze.c
+++ b/src/backend/parser/analyze.c
@@ -229,7 +229,7 @@ transformOptionalSelectInto(ParseState *pstate, Node *parseTree)
 
 			ctas->query = parseTree;
 			ctas->into = stmt->intoClause;
-			ctas->relkind = OBJECT_TABLE;
+			ctas->objtype = OBJECT_TABLE;
 			ctas->is_select_into = true;
 
 			/*
@@ -2572,7 +2572,7 @@ transformCreateTableAsStmt(ParseState *pstate, CreateTableAsStmt *stmt)
 	stmt->query = (Node *) query;
 
 	/* additional work needed for CREATE MATERIALIZED VIEW */
-	if (stmt->relkind == OBJECT_MATVIEW)
+	if (stmt->objtype == OBJECT_MATVIEW)
 	{
 		/*
 		 * Prohibit a data-modifying CTE in the query used to create a
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 1ea1fba43acc..dbb47d498290 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -1844,7 +1844,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $3;
 					n->cmds = $4;
-					n->relkind = OBJECT_TABLE;
+					n->objtype = OBJECT_TABLE;
 					n->missing_ok = false;
 					$$ = (Node *)n;
 				}
@@ -1853,7 +1853,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $5;
 					n->cmds = $6;
-					n->relkind = OBJECT_TABLE;
+					n->objtype = OBJECT_TABLE;
 					n->missing_ok = true;
 					$$ = (Node *)n;
 				}
@@ -1862,7 +1862,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $3;
 					n->cmds = list_make1($4);
-					n->relkind = OBJECT_TABLE;
+					n->objtype = OBJECT_TABLE;
 					n->missing_ok = false;
 					$$ = (Node *)n;
 				}
@@ -1871,7 +1871,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $5;
 					n->cmds = list_make1($6);
-					n->relkind = OBJECT_TABLE;
+					n->objtype = OBJECT_TABLE;
 					n->missing_ok = true;
 					$$ = (Node *)n;
 				}
@@ -1902,7 +1902,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $3;
 					n->cmds = $4;
-					n->relkind = OBJECT_INDEX;
+					n->objtype = OBJECT_INDEX;
 					n->missing_ok = false;
 					$$ = (Node *)n;
 				}
@@ -1911,7 +1911,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $5;
 					n->cmds = $6;
-					n->relkind = OBJECT_INDEX;
+					n->objtype = OBJECT_INDEX;
 					n->missing_ok = true;
 					$$ = (Node *)n;
 				}
@@ -1920,7 +1920,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $3;
 					n->cmds = list_make1($4);
-					n->relkind = OBJECT_INDEX;
+					n->objtype = OBJECT_INDEX;
 					n->missing_ok = false;
 					$$ = (Node *)n;
 				}
@@ -1951,7 +1951,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $3;
 					n->cmds = $4;
-					n->relkind = OBJECT_SEQUENCE;
+					n->objtype = OBJECT_SEQUENCE;
 					n->missing_ok = false;
 					$$ = (Node *)n;
 				}
@@ -1960,7 +1960,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $5;
 					n->cmds = $6;
-					n->relkind = OBJECT_SEQUENCE;
+					n->objtype = OBJECT_SEQUENCE;
 					n->missing_ok = true;
 					$$ = (Node *)n;
 				}
@@ -1969,7 +1969,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $3;
 					n->cmds = $4;
-					n->relkind = OBJECT_VIEW;
+					n->objtype = OBJECT_VIEW;
 					n->missing_ok = false;
 					$$ = (Node *)n;
 				}
@@ -1978,7 +1978,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $5;
 					n->cmds = $6;
-					n->relkind = OBJECT_VIEW;
+					n->objtype = OBJECT_VIEW;
 					n->missing_ok = true;
 					$$ = (Node *)n;
 				}
@@ -1987,7 +1987,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $4;
 					n->cmds = $5;
-					n->relkind = OBJECT_MATVIEW;
+					n->objtype = OBJECT_MATVIEW;
 					n->missing_ok = false;
 					$$ = (Node *)n;
 				}
@@ -1996,7 +1996,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $6;
 					n->cmds = $7;
-					n->relkind = OBJECT_MATVIEW;
+					n->objtype = OBJECT_MATVIEW;
 					n->missing_ok = true;
 					$$ = (Node *)n;
 				}
@@ -2027,7 +2027,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $4;
 					n->cmds = $5;
-					n->relkind = OBJECT_FOREIGN_TABLE;
+					n->objtype = OBJECT_FOREIGN_TABLE;
 					n->missing_ok = false;
 					$$ = (Node *)n;
 				}
@@ -2036,7 +2036,7 @@ AlterTableStmt:
 					AlterTableStmt *n = makeNode(AlterTableStmt);
 					n->relation = $6;
 					n->cmds = $7;
-					n->relkind = OBJECT_FOREIGN_TABLE;
+					n->objtype = OBJECT_FOREIGN_TABLE;
 					n->missing_ok = true;
 					$$ = (Node *)n;
 				}
@@ -2856,7 +2856,7 @@ AlterCompositeTypeStmt:
 					/* can't use qualified_name, sigh */
 					n->relation = makeRangeVarFromAnyName($3, @3, yyscanner);
 					n->cmds = $4;
-					n->relkind = OBJECT_TYPE;
+					n->objtype = OBJECT_TYPE;
 					$$ = (Node *)n;
 				}
 			;
@@ -4072,7 +4072,7 @@ CreateAsStmt:
 					CreateTableAsStmt *ctas = makeNode(CreateTableAsStmt);
 					ctas->query = $6;
 					ctas->into = $4;
-					ctas->relkind = OBJECT_TABLE;
+					ctas->objtype = OBJECT_TABLE;
 					ctas->is_select_into = false;
 					ctas->if_not_exists = false;
 					/* cram additional flags into the IntoClause */
@@ -4085,7 +4085,7 @@ CreateAsStmt:
 					CreateTableAsStmt *ctas = makeNode(CreateTableAsStmt);
 					ctas->query = $9;
 					ctas->into = $7;
-					ctas->relkind = OBJECT_TABLE;
+					ctas->objtype = OBJECT_TABLE;
 					ctas->is_select_into = false;
 					ctas->if_not_exists = true;
 					/* cram additional flags into the IntoClause */
@@ -4131,7 +4131,7 @@ CreateMatViewStmt:
 					CreateTableAsStmt *ctas = makeNode(CreateTableAsStmt);
 					ctas->query = $7;
 					ctas->into = $5;
-					ctas->relkind = OBJECT_MATVIEW;
+					ctas->objtype = OBJECT_MATVIEW;
 					ctas->is_select_into = false;
 					ctas->if_not_exists = false;
 					/* cram additional flags into the IntoClause */
@@ -4144,7 +4144,7 @@ CreateMatViewStmt:
 					CreateTableAsStmt *ctas = makeNode(CreateTableAsStmt);
 					ctas->query = $10;
 					ctas->into = $8;
-					ctas->relkind = OBJECT_MATVIEW;
+					ctas->objtype = OBJECT_MATVIEW;
 					ctas->is_select_into = false;
 					ctas->if_not_exists = true;
 					/* cram additional flags into the IntoClause */
@@ -10695,7 +10695,7 @@ ExecuteStmt: EXECUTE name execute_param_clause
 					n->params = $8;
 					ctas->query = (Node *) n;
 					ctas->into = $4;
-					ctas->relkind = OBJECT_TABLE;
+					ctas->objtype = OBJECT_TABLE;
 					ctas->is_select_into = false;
 					ctas->if_not_exists = false;
 					/* cram additional flags into the IntoClause */
@@ -10712,7 +10712,7 @@ ExecuteStmt: EXECUTE name execute_param_clause
 					n->params = $11;
 					ctas->query = (Node *) n;
 					ctas->into = $7;
-					ctas->relkind = OBJECT_TABLE;
+					ctas->objtype = OBJECT_TABLE;
 					ctas->is_select_into = false;
 					ctas->if_not_exists = true;
 					/* cram additional flags into the IntoClause */
diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c
index 0e4caa6ad47b..25abc544fc72 100644
--- a/src/backend/parser/parse_utilcmd.c
+++ b/src/backend/parser/parse_utilcmd.c
@@ -829,7 +829,7 @@ transformColumnDefinition(CreateStmtContext *cxt, ColumnDef *column)
 		stmt = makeNode(AlterTableStmt);
 		stmt->relation = cxt->relation;
 		stmt->cmds = NIL;
-		stmt->relkind = OBJECT_FOREIGN_TABLE;
+		stmt->objtype = OBJECT_FOREIGN_TABLE;
 		stmt->cmds = lappend(stmt->cmds, cmd);
 
 		cxt->alist = lappend(cxt->alist, stmt);
@@ -2508,7 +2508,7 @@ transformIndexConstraint(Constraint *constraint, CreateStmtContext *cxt)
 
 		alterstmt->relation = copyObject(cxt->relation);
 		alterstmt->cmds = notnullcmds;
-		alterstmt->relkind = OBJECT_TABLE;
+		alterstmt->objtype = OBJECT_TABLE;
 		alterstmt->missing_ok = false;
 
 		cxt->alist = lappend(cxt->alist, alterstmt);
@@ -2610,7 +2610,7 @@ transformFKConstraints(CreateStmtContext *cxt,
 
 		alterstmt->relation = cxt->relation;
 		alterstmt->cmds = NIL;
-		alterstmt->relkind = OBJECT_TABLE;
+		alterstmt->objtype = OBJECT_TABLE;
 
 		foreach(fkclist, cxt->fkconstraints)
 		{
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 97cbaa3072b0..9b0c376c8cb5 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -2574,7 +2574,7 @@ CreateCommandTag(Node *parsetree)
 			break;
 
 		case T_AlterTableStmt:
-			tag = AlterObjectTypeCommandTag(((AlterTableStmt *) parsetree)->relkind);
+			tag = AlterObjectTypeCommandTag(((AlterTableStmt *) parsetree)->objtype);
 			break;
 
 		case T_AlterDomainStmt:
@@ -2752,7 +2752,7 @@ CreateCommandTag(Node *parsetree)
 			break;
 
 		case T_CreateTableAsStmt:
-			switch (((CreateTableAsStmt *) parsetree)->relkind)
+			switch (((CreateTableAsStmt *) parsetree)->objtype)
 			{
 				case OBJECT_TABLE:
 					if (((CreateTableAsStmt *) parsetree)->is_select_into)
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index 5e1ffafb91b1..151bcdb7ef5b 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -1776,7 +1776,7 @@ typedef struct AlterTableStmt
 	NodeTag		type;
 	RangeVar   *relation;		/* table to work on */
 	List	   *cmds;			/* list of subcommands */
-	ObjectType	relkind;		/* type of object */
+	ObjectType	objtype;		/* type of object */
 	bool		missing_ok;		/* skip error if table missing */
 } AlterTableStmt;
 
@@ -3275,7 +3275,7 @@ typedef struct CreateTableAsStmt
 	NodeTag		type;
 	Node	   *query;			/* the query (see comments above) */
 	IntoClause *into;			/* destination table */
-	ObjectType	relkind;		/* OBJECT_TABLE or OBJECT_MATVIEW */
+	ObjectType	objtype;		/* OBJECT_TABLE or OBJECT_MATVIEW */
 	bool		is_select_into; /* it was written as SELECT INTO */
 	bool		if_not_exists;	/* just do nothing if it already exists? */
 } CreateTableAsStmt;

From 64fe120b57c6a928a527880476e9882b9bf7ae8a Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Sat, 11 Jul 2020 13:47:29 +0200
Subject: [PATCH 124/334] doc: Add link from pg_dump --encoding to supported
 encodings

Reported-by: Lee Dong Wook <sh95119@gmail.com>
---
 doc/src/sgml/ref/pg_dump.sgml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/src/sgml/ref/pg_dump.sgml b/doc/src/sgml/ref/pg_dump.sgml
index 456e0ae0bd46..7a37fd80455c 100644
--- a/doc/src/sgml/ref/pg_dump.sgml
+++ b/doc/src/sgml/ref/pg_dump.sgml
@@ -223,7 +223,8 @@ PostgreSQL documentation
         Create the dump in the specified character set encoding. By default,
         the dump is created in the database encoding.  (Another way to get the
         same result is to set the <envar>PGCLIENTENCODING</envar> environment
-        variable to the desired dump encoding.)
+        variable to the desired dump encoding.)  The supported encodings are
+        described in <xref linkend="multibyte-charset-supported"/>.
        </para>
       </listitem>
      </varlistentry>

From ea9125304dc6e90eabad165bd120eb1e667525d4 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 11 Jul 2020 13:36:50 -0400
Subject: [PATCH 125/334] Avoid trying to restore table ACLs and per-column
 ACLs in parallel.

Parallel pg_restore has always supposed that ACL items for different
objects are independent and can be restored in parallel without
conflicts.  However, there is one case where this fails: because
REVOKE on a table is defined to also revoke the privilege(s) at
column level, we can't restore per-column ACLs till after we restore
any table-level privileges on their table.  Failure to honor this
restriction can lead to "tuple concurrently updated" errors during
parallel restore, or even to the per-column ACLs silently disappearing
because the table-level REVOKE is executed afterwards.

To fix, add a dependency from each column-level ACL item to its table's
ACL item, if there is one.  Note that this doesn't fix the hazard
for pre-existing archive files, only for ones made with a corrected
pg_dump.  Given that the bug's been there quite awhile without
field reports, I think this is acceptable.

This requires changing the API of pg_dump's dumpACL() function.
To keep its argument list from getting even longer, I removed the
"CatalogId objCatId" argument, which has been unused for ages.

Per report from Justin Pryzby.  Back-patch to all supported branches.

Discussion: https://postgr.es/m/20200706050129.GW4107@telsasoft.com
---
 src/bin/pg_dump/common.c    |  2 +-
 src/bin/pg_dump/pg_backup.h |  6 +++
 src/bin/pg_dump/pg_dump.c   | 91 +++++++++++++++++++++++--------------
 3 files changed, 64 insertions(+), 35 deletions(-)

diff --git a/src/bin/pg_dump/common.c b/src/bin/pg_dump/common.c
index 4ea59f549426..08239dde4f92 100644
--- a/src/bin/pg_dump/common.c
+++ b/src/bin/pg_dump/common.c
@@ -28,7 +28,7 @@
  */
 static DumpableObject **dumpIdMap = NULL;
 static int	allocedDumpIds = 0;
-static DumpId lastDumpId = 0;
+static DumpId lastDumpId = 0;	/* Note: 0 is InvalidDumpId */
 
 /*
  * Variables for mapping CatalogId to DumpableObject
diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h
index b17c9dbb8beb..1017abbbe58c 100644
--- a/src/bin/pg_dump/pg_backup.h
+++ b/src/bin/pg_dump/pg_backup.h
@@ -233,6 +233,12 @@ typedef struct
 
 typedef int DumpId;
 
+#define InvalidDumpId 0
+
+/*
+ * Function pointer prototypes for assorted callback methods.
+ */
+
 typedef int (*DataDumperPtr) (Archive *AH, void *userArg);
 
 typedef void (*SetupWorkerPtrType) (Archive *AH);
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 8bca147a33f0..e758b5c50adf 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -220,11 +220,11 @@ static void dumpUserMappings(Archive *fout,
 							 const char *owner, CatalogId catalogId, DumpId dumpId);
 static void dumpDefaultACL(Archive *fout, DefaultACLInfo *daclinfo);
 
-static void dumpACL(Archive *fout, CatalogId objCatId, DumpId objDumpId,
-					const char *type, const char *name, const char *subname,
-					const char *nspname, const char *owner,
-					const char *acls, const char *racls,
-					const char *initacls, const char *initracls);
+static DumpId dumpACL(Archive *fout, DumpId objDumpId, DumpId altDumpId,
+					  const char *type, const char *name, const char *subname,
+					  const char *nspname, const char *owner,
+					  const char *acls, const char *racls,
+					  const char *initacls, const char *initracls);
 
 static void getDependencies(Archive *fout);
 static void BuildArchiveDependencies(Archive *fout);
@@ -2992,7 +2992,7 @@ dumpDatabase(Archive *fout)
 	 * Dump ACL if any.  Note that we do not support initial privileges
 	 * (pg_init_privs) on databases.
 	 */
-	dumpACL(fout, dbCatId, dbDumpId, "DATABASE",
+	dumpACL(fout, dbDumpId, InvalidDumpId, "DATABASE",
 			qdatname, NULL, NULL,
 			dba, datacl, rdatacl, "", "");
 
@@ -3477,7 +3477,7 @@ dumpBlob(Archive *fout, BlobInfo *binfo)
 
 	/* Dump ACL if any */
 	if (binfo->blobacl && (binfo->dobj.dump & DUMP_COMPONENT_ACL))
-		dumpACL(fout, binfo->dobj.catId, binfo->dobj.dumpId, "LARGE OBJECT",
+		dumpACL(fout, binfo->dobj.dumpId, InvalidDumpId, "LARGE OBJECT",
 				binfo->dobj.name, NULL,
 				NULL, binfo->rolname, binfo->blobacl, binfo->rblobacl,
 				binfo->initblobacl, binfo->initrblobacl);
@@ -10155,7 +10155,7 @@ dumpNamespace(Archive *fout, NamespaceInfo *nspinfo)
 					 nspinfo->dobj.catId, 0, nspinfo->dobj.dumpId);
 
 	if (nspinfo->dobj.dump & DUMP_COMPONENT_ACL)
-		dumpACL(fout, nspinfo->dobj.catId, nspinfo->dobj.dumpId, "SCHEMA",
+		dumpACL(fout, nspinfo->dobj.dumpId, InvalidDumpId, "SCHEMA",
 				qnspname, NULL, NULL,
 				nspinfo->rolname, nspinfo->nspacl, nspinfo->rnspacl,
 				nspinfo->initnspacl, nspinfo->initrnspacl);
@@ -10439,7 +10439,7 @@ dumpEnumType(Archive *fout, TypeInfo *tyinfo)
 					 tyinfo->dobj.catId, 0, tyinfo->dobj.dumpId);
 
 	if (tyinfo->dobj.dump & DUMP_COMPONENT_ACL)
-		dumpACL(fout, tyinfo->dobj.catId, tyinfo->dobj.dumpId, "TYPE",
+		dumpACL(fout, tyinfo->dobj.dumpId, InvalidDumpId, "TYPE",
 				qtypname, NULL,
 				tyinfo->dobj.namespace->dobj.name,
 				tyinfo->rolname, tyinfo->typacl, tyinfo->rtypacl,
@@ -10565,7 +10565,7 @@ dumpRangeType(Archive *fout, TypeInfo *tyinfo)
 					 tyinfo->dobj.catId, 0, tyinfo->dobj.dumpId);
 
 	if (tyinfo->dobj.dump & DUMP_COMPONENT_ACL)
-		dumpACL(fout, tyinfo->dobj.catId, tyinfo->dobj.dumpId, "TYPE",
+		dumpACL(fout, tyinfo->dobj.dumpId, InvalidDumpId, "TYPE",
 				qtypname, NULL,
 				tyinfo->dobj.namespace->dobj.name,
 				tyinfo->rolname, tyinfo->typacl, tyinfo->rtypacl,
@@ -10637,7 +10637,7 @@ dumpUndefinedType(Archive *fout, TypeInfo *tyinfo)
 					 tyinfo->dobj.catId, 0, tyinfo->dobj.dumpId);
 
 	if (tyinfo->dobj.dump & DUMP_COMPONENT_ACL)
-		dumpACL(fout, tyinfo->dobj.catId, tyinfo->dobj.dumpId, "TYPE",
+		dumpACL(fout, tyinfo->dobj.dumpId, InvalidDumpId, "TYPE",
 				qtypname, NULL,
 				tyinfo->dobj.namespace->dobj.name,
 				tyinfo->rolname, tyinfo->typacl, tyinfo->rtypacl,
@@ -10918,7 +10918,7 @@ dumpBaseType(Archive *fout, TypeInfo *tyinfo)
 					 tyinfo->dobj.catId, 0, tyinfo->dobj.dumpId);
 
 	if (tyinfo->dobj.dump & DUMP_COMPONENT_ACL)
-		dumpACL(fout, tyinfo->dobj.catId, tyinfo->dobj.dumpId, "TYPE",
+		dumpACL(fout, tyinfo->dobj.dumpId, InvalidDumpId, "TYPE",
 				qtypname, NULL,
 				tyinfo->dobj.namespace->dobj.name,
 				tyinfo->rolname, tyinfo->typacl, tyinfo->rtypacl,
@@ -11074,7 +11074,7 @@ dumpDomain(Archive *fout, TypeInfo *tyinfo)
 					 tyinfo->dobj.catId, 0, tyinfo->dobj.dumpId);
 
 	if (tyinfo->dobj.dump & DUMP_COMPONENT_ACL)
-		dumpACL(fout, tyinfo->dobj.catId, tyinfo->dobj.dumpId, "TYPE",
+		dumpACL(fout, tyinfo->dobj.dumpId, InvalidDumpId, "TYPE",
 				qtypname, NULL,
 				tyinfo->dobj.namespace->dobj.name,
 				tyinfo->rolname, tyinfo->typacl, tyinfo->rtypacl,
@@ -11296,7 +11296,7 @@ dumpCompositeType(Archive *fout, TypeInfo *tyinfo)
 					 tyinfo->dobj.catId, 0, tyinfo->dobj.dumpId);
 
 	if (tyinfo->dobj.dump & DUMP_COMPONENT_ACL)
-		dumpACL(fout, tyinfo->dobj.catId, tyinfo->dobj.dumpId, "TYPE",
+		dumpACL(fout, tyinfo->dobj.dumpId, InvalidDumpId, "TYPE",
 				qtypname, NULL,
 				tyinfo->dobj.namespace->dobj.name,
 				tyinfo->rolname, tyinfo->typacl, tyinfo->rtypacl,
@@ -11596,7 +11596,7 @@ dumpProcLang(Archive *fout, ProcLangInfo *plang)
 					 plang->dobj.catId, 0, plang->dobj.dumpId);
 
 	if (plang->lanpltrusted && plang->dobj.dump & DUMP_COMPONENT_ACL)
-		dumpACL(fout, plang->dobj.catId, plang->dobj.dumpId, "LANGUAGE",
+		dumpACL(fout, plang->dobj.dumpId, InvalidDumpId, "LANGUAGE",
 				qlanname, NULL, NULL,
 				plang->lanowner, plang->lanacl, plang->rlanacl,
 				plang->initlanacl, plang->initrlanacl);
@@ -12306,7 +12306,7 @@ dumpFunc(Archive *fout, FuncInfo *finfo)
 					 finfo->dobj.catId, 0, finfo->dobj.dumpId);
 
 	if (finfo->dobj.dump & DUMP_COMPONENT_ACL)
-		dumpACL(fout, finfo->dobj.catId, finfo->dobj.dumpId, keyword,
+		dumpACL(fout, finfo->dobj.dumpId, InvalidDumpId, keyword,
 				funcsig, NULL,
 				finfo->dobj.namespace->dobj.name,
 				finfo->rolname, finfo->proacl, finfo->rproacl,
@@ -14304,7 +14304,7 @@ dumpAgg(Archive *fout, AggInfo *agginfo)
 	aggsig = format_function_signature(fout, &agginfo->aggfn, true);
 
 	if (agginfo->aggfn.dobj.dump & DUMP_COMPONENT_ACL)
-		dumpACL(fout, agginfo->aggfn.dobj.catId, agginfo->aggfn.dobj.dumpId,
+		dumpACL(fout, agginfo->aggfn.dobj.dumpId, InvalidDumpId,
 				"FUNCTION", aggsig, NULL,
 				agginfo->aggfn.dobj.namespace->dobj.name,
 				agginfo->aggfn.rolname, agginfo->aggfn.proacl,
@@ -14706,7 +14706,7 @@ dumpForeignDataWrapper(Archive *fout, FdwInfo *fdwinfo)
 
 	/* Handle the ACL */
 	if (fdwinfo->dobj.dump & DUMP_COMPONENT_ACL)
-		dumpACL(fout, fdwinfo->dobj.catId, fdwinfo->dobj.dumpId,
+		dumpACL(fout, fdwinfo->dobj.dumpId, InvalidDumpId,
 				"FOREIGN DATA WRAPPER", qfdwname, NULL,
 				NULL, fdwinfo->rolname,
 				fdwinfo->fdwacl, fdwinfo->rfdwacl,
@@ -14795,7 +14795,7 @@ dumpForeignServer(Archive *fout, ForeignServerInfo *srvinfo)
 
 	/* Handle the ACL */
 	if (srvinfo->dobj.dump & DUMP_COMPONENT_ACL)
-		dumpACL(fout, srvinfo->dobj.catId, srvinfo->dobj.dumpId,
+		dumpACL(fout, srvinfo->dobj.dumpId, InvalidDumpId,
 				"FOREIGN SERVER", qsrvname, NULL,
 				NULL, srvinfo->rolname,
 				srvinfo->srvacl, srvinfo->rsrvacl,
@@ -14988,8 +14988,9 @@ dumpDefaultACL(Archive *fout, DefaultACLInfo *daclinfo)
 /*----------
  * Write out grant/revoke information
  *
- * 'objCatId' is the catalog ID of the underlying object.
  * 'objDumpId' is the dump ID of the underlying object.
+ * 'altDumpId' can be a second dumpId that the ACL entry must also depend on,
+ *		or InvalidDumpId if there is no need for a second dependency.
  * 'type' must be one of
  *		TABLE, SEQUENCE, FUNCTION, LANGUAGE, SCHEMA, DATABASE, TABLESPACE,
  *		FOREIGN DATA WRAPPER, SERVER, or LARGE OBJECT.
@@ -15012,25 +15013,29 @@ dumpDefaultACL(Archive *fout, DefaultACLInfo *daclinfo)
  * NB: initacls/initracls are needed because extensions can set privileges on
  * an object during the extension's script file and we record those into
  * pg_init_privs as that object's initial privileges.
+ *
+ * Returns the dump ID assigned to the ACL TocEntry, or InvalidDumpId if
+ * no ACL entry was created.
  *----------
  */
-static void
-dumpACL(Archive *fout, CatalogId objCatId, DumpId objDumpId,
+static DumpId
+dumpACL(Archive *fout, DumpId objDumpId, DumpId altDumpId,
 		const char *type, const char *name, const char *subname,
 		const char *nspname, const char *owner,
 		const char *acls, const char *racls,
 		const char *initacls, const char *initracls)
 {
+	DumpId		aclDumpId = InvalidDumpId;
 	DumpOptions *dopt = fout->dopt;
 	PQExpBuffer sql;
 
 	/* Do nothing if ACL dump is not enabled */
 	if (dopt->aclsSkip)
-		return;
+		return InvalidDumpId;
 
 	/* --data-only skips ACLs *except* BLOB ACLs */
 	if (dopt->dataOnly && strcmp(type, "LARGE OBJECT") != 0)
-		return;
+		return InvalidDumpId;
 
 	sql = createPQExpBuffer();
 
@@ -15062,25 +15067,36 @@ dumpACL(Archive *fout, CatalogId objCatId, DumpId objDumpId,
 	if (sql->len > 0)
 	{
 		PQExpBuffer tag = createPQExpBuffer();
+		DumpId		aclDeps[2];
+		int			nDeps = 0;
 
 		if (subname)
 			appendPQExpBuffer(tag, "COLUMN %s.%s", name, subname);
 		else
 			appendPQExpBuffer(tag, "%s %s", type, name);
 
-		ArchiveEntry(fout, nilCatalogId, createDumpId(),
+		aclDeps[nDeps++] = objDumpId;
+		if (altDumpId != InvalidDumpId)
+			aclDeps[nDeps++] = altDumpId;
+
+		aclDumpId = createDumpId();
+
+		ArchiveEntry(fout, nilCatalogId, aclDumpId,
 					 ARCHIVE_OPTS(.tag = tag->data,
 								  .namespace = nspname,
 								  .owner = owner,
 								  .description = "ACL",
 								  .section = SECTION_NONE,
 								  .createStmt = sql->data,
-								  .deps = &objDumpId,
-								  .nDeps = 1));
+								  .deps = aclDeps,
+								  .nDeps = nDeps));
+
 		destroyPQExpBuffer(tag);
 	}
 
 	destroyPQExpBuffer(sql);
+
+	return aclDumpId;
 }
 
 /*
@@ -15406,6 +15422,7 @@ static void
 dumpTable(Archive *fout, TableInfo *tbinfo)
 {
 	DumpOptions *dopt = fout->dopt;
+	DumpId		tableAclDumpId = InvalidDumpId;
 	char	   *namecopy;
 
 	/*
@@ -15427,11 +15444,12 @@ dumpTable(Archive *fout, TableInfo *tbinfo)
 		const char *objtype =
 		(tbinfo->relkind == RELKIND_SEQUENCE) ? "SEQUENCE" : "TABLE";
 
-		dumpACL(fout, tbinfo->dobj.catId, tbinfo->dobj.dumpId,
-				objtype, namecopy, NULL,
-				tbinfo->dobj.namespace->dobj.name, tbinfo->rolname,
-				tbinfo->relacl, tbinfo->rrelacl,
-				tbinfo->initrelacl, tbinfo->initrrelacl);
+		tableAclDumpId =
+			dumpACL(fout, tbinfo->dobj.dumpId, InvalidDumpId,
+					objtype, namecopy, NULL,
+					tbinfo->dobj.namespace->dobj.name, tbinfo->rolname,
+					tbinfo->relacl, tbinfo->rrelacl,
+					tbinfo->initrelacl, tbinfo->initrrelacl);
 	}
 
 	/*
@@ -15515,8 +15533,13 @@ dumpTable(Archive *fout, TableInfo *tbinfo)
 			char	   *attnamecopy;
 
 			attnamecopy = pg_strdup(fmtId(attname));
-			/* Column's GRANT type is always TABLE */
-			dumpACL(fout, tbinfo->dobj.catId, tbinfo->dobj.dumpId,
+
+			/*
+			 * Column's GRANT type is always TABLE.  Each column ACL depends
+			 * on the table-level ACL, since we can restore column ACLs in
+			 * parallel but the table-level ACL has to be done first.
+			 */
+			dumpACL(fout, tbinfo->dobj.dumpId, tableAclDumpId,
 					"TABLE", namecopy, attnamecopy,
 					tbinfo->dobj.namespace->dobj.name, tbinfo->rolname,
 					attacl, rattacl, initattacl, initrattacl);

From cd22d3cdb9bd9963c694c01a8c0232bbae3ddcfb Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 11 Jul 2020 14:21:28 -0400
Subject: [PATCH 126/334] Avoid useless buffer allocations during binary COPY
 FROM.

The raw_buf and line_buf buffers aren't used when reading binary format,
so skip allocating them.  raw_buf is 64K so that seems like a worthwhile
savings.  An unused line_buf only wastes 1K, but as long as we're checking
it's free to avoid allocating that too.

Bharath Rupireddy, tweaked a bit by me

Discussion: https://postgr.es/m/CALj2ACXcCKaGPY0whowqrJ4OPJvDnTssgpGCzvuFQu5z0CXb-g@mail.gmail.com
---
 src/backend/commands/copy.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 3e199bdfd0c6..99d145718012 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -193,6 +193,9 @@ typedef struct CopyStateData
 	 * the current line.  The CopyReadAttributes functions return arrays of
 	 * pointers into this buffer.  We avoid palloc/pfree overhead by re-using
 	 * the buffer on each cycle.
+	 *
+	 * (In binary COPY FROM, attribute_buf holds the binary data for the
+	 * current field, while the other variables are not used.)
 	 */
 	StringInfoData attribute_buf;
 
@@ -3359,12 +3362,19 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->cur_attname = NULL;
 	cstate->cur_attval = NULL;
 
-	/* Set up variables to avoid per-attribute overhead. */
+	/*
+	 * Set up variables to avoid per-attribute overhead.  attribute_buf is
+	 * used in both text and binary modes, but we use line_buf and raw_buf
+	 * only in text mode.
+	 */
 	initStringInfo(&cstate->attribute_buf);
-	initStringInfo(&cstate->line_buf);
-	cstate->line_buf_converted = false;
-	cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1);
-	cstate->raw_buf_index = cstate->raw_buf_len = 0;
+	if (!cstate->binary)
+	{
+		initStringInfo(&cstate->line_buf);
+		cstate->line_buf_converted = false;
+		cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1);
+		cstate->raw_buf_index = cstate->raw_buf_len = 0;
+	}
 
 	/* Assign range table, we'll need it in CopyFrom. */
 	if (pstate)

From b1e48bbe64a411666bb1928b9741e112e267836d Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Sun, 12 Jul 2020 20:47:15 +0900
Subject: [PATCH 127/334] Include replication origins in SQL functions for
 commit timestamp

This includes two changes:
- Addition of a new function pg_xact_commit_timestamp_origin() able, for
a given transaction ID, to return the commit timestamp and replication
origin of this transaction.  An equivalent function existed in
pglogical.
- Addition of the replication origin to pg_last_committed_xact().

The commit timestamp manager includes already APIs able to return the
replication origin of a transaction on top of its commit timestamp, but
the code paths for replication origins were never stressed as those
functions have never looked for a replication origin, and the SQL
functions available have never included this information since their
introduction in 73c986a.

While on it, refactor a test of modules/commit_ts/ to use tstzrange() to
check that a transaction timestamp is within the wanted range, making
the test a bit easier to read.

Bump catalog version.

Author: Movead Li
Reviewed-by: Madan Kumar, Michael Paquier
Discussion: https://postgr.es/m/2020051116430836450630@highgo.ca
---
 doc/src/sgml/func.sgml                        | 22 ++++-
 src/backend/access/transam/commit_ts.c        | 71 +++++++++++++-
 src/include/catalog/catversion.h              |  2 +-
 src/include/catalog/pg_proc.dat               | 15 ++-
 .../commit_ts/expected/commit_timestamp.out   | 93 ++++++++++++++++++-
 .../commit_ts/expected/commit_timestamp_1.out | 75 ++++++++++++++-
 .../commit_ts/sql/commit_timestamp.sql        | 35 ++++++-
 7 files changed, 295 insertions(+), 18 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index f06585653503..cc83d6652e4f 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -23397,6 +23397,21 @@ SELECT collation for ('foo' COLLATE "de_DE");
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>pg_xact_commit_timestamp_origin</primary>
+        </indexterm>
+        <function>pg_xact_commit_timestamp_origin</function> ( <type>xid</type> )
+        <returnvalue>record</returnvalue>
+        ( <parameter>timestamp</parameter> <type>timestamp with time zone</type>,
+         <parameter>roident</parameter> <type>oid</type>)
+       </para>
+       <para>
+         Returns the commit timestamp and replication origin of a transaction.
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
@@ -23405,11 +23420,12 @@ SELECT collation for ('foo' COLLATE "de_DE");
         <function>pg_last_committed_xact</function> ()
         <returnvalue>record</returnvalue>
         ( <parameter>xid</parameter> <type>xid</type>,
-        <parameter>timestamp</parameter> <type>timestamp with time zone</type> )
+        <parameter>timestamp</parameter> <type>timestamp with time zone</type>,
+        <parameter>roident</parameter> <type>oid</type> )
        </para>
        <para>
-        Returns the transaction ID and commit timestamp of the latest
-        committed transaction.
+        Returns the transaction ID, commit timestamp and replication origin
+        of the latest committed transaction.
        </para></entry>
       </row>
      </tbody>
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 182e5391f7b7..903280ae92d0 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -361,7 +361,7 @@ TransactionIdGetCommitTsData(TransactionId xid, TimestampTz *ts,
  * is concerned, anyway; it's up to the caller to ensure the value is useful
  * for its purposes.)
  *
- * ts and extra are filled with the corresponding data; they can be passed
+ * ts and nodeid are filled with the corresponding data; they can be passed
  * as NULL if not wanted.
  */
 TransactionId
@@ -417,28 +417,38 @@ pg_xact_commit_timestamp(PG_FUNCTION_ARGS)
 }
 
 
+/*
+ * pg_last_committed_xact
+ *
+ * SQL-callable wrapper to obtain some information about the latest
+ * committed transaction: transaction ID, timestamp and replication
+ * origin.
+ */
 Datum
 pg_last_committed_xact(PG_FUNCTION_ARGS)
 {
 	TransactionId xid;
+	RepOriginId nodeid;
 	TimestampTz ts;
-	Datum		values[2];
-	bool		nulls[2];
+	Datum		values[3];
+	bool		nulls[3];
 	TupleDesc	tupdesc;
 	HeapTuple	htup;
 
 	/* and construct a tuple with our data */
-	xid = GetLatestCommitTsData(&ts, NULL);
+	xid = GetLatestCommitTsData(&ts, &nodeid);
 
 	/*
 	 * Construct a tuple descriptor for the result row.  This must match this
 	 * function's pg_proc entry!
 	 */
-	tupdesc = CreateTemplateTupleDesc(2);
+	tupdesc = CreateTemplateTupleDesc(3);
 	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "xid",
 					   XIDOID, -1, 0);
 	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "timestamp",
 					   TIMESTAMPTZOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "roident",
+					   OIDOID, -1, 0);
 	tupdesc = BlessTupleDesc(tupdesc);
 
 	if (!TransactionIdIsNormal(xid))
@@ -452,6 +462,9 @@ pg_last_committed_xact(PG_FUNCTION_ARGS)
 
 		values[1] = TimestampTzGetDatum(ts);
 		nulls[1] = false;
+
+		values[2] = ObjectIdGetDatum((Oid) nodeid);
+		nulls[2] = false;
 	}
 
 	htup = heap_form_tuple(tupdesc, values, nulls);
@@ -459,6 +472,54 @@ pg_last_committed_xact(PG_FUNCTION_ARGS)
 	PG_RETURN_DATUM(HeapTupleGetDatum(htup));
 }
 
+/*
+ * pg_xact_commit_timestamp_origin
+ *
+ * SQL-callable wrapper to obtain commit timestamp and replication origin
+ * of a given transaction.
+ */
+Datum
+pg_xact_commit_timestamp_origin(PG_FUNCTION_ARGS)
+{
+	TransactionId xid = PG_GETARG_UINT32(0);
+	RepOriginId nodeid;
+	TimestampTz ts;
+	Datum		values[2];
+	bool		nulls[2];
+	TupleDesc	tupdesc;
+	HeapTuple	htup;
+	bool		found;
+
+	found = TransactionIdGetCommitTsData(xid, &ts, &nodeid);
+
+	/*
+	 * Construct a tuple descriptor for the result row.  This must match this
+	 * function's pg_proc entry!
+	 */
+	tupdesc = CreateTemplateTupleDesc(2);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "timestamp",
+					   TIMESTAMPTZOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "roident",
+					   OIDOID, -1, 0);
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	if (!found)
+	{
+		memset(nulls, true, sizeof(nulls));
+	}
+	else
+	{
+		values[0] = TimestampTzGetDatum(ts);
+		nulls[0] = false;
+
+		values[1] = ObjectIdGetDatum((Oid) nodeid);
+		nulls[1] = false;
+	}
+
+	htup = heap_form_tuple(tupdesc, values, nulls);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(htup));
+}
 
 /*
  * Number of shared CommitTS buffers.
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 60e5361af66e..ee5858656964 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202007072
+#define CATALOG_VERSION_NO	202007121
 
 #endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index d951b4a36f24..d81467af198f 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5946,12 +5946,21 @@
   prorettype => 'timestamptz', proargtypes => 'xid',
   prosrc => 'pg_xact_commit_timestamp' },
 
+{ oid => '8456',
+  descr => 'get commit timestamp and replication origin of a transaction',
+  proname => 'pg_xact_commit_timestamp_origin', provolatile => 'v',
+  prorettype => 'record', proargtypes => 'xid',
+  proallargtypes => '{xid,timestamptz,oid}', proargmodes => '{i,o,o}',
+  proargnames => '{xid,timestamp,roident}',
+  prosrc => 'pg_xact_commit_timestamp_origin' },
+
 { oid => '3583',
-  descr => 'get transaction Id and commit timestamp of latest transaction commit',
+  descr => 'get transaction Id, commit timestamp and replication origin of latest transaction commit',
   proname => 'pg_last_committed_xact', provolatile => 'v',
   prorettype => 'record', proargtypes => '',
-  proallargtypes => '{xid,timestamptz}', proargmodes => '{o,o}',
-  proargnames => '{xid,timestamp}', prosrc => 'pg_last_committed_xact' },
+  proallargtypes => '{xid,timestamptz,oid}', proargmodes => '{o,o,o}',
+  proargnames => '{xid,timestamp,roident}',
+  prosrc => 'pg_last_committed_xact' },
 
 { oid => '3537', descr => 'get identification of SQL object',
   proname => 'pg_describe_object', provolatile => 's', prorettype => 'text',
diff --git a/src/test/modules/commit_ts/expected/commit_timestamp.out b/src/test/modules/commit_ts/expected/commit_timestamp.out
index 5b7783b58f3d..addd55bfd44a 100644
--- a/src/test/modules/commit_ts/expected/commit_timestamp.out
+++ b/src/test/modules/commit_ts/expected/commit_timestamp.out
@@ -39,9 +39,94 @@ SELECT pg_xact_commit_timestamp('2'::xid);
  
 (1 row)
 
-SELECT x.xid::text::bigint > 0, x.timestamp > '-infinity'::timestamptz, x.timestamp <= now() FROM pg_last_committed_xact() x;
- ?column? | ?column? | ?column? 
-----------+----------+----------
- t        | t        | t
+SELECT x.xid::text::bigint > 0 as xid_valid,
+    x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+    roident != 0 AS valid_roident
+  FROM pg_last_committed_xact() x;
+ xid_valid | ts_in_range | valid_roident 
+-----------+-------------+---------------
+ t         | t           | f
+(1 row)
+
+-- Test non-normal transaction ids.
+SELECT * FROM pg_xact_commit_timestamp_origin(NULL); -- ok, NULL
+ timestamp | roident 
+-----------+---------
+           |        
+(1 row)
+
+SELECT * FROM pg_xact_commit_timestamp_origin('0'::xid); -- error
+ERROR:  cannot retrieve commit timestamp for transaction 0
+SELECT * FROM pg_xact_commit_timestamp_origin('1'::xid); -- ok, NULL
+ timestamp | roident 
+-----------+---------
+           |        
+(1 row)
+
+SELECT * FROM pg_xact_commit_timestamp_origin('2'::xid); -- ok, NULL
+ timestamp | roident 
+-----------+---------
+           |        
+(1 row)
+
+-- Test transaction without replication origin
+SELECT txid_current() as txid_no_origin \gset
+SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+       roident != 0 AS valid_roident
+  FROM pg_last_committed_xact() x;
+ ts_in_range | valid_roident 
+-------------+---------------
+ t           | f
+(1 row)
+
+SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+       roident != 0 AS valid_roident
+  FROM pg_xact_commit_timestamp_origin(:'txid_no_origin') x;
+ ts_in_range | valid_roident 
+-------------+---------------
+ t           | f
+(1 row)
+
+-- Test transaction with replication origin
+SELECT pg_replication_origin_create('test_commit_ts: get_origin') != 0
+  AS valid_roident;
+ valid_roident 
+---------------
+ t
+(1 row)
+
+SELECT pg_replication_origin_session_setup('test_commit_ts: get_origin');
+ pg_replication_origin_session_setup 
+-------------------------------------
+ 
+(1 row)
+
+SELECT txid_current() as txid_with_origin \gset
+SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
+  FROM pg_last_committed_xact() x, pg_replication_origin r
+  WHERE r.roident = x.roident;
+ ts_in_range |           roname           
+-------------+----------------------------
+ t           | test_commit_ts: get_origin
+(1 row)
+
+SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
+  FROM pg_xact_commit_timestamp_origin(:'txid_with_origin') x, pg_replication_origin r
+  WHERE r.roident = x.roident;
+ ts_in_range |           roname           
+-------------+----------------------------
+ t           | test_commit_ts: get_origin
+(1 row)
+
+SELECT pg_replication_origin_session_reset();
+ pg_replication_origin_session_reset 
+-------------------------------------
+ 
+(1 row)
+
+SELECT pg_replication_origin_drop('test_commit_ts: get_origin');
+ pg_replication_origin_drop 
+----------------------------
+ 
 (1 row)
 
diff --git a/src/test/modules/commit_ts/expected/commit_timestamp_1.out b/src/test/modules/commit_ts/expected/commit_timestamp_1.out
index c10b0abc2b75..02cd651ed932 100644
--- a/src/test/modules/commit_ts/expected/commit_timestamp_1.out
+++ b/src/test/modules/commit_ts/expected/commit_timestamp_1.out
@@ -34,6 +34,79 @@ SELECT pg_xact_commit_timestamp('2'::xid);
  
 (1 row)
 
-SELECT x.xid::text::bigint > 0, x.timestamp > '-infinity'::timestamptz, x.timestamp <= now() FROM pg_last_committed_xact() x;
+SELECT x.xid::text::bigint > 0 as xid_valid,
+    x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+    roident != 0 AS valid_roident
+  FROM pg_last_committed_xact() x;
 ERROR:  could not get commit timestamp data
 HINT:  Make sure the configuration parameter "track_commit_timestamp" is set.
+-- Test non-normal transaction ids.
+SELECT * FROM pg_xact_commit_timestamp_origin(NULL); -- ok, NULL
+ timestamp | roident 
+-----------+---------
+           |        
+(1 row)
+
+SELECT * FROM pg_xact_commit_timestamp_origin('0'::xid); -- error
+ERROR:  cannot retrieve commit timestamp for transaction 0
+SELECT * FROM pg_xact_commit_timestamp_origin('1'::xid); -- ok, NULL
+ timestamp | roident 
+-----------+---------
+           |        
+(1 row)
+
+SELECT * FROM pg_xact_commit_timestamp_origin('2'::xid); -- ok, NULL
+ timestamp | roident 
+-----------+---------
+           |        
+(1 row)
+
+-- Test transaction without replication origin
+SELECT txid_current() as txid_no_origin \gset
+SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+       roident != 0 AS valid_roident
+  FROM pg_last_committed_xact() x;
+ERROR:  could not get commit timestamp data
+HINT:  Make sure the configuration parameter "track_commit_timestamp" is set.
+SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+       roident != 0 AS valid_roident
+  FROM pg_xact_commit_timestamp_origin(:'txid_no_origin') x;
+ERROR:  could not get commit timestamp data
+HINT:  Make sure the configuration parameter "track_commit_timestamp" is set.
+-- Test transaction with replication origin
+SELECT pg_replication_origin_create('test_commit_ts: get_origin') != 0
+  AS valid_roident;
+ valid_roident 
+---------------
+ t
+(1 row)
+
+SELECT pg_replication_origin_session_setup('test_commit_ts: get_origin');
+ pg_replication_origin_session_setup 
+-------------------------------------
+ 
+(1 row)
+
+SELECT txid_current() as txid_with_origin \gset
+SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
+  FROM pg_last_committed_xact() x, pg_replication_origin r
+  WHERE r.roident = x.roident;
+ERROR:  could not get commit timestamp data
+HINT:  Make sure the configuration parameter "track_commit_timestamp" is set.
+SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
+  FROM pg_xact_commit_timestamp_origin(:'txid_with_origin') x, pg_replication_origin r
+  WHERE r.roident = x.roident;
+ERROR:  could not get commit timestamp data
+HINT:  Make sure the configuration parameter "track_commit_timestamp" is set.
+SELECT pg_replication_origin_session_reset();
+ pg_replication_origin_session_reset 
+-------------------------------------
+ 
+(1 row)
+
+SELECT pg_replication_origin_drop('test_commit_ts: get_origin');
+ pg_replication_origin_drop 
+----------------------------
+ 
+(1 row)
+
diff --git a/src/test/modules/commit_ts/sql/commit_timestamp.sql b/src/test/modules/commit_ts/sql/commit_timestamp.sql
index 4e041a534737..4a8a6aa56ef6 100644
--- a/src/test/modules/commit_ts/sql/commit_timestamp.sql
+++ b/src/test/modules/commit_ts/sql/commit_timestamp.sql
@@ -21,4 +21,37 @@ SELECT pg_xact_commit_timestamp('0'::xid);
 SELECT pg_xact_commit_timestamp('1'::xid);
 SELECT pg_xact_commit_timestamp('2'::xid);
 
-SELECT x.xid::text::bigint > 0, x.timestamp > '-infinity'::timestamptz, x.timestamp <= now() FROM pg_last_committed_xact() x;
+SELECT x.xid::text::bigint > 0 as xid_valid,
+    x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+    roident != 0 AS valid_roident
+  FROM pg_last_committed_xact() x;
+
+-- Test non-normal transaction ids.
+SELECT * FROM pg_xact_commit_timestamp_origin(NULL); -- ok, NULL
+SELECT * FROM pg_xact_commit_timestamp_origin('0'::xid); -- error
+SELECT * FROM pg_xact_commit_timestamp_origin('1'::xid); -- ok, NULL
+SELECT * FROM pg_xact_commit_timestamp_origin('2'::xid); -- ok, NULL
+
+-- Test transaction without replication origin
+SELECT txid_current() as txid_no_origin \gset
+SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+       roident != 0 AS valid_roident
+  FROM pg_last_committed_xact() x;
+SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+       roident != 0 AS valid_roident
+  FROM pg_xact_commit_timestamp_origin(:'txid_no_origin') x;
+
+-- Test transaction with replication origin
+SELECT pg_replication_origin_create('test_commit_ts: get_origin') != 0
+  AS valid_roident;
+SELECT pg_replication_origin_session_setup('test_commit_ts: get_origin');
+SELECT txid_current() as txid_with_origin \gset
+SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
+  FROM pg_last_committed_xact() x, pg_replication_origin r
+  WHERE r.roident = x.roident;
+SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
+  FROM pg_xact_commit_timestamp_origin(:'txid_with_origin') x, pg_replication_origin r
+  WHERE r.roident = x.roident;
+
+SELECT pg_replication_origin_session_reset();
+SELECT pg_replication_origin_drop('test_commit_ts: get_origin');

From ea3e15d1691ec4cadc67f160cc91c7f237a705ae Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Sun, 12 Jul 2020 21:25:36 +0900
Subject: [PATCH 128/334] Fix test failure with
 -DENFORCE_REGRESSION_TEST_NAME_RESTRICTIONS

Replication origins created by regression tests should have names
starting with "regress_", and the test introduced in b1e48bb for commit
timestamps did not do that.

Per buildfarm member longfin.

Discussion: https://postgr.es/m/20200712122507.GD21680@paquier.xyz
---
 .../commit_ts/expected/commit_timestamp.out    | 18 +++++++++---------
 .../commit_ts/expected/commit_timestamp_1.out  |  6 +++---
 .../modules/commit_ts/sql/commit_timestamp.sql |  6 +++---
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/test/modules/commit_ts/expected/commit_timestamp.out b/src/test/modules/commit_ts/expected/commit_timestamp.out
index addd55bfd44a..d484d489111d 100644
--- a/src/test/modules/commit_ts/expected/commit_timestamp.out
+++ b/src/test/modules/commit_ts/expected/commit_timestamp.out
@@ -88,14 +88,14 @@ SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
 (1 row)
 
 -- Test transaction with replication origin
-SELECT pg_replication_origin_create('test_commit_ts: get_origin') != 0
+SELECT pg_replication_origin_create('regress_commit_ts: get_origin') != 0
   AS valid_roident;
  valid_roident 
 ---------------
  t
 (1 row)
 
-SELECT pg_replication_origin_session_setup('test_commit_ts: get_origin');
+SELECT pg_replication_origin_session_setup('regress_commit_ts: get_origin');
  pg_replication_origin_session_setup 
 -------------------------------------
  
@@ -105,17 +105,17 @@ SELECT txid_current() as txid_with_origin \gset
 SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
   FROM pg_last_committed_xact() x, pg_replication_origin r
   WHERE r.roident = x.roident;
- ts_in_range |           roname           
--------------+----------------------------
- t           | test_commit_ts: get_origin
+ ts_in_range |            roname             
+-------------+-------------------------------
+ t           | regress_commit_ts: get_origin
 (1 row)
 
 SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
   FROM pg_xact_commit_timestamp_origin(:'txid_with_origin') x, pg_replication_origin r
   WHERE r.roident = x.roident;
- ts_in_range |           roname           
--------------+----------------------------
- t           | test_commit_ts: get_origin
+ ts_in_range |            roname             
+-------------+-------------------------------
+ t           | regress_commit_ts: get_origin
 (1 row)
 
 SELECT pg_replication_origin_session_reset();
@@ -124,7 +124,7 @@ SELECT pg_replication_origin_session_reset();
  
 (1 row)
 
-SELECT pg_replication_origin_drop('test_commit_ts: get_origin');
+SELECT pg_replication_origin_drop('regress_commit_ts: get_origin');
  pg_replication_origin_drop 
 ----------------------------
  
diff --git a/src/test/modules/commit_ts/expected/commit_timestamp_1.out b/src/test/modules/commit_ts/expected/commit_timestamp_1.out
index 02cd651ed932..9a7487408131 100644
--- a/src/test/modules/commit_ts/expected/commit_timestamp_1.out
+++ b/src/test/modules/commit_ts/expected/commit_timestamp_1.out
@@ -74,14 +74,14 @@ SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
 ERROR:  could not get commit timestamp data
 HINT:  Make sure the configuration parameter "track_commit_timestamp" is set.
 -- Test transaction with replication origin
-SELECT pg_replication_origin_create('test_commit_ts: get_origin') != 0
+SELECT pg_replication_origin_create('regress_commit_ts: get_origin') != 0
   AS valid_roident;
  valid_roident 
 ---------------
  t
 (1 row)
 
-SELECT pg_replication_origin_session_setup('test_commit_ts: get_origin');
+SELECT pg_replication_origin_session_setup('regress_commit_ts: get_origin');
  pg_replication_origin_session_setup 
 -------------------------------------
  
@@ -104,7 +104,7 @@ SELECT pg_replication_origin_session_reset();
  
 (1 row)
 
-SELECT pg_replication_origin_drop('test_commit_ts: get_origin');
+SELECT pg_replication_origin_drop('regress_commit_ts: get_origin');
  pg_replication_origin_drop 
 ----------------------------
  
diff --git a/src/test/modules/commit_ts/sql/commit_timestamp.sql b/src/test/modules/commit_ts/sql/commit_timestamp.sql
index 4a8a6aa56ef6..e77dfb2f0a05 100644
--- a/src/test/modules/commit_ts/sql/commit_timestamp.sql
+++ b/src/test/modules/commit_ts/sql/commit_timestamp.sql
@@ -42,9 +42,9 @@ SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
   FROM pg_xact_commit_timestamp_origin(:'txid_no_origin') x;
 
 -- Test transaction with replication origin
-SELECT pg_replication_origin_create('test_commit_ts: get_origin') != 0
+SELECT pg_replication_origin_create('regress_commit_ts: get_origin') != 0
   AS valid_roident;
-SELECT pg_replication_origin_session_setup('test_commit_ts: get_origin');
+SELECT pg_replication_origin_session_setup('regress_commit_ts: get_origin');
 SELECT txid_current() as txid_with_origin \gset
 SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
   FROM pg_last_committed_xact() x, pg_replication_origin r
@@ -54,4 +54,4 @@ SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
   WHERE r.roident = x.roident;
 
 SELECT pg_replication_origin_session_reset();
-SELECT pg_replication_origin_drop('test_commit_ts: get_origin');
+SELECT pg_replication_origin_drop('regress_commit_ts: get_origin');

From 5bfe6a3c485d3259f59fa2d2e1d34dea1a3baeba Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 13 Jul 2020 10:54:26 +0900
Subject: [PATCH 129/334] Fix timestamp range handling in regression tests of
 modules/commit_ts/

Switching the regression tests to use tstzrange() has proved to not be a
good idea for environments where the timestamp precision is low, as
internal range checks exclude the upper bound.  So, if the commit
timestamp of a transaction matched with now() from the next query,
the test would fail.  This changes to use two bound checks instead of
the range function, where the upper bound is inclusive.

Per buildfarm member jacana.

Discussion: https://postgr.es/m/20200712122507.GD21680@paquier.xyz
---
 .../commit_ts/expected/commit_timestamp.out   | 49 +++++++++++--------
 .../commit_ts/expected/commit_timestamp_1.out | 19 ++++---
 .../commit_ts/sql/commit_timestamp.sql        | 19 ++++---
 3 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/src/test/modules/commit_ts/expected/commit_timestamp.out b/src/test/modules/commit_ts/expected/commit_timestamp.out
index d484d489111d..bb2fda276812 100644
--- a/src/test/modules/commit_ts/expected/commit_timestamp.out
+++ b/src/test/modules/commit_ts/expected/commit_timestamp.out
@@ -40,12 +40,13 @@ SELECT pg_xact_commit_timestamp('2'::xid);
 (1 row)
 
 SELECT x.xid::text::bigint > 0 as xid_valid,
-    x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
-    roident != 0 AS valid_roident
+       x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
+       roident != 0 AS valid_roident
   FROM pg_last_committed_xact() x;
- xid_valid | ts_in_range | valid_roident 
------------+-------------+---------------
- t         | t           | f
+ xid_valid | ts_low | ts_high | valid_roident 
+-----------+--------+---------+---------------
+ t         | t      | t       | f
 (1 row)
 
 -- Test non-normal transaction ids.
@@ -71,20 +72,22 @@ SELECT * FROM pg_xact_commit_timestamp_origin('2'::xid); -- ok, NULL
 
 -- Test transaction without replication origin
 SELECT txid_current() as txid_no_origin \gset
-SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+SELECT x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
        roident != 0 AS valid_roident
   FROM pg_last_committed_xact() x;
- ts_in_range | valid_roident 
--------------+---------------
- t           | f
+ ts_low | ts_high | valid_roident 
+--------+---------+---------------
+ t      | t       | f
 (1 row)
 
-SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+SELECT x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
        roident != 0 AS valid_roident
   FROM pg_xact_commit_timestamp_origin(:'txid_no_origin') x;
- ts_in_range | valid_roident 
--------------+---------------
- t           | f
+ ts_low | ts_high | valid_roident 
+--------+---------+---------------
+ t      | t       | f
 (1 row)
 
 -- Test transaction with replication origin
@@ -102,20 +105,24 @@ SELECT pg_replication_origin_session_setup('regress_commit_ts: get_origin');
 (1 row)
 
 SELECT txid_current() as txid_with_origin \gset
-SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
+SELECT x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
+       r.roname
   FROM pg_last_committed_xact() x, pg_replication_origin r
   WHERE r.roident = x.roident;
- ts_in_range |            roname             
--------------+-------------------------------
- t           | regress_commit_ts: get_origin
+ ts_low | ts_high |            roname             
+--------+---------+-------------------------------
+ t      | t       | regress_commit_ts: get_origin
 (1 row)
 
-SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
+SELECT x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
+       r.roname
   FROM pg_xact_commit_timestamp_origin(:'txid_with_origin') x, pg_replication_origin r
   WHERE r.roident = x.roident;
- ts_in_range |            roname             
--------------+-------------------------------
- t           | regress_commit_ts: get_origin
+ ts_low | ts_high |            roname             
+--------+---------+-------------------------------
+ t      | t       | regress_commit_ts: get_origin
 (1 row)
 
 SELECT pg_replication_origin_session_reset();
diff --git a/src/test/modules/commit_ts/expected/commit_timestamp_1.out b/src/test/modules/commit_ts/expected/commit_timestamp_1.out
index 9a7487408131..f37e701f37ae 100644
--- a/src/test/modules/commit_ts/expected/commit_timestamp_1.out
+++ b/src/test/modules/commit_ts/expected/commit_timestamp_1.out
@@ -35,8 +35,9 @@ SELECT pg_xact_commit_timestamp('2'::xid);
 (1 row)
 
 SELECT x.xid::text::bigint > 0 as xid_valid,
-    x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
-    roident != 0 AS valid_roident
+       x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
+       roident != 0 AS valid_roident
   FROM pg_last_committed_xact() x;
 ERROR:  could not get commit timestamp data
 HINT:  Make sure the configuration parameter "track_commit_timestamp" is set.
@@ -63,12 +64,14 @@ SELECT * FROM pg_xact_commit_timestamp_origin('2'::xid); -- ok, NULL
 
 -- Test transaction without replication origin
 SELECT txid_current() as txid_no_origin \gset
-SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+SELECT x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
        roident != 0 AS valid_roident
   FROM pg_last_committed_xact() x;
 ERROR:  could not get commit timestamp data
 HINT:  Make sure the configuration parameter "track_commit_timestamp" is set.
-SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+SELECT x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
        roident != 0 AS valid_roident
   FROM pg_xact_commit_timestamp_origin(:'txid_no_origin') x;
 ERROR:  could not get commit timestamp data
@@ -88,12 +91,16 @@ SELECT pg_replication_origin_session_setup('regress_commit_ts: get_origin');
 (1 row)
 
 SELECT txid_current() as txid_with_origin \gset
-SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
+SELECT x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
+       r.roname
   FROM pg_last_committed_xact() x, pg_replication_origin r
   WHERE r.roident = x.roident;
 ERROR:  could not get commit timestamp data
 HINT:  Make sure the configuration parameter "track_commit_timestamp" is set.
-SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
+SELECT x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
+       r.roname
   FROM pg_xact_commit_timestamp_origin(:'txid_with_origin') x, pg_replication_origin r
   WHERE r.roident = x.roident;
 ERROR:  could not get commit timestamp data
diff --git a/src/test/modules/commit_ts/sql/commit_timestamp.sql b/src/test/modules/commit_ts/sql/commit_timestamp.sql
index e77dfb2f0a05..3bb7bb27a74c 100644
--- a/src/test/modules/commit_ts/sql/commit_timestamp.sql
+++ b/src/test/modules/commit_ts/sql/commit_timestamp.sql
@@ -22,8 +22,9 @@ SELECT pg_xact_commit_timestamp('1'::xid);
 SELECT pg_xact_commit_timestamp('2'::xid);
 
 SELECT x.xid::text::bigint > 0 as xid_valid,
-    x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
-    roident != 0 AS valid_roident
+       x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
+       roident != 0 AS valid_roident
   FROM pg_last_committed_xact() x;
 
 -- Test non-normal transaction ids.
@@ -34,10 +35,12 @@ SELECT * FROM pg_xact_commit_timestamp_origin('2'::xid); -- ok, NULL
 
 -- Test transaction without replication origin
 SELECT txid_current() as txid_no_origin \gset
-SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+SELECT x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
        roident != 0 AS valid_roident
   FROM pg_last_committed_xact() x;
-SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range,
+SELECT x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
        roident != 0 AS valid_roident
   FROM pg_xact_commit_timestamp_origin(:'txid_no_origin') x;
 
@@ -46,10 +49,14 @@ SELECT pg_replication_origin_create('regress_commit_ts: get_origin') != 0
   AS valid_roident;
 SELECT pg_replication_origin_session_setup('regress_commit_ts: get_origin');
 SELECT txid_current() as txid_with_origin \gset
-SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
+SELECT x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
+       r.roname
   FROM pg_last_committed_xact() x, pg_replication_origin r
   WHERE r.roident = x.roident;
-SELECT x.timestamp <@ tstzrange('-infinity'::timestamptz, now()) AS ts_in_range, r.roname
+SELECT x.timestamp > '-infinity'::timestamptz AS ts_low,
+       x.timestamp <= now() AS ts_high,
+       r.roname
   FROM pg_xact_commit_timestamp_origin(:'txid_with_origin') x, pg_replication_origin r
   WHERE r.roident = x.roident;
 

From d973747281caece520236e93d255c654cc613ec9 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Mon, 13 Jul 2020 08:06:05 +0530
Subject: [PATCH 130/334] Revert "Track statistics for spilling of changes from
 ReorderBuffer".

The stats with this commit was available only for WALSenders, however,
users might want to see for backends doing logical decoding via SQL API.
Then, users might want to reset and access these stats across server
restart which was not possible with the current patch.

List of commits reverted:

caa3c4242c   Don't call elog() while holding spinlock.
e641b2a995   Doc: Update the documentation for spilled transaction
statistics.
5883f5fe27   Fix unportable printf format introduced in commit 9290ad198.
9290ad198b   Track statistics for spilling of changes from ReorderBuffer.

Additionaly, remove the release notes entry for this feature.

Backpatch-through: 13, where it was introduced
Discussion: https://postgr.es/m/CA+fd4k5_pPAYRTDrO2PbtTOe0eHQpBvuqmCr8ic39uTNmR49Eg@mail.gmail.com
---
 doc/src/sgml/monitoring.sgml                  | 38 ----------------
 src/backend/catalog/system_views.sql          |  5 +--
 .../replication/logical/reorderbuffer.c       | 12 ------
 src/backend/replication/walsender.c           | 43 +------------------
 src/include/catalog/catversion.h              |  2 +-
 src/include/catalog/pg_proc.dat               |  6 +--
 src/include/replication/reorderbuffer.h       | 11 -----
 src/include/replication/walsender_private.h   |  5 ---
 src/test/regress/expected/rules.out           |  7 +--
 9 files changed, 9 insertions(+), 120 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index b56c10293bb9..048ccc09886b 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -2466,38 +2466,6 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
        Send time of last reply message received from standby server
       </para></entry>
      </row>
-
-     <row>
-      <entry role="catalog_table_entry"><para role="column_definition">
-       <structfield>spill_txns</structfield> <type>bigint</type>
-      </para>
-      <para>
-       Number of transactions spilled to disk after the memory used by
-       logical decoding exceeds <literal>logical_decoding_work_mem</literal>. The
-       counter gets incremented both for top-level transactions and
-       subtransactions.
-      </para></entry>
-     </row>
-
-     <row>
-      <entry role="catalog_table_entry"><para role="column_definition">
-       <structfield>spill_count</structfield> <type>bigint</type>
-      </para>
-      <para>
-       Number of times transactions were spilled to disk. Transactions
-       may get spilled repeatedly, and this counter gets incremented on every
-       such invocation.
-      </para></entry>
-     </row>
-
-     <row>
-      <entry role="catalog_table_entry"><para role="column_definition">
-       <structfield>spill_bytes</structfield> <type>bigint</type>
-      </para>
-      <para>
-       Amount of decoded transaction data spilled to disk.
-      </para></entry>
-     </row>
     </tbody>
    </tgroup>
   </table>
@@ -2522,12 +2490,6 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
    mechanism will simply display NULL lag.
   </para>
 
-  <para>
-   Tracking of spilled transactions works only for logical replication.  In
-   physical replication, the tracking mechanism will display 0 for spilled
-   statistics.
-  </para>
-
   <note>
    <para>
     The reported lag times are not predictions of how long it will take for
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 73676d04cf4b..b6d35c2d1131 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -785,10 +785,7 @@ CREATE VIEW pg_stat_replication AS
             W.replay_lag,
             W.sync_priority,
             W.sync_state,
-            W.reply_time,
-            W.spill_txns,
-            W.spill_count,
-            W.spill_bytes
+            W.reply_time
     FROM pg_stat_get_activity(NULL) AS S
         JOIN pg_stat_get_wal_senders() AS W ON (S.pid = W.pid)
         LEFT JOIN pg_authid AS U ON (S.usesysid = U.oid);
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 7afa2271bd8d..525193266907 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -317,10 +317,6 @@ ReorderBufferAllocate(void)
 	buffer->outbufsize = 0;
 	buffer->size = 0;
 
-	buffer->spillCount = 0;
-	buffer->spillTxns = 0;
-	buffer->spillBytes = 0;
-
 	buffer->current_restart_decoding_lsn = InvalidXLogRecPtr;
 
 	dlist_init(&buffer->toplevel_by_lsn);
@@ -2418,7 +2414,6 @@ ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 	int			fd = -1;
 	XLogSegNo	curOpenSegNo = 0;
 	Size		spilled = 0;
-	Size		size = txn->size;
 
 	elog(DEBUG2, "spill %u changes in XID %u to disk",
 		 (uint32) txn->nentries_mem, txn->xid);
@@ -2477,13 +2472,6 @@ ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 		spilled++;
 	}
 
-	/* update the statistics */
-	rb->spillCount += 1;
-	rb->spillBytes += size;
-
-	/* Don't consider already serialized transactions. */
-	rb->spillTxns += rbtxn_is_serialized(txn) ? 0 : 1;
-
 	Assert(spilled == txn->nentries_mem);
 	Assert(dlist_is_empty(&txn->changes));
 	txn->nentries_mem = 0;
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index f66acb872068..5e2210dd7bdc 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -254,7 +254,6 @@ static bool TransactionIdInRecentPast(TransactionId xid, uint32 epoch);
 
 static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
 							  TimeLineID *tli_p);
-static void UpdateSpillStats(LogicalDecodingContext *ctx);
 
 
 /* Initialize walsender process before entering the main command loop */
@@ -1348,8 +1347,7 @@ WalSndWriteData(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid,
 /*
  * LogicalDecodingContext 'update_progress' callback.
  *
- * Write the current position to the lag tracker (see XLogSendPhysical),
- * and update the spill statistics.
+ * Write the current position to the lag tracker (see XLogSendPhysical).
  */
 static void
 WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId xid)
@@ -1368,11 +1366,6 @@ WalSndUpdateProgress(LogicalDecodingContext *ctx, XLogRecPtr lsn, TransactionId
 
 	LagTrackerWrite(lsn, now);
 	sendTime = now;
-
-	/*
-	 * Update statistics about transactions that spilled to disk.
-	 */
-	UpdateSpillStats(ctx);
 }
 
 /*
@@ -2418,9 +2411,6 @@ InitWalSenderSlot(void)
 			walsnd->sync_standby_priority = 0;
 			walsnd->latch = &MyProc->procLatch;
 			walsnd->replyTime = 0;
-			walsnd->spillTxns = 0;
-			walsnd->spillCount = 0;
-			walsnd->spillBytes = 0;
 			SpinLockRelease(&walsnd->mutex);
 			/* don't need the lock anymore */
 			MyWalSnd = (WalSnd *) walsnd;
@@ -3256,7 +3246,7 @@ offset_to_interval(TimeOffset offset)
 Datum
 pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
 {
-#define PG_STAT_GET_WAL_SENDERS_COLS	15
+#define PG_STAT_GET_WAL_SENDERS_COLS	12
 	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
 	TupleDesc	tupdesc;
 	Tuplestorestate *tupstore;
@@ -3310,9 +3300,6 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
 		int			pid;
 		WalSndState state;
 		TimestampTz replyTime;
-		int64		spillTxns;
-		int64		spillCount;
-		int64		spillBytes;
 		bool		is_sync_standby;
 		Datum		values[PG_STAT_GET_WAL_SENDERS_COLS];
 		bool		nulls[PG_STAT_GET_WAL_SENDERS_COLS];
@@ -3336,9 +3323,6 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
 		applyLag = walsnd->applyLag;
 		priority = walsnd->sync_standby_priority;
 		replyTime = walsnd->replyTime;
-		spillTxns = walsnd->spillTxns;
-		spillCount = walsnd->spillCount;
-		spillBytes = walsnd->spillBytes;
 		SpinLockRelease(&walsnd->mutex);
 
 		/*
@@ -3436,11 +3420,6 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
 				nulls[11] = true;
 			else
 				values[11] = TimestampTzGetDatum(replyTime);
-
-			/* spill to disk */
-			values[12] = Int64GetDatum(spillTxns);
-			values[13] = Int64GetDatum(spillCount);
-			values[14] = Int64GetDatum(spillBytes);
 		}
 
 		tuplestore_putvalues(tupstore, tupdesc, values, nulls);
@@ -3677,21 +3656,3 @@ LagTrackerRead(int head, XLogRecPtr lsn, TimestampTz now)
 	Assert(time != 0);
 	return now - time;
 }
-
-static void
-UpdateSpillStats(LogicalDecodingContext *ctx)
-{
-	ReorderBuffer *rb = ctx->reorder;
-
-	elog(DEBUG2, "UpdateSpillStats: updating stats %p %lld %lld %lld",
-		 rb,
-		 (long long) rb->spillTxns,
-		 (long long) rb->spillCount,
-		 (long long) rb->spillBytes);
-
-	SpinLockAcquire(&MyWalSnd->mutex);
-	MyWalSnd->spillTxns = rb->spillTxns;
-	MyWalSnd->spillCount = rb->spillCount;
-	MyWalSnd->spillBytes = rb->spillBytes;
-	SpinLockRelease(&MyWalSnd->mutex);
-}
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index ee5858656964..6b3aa7c0063e 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202007121
+#define CATALOG_VERSION_NO	202007131
 
 #endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index d81467af198f..95604e988aa0 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5240,9 +5240,9 @@
   proname => 'pg_stat_get_wal_senders', prorows => '10', proisstrict => 'f',
   proretset => 't', provolatile => 's', proparallel => 'r',
   prorettype => 'record', proargtypes => '',
-  proallargtypes => '{int4,text,pg_lsn,pg_lsn,pg_lsn,pg_lsn,interval,interval,interval,int4,text,timestamptz,int8,int8,int8}',
-  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o,o,o,o}',
-  proargnames => '{pid,state,sent_lsn,write_lsn,flush_lsn,replay_lsn,write_lag,flush_lag,replay_lag,sync_priority,sync_state,reply_time,spill_txns,spill_count,spill_bytes}',
+  proallargtypes => '{int4,text,pg_lsn,pg_lsn,pg_lsn,pg_lsn,interval,interval,interval,int4,text,timestamptz}',
+  proargmodes => '{o,o,o,o,o,o,o,o,o,o,o,o}',
+  proargnames => '{pid,state,sent_lsn,write_lsn,flush_lsn,replay_lsn,write_lag,flush_lag,replay_lag,sync_priority,sync_state,reply_time}',
   prosrc => 'pg_stat_get_wal_senders' },
 { oid => '3317', descr => 'statistics: information about WAL receiver',
   proname => 'pg_stat_get_wal_receiver', proisstrict => 'f', provolatile => 's',
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 626ecf4dc96a..019bd382de9b 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -413,17 +413,6 @@ struct ReorderBuffer
 
 	/* memory accounting */
 	Size		size;
-
-	/*
-	 * Statistics about transactions spilled to disk.
-	 *
-	 * A single transaction may be spilled repeatedly, which is why we keep
-	 * two different counters. For spilling, the transaction counter includes
-	 * both toplevel transactions and subtransactions.
-	 */
-	int64		spillCount;		/* spill-to-disk invocation counter */
-	int64		spillTxns;		/* number of transactions spilled to disk  */
-	int64		spillBytes;		/* amount of data spilled to disk */
 };
 
 
diff --git a/src/include/replication/walsender_private.h b/src/include/replication/walsender_private.h
index 734acec2a419..509856c057ec 100644
--- a/src/include/replication/walsender_private.h
+++ b/src/include/replication/walsender_private.h
@@ -78,11 +78,6 @@ typedef struct WalSnd
 	 * Timestamp of the last message received from standby.
 	 */
 	TimestampTz replyTime;
-
-	/* Statistics for transactions spilled to disk. */
-	int64		spillTxns;
-	int64		spillCount;
-	int64		spillBytes;
 } WalSnd;
 
 extern WalSnd *MyWalSnd;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 93bb2159ca89..fa436f2caa0a 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2002,12 +2002,9 @@ pg_stat_replication| SELECT s.pid,
     w.replay_lag,
     w.sync_priority,
     w.sync_state,
-    w.reply_time,
-    w.spill_txns,
-    w.spill_count,
-    w.spill_bytes
+    w.reply_time
    FROM ((pg_stat_get_activity(NULL::integer) s(datid, pid, usesysid, application_name, state, query, wait_event_type, wait_event, xact_start, query_start, backend_start, state_change, client_addr, client_hostname, client_port, backend_xid, backend_xmin, backend_type, ssl, sslversion, sslcipher, sslbits, sslcompression, ssl_client_dn, ssl_client_serial, ssl_issuer_dn, gss_auth, gss_princ, gss_enc, leader_pid)
-     JOIN pg_stat_get_wal_senders() w(pid, state, sent_lsn, write_lsn, flush_lsn, replay_lsn, write_lag, flush_lag, replay_lag, sync_priority, sync_state, reply_time, spill_txns, spill_count, spill_bytes) ON ((s.pid = w.pid)))
+     JOIN pg_stat_get_wal_senders() w(pid, state, sent_lsn, write_lsn, flush_lsn, replay_lsn, write_lag, flush_lag, replay_lag, sync_priority, sync_state, reply_time) ON ((s.pid = w.pid)))
      LEFT JOIN pg_authid u ON ((s.usesysid = u.oid)));
 pg_stat_slru| SELECT s.name,
     s.blks_zeroed,

From 0babd109801e5ecd90df29589c23c6daf3ae69f7 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jdavis@postgresql.org>
Date: Sun, 12 Jul 2020 16:46:19 -0700
Subject: [PATCH 131/334] Revert "Use CP_SMALL_TLIST for hash aggregate"

This reverts commit 4cad2534da6d17067d98cf04be2dfc1bda8f2cd0 due to a
performance regression. It will be replaced by a new approach in an
upcoming commit.

Reported-by: Andres Freund
Discussion: https://postgr.es/m/20200614181418.mx4bvljmfkkhoqzl@alap3.anarazel.de
Backpatch-through: 13
---
 .../postgres_fdw/expected/postgres_fdw.out    |  4 +--
 src/backend/optimizer/plan/createplan.c       | 28 +++----------------
 2 files changed, 6 insertions(+), 26 deletions(-)

diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 82fc1290ef26..90db550b9214 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -2715,7 +2715,7 @@ select sum(c1) from ft1 group by c2 having avg(c1 * (random() <= 1)::int) > 100
          Group Key: ft1.c2
          Filter: (avg((ft1.c1 * ((random() <= '1'::double precision))::integer)) > '100'::numeric)
          ->  Foreign Scan on public.ft1
-               Output: c2, c1
+               Output: c1, c2
                Remote SQL: SELECT "C 1", c2 FROM "S 1"."T 1"
 (10 rows)
 
@@ -2964,7 +2964,7 @@ select sum(c1) filter (where (c1 / c1) * random() <= 1) from ft1 group by c2 ord
          Output: sum(c1) FILTER (WHERE ((((c1 / c1))::double precision * random()) <= '1'::double precision)), c2
          Group Key: ft1.c2
          ->  Foreign Scan on public.ft1
-               Output: c2, c1
+               Output: c1, c2
                Remote SQL: SELECT "C 1", c2 FROM "S 1"."T 1"
 (9 rows)
 
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index eb9543f6add0..9941dfe65e46 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -2113,22 +2113,12 @@ create_agg_plan(PlannerInfo *root, AggPath *best_path)
 	Plan	   *subplan;
 	List	   *tlist;
 	List	   *quals;
-	int			flags;
 
 	/*
 	 * Agg can project, so no need to be terribly picky about child tlist, but
-	 * we do need grouping columns to be available. We are a bit more careful
-	 * with hash aggregate, where we explicitly request small tlist to
-	 * minimize I/O needed for spilling (we can't be sure spilling won't be
-	 * necessary, so we just do it every time).
+	 * we do need grouping columns to be available
 	 */
-	flags = CP_LABEL_TLIST;
-
-	/* ensure small tlist for hash aggregate */
-	if (best_path->aggstrategy == AGG_HASHED)
-		flags |= CP_SMALL_TLIST;
-
-	subplan = create_plan_recurse(root, best_path->subpath, flags);
+	subplan = create_plan_recurse(root, best_path->subpath, CP_LABEL_TLIST);
 
 	tlist = build_path_tlist(root, &best_path->path);
 
@@ -2210,7 +2200,6 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path)
 	int			maxref;
 	List	   *chain;
 	ListCell   *lc;
-	int			flags;
 
 	/* Shouldn't get here without grouping sets */
 	Assert(root->parse->groupingSets);
@@ -2218,18 +2207,9 @@ create_groupingsets_plan(PlannerInfo *root, GroupingSetsPath *best_path)
 
 	/*
 	 * Agg can project, so no need to be terribly picky about child tlist, but
-	 * we do need grouping columns to be available. We are a bit more careful
-	 * with hash aggregate, where we explicitly request small tlist to
-	 * minimize I/O needed for spilling (we can't be sure spilling won't be
-	 * necessary, so we just do it every time).
+	 * we do need grouping columns to be available
 	 */
-	flags = CP_LABEL_TLIST;
-
-	/* ensure small tlist for hash aggregate */
-	if (best_path->aggstrategy == AGG_HASHED)
-		flags |= CP_SMALL_TLIST;
-
-	subplan = create_plan_recurse(root, best_path->subpath, flags);
+	subplan = create_plan_recurse(root, best_path->subpath, CP_LABEL_TLIST);
 
 	/*
 	 * Compute the mapping from tleSortGroupRef to column index in the child's

From 2302302236a0c7aeba3377d78dd9d80ba53247df Mon Sep 17 00:00:00 2001
From: Jeff Davis <jdavis@postgresql.org>
Date: Sun, 12 Jul 2020 17:48:49 -0700
Subject: [PATCH 132/334] HashAgg: before spilling tuples, set unneeded columns
 to NULL.

This is a replacement for 4cad2534. Instead of projecting all tuples
going into a HashAgg, only remove unnecessary attributes when actually
spilling. This avoids the regression for the in-memory case.

Discussion: https://postgr.es/m/a2fb7dfeb4f50aa0a123e42151ee3013933cb802.camel%40j-davis.com
Backpatch-through: 13
---
 src/backend/executor/nodeAgg.c | 123 ++++++++++++++++++++++++---------
 src/include/nodes/execnodes.h  |   8 ++-
 2 files changed, 96 insertions(+), 35 deletions(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index a20554ae65a6..8eb1732ca884 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -359,6 +359,14 @@ typedef struct HashAggBatch
 	int64		input_tuples;	/* number of tuples in this batch */
 } HashAggBatch;
 
+/* used to find referenced colnos */
+typedef struct FindColsContext
+{
+	bool	   is_aggref;		/* is under an aggref */
+	Bitmapset *aggregated;		/* column references under an aggref */
+	Bitmapset *unaggregated;	/* other column references */
+} FindColsContext;
+
 static void select_current_set(AggState *aggstate, int setno, bool is_hash);
 static void initialize_phase(AggState *aggstate, int newphase);
 static TupleTableSlot *fetch_input_tuple(AggState *aggstate);
@@ -391,8 +399,9 @@ static void finalize_aggregates(AggState *aggstate,
 								AggStatePerAgg peragg,
 								AggStatePerGroup pergroup);
 static TupleTableSlot *project_aggregates(AggState *aggstate);
-static Bitmapset *find_unaggregated_cols(AggState *aggstate);
-static bool find_unaggregated_cols_walker(Node *node, Bitmapset **colnos);
+static void find_cols(AggState *aggstate, Bitmapset **aggregated,
+					  Bitmapset **unaggregated);
+static bool find_cols_walker(Node *node, FindColsContext *context);
 static void build_hash_tables(AggState *aggstate);
 static void build_hash_table(AggState *aggstate, int setno, long nbuckets);
 static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
@@ -425,8 +434,8 @@ static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp);
 static void hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo,
 							   int used_bits, uint64 input_tuples,
 							   double hashentrysize);
-static Size hashagg_spill_tuple(HashAggSpill *spill, TupleTableSlot *slot,
-								uint32 hash);
+static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
+								TupleTableSlot *slot, uint32 hash);
 static void hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill,
 								 int setno);
 static void hashagg_tapeinfo_init(AggState *aggstate);
@@ -1375,26 +1384,28 @@ project_aggregates(AggState *aggstate)
 }
 
 /*
- * find_unaggregated_cols
- *	  Construct a bitmapset of the column numbers of un-aggregated Vars
- *	  appearing in our targetlist and qual (HAVING clause)
+ * Walk tlist and qual to find referenced colnos, dividing them into
+ * aggregated and unaggregated sets.
  */
-static Bitmapset *
-find_unaggregated_cols(AggState *aggstate)
+static void
+find_cols(AggState *aggstate, Bitmapset **aggregated, Bitmapset **unaggregated)
 {
-	Agg		   *node = (Agg *) aggstate->ss.ps.plan;
-	Bitmapset  *colnos;
-
-	colnos = NULL;
-	(void) find_unaggregated_cols_walker((Node *) node->plan.targetlist,
-										 &colnos);
-	(void) find_unaggregated_cols_walker((Node *) node->plan.qual,
-										 &colnos);
-	return colnos;
+	Agg *agg = (Agg *) aggstate->ss.ps.plan;
+	FindColsContext context;
+
+	context.is_aggref = false;
+	context.aggregated = NULL;
+	context.unaggregated = NULL;
+
+	(void) find_cols_walker((Node *) agg->plan.targetlist, &context);
+	(void) find_cols_walker((Node *) agg->plan.qual, &context);
+
+	*aggregated = context.aggregated;
+	*unaggregated = context.unaggregated;
 }
 
 static bool
-find_unaggregated_cols_walker(Node *node, Bitmapset **colnos)
+find_cols_walker(Node *node, FindColsContext *context)
 {
 	if (node == NULL)
 		return false;
@@ -1405,16 +1416,24 @@ find_unaggregated_cols_walker(Node *node, Bitmapset **colnos)
 		/* setrefs.c should have set the varno to OUTER_VAR */
 		Assert(var->varno == OUTER_VAR);
 		Assert(var->varlevelsup == 0);
-		*colnos = bms_add_member(*colnos, var->varattno);
+		if (context->is_aggref)
+			context->aggregated = bms_add_member(context->aggregated,
+												 var->varattno);
+		else
+			context->unaggregated = bms_add_member(context->unaggregated,
+												   var->varattno);
 		return false;
 	}
-	if (IsA(node, Aggref) || IsA(node, GroupingFunc))
+	if (IsA(node, Aggref))
 	{
-		/* do not descend into aggregate exprs */
+		Assert(!context->is_aggref);
+		context->is_aggref = true;
+		expression_tree_walker(node, find_cols_walker, (void *) context);
+		context->is_aggref = false;
 		return false;
 	}
-	return expression_tree_walker(node, find_unaggregated_cols_walker,
-								  (void *) colnos);
+	return expression_tree_walker(node, find_cols_walker,
+								  (void *) context);
 }
 
 /*
@@ -1532,13 +1551,27 @@ static void
 find_hash_columns(AggState *aggstate)
 {
 	Bitmapset  *base_colnos;
+	Bitmapset  *aggregated_colnos;
+	TupleDesc	scanDesc = aggstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
 	List	   *outerTlist = outerPlanState(aggstate)->plan->targetlist;
 	int			numHashes = aggstate->num_hashes;
 	EState	   *estate = aggstate->ss.ps.state;
 	int			j;
 
 	/* Find Vars that will be needed in tlist and qual */
-	base_colnos = find_unaggregated_cols(aggstate);
+	find_cols(aggstate, &aggregated_colnos, &base_colnos);
+	aggstate->colnos_needed = bms_union(base_colnos, aggregated_colnos);
+	aggstate->max_colno_needed = 0;
+	aggstate->all_cols_needed = true;
+
+	for (int i = 0; i < scanDesc->natts; i++)
+	{
+		int colno = i + 1;
+		if (bms_is_member(colno, aggstate->colnos_needed))
+			aggstate->max_colno_needed = colno;
+		else
+			aggstate->all_cols_needed = false;
+	}
 
 	for (j = 0; j < numHashes; ++j)
 	{
@@ -2097,7 +2130,7 @@ lookup_hash_entries(AggState *aggstate)
 								   perhash->aggnode->numGroups,
 								   aggstate->hashentrysize);
 
-			hashagg_spill_tuple(spill, slot, hash);
+			hashagg_spill_tuple(aggstate, spill, slot, hash);
 		}
 	}
 }
@@ -2619,7 +2652,7 @@ agg_refill_hash_table(AggState *aggstate)
 							 HASHAGG_READ_BUFFER_SIZE);
 	for (;;)
 	{
-		TupleTableSlot *slot = aggstate->hash_spill_slot;
+		TupleTableSlot *slot = aggstate->hash_spill_rslot;
 		MinimalTuple tuple;
 		uint32		hash;
 		bool		in_hash_table;
@@ -2655,7 +2688,7 @@ agg_refill_hash_table(AggState *aggstate)
 								   ngroups_estimate, aggstate->hashentrysize);
 			}
 			/* no memory for a new group, spill */
-			hashagg_spill_tuple(&spill, slot, hash);
+			hashagg_spill_tuple(aggstate, &spill, slot, hash);
 		}
 
 		/*
@@ -2934,9 +2967,11 @@ hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, int used_bits,
  * partition.
  */
 static Size
-hashagg_spill_tuple(HashAggSpill *spill, TupleTableSlot *slot, uint32 hash)
+hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
+					TupleTableSlot *inputslot, uint32 hash)
 {
 	LogicalTapeSet *tapeset = spill->tapeset;
+	TupleTableSlot *spillslot;
 	int			partition;
 	MinimalTuple tuple;
 	int			tapenum;
@@ -2945,8 +2980,28 @@ hashagg_spill_tuple(HashAggSpill *spill, TupleTableSlot *slot, uint32 hash)
 
 	Assert(spill->partitions != NULL);
 
-	/* XXX: may contain unnecessary attributes, should project */
-	tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);
+	/* spill only attributes that we actually need */
+	if (!aggstate->all_cols_needed)
+	{
+		spillslot = aggstate->hash_spill_wslot;
+		slot_getsomeattrs(inputslot, aggstate->max_colno_needed);
+		ExecClearTuple(spillslot);
+		for (int i = 0; i < spillslot->tts_tupleDescriptor->natts; i++)
+		{
+			if (bms_is_member(i + 1, aggstate->colnos_needed))
+			{
+				spillslot->tts_values[i] = inputslot->tts_values[i];
+				spillslot->tts_isnull[i] = inputslot->tts_isnull[i];
+			}
+			else
+				spillslot->tts_isnull[i] = true;
+		}
+		ExecStoreVirtualTuple(spillslot);
+	}
+	else
+		spillslot = inputslot;
+
+	tuple = ExecFetchSlotMinimalTuple(spillslot, &shouldFree);
 
 	partition = (hash & spill->mask) >> spill->shift;
 	spill->ntuples[partition]++;
@@ -3563,8 +3618,10 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 		aggstate->hash_metacxt = AllocSetContextCreate(aggstate->ss.ps.state->es_query_cxt,
 													   "HashAgg meta context",
 													   ALLOCSET_DEFAULT_SIZES);
-		aggstate->hash_spill_slot = ExecInitExtraTupleSlot(estate, scanDesc,
-														   &TTSOpsMinimalTuple);
+		aggstate->hash_spill_rslot = ExecInitExtraTupleSlot(estate, scanDesc,
+															&TTSOpsMinimalTuple);
+		aggstate->hash_spill_wslot = ExecInitExtraTupleSlot(estate, scanDesc,
+															&TTSOpsVirtual);
 
 		/* this is an array of pointers, not structures */
 		aggstate->hash_pergroup = pergroups;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 0187989fd19f..6f96b31fb438 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2169,6 +2169,9 @@ typedef struct AggState
 	int			current_set;	/* The current grouping set being evaluated */
 	Bitmapset  *grouped_cols;	/* grouped cols in current projection */
 	List	   *all_grouped_cols;	/* list of all grouped cols in DESC order */
+	Bitmapset  *colnos_needed;	/* all columns needed from the outer plan */
+	int			max_colno_needed;	/* highest colno needed from outer plan */
+	bool		all_cols_needed;	/* are all cols from outer plan needed? */
 	/* These fields are for grouping set phase data */
 	int			maxsets;		/* The max number of sets in any phase */
 	AggStatePerPhase phases;	/* array of all phases */
@@ -2186,7 +2189,8 @@ typedef struct AggState
 	struct HashTapeInfo *hash_tapeinfo; /* metadata for spill tapes */
 	struct HashAggSpill *hash_spills;	/* HashAggSpill for each grouping set,
 										 * exists only during first pass */
-	TupleTableSlot *hash_spill_slot;	/* slot for reading from spill files */
+	TupleTableSlot *hash_spill_rslot;	/* for reading spill files */
+	TupleTableSlot *hash_spill_wslot;	/* for writing spill files */
 	List	   *hash_batches;	/* hash batches remaining to be processed */
 	bool		hash_ever_spilled;	/* ever spilled during this execution? */
 	bool		hash_spill_mode;	/* we hit a limit during the current batch
@@ -2207,7 +2211,7 @@ typedef struct AggState
 										 * per-group pointers */
 
 	/* support for evaluation of agg input expressions: */
-#define FIELDNO_AGGSTATE_ALL_PERGROUPS 49
+#define FIELDNO_AGGSTATE_ALL_PERGROUPS 53
 	AggStatePerGroup *all_pergroups;	/* array of first ->pergroups, than
 										 * ->hash_pergroup */
 	ProjectionInfo *combinedproj;	/* projection machinery */

From 8d2ed66e4107ef27d05aef682c68af5952af7690 Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Sat, 11 Jul 2020 14:14:49 +0300
Subject: [PATCH 133/334] Improvements to psql \dAo and \dAp commands

 * Strategy number and purpose are essential information for opfamily operator.
   So, show those columns in non-verbose output.
 * "Left/right arg type" \dAp column names are confusing, because those type
   don't necessary match to function arguments.  Rename them to "Registered
   left/right type".
 * Replace manual assembling of operator/procedure names with casts to
   regoperator/regprocedure.
 * Add schema-qualification for pg_catalog functions and tables.

Reported-by: Peter Eisentraut, Tom Lane
Reviewed-by: Tom Lane
Discussion: https://postgr.es/m/2edc7b27-031f-b2b6-0db2-864241c91cb9%402ndquadrant.com
Backpatch-through: 13
---
 src/bin/psql/command.c             |  2 +-
 src/bin/psql/describe.c            | 93 ++++++++++++++--------------
 src/bin/psql/describe.h            |  2 +-
 src/test/regress/expected/psql.out | 98 +++++++++++++++---------------
 src/test/regress/sql/psql.sql      |  2 +-
 5 files changed, 98 insertions(+), 99 deletions(-)

diff --git a/src/bin/psql/command.c b/src/bin/psql/command.c
index 560eacc7f0c8..9902a4a2ba8e 100644
--- a/src/bin/psql/command.c
+++ b/src/bin/psql/command.c
@@ -747,7 +747,7 @@ exec_command_d(PsqlScanState scan_state, bool active_branch, const char *cmd)
 							success = listOpFamilyOperators(pattern, pattern2, show_verbose);
 							break;
 						case 'p':
-							success = listOpFamilyFunctions(pattern, pattern2);
+							success = listOpFamilyFunctions(pattern, pattern2, show_verbose);
 							break;
 						default:
 							status = PSQL_CMD_UNKNOWN;
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index cd39b913cda9..3b870c3b17e2 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -6067,15 +6067,16 @@ listOperatorClasses(const char *access_method_pattern,
 	printfPQExpBuffer(&buf,
 					  "SELECT DISTINCT"
 					  "  am.amname AS \"%s\",\n"
-					  "  c.opcintype::pg_catalog.regtype AS \"%s\",\n"
-					  "  (CASE WHEN c.opckeytype <> 0 AND c.opckeytype <> c.opcintype\n"
-					  "    THEN c.opckeytype\n"
-					  "    ELSE NULL -- c.opcintype\n"
-					  "  END)::pg_catalog.regtype AS \"%s\",\n"
+					  "  pg_catalog.format_type(c.opcintype, NULL) AS \"%s\",\n"
+					  "  CASE\n"
+					  "    WHEN c.opckeytype <> 0 AND c.opckeytype <> c.opcintype\n"
+					  "    THEN pg_catalog.format_type(c.opckeytype, NULL)\n"
+					  "    ELSE NULL\n"
+					  "  END AS \"%s\",\n"
 					  "  CASE\n"
 					  "    WHEN pg_catalog.pg_opclass_is_visible(c.oid)\n"
-					  "    THEN format('%%I', c.opcname)\n"
-					  "    ELSE format('%%I.%%I', n.nspname, c.opcname)\n"
+					  "    THEN pg_catalog.format('%%I', c.opcname)\n"
+					  "    ELSE pg_catalog.format('%%I.%%I', n.nspname, c.opcname)\n"
 					  "  END AS \"%s\",\n"
 					  "  (CASE WHEN c.opcdefault\n"
 					  "    THEN '%s'\n"
@@ -6092,8 +6093,8 @@ listOperatorClasses(const char *access_method_pattern,
 		appendPQExpBuffer(&buf,
 						  ",\n  CASE\n"
 						  "    WHEN pg_catalog.pg_opfamily_is_visible(of.oid)\n"
-						  "    THEN format('%%I', of.opfname)\n"
-						  "    ELSE format('%%I.%%I', ofn.nspname, of.opfname)\n"
+						  "    THEN pg_catalog.format('%%I', of.opfname)\n"
+						  "    ELSE pg_catalog.format('%%I.%%I', ofn.nspname, of.opfname)\n"
 						  "  END AS \"%s\",\n"
 						  " pg_catalog.pg_get_userbyid(c.opcowner) AS \"%s\"\n",
 						  gettext_noop("Operator family"),
@@ -6157,12 +6158,12 @@ listOperatorFamilies(const char *access_method_pattern,
 					  "  am.amname AS \"%s\",\n"
 					  "  CASE\n"
 					  "    WHEN pg_catalog.pg_opfamily_is_visible(f.oid)\n"
-					  "    THEN format('%%I', f.opfname)\n"
-					  "    ELSE format('%%I.%%I', n.nspname, f.opfname)\n"
+					  "    THEN pg_catalog.format('%%I', f.opfname)\n"
+					  "    ELSE pg_catalog.format('%%I.%%I', n.nspname, f.opfname)\n"
 					  "  END AS \"%s\",\n"
 					  "  (SELECT\n"
-					  "     string_agg(format_type(oc.opcintype, -1), ', ')\n"
-					  "   FROM pg_opclass oc\n"
+					  "     pg_catalog.string_agg(pg_catalog.format_type(oc.opcintype, NULL), ', ')\n"
+					  "   FROM pg_catalog.pg_opclass oc\n"
 					  "   WHERE oc.opcfamily = f.oid) \"%s\"",
 					  gettext_noop("AM"),
 					  gettext_noop("Operator family"),
@@ -6185,8 +6186,8 @@ listOperatorFamilies(const char *access_method_pattern,
 		appendPQExpBuffer(&buf,
 						  "\n  %s EXISTS (\n"
 						  "    SELECT 1\n"
-						  "    FROM pg_type t\n"
-						  "    JOIN pg_opclass oc ON oc.opcintype = t.oid\n"
+						  "    FROM pg_catalog.pg_type t\n"
+						  "    JOIN pg_catalog.pg_opclass oc ON oc.opcintype = t.oid\n"
 						  "    WHERE oc.opcfamily = f.oid",
 						  have_where ? "AND" : "WHERE");
 		processSQLNamePattern(pset.db, &buf, type_pattern, true, false,
@@ -6237,38 +6238,29 @@ listOpFamilyOperators(const char *access_method_pattern,
 					  "  am.amname AS \"%s\",\n"
 					  "  CASE\n"
 					  "    WHEN pg_catalog.pg_opfamily_is_visible(of.oid)\n"
-					  "    THEN format('%%I', of.opfname)\n"
-					  "    ELSE format('%%I.%%I', nsf.nspname, of.opfname)\n"
+					  "    THEN pg_catalog.format('%%I', of.opfname)\n"
+					  "    ELSE pg_catalog.format('%%I.%%I', nsf.nspname, of.opfname)\n"
 					  "  END AS \"%s\",\n"
-					  "  format ('%%s (%%s, %%s)',\n"
-					  "    CASE\n"
-					  "      WHEN pg_catalog.pg_operator_is_visible(op.oid) \n"
-					  "      THEN op.oprname::pg_catalog.text \n"
-					  "      ELSE o.amopopr::pg_catalog.regoper::pg_catalog.text \n"
-					  "    END,\n"
-					  "    pg_catalog.format_type(o.amoplefttype, NULL),\n"
-					  "    pg_catalog.format_type(o.amoprighttype, NULL)\n"
-					  "  ) AS \"%s\"\n",
+					  "  o.amopopr::pg_catalog.regoperator AS \"%s\"\n,"
+					  "  o.amopstrategy AS \"%s\",\n"
+					  "  CASE o.amoppurpose\n"
+					  "    WHEN 'o' THEN '%s'\n"
+					  "    WHEN 's' THEN '%s'\n"
+					  "  END AS \"%s\"\n",
 					  gettext_noop("AM"),
 					  gettext_noop("Operator family"),
-					  gettext_noop("Operator"));
+					  gettext_noop("Operator"),
+					  gettext_noop("Strategy"),
+					  gettext_noop("ordering"),
+					  gettext_noop("search"),
+					  gettext_noop("Purpose"));
 
 	if (verbose)
 		appendPQExpBuffer(&buf,
-						  ", o.amopstrategy AS \"%s\",\n"
-						  "  CASE o.amoppurpose\n"
-						  "    WHEN 'o' THEN '%s'\n"
-						  "    WHEN 's' THEN '%s'\n"
-						  "  END AS \"%s\",\n"
-						  "  ofs.opfname AS \"%s\"\n",
-						  gettext_noop("Strategy"),
-						  gettext_noop("ordering"),
-						  gettext_noop("search"),
-						  gettext_noop("Purpose"),
+						  ", ofs.opfname AS \"%s\"\n",
 						  gettext_noop("Sort opfamily"));
 	appendPQExpBuffer(&buf,
 					  "FROM pg_catalog.pg_amop o\n"
-					  "  LEFT JOIN pg_catalog.pg_operator op ON op.oid = o.amopopr\n"
 					  "  LEFT JOIN pg_catalog.pg_opfamily of ON of.oid = o.amopfamily\n"
 					  "  LEFT JOIN pg_catalog.pg_am am ON am.oid = of.opfmethod AND am.oid = o.amopmethod\n"
 					  "  LEFT JOIN pg_catalog.pg_namespace nsf ON of.opfnamespace = nsf.oid\n");
@@ -6317,7 +6309,7 @@ listOpFamilyOperators(const char *access_method_pattern,
  */
 bool
 listOpFamilyFunctions(const char *access_method_pattern,
-					  const char *family_pattern)
+					  const char *family_pattern, bool verbose)
 {
 	PQExpBufferData buf;
 	PGresult   *res;
@@ -6332,19 +6324,26 @@ listOpFamilyFunctions(const char *access_method_pattern,
 					  "  am.amname AS \"%s\",\n"
 					  "  CASE\n"
 					  "    WHEN pg_catalog.pg_opfamily_is_visible(of.oid)\n"
-					  "    THEN format('%%I', of.opfname)\n"
-					  "    ELSE format('%%I.%%I', ns.nspname, of.opfname)\n"
+					  "    THEN pg_catalog.format('%%I', of.opfname)\n"
+					  "    ELSE pg_catalog.format('%%I.%%I', ns.nspname, of.opfname)\n"
 					  "  END AS \"%s\",\n"
 					  "  pg_catalog.format_type(ap.amproclefttype, NULL) AS \"%s\",\n"
 					  "  pg_catalog.format_type(ap.amprocrighttype, NULL) AS \"%s\",\n"
-					  "  ap.amprocnum AS \"%s\"\n,"
-					  "  p.proname AS \"%s\"\n",
+					  "  ap.amprocnum AS \"%s\"\n",
 					  gettext_noop("AM"),
 					  gettext_noop("Operator family"),
-					  gettext_noop("Left arg type"),
-					  gettext_noop("Right arg type"),
-					  gettext_noop("Number"),
-					  gettext_noop("Function"));
+					  gettext_noop("Registered left type"),
+					  gettext_noop("Registered right type"),
+					  gettext_noop("Number"));
+
+	if (!verbose)
+		appendPQExpBuffer(&buf,
+						  ", p.proname AS \"%s\"\n",
+						  gettext_noop("Function"));
+	else
+		appendPQExpBuffer(&buf,
+						  ", ap.amproc::pg_catalog.regprocedure AS \"%s\"\n",
+						  gettext_noop("Function"));
 
 	appendPQExpBuffer(&buf,
 					  "FROM pg_catalog.pg_amproc ap\n"
diff --git a/src/bin/psql/describe.h b/src/bin/psql/describe.h
index 4297f7fdfdf7..f0e3ec957c05 100644
--- a/src/bin/psql/describe.h
+++ b/src/bin/psql/describe.h
@@ -130,7 +130,7 @@ extern bool listOpFamilyOperators(const char *accessMethod_pattern,
 
 /* \dAp */
 extern bool listOpFamilyFunctions(const char *access_method_pattern,
-								  const char *family_pattern);
+								  const char *family_pattern, bool verbose);
 
 
 #endif							/* DESCRIBE_H */
diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out
index 7d2d6328fc80..555d464f9182 100644
--- a/src/test/regress/expected/psql.out
+++ b/src/test/regress/expected/psql.out
@@ -4953,62 +4953,62 @@ List of access methods
 (1 row)
 
 \dAo+ btree float_ops
-                                 List of operators of operator families
-  AM   | Operator family |                Operator                 | Strategy | Purpose | Sort opfamily 
--------+-----------------+-----------------------------------------+----------+---------+---------------
- btree | float_ops       | < (double precision, double precision)  |        1 | search  | 
- btree | float_ops       | <= (double precision, double precision) |        2 | search  | 
- btree | float_ops       | = (double precision, double precision)  |        3 | search  | 
- btree | float_ops       | >= (double precision, double precision) |        4 | search  | 
- btree | float_ops       | > (double precision, double precision)  |        5 | search  | 
- btree | float_ops       | < (real, real)                          |        1 | search  | 
- btree | float_ops       | <= (real, real)                         |        2 | search  | 
- btree | float_ops       | = (real, real)                          |        3 | search  | 
- btree | float_ops       | >= (real, real)                         |        4 | search  | 
- btree | float_ops       | > (real, real)                          |        5 | search  | 
- btree | float_ops       | < (double precision, real)              |        1 | search  | 
- btree | float_ops       | <= (double precision, real)             |        2 | search  | 
- btree | float_ops       | = (double precision, real)              |        3 | search  | 
- btree | float_ops       | >= (double precision, real)             |        4 | search  | 
- btree | float_ops       | > (double precision, real)              |        5 | search  | 
- btree | float_ops       | < (real, double precision)              |        1 | search  | 
- btree | float_ops       | <= (real, double precision)             |        2 | search  | 
- btree | float_ops       | = (real, double precision)              |        3 | search  | 
- btree | float_ops       | >= (real, double precision)             |        4 | search  | 
- btree | float_ops       | > (real, double precision)              |        5 | search  | 
+                                List of operators of operator families
+  AM   | Operator family |               Operator                | Strategy | Purpose | Sort opfamily 
+-------+-----------------+---------------------------------------+----------+---------+---------------
+ btree | float_ops       | <(double precision,double precision)  |        1 | search  | 
+ btree | float_ops       | <=(double precision,double precision) |        2 | search  | 
+ btree | float_ops       | =(double precision,double precision)  |        3 | search  | 
+ btree | float_ops       | >=(double precision,double precision) |        4 | search  | 
+ btree | float_ops       | >(double precision,double precision)  |        5 | search  | 
+ btree | float_ops       | <(real,real)                          |        1 | search  | 
+ btree | float_ops       | <=(real,real)                         |        2 | search  | 
+ btree | float_ops       | =(real,real)                          |        3 | search  | 
+ btree | float_ops       | >=(real,real)                         |        4 | search  | 
+ btree | float_ops       | >(real,real)                          |        5 | search  | 
+ btree | float_ops       | <(double precision,real)              |        1 | search  | 
+ btree | float_ops       | <=(double precision,real)             |        2 | search  | 
+ btree | float_ops       | =(double precision,real)              |        3 | search  | 
+ btree | float_ops       | >=(double precision,real)             |        4 | search  | 
+ btree | float_ops       | >(double precision,real)              |        5 | search  | 
+ btree | float_ops       | <(real,double precision)              |        1 | search  | 
+ btree | float_ops       | <=(real,double precision)             |        2 | search  | 
+ btree | float_ops       | =(real,double precision)              |        3 | search  | 
+ btree | float_ops       | >=(real,double precision)             |        4 | search  | 
+ btree | float_ops       | >(real,double precision)              |        5 | search  | 
 (20 rows)
 
 \dAo * pg_catalog.jsonb_path_ops
-    List of operators of operator families
- AM  | Operator family |       Operator       
------+-----------------+----------------------
- gin | jsonb_path_ops  | @> (jsonb, jsonb)
- gin | jsonb_path_ops  | @? (jsonb, jsonpath)
- gin | jsonb_path_ops  | @@ (jsonb, jsonpath)
+             List of operators of operator families
+ AM  | Operator family |      Operator      | Strategy | Purpose 
+-----+-----------------+--------------------+----------+---------
+ gin | jsonb_path_ops  | @>(jsonb,jsonb)    |        7 | search
+ gin | jsonb_path_ops  | @?(jsonb,jsonpath) |       15 | search
+ gin | jsonb_path_ops  | @@(jsonb,jsonpath) |       16 | search
 (3 rows)
 
-\dAp btree float_ops
-                        List of support functions of operator families
-  AM   | Operator family |  Left arg type   |  Right arg type  | Number |      Function       
--------+-----------------+------------------+------------------+--------+---------------------
- btree | float_ops       | double precision | double precision |      1 | btfloat8cmp
- btree | float_ops       | double precision | double precision |      2 | btfloat8sortsupport
- btree | float_ops       | double precision | double precision |      3 | in_range
- btree | float_ops       | real             | real             |      1 | btfloat4cmp
- btree | float_ops       | real             | real             |      2 | btfloat4sortsupport
- btree | float_ops       | double precision | real             |      1 | btfloat84cmp
- btree | float_ops       | real             | double precision |      1 | btfloat48cmp
- btree | float_ops       | real             | double precision |      3 | in_range
+\dAp+ btree float_ops
+                                                         List of support functions of operator families
+  AM   | Operator family | Registered left type | Registered right type | Number |                                   Function                                   
+-------+-----------------+----------------------+-----------------------+--------+------------------------------------------------------------------------------
+ btree | float_ops       | double precision     | double precision      |      1 | btfloat8cmp(double precision,double precision)
+ btree | float_ops       | double precision     | double precision      |      2 | btfloat8sortsupport(internal)
+ btree | float_ops       | double precision     | double precision      |      3 | in_range(double precision,double precision,double precision,boolean,boolean)
+ btree | float_ops       | real                 | real                  |      1 | btfloat4cmp(real,real)
+ btree | float_ops       | real                 | real                  |      2 | btfloat4sortsupport(internal)
+ btree | float_ops       | double precision     | real                  |      1 | btfloat84cmp(double precision,real)
+ btree | float_ops       | real                 | double precision      |      1 | btfloat48cmp(real,double precision)
+ btree | float_ops       | real                 | double precision      |      3 | in_range(real,real,double precision,boolean,boolean)
 (8 rows)
 
 \dAp * pg_catalog.uuid_ops
-                     List of support functions of operator families
-  AM   | Operator family | Left arg type | Right arg type | Number |      Function      
--------+-----------------+---------------+----------------+--------+--------------------
- btree | uuid_ops        | uuid          | uuid           |      1 | uuid_cmp
- btree | uuid_ops        | uuid          | uuid           |      2 | uuid_sortsupport
- btree | uuid_ops        | uuid          | uuid           |      4 | btequalimage
- hash  | uuid_ops        | uuid          | uuid           |      1 | uuid_hash
- hash  | uuid_ops        | uuid          | uuid           |      2 | uuid_hash_extended
+                            List of support functions of operator families
+  AM   | Operator family | Registered left type | Registered right type | Number |      Function      
+-------+-----------------+----------------------+-----------------------+--------+--------------------
+ btree | uuid_ops        | uuid                 | uuid                  |      1 | uuid_cmp
+ btree | uuid_ops        | uuid                 | uuid                  |      2 | uuid_sortsupport
+ btree | uuid_ops        | uuid                 | uuid                  |      4 | btequalimage
+ hash  | uuid_ops        | uuid                 | uuid                  |      1 | uuid_hash
+ hash  | uuid_ops        | uuid                 | uuid                  |      2 | uuid_hash_extended
 (5 rows)
 
diff --git a/src/test/regress/sql/psql.sql b/src/test/regress/sql/psql.sql
index bd10aec6d679..5a160809807b 100644
--- a/src/test/regress/sql/psql.sql
+++ b/src/test/regress/sql/psql.sql
@@ -1205,5 +1205,5 @@ drop role regress_partitioning_role;
 \dAf btree int4
 \dAo+ btree float_ops
 \dAo * pg_catalog.jsonb_path_ops
-\dAp btree float_ops
+\dAp+ btree float_ops
 \dAp * pg_catalog.uuid_ops

From 25fe5ac45a736e50d2b85280a3cb0ce169f583f3 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 13 Jul 2020 11:57:55 -0400
Subject: [PATCH 134/334] Fix bugs in libpq's management of GSS encryption
 state.

GSS-related resources should be cleaned up in pqDropConnection,
not freePGconn, else the wrong things happen when resetting
a connection or trying to switch to a different server.
It's also critical to reset conn->gssenc there.

During connection setup, initialize conn->try_gss at the correct
place, else switching to a different server won't work right.

Remove now-redundant cleanup of GSS resources around one (and, for
some reason, only one) pqDropConnection call in connectDBStart.

Per report from Kyotaro Horiguchi that psql would freeze up,
rather than successfully resetting a GSS-encrypted connection
after a server restart.

This is YA oversight in commit b0b39f72b, so back-patch to v12.

Discussion: https://postgr.es/m/20200710.173803.435804731896516388.horikyota.ntt@gmail.com
---
 src/interfaces/libpq/fe-connect.c | 37 ++++++++-----------------------
 1 file changed, 9 insertions(+), 28 deletions(-)

diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 27c9bb46eea0..7bee9dd2014c 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -477,6 +477,11 @@ pqDropConnection(PGconn *conn, bool flushInput)
 	{
 		OM_uint32	min_s;
 
+		if (conn->gcred != GSS_C_NO_CREDENTIAL)
+		{
+			gss_release_cred(&min_s, &conn->gcred);
+			conn->gcred = GSS_C_NO_CREDENTIAL;
+		}
 		if (conn->gctx)
 			gss_delete_sec_context(&min_s, &conn->gctx, GSS_C_NO_BUFFER);
 		if (conn->gtarg_nam)
@@ -496,6 +501,7 @@ pqDropConnection(PGconn *conn, bool flushInput)
 			free(conn->gss_ResultBuffer);
 			conn->gss_ResultBuffer = NULL;
 		}
+		conn->gssenc = false;
 	}
 #endif
 #ifdef ENABLE_SSPI
@@ -2027,11 +2033,6 @@ connectDBStart(PGconn *conn)
 	 */
 	resetPQExpBuffer(&conn->errorMessage);
 
-#ifdef ENABLE_GSS
-	if (conn->gssencmode[0] == 'd') /* "disable" */
-		conn->try_gss = false;
-#endif
-
 	/*
 	 * Set up to try to connect to the first host.  (Setting whichhost = -1 is
 	 * a bit of a cheat, but PQconnectPoll will advance it to 0 before
@@ -2468,6 +2469,9 @@ PQconnectPoll(PGconn *conn)
 		conn->allow_ssl_try = (conn->sslmode[0] != 'd');	/* "disable" */
 		conn->wait_ssl_try = (conn->sslmode[0] == 'a'); /* "allow" */
 #endif
+#ifdef ENABLE_GSS
+		conn->try_gss = (conn->gssencmode[0] != 'd');	/* "disable" */
+#endif
 
 		reset_connection_state_machine = false;
 		need_new_connection = true;
@@ -3349,12 +3353,8 @@ PQconnectPoll(PGconn *conn)
 					 */
 					if (conn->gssenc && conn->gssencmode[0] == 'p')
 					{
-						OM_uint32	minor;
-
 						/* postmaster expects us to drop the connection */
 						conn->try_gss = false;
-						conn->gssenc = false;
-						gss_delete_sec_context(&minor, &conn->gctx, NULL);
 						pqDropConnection(conn, true);
 						conn->status = CONNECTION_NEEDED;
 						goto keep_going;
@@ -3906,9 +3906,6 @@ makeEmptyPGconn(void)
 	conn->verbosity = PQERRORS_DEFAULT;
 	conn->show_context = PQSHOW_CONTEXT_ERRORS;
 	conn->sock = PGINVALID_SOCKET;
-#ifdef ENABLE_GSS
-	conn->try_gss = true;
-#endif
 
 	/*
 	 * We try to send at least 8K at a time, which is the usual size of pipe
@@ -4065,22 +4062,6 @@ freePGconn(PGconn *conn)
 		free(conn->gsslib);
 	if (conn->connip)
 		free(conn->connip);
-#ifdef ENABLE_GSS
-	if (conn->gcred != GSS_C_NO_CREDENTIAL)
-	{
-		OM_uint32	minor;
-
-		gss_release_cred(&minor, &conn->gcred);
-		conn->gcred = GSS_C_NO_CREDENTIAL;
-	}
-	if (conn->gctx)
-	{
-		OM_uint32	minor;
-
-		gss_delete_sec_context(&minor, &conn->gctx, GSS_C_NO_BUFFER);
-		conn->gctx = NULL;
-	}
-#endif
 	/* Note that conn->Pfdebug is not ours to close or free */
 	if (conn->last_query)
 		free(conn->last_query);

From b5b4c0fef9fb905d98f93c1f455397449c6e63a8 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Mon, 13 Jul 2020 13:49:51 -0400
Subject: [PATCH 135/334] Fix uninitialized value in segno calculation

Remove previous hack in KeepLogSeg that added a case to deal with a
(badly represented) invalid segment number.  This was added for the sake
of GetWALAvailability.  But it's not needed if in that function we
initialize the segment number to be retreated to the currently being
written segment, so do that instead.

Per valgrind-running buildfarm member skink, and some sparc64 animals.

Discussion: https://postgr.es/m/1724648.1594230917@sss.pgh.pa.us
---
 src/backend/access/transam/xlog.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 28daf72a503a..0a97b1d37fbe 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9523,13 +9523,13 @@ GetWALAvailability(XLogRecPtr targetLSN)
 	if (XLogRecPtrIsInvalid(targetLSN))
 		return WALAVAIL_INVALID_LSN;
 
-	currpos = GetXLogWriteRecPtr();
-
 	/*
-	 * calculate the oldest segment currently reserved by all slots,
-	 * considering wal_keep_segments and max_slot_wal_keep_size
+	 * Calculate the oldest segment currently reserved by all slots,
+	 * considering wal_keep_segments and max_slot_wal_keep_size.  Initialize
+	 * oldestSlotSeg to the current segment.
 	 */
-	XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
+	currpos = GetXLogWriteRecPtr();
+	XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
 	KeepLogSeg(currpos, &oldestSlotSeg);
 
 	/*
@@ -9548,6 +9548,9 @@ GetWALAvailability(XLogRecPtr targetLSN)
 	else
 		oldestSegMaxWalSize = 1;
 
+	/* the segment we care about */
+	XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
+
 	/*
 	 * No point in returning reserved or extended status values if the
 	 * targetSeg is known to be lost.
@@ -9624,7 +9627,7 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
 	}
 
 	/* don't delete WAL segments newer than the calculated segment */
-	if (XLogRecPtrIsInvalid(*logSegNo) || segno < *logSegNo)
+	if (segno < *logSegNo)
 		*logSegNo = segno;
 }
 

From a742ecf9c63d454ccb107a357288c8ec1444ca12 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 13 Jul 2020 20:38:20 -0400
Subject: [PATCH 136/334] Cope with lateral references in the quals of a
 subquery RTE.

The qual pushdown logic assumed that all Vars in a restriction clause
must be Vars referencing subquery outputs; but since we introduced
LATERAL, it's possible for such a Var to be a lateral reference instead.
This led to an assertion failure in debug builds.  In a non-debug
build, there might be no ill effects (if qual_is_pushdown_safe decided
the qual was unsafe anyway), or we could get failures later due to
construction of an invalid plan.  I've not gone to much length to
characterize the possible failures, but at least segfaults in the
executor have been observed.

Given that this has been busted since 9.3 and it took this long for
anybody to notice, I judge that the case isn't worth going to great
lengths to optimize.  Hence, fix by just teaching qual_is_pushdown_safe
that such quals are unsafe to push down, matching the previous behavior
when it accidentally didn't fail.

Per report from Tom Ellis.  Back-patch to all supported branches.

Discussion: https://postgr.es/m/20200713175124.GQ8220@cloudinit-builder
---
 src/backend/optimizer/path/allpaths.c   | 20 ++++++++--
 src/test/regress/expected/subselect.out | 53 +++++++++++++++++++++++++
 src/test/regress/sql/subselect.sql      | 26 ++++++++++++
 3 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index c4e1967f1231..6da0dcd61cec 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -3508,8 +3508,10 @@ qual_is_pushdown_safe(Query *subquery, Index rti, Node *qual,
 	Assert(!contain_window_function(qual));
 
 	/*
-	 * Examine all Vars used in clause; since it's a restriction clause, all
-	 * such Vars must refer to subselect output columns.
+	 * Examine all Vars used in clause.  Since it's a restriction clause, all
+	 * such Vars must refer to subselect output columns ... unless this is
+	 * part of a LATERAL subquery, in which case there could be lateral
+	 * references.
 	 */
 	vars = pull_var_clause(qual, PVC_INCLUDE_PLACEHOLDERS);
 	foreach(vl, vars)
@@ -3529,7 +3531,19 @@ qual_is_pushdown_safe(Query *subquery, Index rti, Node *qual,
 			break;
 		}
 
-		Assert(var->varno == rti);
+		/*
+		 * Punt if we find any lateral references.  It would be safe to push
+		 * these down, but we'd have to convert them into outer references,
+		 * which subquery_push_qual lacks the infrastructure to do.  The case
+		 * arises so seldom that it doesn't seem worth working hard on.
+		 */
+		if (var->varno != rti)
+		{
+			safe = false;
+			break;
+		}
+
+		/* Subqueries have no system columns */
 		Assert(var->varattno >= 0);
 
 		/* Check point 4 */
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 4c6cd5f14669..1c5d80da323e 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -1159,6 +1159,59 @@ from int4_tbl;
  (4,5,6.0)
 (5 rows)
 
+--
+-- Check for sane handling of a lateral reference in a subquery's quals
+-- (most of the complication here is to prevent the test case from being
+-- flattened too much)
+--
+explain (verbose, costs off)
+select * from
+    int4_tbl i4,
+    lateral (
+        select i4.f1 > 1 as b, 1 as id
+        from (select random() order by 1) as t1
+      union all
+        select true as b, 2 as id
+    ) as t2
+where b and f1 >= 0;
+                 QUERY PLAN                 
+--------------------------------------------
+ Nested Loop
+   Output: i4.f1, ((i4.f1 > 1)), (1)
+   ->  Seq Scan on public.int4_tbl i4
+         Output: i4.f1
+         Filter: (i4.f1 >= 0)
+   ->  Append
+         ->  Subquery Scan on t1
+               Output: (i4.f1 > 1), 1
+               Filter: (i4.f1 > 1)
+               ->  Sort
+                     Output: (random())
+                     Sort Key: (random())
+                     ->  Result
+                           Output: random()
+         ->  Result
+               Output: true, 2
+(16 rows)
+
+select * from
+    int4_tbl i4,
+    lateral (
+        select i4.f1 > 1 as b, 1 as id
+        from (select random() order by 1) as t1
+      union all
+        select true as b, 2 as id
+    ) as t2
+where b and f1 >= 0;
+     f1     | b | id 
+------------+---+----
+          0 | t |  2
+     123456 | t |  1
+     123456 | t |  2
+ 2147483647 | t |  1
+ 2147483647 | t |  2
+(5 rows)
+
 --
 -- Check that volatile quals aren't pushed down past a DISTINCT:
 -- nextval() should not be called more than the nominal number of times
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 893d8d0f6212..a56057bd4fad 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -627,6 +627,32 @@ select (select q from
          ) q )
 from int4_tbl;
 
+--
+-- Check for sane handling of a lateral reference in a subquery's quals
+-- (most of the complication here is to prevent the test case from being
+-- flattened too much)
+--
+explain (verbose, costs off)
+select * from
+    int4_tbl i4,
+    lateral (
+        select i4.f1 > 1 as b, 1 as id
+        from (select random() order by 1) as t1
+      union all
+        select true as b, 2 as id
+    ) as t2
+where b and f1 >= 0;
+
+select * from
+    int4_tbl i4,
+    lateral (
+        select i4.f1 > 1 as b, 1 as id
+        from (select random() order by 1) as t1
+      union all
+        select true as b, 2 as id
+    ) as t2
+where b and f1 >= 0;
+
 --
 -- Check that volatile quals aren't pushed down past a DISTINCT:
 -- nextval() should not be called more than the nominal number of times

From 9168793d7275b4b318c153d607fba55d14098c19 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 14 Jul 2020 13:17:11 +0900
Subject: [PATCH 137/334] Fix comments related to table AMs

Incorrect function names were referenced.  As this fixes some portions
of tableam.h, that is mentioned in the docs as something to look at when
implementing a table AM, backpatch down to 12 where this has been
introduced.

Author: Hironobu Suzuki
Discussion: https://postgr.es/m/8fe6d672-28dd-3f1d-7aed-ac2f6d599d3f@interdb.jp
Backpatch-through: 12
---
 src/backend/access/heap/heapam.c |  6 +++---
 src/include/access/tableam.h     | 21 +++++++++++----------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 7bd45703aa6d..d881f4cd46a5 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1639,9 +1639,9 @@ heap_get_latest_tid(TableScanDesc sscan,
 	TransactionId priorXmax;
 
 	/*
-	 * table_get_latest_tid verified that the passed in tid is valid.  Assume
-	 * that t_ctid links are valid however - there shouldn't be invalid ones
-	 * in the table.
+	 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
+	 * Assume that t_ctid links are valid however - there shouldn't be invalid
+	 * ones in the table.
 	 */
 	Assert(ItemPointerIsValid(tid));
 
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index b3d2a6dd3150..0d28f01ca918 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -1003,12 +1003,12 @@ table_index_fetch_end(struct IndexFetchTableData *scan)
  * that tuple. Index AMs can use that to avoid returning that tid in future
  * searches.
  *
- * The difference between this function and table_fetch_row_version is that
- * this function returns the currently visible version of a row if the AM
- * supports storing multiple row versions reachable via a single index entry
- * (like heap's HOT). Whereas table_fetch_row_version only evaluates the
- * tuple exactly at `tid`. Outside of index entry ->table tuple lookups,
- * table_tuple_fetch_row_version is what's usually needed.
+ * The difference between this function and table_tuple_fetch_row_version()
+ * is that this function returns the currently visible version of a row if
+ * the AM supports storing multiple row versions reachable via a single index
+ * entry (like heap's HOT). Whereas table_tuple_fetch_row_version() only
+ * evaluates the tuple exactly at `tid`. Outside of index entry ->table tuple
+ * lookups, table_tuple_fetch_row_version() is what's usually needed.
  */
 static inline bool
 table_index_fetch_tuple(struct IndexFetchTableData *scan,
@@ -1062,8 +1062,9 @@ table_tuple_fetch_row_version(Relation rel,
 /*
  * Verify that `tid` is a potentially valid tuple identifier. That doesn't
  * mean that the pointed to row needs to exist or be visible, but that
- * attempting to fetch the row (e.g. with table_get_latest_tid() or
- * table_fetch_row_version()) should not error out if called with that tid.
+ * attempting to fetch the row (e.g. with table_tuple_get_latest_tid() or
+ * table_tuple_fetch_row_version()) should not error out if called with that
+ * tid.
  *
  * `scan` needs to have been started via table_beginscan().
  */
@@ -1192,8 +1193,8 @@ table_tuple_complete_speculative(Relation rel, TupleTableSlot *slot,
 /*
  * Insert multiple tuples into a table.
  *
- * This is like table_insert(), but inserts multiple tuples in one
- * operation. That's often faster than calling table_insert() in a loop,
+ * This is like table_tuple_insert(), but inserts multiple tuples in one
+ * operation. That's often faster than calling table_tuple_insert() in a loop,
  * because e.g. the AM can reduce WAL logging and page locking overhead.
  *
  * Except for taking `nslots` tuples as input, and an array of TupleTableSlots

From b8401c32bacd0869bd00bc71fd64f1d6071bcd6a Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 14 Jul 2020 13:39:45 +0900
Subject: [PATCH 138/334] Fix some header identifications

The following header files missed the shot:
- jsonfuncs.h, as of ce0425b.
- jsonapi.h, as of beb4699.
- llvmjit_emit.h as of 7ec0d80.
- partdesc.h, as of 1bb5e78.

Author: Jesse Zhang
Discussion: https://postgr.es/m/CAGf+fX4-8xULEOz09DE2dZGjT+q8VJ--rqfTpvcFwc+A4fc-3Q@mail.gmail.com
---
 src/include/common/jsonapi.h        | 2 +-
 src/include/jit/llvmjit_emit.h      | 2 +-
 src/include/partitioning/partdesc.h | 2 +-
 src/include/utils/jsonfuncs.h       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/include/common/jsonapi.h b/src/include/common/jsonapi.h
index bcfd57cc53c4..1fee0ea81e47 100644
--- a/src/include/common/jsonapi.h
+++ b/src/include/common/jsonapi.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * src/include/utils/jsonapi.h
+ * src/include/common/jsonapi.h
  *
  *-------------------------------------------------------------------------
  */
diff --git a/src/include/jit/llvmjit_emit.h b/src/include/jit/llvmjit_emit.h
index 74607a43770a..1a7d6db7259e 100644
--- a/src/include/jit/llvmjit_emit.h
+++ b/src/include/jit/llvmjit_emit.h
@@ -4,7 +4,7 @@
  *
  * Copyright (c) 2018-2020, PostgreSQL Global Development Group
  *
- * src/include/lib/llvmjit_emit.h
+ * src/include/jit/llvmjit_emit.h
  */
 #ifndef LLVMJIT_EMIT_H
 #define LLVMJIT_EMIT_H
diff --git a/src/include/partitioning/partdesc.h b/src/include/partitioning/partdesc.h
index fb416e073d07..70df764981e7 100644
--- a/src/include/partitioning/partdesc.h
+++ b/src/include/partitioning/partdesc.h
@@ -4,7 +4,7 @@
  *
  * Copyright (c) 1996-2020, PostgreSQL Global Development Group
  *
- * src/include/utils/partdesc.h
+ * src/include/partitioning/partdesc.h
  *
  *-------------------------------------------------------------------------
  */
diff --git a/src/include/utils/jsonfuncs.h b/src/include/utils/jsonfuncs.h
index 1f1b4029cbf1..4796b2b78bf9 100644
--- a/src/include/utils/jsonfuncs.h
+++ b/src/include/utils/jsonfuncs.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * src/include/utils/jsonapi.h
+ * src/include/utils/jsonfuncs.h
  *
  *-------------------------------------------------------------------------
  */

From f1fcf2d3b2e00b3d7ad3e0d2b1d26b77f5a48413 Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Tue, 14 Jul 2020 16:54:47 +1200
Subject: [PATCH 139/334] Fix timing issue with ALTER TABLE's validate
 constraint

An ALTER TABLE to validate a foreign key in which another subcommand
already caused a pending table rewrite could fail due to ALTER TABLE
attempting to validate the foreign key before the actual table rewrite
takes place.  This situation could result in an error such as:

ERROR:  could not read block 0 in file "base/nnnnn/nnnnn": read only 0 of 8192 bytes

The failure here was due to the SPI call which validates the foreign key
trying to access an index which is yet to be rebuilt.

Similarly, we also incorrectly tried to validate CHECK constraints before
the heap had been rewritten.

The fix for both is to delay constraint validation until phase 3, after
the table has been rewritten.  For CHECK constraints this means a slight
behavioral change.  Previously ALTER TABLE VALIDATE CONSTRAINT on
inheritance tables would be validated from the bottom up.  This was
different from the order of evaluation when a new CHECK constraint was
added.  The changes made here aligns the VALIDATE CONSTRAINT evaluation
order for inheritance tables to be the same as ADD CONSTRAINT, which is
generally top-down.

Reported-by: Nazli Ugur Koyluoglu, using SQLancer
Discussion: https://postgr.es/m/CAApHDvp%3DZXv8wiRyk_0rWr00skhGkt8vXDrHJYXRMft3TjkxCA%40mail.gmail.com
Backpatch-through: 9.5 (all supported versions)
---
 src/backend/commands/tablecmds.c          | 155 +++++++---------------
 src/test/regress/expected/alter_table.out |  22 ++-
 src/test/regress/sql/alter_table.sql      |  22 +++
 3 files changed, 93 insertions(+), 106 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index ed553f73841c..20049f2c5558 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -328,7 +328,8 @@ static void AlterSeqNamespaces(Relation classRel, Relation rel,
 							   LOCKMODE lockmode);
 static ObjectAddress ATExecAlterConstraint(Relation rel, AlterTableCmd *cmd,
 										   bool recurse, bool recursing, LOCKMODE lockmode);
-static ObjectAddress ATExecValidateConstraint(Relation rel, char *constrName,
+static ObjectAddress ATExecValidateConstraint(List **wqueue,
+											  Relation rel, char *constrName,
 											  bool recurse, bool recursing, LOCKMODE lockmode);
 static int	transformColumnNameList(Oid relId, List *colList,
 									int16 *attnums, Oid *atttypids);
@@ -342,7 +343,6 @@ static Oid	transformFkeyCheckAttrs(Relation pkrel,
 static void checkFkeyPermissions(Relation rel, int16 *attnums, int natts);
 static CoercionPathType findFkeyCast(Oid targetTypeId, Oid sourceTypeId,
 									 Oid *funcid);
-static void validateCheckConstraint(Relation rel, HeapTuple constrtup);
 static void validateForeignKeyConstraint(char *conname,
 										 Relation rel, Relation pkrel,
 										 Oid pkindOid, Oid constraintOid);
@@ -4500,13 +4500,13 @@ ATExecCmd(List **wqueue, AlteredTableInfo *tab, Relation rel,
 			address = ATExecAlterConstraint(rel, cmd, false, false, lockmode);
 			break;
 		case AT_ValidateConstraint: /* VALIDATE CONSTRAINT */
-			address = ATExecValidateConstraint(rel, cmd->name, false, false,
-											   lockmode);
+			address = ATExecValidateConstraint(wqueue, rel, cmd->name, false,
+											   false, lockmode);
 			break;
 		case AT_ValidateConstraintRecurse:	/* VALIDATE CONSTRAINT with
 											 * recursion */
-			address = ATExecValidateConstraint(rel, cmd->name, true, false,
-											   lockmode);
+			address = ATExecValidateConstraint(wqueue, rel, cmd->name, true,
+											   false, lockmode);
 			break;
 		case AT_DropConstraint: /* DROP CONSTRAINT */
 			ATExecDropConstraint(rel, cmd->name, cmd->behavior,
@@ -9727,8 +9727,8 @@ ATExecAlterConstraint(Relation rel, AlterTableCmd *cmd,
  * was already validated, InvalidObjectAddress is returned.
  */
 static ObjectAddress
-ATExecValidateConstraint(Relation rel, char *constrName, bool recurse,
-						 bool recursing, LOCKMODE lockmode)
+ATExecValidateConstraint(List **wqueue, Relation rel, char *constrName,
+						 bool recurse, bool recursing, LOCKMODE lockmode)
 {
 	Relation	conrel;
 	SysScanDesc scan;
@@ -9774,27 +9774,31 @@ ATExecValidateConstraint(Relation rel, char *constrName, bool recurse,
 
 	if (!con->convalidated)
 	{
+		AlteredTableInfo *tab;
 		HeapTuple	copyTuple;
 		Form_pg_constraint copy_con;
 
 		if (con->contype == CONSTRAINT_FOREIGN)
 		{
-			Relation	refrel;
+			NewConstraint *newcon;
+			Constraint *fkconstraint;
 
-			/*
-			 * Triggers are already in place on both tables, so a concurrent
-			 * write that alters the result here is not possible. Normally we
-			 * can run a query here to do the validation, which would only
-			 * require AccessShareLock. In some cases, it is possible that we
-			 * might need to fire triggers to perform the check, so we take a
-			 * lock at RowShareLock level just in case.
-			 */
-			refrel = table_open(con->confrelid, RowShareLock);
+			/* Queue validation for phase 3 */
+			fkconstraint = makeNode(Constraint);
+			/* for now this is all we need */
+			fkconstraint->conname = constrName;
 
-			validateForeignKeyConstraint(constrName, rel, refrel,
-										 con->conindid,
-										 con->oid);
-			table_close(refrel, NoLock);
+			newcon = (NewConstraint *) palloc0(sizeof(NewConstraint));
+			newcon->name = constrName;
+			newcon->contype = CONSTR_FOREIGN;
+			newcon->refrelid = con->confrelid;
+			newcon->refindid = con->conindid;
+			newcon->conid = con->oid;
+			newcon->qual = (Node *) fkconstraint;
+
+			/* Find or create work queue entry for this table */
+			tab = ATGetQueueEntry(wqueue, rel);
+			tab->constraints = lappend(tab->constraints, newcon);
 
 			/*
 			 * We disallow creating invalid foreign keys to or from
@@ -9805,6 +9809,10 @@ ATExecValidateConstraint(Relation rel, char *constrName, bool recurse,
 		{
 			List	   *children = NIL;
 			ListCell   *child;
+			NewConstraint *newcon;
+			bool		isnull;
+			Datum		val;
+			char	   *conbin;
 
 			/*
 			 * If we're recursing, the parent has already done this, so skip
@@ -9844,12 +9852,30 @@ ATExecValidateConstraint(Relation rel, char *constrName, bool recurse,
 				/* find_all_inheritors already got lock */
 				childrel = table_open(childoid, NoLock);
 
-				ATExecValidateConstraint(childrel, constrName, false,
+				ATExecValidateConstraint(wqueue, childrel, constrName, false,
 										 true, lockmode);
 				table_close(childrel, NoLock);
 			}
 
-			validateCheckConstraint(rel, tuple);
+			/* Queue validation for phase 3 */
+			newcon = (NewConstraint *) palloc0(sizeof(NewConstraint));
+			newcon->name = constrName;
+			newcon->contype = CONSTR_CHECK;
+			newcon->refrelid = InvalidOid;
+			newcon->refindid = InvalidOid;
+			newcon->conid = con->oid;
+
+			val = SysCacheGetAttr(CONSTROID, tuple,
+								  Anum_pg_constraint_conbin, &isnull);
+			if (isnull)
+				elog(ERROR, "null conbin for constraint %u", con->oid);
+
+			conbin = TextDatumGetCString(val);
+			newcon->qual = (Node *) stringToNode(conbin);
+
+			/* Find or create work queue entry for this table */
+			tab = ATGetQueueEntry(wqueue, rel);
+			tab->constraints = lappend(tab->constraints, newcon);
 
 			/*
 			 * Invalidate relcache so that others see the new validated
@@ -10223,87 +10249,6 @@ checkFkeyPermissions(Relation rel, int16 *attnums, int natts)
 	}
 }
 
-/*
- * Scan the existing rows in a table to verify they meet a proposed
- * CHECK constraint.
- *
- * The caller must have opened and locked the relation appropriately.
- */
-static void
-validateCheckConstraint(Relation rel, HeapTuple constrtup)
-{
-	EState	   *estate;
-	Datum		val;
-	char	   *conbin;
-	Expr	   *origexpr;
-	ExprState  *exprstate;
-	TableScanDesc scan;
-	ExprContext *econtext;
-	MemoryContext oldcxt;
-	TupleTableSlot *slot;
-	Form_pg_constraint constrForm;
-	bool		isnull;
-	Snapshot	snapshot;
-
-	/*
-	 * VALIDATE CONSTRAINT is a no-op for foreign tables and partitioned
-	 * tables.
-	 */
-	if (rel->rd_rel->relkind == RELKIND_FOREIGN_TABLE ||
-		rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
-		return;
-
-	constrForm = (Form_pg_constraint) GETSTRUCT(constrtup);
-
-	estate = CreateExecutorState();
-
-	/*
-	 * XXX this tuple doesn't really come from a syscache, but this doesn't
-	 * matter to SysCacheGetAttr, because it only wants to be able to fetch
-	 * the tupdesc
-	 */
-	val = SysCacheGetAttr(CONSTROID, constrtup, Anum_pg_constraint_conbin,
-						  &isnull);
-	if (isnull)
-		elog(ERROR, "null conbin for constraint %u",
-			 constrForm->oid);
-	conbin = TextDatumGetCString(val);
-	origexpr = (Expr *) stringToNode(conbin);
-	exprstate = ExecPrepareExpr(origexpr, estate);
-
-	econtext = GetPerTupleExprContext(estate);
-	slot = table_slot_create(rel, NULL);
-	econtext->ecxt_scantuple = slot;
-
-	snapshot = RegisterSnapshot(GetLatestSnapshot());
-	scan = table_beginscan(rel, snapshot, 0, NULL);
-
-	/*
-	 * Switch to per-tuple memory context and reset it for each tuple
-	 * produced, so we don't leak memory.
-	 */
-	oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
-
-	while (table_scan_getnextslot(scan, ForwardScanDirection, slot))
-	{
-		if (!ExecCheck(exprstate, econtext))
-			ereport(ERROR,
-					(errcode(ERRCODE_CHECK_VIOLATION),
-					 errmsg("check constraint \"%s\" of relation \"%s\" is violated by some row",
-							NameStr(constrForm->conname),
-							RelationGetRelationName(rel)),
-					 errtableconstraint(rel, NameStr(constrForm->conname))));
-
-		ResetExprContext(econtext);
-	}
-
-	MemoryContextSwitchTo(oldcxt);
-	table_endscan(scan);
-	UnregisterSnapshot(snapshot);
-	ExecDropSingleTupleTableSlot(slot);
-	FreeExecutorState(estate);
-}
-
 /*
  * Scan the existing rows in a table to verify they meet a proposed FK
  * constraint.
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index 002079601fa8..6f90eae2f8ce 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -487,8 +487,8 @@ NOTICE:  boo: 18
 ALTER TABLE attmp3 ADD CONSTRAINT IDENTITY check (b = boo(b)) NOT VALID;
 NOTICE:  merging constraint "identity" with inherited definition
 ALTER TABLE attmp3 VALIDATE CONSTRAINT identity;
-NOTICE:  boo: 16
 NOTICE:  boo: 20
+NOTICE:  boo: 16
 -- A NO INHERIT constraint should not be looked for in children during VALIDATE CONSTRAINT
 create table parent_noinh_convalid (a int);
 create table child_noinh_convalid () inherits (parent_noinh_convalid);
@@ -997,6 +997,26 @@ alter table atacc1
   add column b float8 not null default random(),
   add primary key(a);
 drop table atacc1;
+-- additionally, we've seen issues with foreign key validation not being
+-- properly delayed until after a table rewrite.  Check that works ok.
+create table atacc1 (a int primary key);
+alter table atacc1 add constraint atacc1_fkey foreign key (a) references atacc1 (a) not valid;
+alter table atacc1 validate constraint atacc1_fkey, alter a type bigint;
+drop table atacc1;
+-- we've also seen issues with check constraints being validated at the wrong
+-- time when there's a pending table rewrite.
+create table atacc1 (a bigint, b int);
+insert into atacc1 values(1,1);
+alter table atacc1 add constraint atacc1_chk check(b = 1) not valid;
+alter table atacc1 validate constraint atacc1_chk, alter a type int;
+drop table atacc1;
+-- same as above, but ensure the constraint violation is detected
+create table atacc1 (a bigint, b int);
+insert into atacc1 values(1,2);
+alter table atacc1 add constraint atacc1_chk check(b = 1) not valid;
+alter table atacc1 validate constraint atacc1_chk, alter a type int;
+ERROR:  check constraint "atacc1_chk" of relation "atacc1" is violated by some row
+drop table atacc1;
 -- something a little more complicated
 create table atacc1 ( test int, test2 int);
 -- add a primary key constraint
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index ec272d78f8a3..ce6401d80d28 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -757,6 +757,28 @@ alter table atacc1
   add primary key(a);
 drop table atacc1;
 
+-- additionally, we've seen issues with foreign key validation not being
+-- properly delayed until after a table rewrite.  Check that works ok.
+create table atacc1 (a int primary key);
+alter table atacc1 add constraint atacc1_fkey foreign key (a) references atacc1 (a) not valid;
+alter table atacc1 validate constraint atacc1_fkey, alter a type bigint;
+drop table atacc1;
+
+-- we've also seen issues with check constraints being validated at the wrong
+-- time when there's a pending table rewrite.
+create table atacc1 (a bigint, b int);
+insert into atacc1 values(1,1);
+alter table atacc1 add constraint atacc1_chk check(b = 1) not valid;
+alter table atacc1 validate constraint atacc1_chk, alter a type int;
+drop table atacc1;
+
+-- same as above, but ensure the constraint violation is detected
+create table atacc1 (a bigint, b int);
+insert into atacc1 values(1,2);
+alter table atacc1 add constraint atacc1_chk check(b = 1) not valid;
+alter table atacc1 validate constraint atacc1_chk, alter a type int;
+drop table atacc1;
+
 -- something a little more complicated
 create table atacc1 ( test int, test2 int);
 -- add a primary key constraint

From 101f903e51f52bf595cd8177d2e0bc6fe9000762 Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Tue, 14 Jul 2020 17:29:52 +1200
Subject: [PATCH 140/334] Add comment to explain an unused function parameter

Removing the unused 'miinfo' parameter has been raised a couple of times
now.  It was decided in the 2nd discussion below that we're going to leave
it alone.  It seems like it might be useful to add a comment to mention
this fact so that nobody wastes any time in the future proposing its
removal again.

Discussion: https://postgr.es/m/CAApHDvpCf-qR5HC1rXskUM4ToV+3YDb4-n1meY=vpAHsRS_1PA@mail.gmail.com
Discussion: https://postgr.es/m/CAE9k0P%3DFvcDswnSVtRpSyZMpcAWC%3DGp%3DifZ0HdfPaRQ%3D__LBtw%40mail.gmail.com
---
 src/backend/commands/copy.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 99d145718012..44da71c4cb5c 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2604,6 +2604,9 @@ CopyMultiInsertInfoCleanup(CopyMultiInsertInfo *miinfo)
  * Get the next TupleTableSlot that the next tuple should be stored in.
  *
  * Callers must ensure that the buffer is not full.
+ *
+ * Note: 'miinfo' is unused but has been included for consistency with the
+ * other functions in this area.
  */
 static inline TupleTableSlot *
 CopyMultiInsertInfoNextFreeSlot(CopyMultiInsertInfo *miinfo,

From de8feb1f3a23465b5737e8a8c160e8ca62f61339 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Tue, 14 Jul 2020 19:36:30 +0200
Subject: [PATCH 141/334] Fix -Wcast-function-type warnings

Three groups of issues needed to be addressed:

load_external_function() and related functions returned PGFunction,
even though not necessarily all callers are looking for a function of
type PGFunction.  Since these functions are really just wrappers
around dlsym(), change to return void * just like dlsym().

In dynahash.c, we are using strlcpy() where a function with a
signature like memcpy() is expected.  This should be safe, as the new
comment there explains, but the cast needs to be augmented to avoid
the warning.

In PL/Python, methods all need to be cast to PyCFunction, per Python
API, but this now runs afoul of these warnings.  (This issue also
exists in core CPython.)

To fix the second and third case, we add a new type pg_funcptr_t that
is defined specifically so that gcc accepts it as a special function
pointer that can be cast to any other function pointer without the
warning.

Also add -Wcast-function-type to the standard warning flags, subject
to configure check.

Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://www.postgresql.org/message-id/flat/1e97628e-6447-b4fd-e230-d109cec2d584%402ndquadrant.com
---
 configure                         | 91 +++++++++++++++++++++++++++++++
 configure.in                      |  2 +
 src/backend/utils/fmgr/dfmgr.c    | 14 ++---
 src/backend/utils/hash/dynahash.c | 11 +++-
 src/include/c.h                   |  7 +++
 src/include/fmgr.h                |  6 +-
 src/pl/plpython/plpy_plpymodule.c | 14 ++---
 7 files changed, 127 insertions(+), 18 deletions(-)

diff --git a/configure b/configure
index 2feff37fe371..9907637e3176 100755
--- a/configure
+++ b/configure
@@ -5643,6 +5643,97 @@ if test x"$pgac_cv_prog_CXX_cxxflags__Wimplicit_fallthrough_3" = x"yes"; then
 fi
 
 
+
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -Wcast-function-type, for CFLAGS" >&5
+$as_echo_n "checking whether ${CC} supports -Wcast-function-type, for CFLAGS... " >&6; }
+if ${pgac_cv_prog_CC_cflags__Wcast_function_type+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+pgac_save_CC=$CC
+CC=${CC}
+CFLAGS="${CFLAGS} -Wcast-function-type"
+ac_save_c_werror_flag=$ac_c_werror_flag
+ac_c_werror_flag=yes
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"; then :
+  pgac_cv_prog_CC_cflags__Wcast_function_type=yes
+else
+  pgac_cv_prog_CC_cflags__Wcast_function_type=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_c_werror_flag=$ac_save_c_werror_flag
+CFLAGS="$pgac_save_CFLAGS"
+CC="$pgac_save_CC"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CC_cflags__Wcast_function_type" >&5
+$as_echo "$pgac_cv_prog_CC_cflags__Wcast_function_type" >&6; }
+if test x"$pgac_cv_prog_CC_cflags__Wcast_function_type" = x"yes"; then
+  CFLAGS="${CFLAGS} -Wcast-function-type"
+fi
+
+
+  { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CXX} supports -Wcast-function-type, for CXXFLAGS" >&5
+$as_echo_n "checking whether ${CXX} supports -Wcast-function-type, for CXXFLAGS... " >&6; }
+if ${pgac_cv_prog_CXX_cxxflags__Wcast_function_type+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CXXFLAGS=$CXXFLAGS
+pgac_save_CXX=$CXX
+CXX=${CXX}
+CXXFLAGS="${CXXFLAGS} -Wcast-function-type"
+ac_save_cxx_werror_flag=$ac_cxx_werror_flag
+ac_cxx_werror_flag=yes
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"; then :
+  pgac_cv_prog_CXX_cxxflags__Wcast_function_type=yes
+else
+  pgac_cv_prog_CXX_cxxflags__Wcast_function_type=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+CXXFLAGS="$pgac_save_CXXFLAGS"
+CXX="$pgac_save_CXX"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_prog_CXX_cxxflags__Wcast_function_type" >&5
+$as_echo "$pgac_cv_prog_CXX_cxxflags__Wcast_function_type" >&6; }
+if test x"$pgac_cv_prog_CXX_cxxflags__Wcast_function_type" = x"yes"; then
+  CXXFLAGS="${CXXFLAGS} -Wcast-function-type"
+fi
+
+
   # This was included in -Wall/-Wformat in older GCC versions
 
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${CC} supports -Wformat-security, for CFLAGS" >&5
diff --git a/configure.in b/configure.in
index 0188c6ff074e..2e05ce2e4d69 100644
--- a/configure.in
+++ b/configure.in
@@ -498,6 +498,8 @@ if test "$GCC" = yes -a "$ICC" = no; then
   PGAC_PROG_CXX_CFLAGS_OPT([-Wmissing-format-attribute])
   PGAC_PROG_CC_CFLAGS_OPT([-Wimplicit-fallthrough=3])
   PGAC_PROG_CXX_CFLAGS_OPT([-Wimplicit-fallthrough=3])
+  PGAC_PROG_CC_CFLAGS_OPT([-Wcast-function-type])
+  PGAC_PROG_CXX_CFLAGS_OPT([-Wcast-function-type])
   # This was included in -Wall/-Wformat in older GCC versions
   PGAC_PROG_CC_CFLAGS_OPT([-Wformat-security])
   PGAC_PROG_CXX_CFLAGS_OPT([-Wformat-security])
diff --git a/src/backend/utils/fmgr/dfmgr.c b/src/backend/utils/fmgr/dfmgr.c
index 9dff1f5e8291..bd779fdaf7af 100644
--- a/src/backend/utils/fmgr/dfmgr.c
+++ b/src/backend/utils/fmgr/dfmgr.c
@@ -95,7 +95,7 @@ static const Pg_magic_struct magic_data = PG_MODULE_MAGIC_DATA;
  * named funcname in it.
  *
  * If the function is not found, we raise an error if signalNotFound is true,
- * else return (PGFunction) NULL.  Note that errors in loading the library
+ * else return NULL.  Note that errors in loading the library
  * will provoke ereport() regardless of signalNotFound.
  *
  * If filehandle is not NULL, then *filehandle will be set to a handle
@@ -103,13 +103,13 @@ static const Pg_magic_struct magic_data = PG_MODULE_MAGIC_DATA;
  * lookup_external_function to lookup additional functions in the same file
  * at less cost than repeating load_external_function.
  */
-PGFunction
+void *
 load_external_function(const char *filename, const char *funcname,
 					   bool signalNotFound, void **filehandle)
 {
 	char	   *fullname;
 	void	   *lib_handle;
-	PGFunction	retval;
+	void	   *retval;
 
 	/* Expand the possibly-abbreviated filename to an exact path name */
 	fullname = expand_dynamic_library_name(filename);
@@ -122,7 +122,7 @@ load_external_function(const char *filename, const char *funcname,
 		*filehandle = lib_handle;
 
 	/* Look up the function within the library. */
-	retval = (PGFunction) dlsym(lib_handle, funcname);
+	retval = dlsym(lib_handle, funcname);
 
 	if (retval == NULL && signalNotFound)
 		ereport(ERROR,
@@ -165,12 +165,12 @@ load_file(const char *filename, bool restricted)
 
 /*
  * Lookup a function whose library file is already loaded.
- * Return (PGFunction) NULL if not found.
+ * Return NULL if not found.
  */
-PGFunction
+void *
 lookup_external_function(void *filehandle, const char *funcname)
 {
-	return (PGFunction) dlsym(filehandle, funcname);
+	return dlsym(filehandle, funcname);
 }
 
 
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 2688e277267e..5948b01abc34 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -398,7 +398,16 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 	if (flags & HASH_KEYCOPY)
 		hashp->keycopy = info->keycopy;
 	else if (hashp->hash == string_hash)
-		hashp->keycopy = (HashCopyFunc) strlcpy;
+	{
+		/*
+		 * The signature of keycopy is meant for memcpy(), which returns
+		 * void*, but strlcpy() returns size_t.  Since we never use the return
+		 * value of keycopy, and size_t is pretty much always the same size as
+		 * void *, this should be safe.  The extra cast in the middle is to
+		 * avoid warnings from -Wcast-function-type.
+		 */
+		hashp->keycopy = (HashCopyFunc) (pg_funcptr_t) strlcpy;
+	}
 	else
 		hashp->keycopy = memcpy;
 
diff --git a/src/include/c.h b/src/include/c.h
index a904b49a37fd..f242e32edbe7 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -265,6 +265,13 @@
 #define dummyret	char
 #endif
 
+/*
+ * Generic function pointer.  This can be used in the rare cases where it's
+ * necessary to cast a function pointer to a seemingly incompatible function
+ * pointer type while avoiding gcc's -Wcast-function-type warnings.
+ */
+typedef void (*pg_funcptr_t) (void);
+
 /*
  * We require C99, hence the compiler should understand flexible array
  * members.  However, for documentation purposes we still consider it to be
diff --git a/src/include/fmgr.h b/src/include/fmgr.h
index d349510b7c76..f25068fae201 100644
--- a/src/include/fmgr.h
+++ b/src/include/fmgr.h
@@ -716,9 +716,9 @@ extern bool CheckFunctionValidatorAccess(Oid validatorOid, Oid functionOid);
  */
 extern char *Dynamic_library_path;
 
-extern PGFunction load_external_function(const char *filename, const char *funcname,
-										 bool signalNotFound, void **filehandle);
-extern PGFunction lookup_external_function(void *filehandle, const char *funcname);
+extern void *load_external_function(const char *filename, const char *funcname,
+									bool signalNotFound, void **filehandle);
+extern void *lookup_external_function(void *filehandle, const char *funcname);
 extern void load_file(const char *filename, bool restricted);
 extern void **find_rendezvous_variable(const char *varName);
 extern Size EstimateLibraryStateSpace(void);
diff --git a/src/pl/plpython/plpy_plpymodule.c b/src/pl/plpython/plpy_plpymodule.c
index e308c61d50fe..7f54d093ace6 100644
--- a/src/pl/plpython/plpy_plpymodule.c
+++ b/src/pl/plpython/plpy_plpymodule.c
@@ -61,13 +61,13 @@ static PyMethodDef PLy_methods[] = {
 	/*
 	 * logging methods
 	 */
-	{"debug", (PyCFunction) PLy_debug, METH_VARARGS | METH_KEYWORDS, NULL},
-	{"log", (PyCFunction) PLy_log, METH_VARARGS | METH_KEYWORDS, NULL},
-	{"info", (PyCFunction) PLy_info, METH_VARARGS | METH_KEYWORDS, NULL},
-	{"notice", (PyCFunction) PLy_notice, METH_VARARGS | METH_KEYWORDS, NULL},
-	{"warning", (PyCFunction) PLy_warning, METH_VARARGS | METH_KEYWORDS, NULL},
-	{"error", (PyCFunction) PLy_error, METH_VARARGS | METH_KEYWORDS, NULL},
-	{"fatal", (PyCFunction) PLy_fatal, METH_VARARGS | METH_KEYWORDS, NULL},
+	{"debug", (PyCFunction) (pg_funcptr_t) PLy_debug, METH_VARARGS | METH_KEYWORDS, NULL},
+	{"log", (PyCFunction) (pg_funcptr_t) PLy_log, METH_VARARGS | METH_KEYWORDS, NULL},
+	{"info", (PyCFunction) (pg_funcptr_t) PLy_info, METH_VARARGS | METH_KEYWORDS, NULL},
+	{"notice", (PyCFunction) (pg_funcptr_t) PLy_notice, METH_VARARGS | METH_KEYWORDS, NULL},
+	{"warning", (PyCFunction) (pg_funcptr_t) PLy_warning, METH_VARARGS | METH_KEYWORDS, NULL},
+	{"error", (PyCFunction) (pg_funcptr_t) PLy_error, METH_VARARGS | METH_KEYWORDS, NULL},
+	{"fatal", (PyCFunction) (pg_funcptr_t) PLy_fatal, METH_VARARGS | METH_KEYWORDS, NULL},
 
 	/*
 	 * create a stored plan

From 689696c7110f148ede8004aae50d7543d05b5587 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 14 Jul 2020 18:56:49 -0400
Subject: [PATCH 142/334] Fix bitmap AND/OR scans on the inside of a nestloop
 partition-wise join.

reparameterize_path_by_child() failed to reparameterize BitmapAnd
and BitmapOr paths.  This matters only if such a path is chosen as
the inside of a nestloop partition-wise join, where we have to pass
in parameters from the outside of the nestloop.  If that did happen,
we generated a bad plan that would likely lead to crashes at execution.

This is not entirely reparameterize_path_by_child()'s fault though;
it's the victim of an ancient decision (my ancient decision, I think)
to not bother filling in param_info in BitmapAnd/Or path nodes.  That
caused the function to believe that such nodes and their children
contain no parameter references and so need not be processed.

In hindsight that decision looks pretty penny-wise and pound-foolish:
while it saves a few cycles during path node setup, we do commonly
need the information later.  In particular, by reversing the decision
and requiring valid param_info data in all nodes of a bitmap path
tree, we can get rid of indxpath.c's get_bitmap_tree_required_outer()
function, which computed the data on-demand.  It's not unlikely that
that nets out as a savings of cycles in many scenarios.  A couple
of other things in indxpath.c can be simplified as well.

While here, get rid of some cases in reparameterize_path_by_child()
that are visibly dead or useless, given that we only care about
reparameterizing paths that can be on the inside of a parameterized
nestloop.  This case reminds one of the maxim that untested code
probably does not work, so I'm unwilling to leave unreachable code
in this function.  (I did leave the T_Gather case in place even
though it's not reached in the regression tests.  It's not very
clear to me when the planner might prefer to put Gather below
rather than above a nestloop, but at least in principle the case
might be interesting.)

Per bug #16536, originally from Arne Roland but with a test case
by Andrew Gierth.  Back-patch to v11 where this code came in.

Discussion: https://postgr.es/m/16536-2213ee0b3aad41fd@postgresql.org
---
 src/backend/optimizer/path/indxpath.c        | 121 +++----------------
 src/backend/optimizer/util/pathnode.c        | 100 +++++++--------
 src/test/regress/expected/partition_join.out | 104 ++++++++++++++++
 src/test/regress/sql/partition_join.sql      |  44 +++++++
 4 files changed, 211 insertions(+), 158 deletions(-)

diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 2a50272da6b2..bcb1bc6097d0 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -122,7 +122,6 @@ static Cost bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel,
 								List *paths);
 static PathClauseUsage *classify_index_clause_usage(Path *path,
 													List **clauselist);
-static Relids get_bitmap_tree_required_outer(Path *bitmapqual);
 static void find_indexpath_quals(Path *bitmapqual, List **quals, List **preds);
 static int	find_list_position(Node *node, List **nodelist);
 static bool check_index_only(RelOptInfo *rel, IndexOptInfo *index);
@@ -357,23 +356,16 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 	 */
 	if (bitjoinpaths != NIL)
 	{
-		List	   *path_outer;
 		List	   *all_path_outers;
 		ListCell   *lc;
 
-		/*
-		 * path_outer holds the parameterization of each path in bitjoinpaths
-		 * (to save recalculating that several times), while all_path_outers
-		 * holds all distinct parameterization sets.
-		 */
-		path_outer = all_path_outers = NIL;
+		/* Identify each distinct parameterization seen in bitjoinpaths */
+		all_path_outers = NIL;
 		foreach(lc, bitjoinpaths)
 		{
 			Path	   *path = (Path *) lfirst(lc);
-			Relids		required_outer;
+			Relids		required_outer = PATH_REQ_OUTER(path);
 
-			required_outer = get_bitmap_tree_required_outer(path);
-			path_outer = lappend(path_outer, required_outer);
 			if (!bms_equal_any(required_outer, all_path_outers))
 				all_path_outers = lappend(all_path_outers, required_outer);
 		}
@@ -388,16 +380,14 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 			double		loop_count;
 			BitmapHeapPath *bpath;
 			ListCell   *lcp;
-			ListCell   *lco;
 
 			/* Identify all the bitmap join paths needing no more than that */
 			this_path_set = NIL;
-			forboth(lcp, bitjoinpaths, lco, path_outer)
+			foreach(lcp, bitjoinpaths)
 			{
 				Path	   *path = (Path *) lfirst(lcp);
-				Relids		p_outers = (Relids) lfirst(lco);
 
-				if (bms_is_subset(p_outers, max_outers))
+				if (bms_is_subset(PATH_REQ_OUTER(path), max_outers))
 					this_path_set = lappend(this_path_set, path);
 			}
 
@@ -411,7 +401,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 			bitmapqual = choose_bitmap_and(root, rel, this_path_set);
 
 			/* And push that path into the mix */
-			required_outer = get_bitmap_tree_required_outer(bitmapqual);
+			required_outer = PATH_REQ_OUTER(bitmapqual);
 			loop_count = get_loop_count(root, rel->relid, required_outer);
 			bpath = create_bitmap_heap_path(root, rel, bitmapqual,
 											required_outer, loop_count, 0);
@@ -1601,25 +1591,19 @@ path_usage_comparator(const void *a, const void *b)
 
 /*
  * Estimate the cost of actually executing a bitmap scan with a single
- * index path (no BitmapAnd, at least not at this level; but it could be
- * a BitmapOr).
+ * index path (which could be a BitmapAnd or BitmapOr node).
  */
 static Cost
 bitmap_scan_cost_est(PlannerInfo *root, RelOptInfo *rel, Path *ipath)
 {
 	BitmapHeapPath bpath;
-	Relids		required_outer;
-
-	/* Identify required outer rels, in case it's a parameterized scan */
-	required_outer = get_bitmap_tree_required_outer(ipath);
 
 	/* Set up a dummy BitmapHeapPath */
 	bpath.path.type = T_BitmapHeapPath;
 	bpath.path.pathtype = T_BitmapHeapScan;
 	bpath.path.parent = rel;
 	bpath.path.pathtarget = rel->reltarget;
-	bpath.path.param_info = get_baserel_parampathinfo(root, rel,
-													  required_outer);
+	bpath.path.param_info = ipath->param_info;
 	bpath.path.pathkeys = NIL;
 	bpath.bitmapqual = ipath;
 
@@ -1628,10 +1612,13 @@ bitmap_scan_cost_est(PlannerInfo *root, RelOptInfo *rel, Path *ipath)
 	 * Parallel bitmap heap path will be considered at later stage.
 	 */
 	bpath.path.parallel_workers = 0;
+
+	/* Now we can do cost_bitmap_heap_scan */
 	cost_bitmap_heap_scan(&bpath.path, root, rel,
 						  bpath.path.param_info,
 						  ipath,
-						  get_loop_count(root, rel->relid, required_outer));
+						  get_loop_count(root, rel->relid,
+										 PATH_REQ_OUTER(ipath)));
 
 	return bpath.path.total_cost;
 }
@@ -1643,46 +1630,15 @@ bitmap_scan_cost_est(PlannerInfo *root, RelOptInfo *rel, Path *ipath)
 static Cost
 bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel, List *paths)
 {
-	BitmapAndPath apath;
-	BitmapHeapPath bpath;
-	Relids		required_outer;
-
-	/* Set up a dummy BitmapAndPath */
-	apath.path.type = T_BitmapAndPath;
-	apath.path.pathtype = T_BitmapAnd;
-	apath.path.parent = rel;
-	apath.path.pathtarget = rel->reltarget;
-	apath.path.param_info = NULL;	/* not used in bitmap trees */
-	apath.path.pathkeys = NIL;
-	apath.bitmapquals = paths;
-	cost_bitmap_and_node(&apath, root);
-
-	/* Identify required outer rels, in case it's a parameterized scan */
-	required_outer = get_bitmap_tree_required_outer((Path *) &apath);
-
-	/* Set up a dummy BitmapHeapPath */
-	bpath.path.type = T_BitmapHeapPath;
-	bpath.path.pathtype = T_BitmapHeapScan;
-	bpath.path.parent = rel;
-	bpath.path.pathtarget = rel->reltarget;
-	bpath.path.param_info = get_baserel_parampathinfo(root, rel,
-													  required_outer);
-	bpath.path.pathkeys = NIL;
-	bpath.bitmapqual = (Path *) &apath;
+	BitmapAndPath *apath;
 
 	/*
-	 * Check the cost of temporary path without considering parallelism.
-	 * Parallel bitmap heap path will be considered at later stage.
+	 * Might as well build a real BitmapAndPath here, as the work is slightly
+	 * too complicated to be worth repeating just to save one palloc.
 	 */
-	bpath.path.parallel_workers = 0;
-
-	/* Now we can do cost_bitmap_heap_scan */
-	cost_bitmap_heap_scan(&bpath.path, root, rel,
-						  bpath.path.param_info,
-						  (Path *) &apath,
-						  get_loop_count(root, rel->relid, required_outer));
+	apath = create_bitmap_and_path(root, rel, paths);
 
-	return bpath.path.total_cost;
+	return bitmap_scan_cost_est(root, rel, (Path *) apath);
 }
 
 
@@ -1753,49 +1709,6 @@ classify_index_clause_usage(Path *path, List **clauselist)
 }
 
 
-/*
- * get_bitmap_tree_required_outer
- *		Find the required outer rels for a bitmap tree (index/and/or)
- *
- * We don't associate any particular parameterization with a BitmapAnd or
- * BitmapOr node; however, the IndexPaths have parameterization info, in
- * their capacity as standalone access paths.  The parameterization required
- * for the bitmap heap scan node is the union of rels referenced in the
- * child IndexPaths.
- */
-static Relids
-get_bitmap_tree_required_outer(Path *bitmapqual)
-{
-	Relids		result = NULL;
-	ListCell   *lc;
-
-	if (IsA(bitmapqual, IndexPath))
-	{
-		return bms_copy(PATH_REQ_OUTER(bitmapqual));
-	}
-	else if (IsA(bitmapqual, BitmapAndPath))
-	{
-		foreach(lc, ((BitmapAndPath *) bitmapqual)->bitmapquals)
-		{
-			result = bms_join(result,
-							  get_bitmap_tree_required_outer((Path *) lfirst(lc)));
-		}
-	}
-	else if (IsA(bitmapqual, BitmapOrPath))
-	{
-		foreach(lc, ((BitmapOrPath *) bitmapqual)->bitmapquals)
-		{
-			result = bms_join(result,
-							  get_bitmap_tree_required_outer((Path *) lfirst(lc)));
-		}
-	}
-	else
-		elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual));
-
-	return result;
-}
-
-
 /*
  * find_indexpath_quals
  *
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index e845a4b1ae13..5110a6b80601 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1081,11 +1081,27 @@ create_bitmap_and_path(PlannerInfo *root,
 					   List *bitmapquals)
 {
 	BitmapAndPath *pathnode = makeNode(BitmapAndPath);
+	Relids		required_outer = NULL;
+	ListCell   *lc;
 
 	pathnode->path.pathtype = T_BitmapAnd;
 	pathnode->path.parent = rel;
 	pathnode->path.pathtarget = rel->reltarget;
-	pathnode->path.param_info = NULL;	/* not used in bitmap trees */
+
+	/*
+	 * Identify the required outer rels as the union of what the child paths
+	 * depend on.  (Alternatively, we could insist that the caller pass this
+	 * in, but it's more convenient and reliable to compute it here.)
+	 */
+	foreach(lc, bitmapquals)
+	{
+		Path	   *bitmapqual = (Path *) lfirst(lc);
+
+		required_outer = bms_add_members(required_outer,
+										 PATH_REQ_OUTER(bitmapqual));
+	}
+	pathnode->path.param_info = get_baserel_parampathinfo(root, rel,
+														  required_outer);
 
 	/*
 	 * Currently, a BitmapHeapPath, BitmapAndPath, or BitmapOrPath will be
@@ -1117,11 +1133,27 @@ create_bitmap_or_path(PlannerInfo *root,
 					  List *bitmapquals)
 {
 	BitmapOrPath *pathnode = makeNode(BitmapOrPath);
+	Relids		required_outer = NULL;
+	ListCell   *lc;
 
 	pathnode->path.pathtype = T_BitmapOr;
 	pathnode->path.parent = rel;
 	pathnode->path.pathtarget = rel->reltarget;
-	pathnode->path.param_info = NULL;	/* not used in bitmap trees */
+
+	/*
+	 * Identify the required outer rels as the union of what the child paths
+	 * depend on.  (Alternatively, we could insist that the caller pass this
+	 * in, but it's more convenient and reliable to compute it here.)
+	 */
+	foreach(lc, bitmapquals)
+	{
+		Path	   *bitmapqual = (Path *) lfirst(lc);
+
+		required_outer = bms_add_members(required_outer,
+										 PATH_REQ_OUTER(bitmapqual));
+	}
+	pathnode->path.param_info = get_baserel_parampathinfo(root, rel,
+														  required_outer);
 
 	/*
 	 * Currently, a BitmapHeapPath, BitmapAndPath, or BitmapOrPath will be
@@ -3885,7 +3917,18 @@ do { \
 		!bms_overlap(PATH_REQ_OUTER(path), child_rel->top_parent_relids))
 		return path;
 
-	/* Reparameterize a copy of given path. */
+	/*
+	 * If possible, reparameterize the given path, making a copy.
+	 *
+	 * This function is currently only applied to the inner side of a nestloop
+	 * join that is being partitioned by the partitionwise-join code.  Hence,
+	 * we need only support path types that plausibly arise in that context.
+	 * (In particular, supporting sorted path types would be a waste of code
+	 * and cycles: even if we translated them here, they'd just lose in
+	 * subsequent cost comparisons.)  If we do see an unsupported path type,
+	 * that just means we won't be able to generate a partitionwise-join plan
+	 * using that path type.
+	 */
 	switch (nodeTag(path))
 	{
 		case T_Path:
@@ -3932,16 +3975,6 @@ do { \
 			}
 			break;
 
-		case T_TidPath:
-			{
-				TidPath    *tpath;
-
-				FLAT_COPY_PATH(tpath, path, TidPath);
-				ADJUST_CHILD_ATTRS(tpath->tidquals);
-				new_path = (Path *) tpath;
-			}
-			break;
-
 		case T_ForeignPath:
 			{
 				ForeignPath *fpath;
@@ -4032,37 +4065,6 @@ do { \
 			}
 			break;
 
-		case T_MergeAppendPath:
-			{
-				MergeAppendPath *mapath;
-
-				FLAT_COPY_PATH(mapath, path, MergeAppendPath);
-				REPARAMETERIZE_CHILD_PATH_LIST(mapath->subpaths);
-				new_path = (Path *) mapath;
-			}
-			break;
-
-		case T_MaterialPath:
-			{
-				MaterialPath *mpath;
-
-				FLAT_COPY_PATH(mpath, path, MaterialPath);
-				REPARAMETERIZE_CHILD_PATH(mpath->subpath);
-				new_path = (Path *) mpath;
-			}
-			break;
-
-		case T_UniquePath:
-			{
-				UniquePath *upath;
-
-				FLAT_COPY_PATH(upath, path, UniquePath);
-				REPARAMETERIZE_CHILD_PATH(upath->subpath);
-				ADJUST_CHILD_ATTRS(upath->uniq_exprs);
-				new_path = (Path *) upath;
-			}
-			break;
-
 		case T_GatherPath:
 			{
 				GatherPath *gpath;
@@ -4073,16 +4075,6 @@ do { \
 			}
 			break;
 
-		case T_GatherMergePath:
-			{
-				GatherMergePath *gmpath;
-
-				FLAT_COPY_PATH(gmpath, path, GatherMergePath);
-				REPARAMETERIZE_CHILD_PATH(gmpath->subpath);
-				new_path = (Path *) gmpath;
-			}
-			break;
-
 		default:
 
 			/* We don't know how to reparameterize this path. */
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
index b45a590b9450..585e7243752c 100644
--- a/src/test/regress/expected/partition_join.out
+++ b/src/test/regress/expected/partition_join.out
@@ -2165,6 +2165,110 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 FULL JOIN prt1 t2 ON (t1.c = t2.c);
                ->  Seq Scan on prt1_n_p2 t1_2
 (10 rows)
 
+--
+-- Test some other plan types in a partitionwise join (unfortunately,
+-- we need larger tables to get the planner to choose these plan types)
+--
+create temp table prtx1 (a integer, b integer, c integer)
+  partition by range (a);
+create temp table prtx1_1 partition of prtx1 for values from (1) to (11);
+create temp table prtx1_2 partition of prtx1 for values from (11) to (21);
+create temp table prtx1_3 partition of prtx1 for values from (21) to (31);
+create temp table prtx2 (a integer, b integer, c integer)
+  partition by range (a);
+create temp table prtx2_1 partition of prtx2 for values from (1) to (11);
+create temp table prtx2_2 partition of prtx2 for values from (11) to (21);
+create temp table prtx2_3 partition of prtx2 for values from (21) to (31);
+insert into prtx1 select 1 + i%30, i, i
+  from generate_series(1,1000) i;
+insert into prtx2 select 1 + i%30, i, i
+  from generate_series(1,500) i, generate_series(1,10) j;
+create index on prtx2 (b);
+create index on prtx2 (c);
+analyze prtx1;
+analyze prtx2;
+explain (costs off)
+select * from prtx1
+where not exists (select 1 from prtx2
+                  where prtx2.a=prtx1.a and prtx2.b=prtx1.b and prtx2.c=123)
+  and a<20 and c=120;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Append
+   ->  Nested Loop Anti Join
+         ->  Seq Scan on prtx1_1
+               Filter: ((a < 20) AND (c = 120))
+         ->  Bitmap Heap Scan on prtx2_1
+               Recheck Cond: ((b = prtx1_1.b) AND (c = 123))
+               Filter: (a = prtx1_1.a)
+               ->  BitmapAnd
+                     ->  Bitmap Index Scan on prtx2_1_b_idx
+                           Index Cond: (b = prtx1_1.b)
+                     ->  Bitmap Index Scan on prtx2_1_c_idx
+                           Index Cond: (c = 123)
+   ->  Nested Loop Anti Join
+         ->  Seq Scan on prtx1_2
+               Filter: ((a < 20) AND (c = 120))
+         ->  Bitmap Heap Scan on prtx2_2
+               Recheck Cond: ((b = prtx1_2.b) AND (c = 123))
+               Filter: (a = prtx1_2.a)
+               ->  BitmapAnd
+                     ->  Bitmap Index Scan on prtx2_2_b_idx
+                           Index Cond: (b = prtx1_2.b)
+                     ->  Bitmap Index Scan on prtx2_2_c_idx
+                           Index Cond: (c = 123)
+(23 rows)
+
+select * from prtx1
+where not exists (select 1 from prtx2
+                  where prtx2.a=prtx1.a and prtx2.b=prtx1.b and prtx2.c=123)
+  and a<20 and c=120;
+ a |  b  |  c  
+---+-----+-----
+ 1 | 120 | 120
+(1 row)
+
+explain (costs off)
+select * from prtx1
+where not exists (select 1 from prtx2
+                  where prtx2.a=prtx1.a and (prtx2.b=prtx1.b+1 or prtx2.c=99))
+  and a<20 and c=91;
+                           QUERY PLAN                            
+-----------------------------------------------------------------
+ Append
+   ->  Nested Loop Anti Join
+         ->  Seq Scan on prtx1_1
+               Filter: ((a < 20) AND (c = 91))
+         ->  Bitmap Heap Scan on prtx2_1
+               Recheck Cond: ((b = (prtx1_1.b + 1)) OR (c = 99))
+               Filter: (a = prtx1_1.a)
+               ->  BitmapOr
+                     ->  Bitmap Index Scan on prtx2_1_b_idx
+                           Index Cond: (b = (prtx1_1.b + 1))
+                     ->  Bitmap Index Scan on prtx2_1_c_idx
+                           Index Cond: (c = 99)
+   ->  Nested Loop Anti Join
+         ->  Seq Scan on prtx1_2
+               Filter: ((a < 20) AND (c = 91))
+         ->  Bitmap Heap Scan on prtx2_2
+               Recheck Cond: ((b = (prtx1_2.b + 1)) OR (c = 99))
+               Filter: (a = prtx1_2.a)
+               ->  BitmapOr
+                     ->  Bitmap Index Scan on prtx2_2_b_idx
+                           Index Cond: (b = (prtx1_2.b + 1))
+                     ->  Bitmap Index Scan on prtx2_2_c_idx
+                           Index Cond: (c = 99)
+(23 rows)
+
+select * from prtx1
+where not exists (select 1 from prtx2
+                  where prtx2.a=prtx1.a and (prtx2.b=prtx1.b+1 or prtx2.c=99))
+  and a<20 and c=91;
+ a | b  | c  
+---+----+----
+ 2 | 91 | 91
+(1 row)
+
 --
 -- Test advanced partition-matching algorithm for partitioned join
 --
diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql
index 2a15362b1f8e..73606c86e514 100644
--- a/src/test/regress/sql/partition_join.sql
+++ b/src/test/regress/sql/partition_join.sql
@@ -463,6 +463,50 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 JOIN prt2_n t2 ON (t1.c = t2.c) JOI
 EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_n t1 FULL JOIN prt1 t2 ON (t1.c = t2.c);
 
+--
+-- Test some other plan types in a partitionwise join (unfortunately,
+-- we need larger tables to get the planner to choose these plan types)
+--
+create temp table prtx1 (a integer, b integer, c integer)
+  partition by range (a);
+create temp table prtx1_1 partition of prtx1 for values from (1) to (11);
+create temp table prtx1_2 partition of prtx1 for values from (11) to (21);
+create temp table prtx1_3 partition of prtx1 for values from (21) to (31);
+create temp table prtx2 (a integer, b integer, c integer)
+  partition by range (a);
+create temp table prtx2_1 partition of prtx2 for values from (1) to (11);
+create temp table prtx2_2 partition of prtx2 for values from (11) to (21);
+create temp table prtx2_3 partition of prtx2 for values from (21) to (31);
+insert into prtx1 select 1 + i%30, i, i
+  from generate_series(1,1000) i;
+insert into prtx2 select 1 + i%30, i, i
+  from generate_series(1,500) i, generate_series(1,10) j;
+create index on prtx2 (b);
+create index on prtx2 (c);
+analyze prtx1;
+analyze prtx2;
+
+explain (costs off)
+select * from prtx1
+where not exists (select 1 from prtx2
+                  where prtx2.a=prtx1.a and prtx2.b=prtx1.b and prtx2.c=123)
+  and a<20 and c=120;
+
+select * from prtx1
+where not exists (select 1 from prtx2
+                  where prtx2.a=prtx1.a and prtx2.b=prtx1.b and prtx2.c=123)
+  and a<20 and c=120;
+
+explain (costs off)
+select * from prtx1
+where not exists (select 1 from prtx2
+                  where prtx2.a=prtx1.a and (prtx2.b=prtx1.b+1 or prtx2.c=99))
+  and a<20 and c=91;
+
+select * from prtx1
+where not exists (select 1 from prtx2
+                  where prtx2.a=prtx1.a and (prtx2.b=prtx1.b+1 or prtx2.c=99))
+  and a<20 and c=91;
 
 --
 -- Test advanced partition-matching algorithm for partitioned join

From 2a10fdc4307a667883f7a3369cb93a721ade9680 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 15 Jul 2020 09:03:10 +0900
Subject: [PATCH 143/334] Eliminate cache lookup errors in SQL functions for
 object addresses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When using the following functions, users could see various types of
errors of the type "cache lookup failed for OID XXX" with elog(), that
can only be used for internal errors:
* pg_describe_object()
* pg_identify_object()
* pg_identify_object_as_address()

The set of APIs managing object addresses for all object types are made
smarter by gaining a new argument "missing_ok" that allows any caller to
control if an error is raised or not on an undefined object.  The SQL
functions listed above are changed to handle the case where an object is
missing.

Regression tests are added for all object types for the cases where
these are undefined.  Before this commit, these cases failed with cache
lookup errors, and now they basically return NULL (minus the name of the
object type requested).

Author: Michael Paquier
Reviewed-by: Aleksander Alekseev, Dmitry Dolgov, Daniel Gustafsson,
Álvaro Herrera, Kyotaro Horiguchi
Discussion: https://postgr.es/m/CAB7nPqSZxrSmdHK-rny7z8mi=EAFXJ5J-0RbzDw6aus=wB5azQ@mail.gmail.com
---
 contrib/sepgsql/database.c                   |   6 +-
 contrib/sepgsql/dml.c                        |   4 +-
 contrib/sepgsql/label.c                      |   4 +-
 contrib/sepgsql/proc.c                       |  14 +-
 contrib/sepgsql/relation.c                   |  20 +-
 contrib/sepgsql/schema.c                     |   6 +-
 doc/src/sgml/func.sgml                       |   7 +-
 src/backend/catalog/dependency.c             |  30 +-
 src/backend/catalog/objectaddress.c          | 997 ++++++++++++++-----
 src/backend/catalog/pg_depend.c              |   4 +-
 src/backend/catalog/pg_shdepend.c            |   8 +-
 src/backend/commands/event_trigger.c         |   9 +-
 src/backend/commands/extension.c             |   6 +-
 src/backend/commands/tablecmds.c             |  12 +-
 src/backend/utils/adt/regproc.c              |  20 +-
 src/include/catalog/objectaddress.h          |  12 +-
 src/include/utils/regproc.h                  |   5 +-
 src/test/regress/expected/object_address.out | 100 ++
 src/test/regress/sql/object_address.sql      |  64 ++
 19 files changed, 996 insertions(+), 332 deletions(-)

diff --git a/contrib/sepgsql/database.c b/contrib/sepgsql/database.c
index 4e83b7bfa8c0..ec2037859379 100644
--- a/contrib/sepgsql/database.c
+++ b/contrib/sepgsql/database.c
@@ -142,7 +142,7 @@ sepgsql_database_drop(Oid databaseId)
 	object.classId = DatabaseRelationId;
 	object.objectId = databaseId;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	sepgsql_avc_check_perms(&object,
 							SEPG_CLASS_DB_DATABASE,
@@ -169,7 +169,7 @@ sepgsql_database_setattr(Oid databaseId)
 	object.classId = DatabaseRelationId;
 	object.objectId = databaseId;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	sepgsql_avc_check_perms(&object,
 							SEPG_CLASS_DB_DATABASE,
@@ -193,7 +193,7 @@ sepgsql_database_relabel(Oid databaseId, const char *seclabel)
 	object.classId = DatabaseRelationId;
 	object.objectId = databaseId;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	/*
 	 * check db_database:{setattr relabelfrom} permission
diff --git a/contrib/sepgsql/dml.c b/contrib/sepgsql/dml.c
index 53f6f41c5c41..75ee612bcdae 100644
--- a/contrib/sepgsql/dml.c
+++ b/contrib/sepgsql/dml.c
@@ -179,7 +179,7 @@ check_relation_privileges(Oid relOid,
 	object.classId = RelationRelationId;
 	object.objectId = relOid;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 	switch (relkind)
 	{
 		case RELKIND_RELATION:
@@ -256,7 +256,7 @@ check_relation_privileges(Oid relOid,
 		object.classId = RelationRelationId;
 		object.objectId = relOid;
 		object.objectSubId = attnum;
-		audit_name = getObjectDescription(&object);
+		audit_name = getObjectDescription(&object, false);
 
 		result = sepgsql_avc_check_perms(&object,
 										 SEPG_CLASS_DB_COLUMN,
diff --git a/contrib/sepgsql/label.c b/contrib/sepgsql/label.c
index 147ab67f3220..32e405530bb6 100644
--- a/contrib/sepgsql/label.c
+++ b/contrib/sepgsql/label.c
@@ -355,7 +355,7 @@ sepgsql_fmgr_hook(FmgrHookEventType event,
 					sepgsql_avc_check_perms(&object,
 											SEPG_CLASS_DB_PROCEDURE,
 											SEPG_DB_PROCEDURE__ENTRYPOINT,
-											getObjectDescription(&object),
+											getObjectDescription(&object, false),
 											true);
 
 					sepgsql_avc_check_perms_label(stack->new_label,
@@ -523,7 +523,7 @@ sepgsql_object_relabel(const ObjectAddress *object, const char *seclabel)
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 					 errmsg("sepgsql provider does not support labels on %s",
-							getObjectTypeDescription(object))));
+							getObjectTypeDescription(object, false))));
 			break;
 	}
 }
diff --git a/contrib/sepgsql/proc.c b/contrib/sepgsql/proc.c
index 2c244a90033d..d5d7dbe103ba 100644
--- a/contrib/sepgsql/proc.c
+++ b/contrib/sepgsql/proc.c
@@ -80,7 +80,7 @@ sepgsql_proc_post_create(Oid functionId)
 	sepgsql_avc_check_perms(&object,
 							SEPG_CLASS_DB_SCHEMA,
 							SEPG_DB_SCHEMA__ADD_NAME,
-							getObjectIdentity(&object),
+							getObjectIdentity(&object, false),
 							true);
 
 	/*
@@ -114,7 +114,7 @@ sepgsql_proc_post_create(Oid functionId)
 		object.classId = TypeRelationId;
 		object.objectId = proForm->proargtypes.values[i];
 		object.objectSubId = 0;
-		appendStringInfoString(&audit_name, getObjectIdentity(&object));
+		appendStringInfoString(&audit_name, getObjectIdentity(&object, false));
 	}
 	appendStringInfoChar(&audit_name, ')');
 
@@ -164,7 +164,7 @@ sepgsql_proc_drop(Oid functionId)
 	object.classId = NamespaceRelationId;
 	object.objectId = get_func_namespace(functionId);
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	sepgsql_avc_check_perms(&object,
 							SEPG_CLASS_DB_SCHEMA,
@@ -179,7 +179,7 @@ sepgsql_proc_drop(Oid functionId)
 	object.classId = ProcedureRelationId;
 	object.objectId = functionId;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	sepgsql_avc_check_perms(&object,
 							SEPG_CLASS_DB_PROCEDURE,
@@ -204,7 +204,7 @@ sepgsql_proc_relabel(Oid functionId, const char *seclabel)
 	object.classId = ProcedureRelationId;
 	object.objectId = functionId;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	/*
 	 * check db_procedure:{setattr relabelfrom} permission
@@ -292,7 +292,7 @@ sepgsql_proc_setattr(Oid functionId)
 	object.classId = ProcedureRelationId;
 	object.objectId = functionId;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	sepgsql_avc_check_perms(&object,
 							SEPG_CLASS_DB_PROCEDURE,
@@ -324,7 +324,7 @@ sepgsql_proc_execute(Oid functionId)
 	object.classId = ProcedureRelationId;
 	object.objectId = functionId;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 	sepgsql_avc_check_perms(&object,
 							SEPG_CLASS_DB_PROCEDURE,
 							SEPG_DB_PROCEDURE__EXECUTE,
diff --git a/contrib/sepgsql/relation.c b/contrib/sepgsql/relation.c
index 380bc6094d55..b50f386f5b38 100644
--- a/contrib/sepgsql/relation.c
+++ b/contrib/sepgsql/relation.c
@@ -102,7 +102,7 @@ sepgsql_attribute_post_create(Oid relOid, AttrNumber attnum)
 
 	initStringInfo(&audit_name);
 	appendStringInfo(&audit_name, "%s.%s",
-					 getObjectIdentity(&object),
+					 getObjectIdentity(&object, false),
 					 quote_identifier(NameStr(attForm->attname)));
 	sepgsql_avc_check_perms_label(ncontext,
 								  SEPG_CLASS_DB_COLUMN,
@@ -146,7 +146,7 @@ sepgsql_attribute_drop(Oid relOid, AttrNumber attnum)
 	object.classId = RelationRelationId;
 	object.objectId = relOid;
 	object.objectSubId = attnum;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	sepgsql_avc_check_perms(&object,
 							SEPG_CLASS_DB_COLUMN,
@@ -178,7 +178,7 @@ sepgsql_attribute_relabel(Oid relOid, AttrNumber attnum,
 	object.classId = RelationRelationId;
 	object.objectId = relOid;
 	object.objectSubId = attnum;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	/*
 	 * check db_column:{setattr relabelfrom} permission
@@ -222,7 +222,7 @@ sepgsql_attribute_setattr(Oid relOid, AttrNumber attnum)
 	object.classId = RelationRelationId;
 	object.objectId = relOid;
 	object.objectSubId = attnum;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	sepgsql_avc_check_perms(&object,
 							SEPG_CLASS_DB_COLUMN,
@@ -288,7 +288,7 @@ sepgsql_relation_post_create(Oid relOid)
 	sepgsql_avc_check_perms(&object,
 							SEPG_CLASS_DB_SCHEMA,
 							SEPG_DB_SCHEMA__ADD_NAME,
-							getObjectIdentity(&object),
+							getObjectIdentity(&object, false),
 							true);
 
 	switch (classForm->relkind)
@@ -450,7 +450,7 @@ sepgsql_relation_drop(Oid relOid)
 	object.classId = NamespaceRelationId;
 	object.objectId = get_rel_namespace(relOid);
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	sepgsql_avc_check_perms(&object,
 							SEPG_CLASS_DB_SCHEMA,
@@ -472,7 +472,7 @@ sepgsql_relation_drop(Oid relOid)
 	object.classId = RelationRelationId;
 	object.objectId = relOid;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	sepgsql_avc_check_perms(&object,
 							tclass,
@@ -503,7 +503,7 @@ sepgsql_relation_drop(Oid relOid)
 			object.classId = RelationRelationId;
 			object.objectId = relOid;
 			object.objectSubId = attForm->attnum;
-			audit_name = getObjectIdentity(&object);
+			audit_name = getObjectIdentity(&object, false);
 
 			sepgsql_avc_check_perms(&object,
 									SEPG_CLASS_DB_COLUMN,
@@ -584,7 +584,7 @@ sepgsql_relation_relabel(Oid relOid, const char *seclabel)
 	object.classId = RelationRelationId;
 	object.objectId = relOid;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	/*
 	 * check db_xxx:{setattr relabelfrom} permission
@@ -695,7 +695,7 @@ sepgsql_relation_setattr(Oid relOid)
 	object.classId = RelationRelationId;
 	object.objectId = relOid;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	sepgsql_avc_check_perms(&object,
 							tclass,
diff --git a/contrib/sepgsql/schema.c b/contrib/sepgsql/schema.c
index 90ecbc172553..3b2b80be831e 100644
--- a/contrib/sepgsql/schema.c
+++ b/contrib/sepgsql/schema.c
@@ -123,7 +123,7 @@ sepgsql_schema_drop(Oid namespaceId)
 	object.classId = NamespaceRelationId;
 	object.objectId = namespaceId;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	sepgsql_avc_check_perms(&object,
 							SEPG_CLASS_DB_SCHEMA,
@@ -148,7 +148,7 @@ sepgsql_schema_relabel(Oid namespaceId, const char *seclabel)
 	object.classId = NamespaceRelationId;
 	object.objectId = namespaceId;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	/*
 	 * check db_schema:{setattr relabelfrom} permission
@@ -186,7 +186,7 @@ check_schema_perms(Oid namespaceId, uint32 required, bool abort_on_violation)
 	object.classId = NamespaceRelationId;
 	object.objectId = namespaceId;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	result = sepgsql_avc_check_perms(&object,
 									 SEPG_CLASS_DB_SCHEMA,
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index cc83d6652e4f..959f6a1c2f25 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -22826,7 +22826,8 @@ SELECT collation for ('foo' COLLATE "de_DE");
         object).  This description is intended to be human-readable, and might
         be translated, depending on server configuration.  This is especially
         useful to determine the identity of an object referenced in the
-        <structname>pg_depend</structname> catalog.
+        <structname>pg_depend</structname> catalog. This function returns
+        <literal>NULL</literal> values for undefined objects.
        </para></entry>
       </row>
 
@@ -22858,7 +22859,8 @@ SELECT collation for ('foo' COLLATE "de_DE");
         otherwise <literal>NULL</literal>;
         <parameter>identity</parameter> is the complete object identity, with
         the precise format depending on object type, and each name within the
-        format being schema-qualified and quoted as necessary.
+        format being schema-qualified and quoted as necessary. Undefined
+        objects are identified with <literal>NULL</literal> values.
        </para></entry>
       </row>
 
@@ -22915,6 +22917,7 @@ SELECT collation for ('foo' COLLATE "de_DE");
         <parameter>objsubid</parameter> is the sub-object ID, or zero if none.
         This function is the inverse
         of <function>pg_identify_object_as_address</function>.
+        Undefined objects are identified with <literal>NULL</literal> values.
        </para></entry>
       </row>
      </tbody>
diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c
index b33a2f94af02..f515e2c308bc 100644
--- a/src/backend/catalog/dependency.c
+++ b/src/backend/catalog/dependency.c
@@ -743,8 +743,8 @@ findDependentObjects(const ObjectAddress *object,
 				if (!object_address_present_add_flags(object, objflags,
 													  targetObjects))
 					elog(ERROR, "deletion of owning object %s failed to delete %s",
-						 getObjectDescription(&otherObject),
-						 getObjectDescription(object));
+						 getObjectDescription(&otherObject, false),
+						 getObjectDescription(object, false));
 
 				/* And we're done here. */
 				return;
@@ -790,11 +790,11 @@ findDependentObjects(const ObjectAddress *object,
 				 * the depender fields...
 				 */
 				elog(ERROR, "incorrect use of PIN dependency with %s",
-					 getObjectDescription(object));
+					 getObjectDescription(object, false));
 				break;
 			default:
 				elog(ERROR, "unrecognized dependency type '%c' for %s",
-					 foundDep->deptype, getObjectDescription(object));
+					 foundDep->deptype, getObjectDescription(object, false));
 				break;
 		}
 	}
@@ -812,14 +812,14 @@ findDependentObjects(const ObjectAddress *object,
 		char	   *otherObjDesc;
 
 		if (OidIsValid(partitionObject.classId))
-			otherObjDesc = getObjectDescription(&partitionObject);
+			otherObjDesc = getObjectDescription(&partitionObject, false);
 		else
-			otherObjDesc = getObjectDescription(&owningObject);
+			otherObjDesc = getObjectDescription(&owningObject, false);
 
 		ereport(ERROR,
 				(errcode(ERRCODE_DEPENDENT_OBJECTS_STILL_EXIST),
 				 errmsg("cannot drop %s because %s requires it",
-						getObjectDescription(object), otherObjDesc),
+						getObjectDescription(object, false), otherObjDesc),
 				 errhint("You can drop %s instead.", otherObjDesc)));
 	}
 
@@ -929,12 +929,12 @@ findDependentObjects(const ObjectAddress *object,
 				ereport(ERROR,
 						(errcode(ERRCODE_DEPENDENT_OBJECTS_STILL_EXIST),
 						 errmsg("cannot drop %s because it is required by the database system",
-								getObjectDescription(object))));
+								getObjectDescription(object, false))));
 				subflags = 0;	/* keep compiler quiet */
 				break;
 			default:
 				elog(ERROR, "unrecognized dependency type '%c' for %s",
-					 foundDep->deptype, getObjectDescription(object));
+					 foundDep->deptype, getObjectDescription(object, false));
 				subflags = 0;	/* keep compiler quiet */
 				break;
 		}
@@ -1052,12 +1052,13 @@ reportDependentObjects(const ObjectAddresses *targetObjects,
 			!(extra->flags & DEPFLAG_PARTITION))
 		{
 			const ObjectAddress *object = &targetObjects->refs[i];
-			char	   *otherObjDesc = getObjectDescription(&extra->dependee);
+			char	   *otherObjDesc = getObjectDescription(&extra->dependee,
+															false);
 
 			ereport(ERROR,
 					(errcode(ERRCODE_DEPENDENT_OBJECTS_STILL_EXIST),
 					 errmsg("cannot drop %s because %s requires it",
-							getObjectDescription(object), otherObjDesc),
+							getObjectDescription(object, false), otherObjDesc),
 					 errhint("You can drop %s instead.", otherObjDesc)));
 		}
 	}
@@ -1105,7 +1106,7 @@ reportDependentObjects(const ObjectAddresses *targetObjects,
 		if (extra->flags & DEPFLAG_SUBOBJECT)
 			continue;
 
-		objDesc = getObjectDescription(obj);
+		objDesc = getObjectDescription(obj, false);
 
 		/*
 		 * If, at any stage of the recursive search, we reached the object via
@@ -1129,7 +1130,8 @@ reportDependentObjects(const ObjectAddresses *targetObjects,
 		}
 		else if (behavior == DROP_RESTRICT)
 		{
-			char	   *otherDesc = getObjectDescription(&extra->dependee);
+			char	   *otherDesc = getObjectDescription(&extra->dependee,
+														 false);
 
 			if (numReportedClient < MAX_REPORTED_DEPS)
 			{
@@ -1187,7 +1189,7 @@ reportDependentObjects(const ObjectAddresses *targetObjects,
 			ereport(ERROR,
 					(errcode(ERRCODE_DEPENDENT_OBJECTS_STILL_EXIST),
 					 errmsg("cannot drop %s because other objects depend on it",
-							getObjectDescription(origObject)),
+							getObjectDescription(origObject, false)),
 					 errdetail("%s", clientdetail.data),
 					 errdetail_log("%s", logdetail.data),
 					 errhint("Use DROP ... CASCADE to drop the dependent objects too.")));
diff --git a/src/backend/catalog/objectaddress.c b/src/backend/catalog/objectaddress.c
index 534df8e80208..6dfe1be2cc00 100644
--- a/src/backend/catalog/objectaddress.c
+++ b/src/backend/catalog/objectaddress.c
@@ -879,14 +879,20 @@ static ObjectAddress get_object_address_defacl(List *object,
 											   bool missing_ok);
 static const ObjectPropertyType *get_object_property_data(Oid class_id);
 
-static void getRelationDescription(StringInfo buffer, Oid relid);
-static void getOpFamilyDescription(StringInfo buffer, Oid opfid);
+static void getRelationDescription(StringInfo buffer, Oid relid,
+								   bool missing_ok);
+static void getOpFamilyDescription(StringInfo buffer, Oid opfid,
+								   bool missing_ok);
 static void getRelationTypeDescription(StringInfo buffer, Oid relid,
-									   int32 objectSubId);
-static void getProcedureTypeDescription(StringInfo buffer, Oid procid);
-static void getConstraintTypeDescription(StringInfo buffer, Oid constroid);
-static void getOpFamilyIdentity(StringInfo buffer, Oid opfid, List **object);
-static void getRelationIdentity(StringInfo buffer, Oid relid, List **object);
+									   int32 objectSubId, bool missing_ok);
+static void getProcedureTypeDescription(StringInfo buffer, Oid procid,
+										bool missing_ok);
+static void getConstraintTypeDescription(StringInfo buffer, Oid constroid,
+										 bool missing_ok);
+static void getOpFamilyIdentity(StringInfo buffer, Oid opfid, List **object,
+								bool missing_ok);
+static void getRelationIdentity(StringInfo buffer, Oid relid, List **object,
+								bool missing_ok);
 
 /*
  * Translate an object name and arguments (as passed by the parser) to an
@@ -1759,7 +1765,7 @@ get_object_address_opf_member(ObjectType objtype,
 										membernum,
 										TypeNameToString(typenames[0]),
 										TypeNameToString(typenames[1]),
-										getObjectDescription(&famaddr))));
+										getObjectDescription(&famaddr, false))));
 				}
 				else
 				{
@@ -1790,7 +1796,7 @@ get_object_address_opf_member(ObjectType objtype,
 										membernum,
 										TypeNameToString(typenames[0]),
 										TypeNameToString(typenames[1]),
-										getObjectDescription(&famaddr))));
+										getObjectDescription(&famaddr, false))));
 				}
 				else
 				{
@@ -2844,10 +2850,11 @@ get_catalog_object_by_oid(Relation catalog, AttrNumber oidcol, Oid objectId)
 /*
  * getObjectDescription: build an object description for messages
  *
- * The result is a palloc'd string.
+ * The result is a palloc'd string.  NULL is returned for an undefined
+ * object if missing_ok is true, else an error is generated.
  */
 char *
-getObjectDescription(const ObjectAddress *object)
+getObjectDescription(const ObjectAddress *object, bool missing_ok)
 {
 	StringInfoData buffer;
 
@@ -2857,33 +2864,49 @@ getObjectDescription(const ObjectAddress *object)
 	{
 		case OCLASS_CLASS:
 			if (object->objectSubId == 0)
-				getRelationDescription(&buffer, object->objectId);
+				getRelationDescription(&buffer, object->objectId, missing_ok);
 			else
 			{
 				/* column, not whole relation */
 				StringInfoData rel;
+				char	   *attname = get_attname(object->objectId,
+												  object->objectSubId,
+												  missing_ok);
+				if (!attname)
+					break;
 
 				initStringInfo(&rel);
-				getRelationDescription(&rel, object->objectId);
+				getRelationDescription(&rel, object->objectId, missing_ok);
 				/* translator: second %s is, e.g., "table %s" */
 				appendStringInfo(&buffer, _("column %s of %s"),
-								 get_attname(object->objectId,
-											 object->objectSubId,
-											 false),
-								 rel.data);
+								 attname, rel.data);
 				pfree(rel.data);
 			}
 			break;
 
 		case OCLASS_PROC:
-			appendStringInfo(&buffer, _("function %s"),
-							 format_procedure(object->objectId));
-			break;
+			{
+				bits16		flags = FORMAT_PROC_INVALID_AS_NULL;
+				char	   *proname = format_procedure_extended(object->objectId,
+																flags);
+				if (proname == NULL)
+					break;
+
+				appendStringInfo(&buffer, _("function %s"), proname);
+				break;
+			}
 
 		case OCLASS_TYPE:
-			appendStringInfo(&buffer, _("type %s"),
-							 format_type_be(object->objectId));
-			break;
+			{
+				bits16		flags = FORMAT_TYPE_INVALID_AS_NULL;
+				char	   *typname = format_type_extended(object->objectId, -1,
+														   flags);
+				if (typname == NULL)
+					break;
+
+				appendStringInfo(&buffer, _("type %s"), typname);
+				break;
+			}
 
 		case OCLASS_CAST:
 			{
@@ -2906,8 +2929,15 @@ getObjectDescription(const ObjectAddress *object)
 				tup = systable_getnext(rcscan);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for cast %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for cast %u",
+							 object->objectId);
+
+					systable_endscan(rcscan);
+					table_close(castDesc, AccessShareLock);
+					break;
+				}
 
 				castForm = (Form_pg_cast) GETSTRUCT(tup);
 
@@ -2929,8 +2959,13 @@ getObjectDescription(const ObjectAddress *object)
 				collTup = SearchSysCache1(COLLOID,
 										  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(collTup))
-					elog(ERROR, "cache lookup failed for collation %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for collation %u",
+							 object->objectId);
+					break;
+				}
+
 				coll = (Form_pg_collation) GETSTRUCT(collTup);
 
 				/* Qualify the name if not visible in search path */
@@ -2954,8 +2989,13 @@ getObjectDescription(const ObjectAddress *object)
 				conTup = SearchSysCache1(CONSTROID,
 										 ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(conTup))
-					elog(ERROR, "cache lookup failed for constraint %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for constraint %u",
+							 object->objectId);
+					break;
+				}
+
 				con = (Form_pg_constraint) GETSTRUCT(conTup);
 
 				if (OidIsValid(con->conrelid))
@@ -2963,7 +3003,7 @@ getObjectDescription(const ObjectAddress *object)
 					StringInfoData rel;
 
 					initStringInfo(&rel);
-					getRelationDescription(&rel, con->conrelid);
+					getRelationDescription(&rel, con->conrelid, false);
 					/* translator: second %s is, e.g., "table %s" */
 					appendStringInfo(&buffer, _("constraint %s on %s"),
 									 NameStr(con->conname), rel.data);
@@ -2988,8 +3028,13 @@ getObjectDescription(const ObjectAddress *object)
 				conTup = SearchSysCache1(CONVOID,
 										 ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(conTup))
-					elog(ERROR, "cache lookup failed for conversion %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for conversion %u",
+							 object->objectId);
+					break;
+				}
+
 				conv = (Form_pg_conversion) GETSTRUCT(conTup);
 
 				/* Qualify the name if not visible in search path */
@@ -3027,8 +3072,15 @@ getObjectDescription(const ObjectAddress *object)
 				tup = systable_getnext(adscan);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for attrdef %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for attrdef %u",
+							 object->objectId);
+
+					systable_endscan(adscan);
+					table_close(attrdefDesc, AccessShareLock);
+					break;
+				}
 
 				attrdef = (Form_pg_attrdef) GETSTRUCT(tup);
 
@@ -3038,7 +3090,7 @@ getObjectDescription(const ObjectAddress *object)
 
 				/* translator: %s is typically "column %s of table %s" */
 				appendStringInfo(&buffer, _("default value for %s"),
-								 getObjectDescription(&colobject));
+								 getObjectDescription(&colobject, false));
 
 				systable_endscan(adscan);
 				table_close(attrdefDesc, AccessShareLock);
@@ -3046,19 +3098,35 @@ getObjectDescription(const ObjectAddress *object)
 			}
 
 		case OCLASS_LANGUAGE:
-			appendStringInfo(&buffer, _("language %s"),
-							 get_language_name(object->objectId, false));
-			break;
+			{
+				char	   *langname = get_language_name(object->objectId,
+														 missing_ok);
+
+				if (langname)
+					appendStringInfo(&buffer, _("language %s"),
+									 get_language_name(object->objectId, false));
+				break;
+			}
 
 		case OCLASS_LARGEOBJECT:
+			if (!LargeObjectExists(object->objectId))
+				break;
 			appendStringInfo(&buffer, _("large object %u"),
 							 object->objectId);
 			break;
 
 		case OCLASS_OPERATOR:
-			appendStringInfo(&buffer, _("operator %s"),
-							 format_operator(object->objectId));
-			break;
+			{
+				bits16		flags = FORMAT_OPERATOR_INVALID_AS_NULL;
+				char	   *oprname = format_operator_extended(object->objectId,
+															   flags);
+
+				if (oprname == NULL)
+					break;
+
+				appendStringInfo(&buffer, _("operator %s"), oprname);
+				break;
+			}
 
 		case OCLASS_OPCLASS:
 			{
@@ -3071,8 +3139,13 @@ getObjectDescription(const ObjectAddress *object)
 				opcTup = SearchSysCache1(CLAOID,
 										 ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(opcTup))
-					elog(ERROR, "cache lookup failed for opclass %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for opclass %u",
+							 object->objectId);
+					break;
+				}
+
 				opcForm = (Form_pg_opclass) GETSTRUCT(opcTup);
 
 				amTup = SearchSysCache1(AMOID,
@@ -3099,7 +3172,7 @@ getObjectDescription(const ObjectAddress *object)
 			}
 
 		case OCLASS_OPFAMILY:
-			getOpFamilyDescription(&buffer, object->objectId);
+			getOpFamilyDescription(&buffer, object->objectId, missing_ok);
 			break;
 
 		case OCLASS_AM:
@@ -3109,8 +3182,13 @@ getObjectDescription(const ObjectAddress *object)
 				tup = SearchSysCache1(AMOID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for access method %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for access method %u",
+							 object->objectId);
+					break;
+				}
+
 				appendStringInfo(&buffer, _("access method %s"),
 								 NameStr(((Form_pg_am) GETSTRUCT(tup))->amname));
 				ReleaseSysCache(tup);
@@ -3140,13 +3218,20 @@ getObjectDescription(const ObjectAddress *object)
 				tup = systable_getnext(amscan);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for amop entry %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for amop entry %u",
+							 object->objectId);
+
+					systable_endscan(amscan);
+					table_close(amopDesc, AccessShareLock);
+					break;
+				}
 
 				amopForm = (Form_pg_amop) GETSTRUCT(tup);
 
 				initStringInfo(&opfam);
-				getOpFamilyDescription(&opfam, amopForm->amopfamily);
+				getOpFamilyDescription(&opfam, amopForm->amopfamily, false);
 
 				/*------
 				   translator: %d is the operator strategy (a number), the
@@ -3190,13 +3275,20 @@ getObjectDescription(const ObjectAddress *object)
 				tup = systable_getnext(amscan);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for amproc entry %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for amproc entry %u",
+							 object->objectId);
+
+					systable_endscan(amscan);
+					table_close(amprocDesc, AccessShareLock);
+					break;
+				}
 
 				amprocForm = (Form_pg_amproc) GETSTRUCT(tup);
 
 				initStringInfo(&opfam);
-				getOpFamilyDescription(&opfam, amprocForm->amprocfamily);
+				getOpFamilyDescription(&opfam, amprocForm->amprocfamily, false);
 
 				/*------
 				   translator: %d is the function number, the first two %s's
@@ -3239,12 +3331,20 @@ getObjectDescription(const ObjectAddress *object)
 				tup = systable_getnext(rcscan);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for rule %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for rule %u",
+							 object->objectId);
+
+					systable_endscan(rcscan);
+					table_close(ruleDesc, AccessShareLock);
+					break;
+				}
+
 				rule = (Form_pg_rewrite) GETSTRUCT(tup);
 
 				initStringInfo(&rel);
-				getRelationDescription(&rel, rule->ev_class);
+				getRelationDescription(&rel, rule->ev_class, false);
 
 				/* translator: second %s is, e.g., "table %s" */
 				appendStringInfo(&buffer, _("rule %s on %s"),
@@ -3277,12 +3377,20 @@ getObjectDescription(const ObjectAddress *object)
 				tup = systable_getnext(tgscan);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for trigger %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for trigger %u",
+							 object->objectId);
+
+					systable_endscan(tgscan);
+					table_close(trigDesc, AccessShareLock);
+					break;
+				}
+
 				trig = (Form_pg_trigger) GETSTRUCT(tup);
 
 				initStringInfo(&rel);
-				getRelationDescription(&rel, trig->tgrelid);
+				getRelationDescription(&rel, trig->tgrelid, false);
 
 				/* translator: second %s is, e.g., "table %s" */
 				appendStringInfo(&buffer, _("trigger %s on %s"),
@@ -3299,8 +3407,12 @@ getObjectDescription(const ObjectAddress *object)
 
 				nspname = get_namespace_name(object->objectId);
 				if (!nspname)
-					elog(ERROR, "cache lookup failed for namespace %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for namespace %u",
+							 object->objectId);
+					break;
+				}
 				appendStringInfo(&buffer, _("schema %s"), nspname);
 				break;
 			}
@@ -3314,8 +3426,13 @@ getObjectDescription(const ObjectAddress *object)
 				stxTup = SearchSysCache1(STATEXTOID,
 										 ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(stxTup))
-					elog(ERROR, "could not find tuple for statistics object %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for statistics object %u",
+							 object->objectId);
+					break;
+				}
+
 				stxForm = (Form_pg_statistic_ext) GETSTRUCT(stxTup);
 
 				/* Qualify the name if not visible in search path */
@@ -3341,8 +3458,12 @@ getObjectDescription(const ObjectAddress *object)
 				tup = SearchSysCache1(TSPARSEROID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for text search parser %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for text search parser %u",
+							 object->objectId);
+					break;
+				}
 				prsForm = (Form_pg_ts_parser) GETSTRUCT(tup);
 
 				/* Qualify the name if not visible in search path */
@@ -3367,8 +3488,13 @@ getObjectDescription(const ObjectAddress *object)
 				tup = SearchSysCache1(TSDICTOID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for text search dictionary %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for text search dictionary %u",
+							 object->objectId);
+					break;
+				}
+
 				dictForm = (Form_pg_ts_dict) GETSTRUCT(tup);
 
 				/* Qualify the name if not visible in search path */
@@ -3393,8 +3519,13 @@ getObjectDescription(const ObjectAddress *object)
 				tup = SearchSysCache1(TSTEMPLATEOID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for text search template %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for text search template %u",
+							 object->objectId);
+					break;
+				}
+
 				tmplForm = (Form_pg_ts_template) GETSTRUCT(tup);
 
 				/* Qualify the name if not visible in search path */
@@ -3419,8 +3550,13 @@ getObjectDescription(const ObjectAddress *object)
 				tup = SearchSysCache1(TSCONFIGOID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for text search configuration %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for text search configuration %u",
+							 object->objectId);
+					break;
+				}
+
 				cfgForm = (Form_pg_ts_config) GETSTRUCT(tup);
 
 				/* Qualify the name if not visible in search path */
@@ -3438,8 +3574,11 @@ getObjectDescription(const ObjectAddress *object)
 
 		case OCLASS_ROLE:
 			{
-				appendStringInfo(&buffer, _("role %s"),
-								 GetUserNameFromId(object->objectId, false));
+				char	   *username = GetUserNameFromId(object->objectId,
+														 missing_ok);
+
+				if (username)
+					appendStringInfo(&buffer, _("role %s"), username);
 				break;
 			}
 
@@ -3449,8 +3588,12 @@ getObjectDescription(const ObjectAddress *object)
 
 				datname = get_database_name(object->objectId);
 				if (!datname)
-					elog(ERROR, "cache lookup failed for database %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for database %u",
+							 object->objectId);
+					break;
+				}
 				appendStringInfo(&buffer, _("database %s"), datname);
 				break;
 			}
@@ -3461,8 +3604,12 @@ getObjectDescription(const ObjectAddress *object)
 
 				tblspace = get_tablespace_name(object->objectId);
 				if (!tblspace)
-					elog(ERROR, "cache lookup failed for tablespace %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for tablespace %u",
+							 object->objectId);
+					break;
+				}
 				appendStringInfo(&buffer, _("tablespace %s"), tblspace);
 				break;
 			}
@@ -3471,8 +3618,10 @@ getObjectDescription(const ObjectAddress *object)
 			{
 				ForeignDataWrapper *fdw;
 
-				fdw = GetForeignDataWrapper(object->objectId);
-				appendStringInfo(&buffer, _("foreign-data wrapper %s"), fdw->fdwname);
+				fdw = GetForeignDataWrapperExtended(object->objectId,
+													missing_ok);
+				if (fdw)
+					appendStringInfo(&buffer, _("foreign-data wrapper %s"), fdw->fdwname);
 				break;
 			}
 
@@ -3480,8 +3629,9 @@ getObjectDescription(const ObjectAddress *object)
 			{
 				ForeignServer *srv;
 
-				srv = GetForeignServer(object->objectId);
-				appendStringInfo(&buffer, _("server %s"), srv->servername);
+				srv = GetForeignServerExtended(object->objectId, missing_ok);
+				if (srv)
+					appendStringInfo(&buffer, _("server %s"), srv->servername);
 				break;
 			}
 
@@ -3496,8 +3646,13 @@ getObjectDescription(const ObjectAddress *object)
 				tup = SearchSysCache1(USERMAPPINGOID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for user mapping %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for user mapping %u",
+							 object->objectId);
+					break;
+				}
+
 				umform = (Form_pg_user_mapping) GETSTRUCT(tup);
 				useid = umform->umuser;
 				srv = GetForeignServer(umform->umserver);
@@ -3537,8 +3692,15 @@ getObjectDescription(const ObjectAddress *object)
 				tup = systable_getnext(rcscan);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for default ACL %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for default ACL %u",
+							 object->objectId);
+
+					systable_endscan(rcscan);
+					table_close(defaclrel, AccessShareLock);
+					break;
+				}
 
 				defacl = (Form_pg_default_acl) GETSTRUCT(tup);
 
@@ -3621,8 +3783,12 @@ getObjectDescription(const ObjectAddress *object)
 
 				extname = get_extension_name(object->objectId);
 				if (!extname)
-					elog(ERROR, "cache lookup failed for extension %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for extension %u",
+							 object->objectId);
+					break;
+				}
 				appendStringInfo(&buffer, _("extension %s"), extname);
 				break;
 			}
@@ -3634,8 +3800,12 @@ getObjectDescription(const ObjectAddress *object)
 				tup = SearchSysCache1(EVENTTRIGGEROID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for event trigger %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for event trigger %u",
+							 object->objectId);
+					break;
+				}
 				appendStringInfo(&buffer, _("event trigger %s"),
 								 NameStr(((Form_pg_event_trigger) GETSTRUCT(tup))->evtname));
 				ReleaseSysCache(tup);
@@ -3664,12 +3834,20 @@ getObjectDescription(const ObjectAddress *object)
 				tuple = systable_getnext(sscan);
 
 				if (!HeapTupleIsValid(tuple))
-					elog(ERROR, "could not find tuple for policy %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for policy %u",
+							 object->objectId);
+
+					systable_endscan(sscan);
+					table_close(policy_rel, AccessShareLock);
+					break;
+				}
+
 				form_policy = (Form_pg_policy) GETSTRUCT(tuple);
 
 				initStringInfo(&rel);
-				getRelationDescription(&rel, form_policy->polrelid);
+				getRelationDescription(&rel, form_policy->polrelid, false);
 
 				/* translator: second %s is, e.g., "table %s" */
 				appendStringInfo(&buffer, _("policy %s on %s"),
@@ -3682,9 +3860,10 @@ getObjectDescription(const ObjectAddress *object)
 
 		case OCLASS_PUBLICATION:
 			{
-				appendStringInfo(&buffer, _("publication %s"),
-								 get_publication_name(object->objectId,
-													  false));
+				char	   *pubname = get_publication_name(object->objectId,
+														   missing_ok);
+				if (pubname)
+					appendStringInfo(&buffer, _("publication %s"), pubname);
 				break;
 			}
 
@@ -3698,14 +3877,18 @@ getObjectDescription(const ObjectAddress *object)
 				tup = SearchSysCache1(PUBLICATIONREL,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for publication table %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for publication table %u",
+							 object->objectId);
+					break;
+				}
 
 				prform = (Form_pg_publication_rel) GETSTRUCT(tup);
 				pubname = get_publication_name(prform->prpubid, false);
 
 				initStringInfo(&rel);
-				getRelationDescription(&rel, prform->prrelid);
+				getRelationDescription(&rel, prform->prrelid, false);
 
 				/* translator: first %s is, e.g., "table %s" */
 				appendStringInfo(&buffer, _("publication of %s in publication %s"),
@@ -3717,9 +3900,10 @@ getObjectDescription(const ObjectAddress *object)
 
 		case OCLASS_SUBSCRIPTION:
 			{
-				appendStringInfo(&buffer, _("subscription %s"),
-								 get_subscription_name(object->objectId,
-													   false));
+				char	   *subname = get_subscription_name(object->objectId,
+															missing_ok);
+				if (subname)
+					appendStringInfo(&buffer, _("subscription %s"), subname);
 				break;
 			}
 
@@ -3731,8 +3915,12 @@ getObjectDescription(const ObjectAddress *object)
 				trfTup = SearchSysCache1(TRFOID,
 										 ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(trfTup))
-					elog(ERROR, "could not find tuple for transform %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for transform %u",
+							 object->objectId);
+					break;
+				}
 
 				trfForm = (Form_pg_transform) GETSTRUCT(trfTup);
 
@@ -3750,6 +3938,10 @@ getObjectDescription(const ObjectAddress *object)
 			 */
 	}
 
+	/* an empty buffer is equivalent to no object found */
+	if (buffer.len == 0)
+		return NULL;
+
 	return buffer.data;
 }
 
@@ -3765,7 +3957,7 @@ getObjectDescriptionOids(Oid classid, Oid objid)
 	address.objectId = objid;
 	address.objectSubId = 0;
 
-	return getObjectDescription(&address);
+	return getObjectDescription(&address, false);
 }
 
 /*
@@ -3774,7 +3966,7 @@ getObjectDescriptionOids(Oid classid, Oid objid)
  * The result is appended to "buffer".
  */
 static void
-getRelationDescription(StringInfo buffer, Oid relid)
+getRelationDescription(StringInfo buffer, Oid relid, bool missing_ok)
 {
 	HeapTuple	relTup;
 	Form_pg_class relForm;
@@ -3784,7 +3976,11 @@ getRelationDescription(StringInfo buffer, Oid relid)
 	relTup = SearchSysCache1(RELOID,
 							 ObjectIdGetDatum(relid));
 	if (!HeapTupleIsValid(relTup))
-		elog(ERROR, "cache lookup failed for relation %u", relid);
+	{
+		if (!missing_ok)
+			elog(ERROR, "cache lookup failed for relation %u", relid);
+		return;
+	}
 	relForm = (Form_pg_class) GETSTRUCT(relTup);
 
 	/* Qualify the name if not visible in search path */
@@ -3845,7 +4041,7 @@ getRelationDescription(StringInfo buffer, Oid relid)
  * subroutine for getObjectDescription: describe an operator family
  */
 static void
-getOpFamilyDescription(StringInfo buffer, Oid opfid)
+getOpFamilyDescription(StringInfo buffer, Oid opfid, bool missing_ok)
 {
 	HeapTuple	opfTup;
 	Form_pg_opfamily opfForm;
@@ -3855,7 +4051,11 @@ getOpFamilyDescription(StringInfo buffer, Oid opfid)
 
 	opfTup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfid));
 	if (!HeapTupleIsValid(opfTup))
-		elog(ERROR, "cache lookup failed for opfamily %u", opfid);
+	{
+		if (!missing_ok)
+			elog(ERROR, "cache lookup failed for opfamily %u", opfid);
+		return;
+	}
 	opfForm = (Form_pg_opfamily) GETSTRUCT(opfTup);
 
 	amTup = SearchSysCache1(AMOID, ObjectIdGetDatum(opfForm->opfmethod));
@@ -3899,7 +4099,11 @@ pg_describe_object(PG_FUNCTION_ARGS)
 	address.objectId = objid;
 	address.objectSubId = objsubid;
 
-	description = getObjectDescription(&address);
+	description = getObjectDescription(&address, true);
+
+	if (description == NULL)
+		PG_RETURN_NULL();
+
 	PG_RETURN_TEXT_P(cstring_to_text(description));
 }
 
@@ -3914,6 +4118,7 @@ pg_identify_object(PG_FUNCTION_ARGS)
 	int32		objsubid = PG_GETARG_INT32(2);
 	Oid			schema_oid = InvalidOid;
 	const char *objname = NULL;
+	char	   *objidentity;
 	ObjectAddress address;
 	Datum		values[4];
 	bool		nulls[4];
@@ -3988,12 +4193,18 @@ pg_identify_object(PG_FUNCTION_ARGS)
 		table_close(catalog, AccessShareLock);
 	}
 
-	/* object type */
-	values[0] = CStringGetTextDatum(getObjectTypeDescription(&address));
+	/* object type, which can never be NULL */
+	values[0] = CStringGetTextDatum(getObjectTypeDescription(&address, true));
 	nulls[0] = false;
 
+	/*
+	 * Before doing anything, extract the object identity.  If the identity
+	 * could not be found, set all the fields except the object type to NULL.
+	 */
+	objidentity = getObjectIdentity(&address, true);
+
 	/* schema name */
-	if (OidIsValid(schema_oid))
+	if (OidIsValid(schema_oid) && objidentity)
 	{
 		const char *schema = quote_identifier(get_namespace_name(schema_oid));
 
@@ -4004,7 +4215,7 @@ pg_identify_object(PG_FUNCTION_ARGS)
 		nulls[1] = true;
 
 	/* object name */
-	if (objname)
+	if (objname && objidentity)
 	{
 		values[2] = CStringGetTextDatum(objname);
 		nulls[2] = false;
@@ -4013,8 +4224,13 @@ pg_identify_object(PG_FUNCTION_ARGS)
 		nulls[2] = true;
 
 	/* object identity */
-	values[3] = CStringGetTextDatum(getObjectIdentity(&address));
-	nulls[3] = false;
+	if (objidentity)
+	{
+		values[3] = CStringGetTextDatum(objidentity);
+		nulls[3] = false;
+	}
+	else
+		nulls[3] = true;
 
 	htup = heap_form_tuple(tupdesc, values, nulls);
 
@@ -4058,26 +4274,34 @@ pg_identify_object_as_address(PG_FUNCTION_ARGS)
 	tupdesc = BlessTupleDesc(tupdesc);
 
 	/* object type */
-	values[0] = CStringGetTextDatum(getObjectTypeDescription(&address));
+	values[0] = CStringGetTextDatum(getObjectTypeDescription(&address, true));
 	nulls[0] = false;
 
 	/* object identity */
-	identity = getObjectIdentityParts(&address, &names, &args);
-	pfree(identity);
-
-	/* object_names */
-	if (names != NIL)
-		values[1] = PointerGetDatum(strlist_to_textarray(names));
+	identity = getObjectIdentityParts(&address, &names, &args, true);
+	if (identity == NULL)
+	{
+		nulls[1] = true;
+		nulls[2] = true;
+	}
 	else
-		values[1] = PointerGetDatum(construct_empty_array(TEXTOID));
-	nulls[1] = false;
+	{
+		pfree(identity);
 
-	/* object_args */
-	if (args)
-		values[2] = PointerGetDatum(strlist_to_textarray(args));
-	else
-		values[2] = PointerGetDatum(construct_empty_array(TEXTOID));
-	nulls[2] = false;
+		/* object_names */
+		if (names != NIL)
+			values[1] = PointerGetDatum(strlist_to_textarray(names));
+		else
+			values[1] = PointerGetDatum(construct_empty_array(TEXTOID));
+		nulls[1] = false;
+
+		/* object_args */
+		if (args)
+			values[2] = PointerGetDatum(strlist_to_textarray(args));
+		else
+			values[2] = PointerGetDatum(construct_empty_array(TEXTOID));
+		nulls[2] = false;
+	}
 
 	htup = heap_form_tuple(tupdesc, values, nulls);
 
@@ -4091,7 +4315,7 @@ pg_identify_object_as_address(PG_FUNCTION_ARGS)
  * Keep ObjectTypeMap in sync with this.
  */
 char *
-getObjectTypeDescription(const ObjectAddress *object)
+getObjectTypeDescription(const ObjectAddress *object, bool missing_ok)
 {
 	StringInfoData buffer;
 
@@ -4101,11 +4325,13 @@ getObjectTypeDescription(const ObjectAddress *object)
 	{
 		case OCLASS_CLASS:
 			getRelationTypeDescription(&buffer, object->objectId,
-									   object->objectSubId);
+									   object->objectSubId,
+									   missing_ok);
 			break;
 
 		case OCLASS_PROC:
-			getProcedureTypeDescription(&buffer, object->objectId);
+			getProcedureTypeDescription(&buffer, object->objectId,
+										missing_ok);
 			break;
 
 		case OCLASS_TYPE:
@@ -4121,7 +4347,8 @@ getObjectTypeDescription(const ObjectAddress *object)
 			break;
 
 		case OCLASS_CONSTRAINT:
-			getConstraintTypeDescription(&buffer, object->objectId);
+			getConstraintTypeDescription(&buffer, object->objectId,
+										 missing_ok);
 			break;
 
 		case OCLASS_CONVERSION:
@@ -4258,6 +4485,10 @@ getObjectTypeDescription(const ObjectAddress *object)
 			 */
 	}
 
+	/* an empty string is equivalent to no object found */
+	if (buffer.len == 0)
+		return NULL;
+
 	return buffer.data;
 }
 
@@ -4265,7 +4496,8 @@ getObjectTypeDescription(const ObjectAddress *object)
  * subroutine for getObjectTypeDescription: describe a relation type
  */
 static void
-getRelationTypeDescription(StringInfo buffer, Oid relid, int32 objectSubId)
+getRelationTypeDescription(StringInfo buffer, Oid relid, int32 objectSubId,
+						   bool missing_ok)
 {
 	HeapTuple	relTup;
 	Form_pg_class relForm;
@@ -4273,7 +4505,14 @@ getRelationTypeDescription(StringInfo buffer, Oid relid, int32 objectSubId)
 	relTup = SearchSysCache1(RELOID,
 							 ObjectIdGetDatum(relid));
 	if (!HeapTupleIsValid(relTup))
-		elog(ERROR, "cache lookup failed for relation %u", relid);
+	{
+		if (!missing_ok)
+			elog(ERROR, "cache lookup failed for relation %u", relid);
+
+		/* fallback to "relation" for an undefined object */
+		appendStringInfoString(buffer, "relation");
+		return;
+	}
 	relForm = (Form_pg_class) GETSTRUCT(relTup);
 
 	switch (relForm->relkind)
@@ -4320,7 +4559,7 @@ getRelationTypeDescription(StringInfo buffer, Oid relid, int32 objectSubId)
  * subroutine for getObjectTypeDescription: describe a constraint type
  */
 static void
-getConstraintTypeDescription(StringInfo buffer, Oid constroid)
+getConstraintTypeDescription(StringInfo buffer, Oid constroid, bool missing_ok)
 {
 	Relation	constrRel;
 	HeapTuple	constrTup;
@@ -4330,7 +4569,16 @@ getConstraintTypeDescription(StringInfo buffer, Oid constroid)
 	constrTup = get_catalog_object_by_oid(constrRel, Anum_pg_constraint_oid,
 										  constroid);
 	if (!HeapTupleIsValid(constrTup))
-		elog(ERROR, "cache lookup failed for constraint %u", constroid);
+	{
+		if (!missing_ok)
+			elog(ERROR, "cache lookup failed for constraint %u", constroid);
+
+		table_close(constrRel, AccessShareLock);
+
+		/* fallback to "constraint" for an undefined object */
+		appendStringInfoString(buffer, "constraint");
+		return;
+	}
 
 	constrForm = (Form_pg_constraint) GETSTRUCT(constrTup);
 
@@ -4348,7 +4596,8 @@ getConstraintTypeDescription(StringInfo buffer, Oid constroid)
  * subroutine for getObjectTypeDescription: describe a procedure type
  */
 static void
-getProcedureTypeDescription(StringInfo buffer, Oid procid)
+getProcedureTypeDescription(StringInfo buffer, Oid procid,
+							bool missing_ok)
 {
 	HeapTuple	procTup;
 	Form_pg_proc procForm;
@@ -4356,7 +4605,14 @@ getProcedureTypeDescription(StringInfo buffer, Oid procid)
 	procTup = SearchSysCache1(PROCOID,
 							  ObjectIdGetDatum(procid));
 	if (!HeapTupleIsValid(procTup))
-		elog(ERROR, "cache lookup failed for procedure %u", procid);
+	{
+		if (!missing_ok)
+			elog(ERROR, "cache lookup failed for procedure %u", procid);
+
+		/* fallback to "procedure" for an undefined object */
+		appendStringInfoString(buffer, "routine");
+		return;
+	}
 	procForm = (Form_pg_proc) GETSTRUCT(procTup);
 
 	if (procForm->prokind == PROKIND_AGGREGATE)
@@ -4373,12 +4629,13 @@ getProcedureTypeDescription(StringInfo buffer, Oid procid)
  * Obtain a given object's identity, as a palloc'ed string.
  *
  * This is for machine consumption, so it's not translated.  All elements are
- * schema-qualified when appropriate.
+ * schema-qualified when appropriate.  Returns NULL if the object could not
+ * be found.
  */
 char *
-getObjectIdentity(const ObjectAddress *object)
+getObjectIdentity(const ObjectAddress *object, bool missing_ok)
 {
-	return getObjectIdentityParts(object, NULL, NULL);
+	return getObjectIdentityParts(object, NULL, NULL, missing_ok);
 }
 
 /*
@@ -4387,11 +4644,13 @@ getObjectIdentity(const ObjectAddress *object)
  * There are two sets of return values: the identity itself as a palloc'd
  * string is returned.  objname and objargs, if not NULL, are output parameters
  * that receive lists of C-strings that are useful to give back to
- * get_object_address() to reconstruct the ObjectAddress.
+ * get_object_address() to reconstruct the ObjectAddress.  Returns NULL if
+ * the object could not be found.
  */
 char *
 getObjectIdentityParts(const ObjectAddress *object,
-					   List **objname, List **objargs)
+					   List **objname, List **objargs,
+					   bool missing_ok)
 {
 	StringInfoData buffer;
 
@@ -4413,31 +4672,63 @@ getObjectIdentityParts(const ObjectAddress *object,
 	switch (getObjectClass(object))
 	{
 		case OCLASS_CLASS:
-			getRelationIdentity(&buffer, object->objectId, objname);
-			if (object->objectSubId != 0)
 			{
-				char	   *attr;
+				char	   *attr = NULL;
 
-				attr = get_attname(object->objectId, object->objectSubId,
-								   false);
-				appendStringInfo(&buffer, ".%s", quote_identifier(attr));
-				if (objname)
-					*objname = lappend(*objname, attr);
+				/*
+				 * Check for the attribute first, so as if it is missing we
+				 * can skip the entire relation description.
+				 */
+				if (object->objectSubId != 0)
+				{
+					attr = get_attname(object->objectId,
+									   object->objectSubId,
+									   missing_ok);
+
+					if (missing_ok && attr == NULL)
+						break;
+				}
+
+				getRelationIdentity(&buffer, object->objectId, objname,
+									missing_ok);
+				if (objname && *objname == NIL)
+					break;
+
+				if (attr)
+				{
+					appendStringInfo(&buffer, ".%s",
+									 quote_identifier(attr));
+					if (objname)
+						*objname = lappend(*objname, attr);
+				}
 			}
 			break;
 
 		case OCLASS_PROC:
-			appendStringInfoString(&buffer,
-								   format_procedure_qualified(object->objectId));
-			if (objname)
-				format_procedure_parts(object->objectId, objname, objargs);
-			break;
+			{
+				bits16		flags = FORMAT_PROC_FORCE_QUALIFY | FORMAT_PROC_INVALID_AS_NULL;
+				char	   *proname = format_procedure_extended(object->objectId,
+																flags);
+				if (proname == NULL)
+					break;
+
+				appendStringInfoString(&buffer, proname);
+				if (objname)
+					format_procedure_parts(object->objectId, objname, objargs,
+										   missing_ok);
+				break;
+			}
 
 		case OCLASS_TYPE:
 			{
+				bits16		flags = FORMAT_TYPE_INVALID_AS_NULL | FORMAT_TYPE_FORCE_QUALIFY;
 				char	   *typeout;
 
-				typeout = format_type_be_qualified(object->objectId);
+				typeout = format_type_extended(object->objectId, -1, flags);
+
+				if (typeout == NULL)
+					break;
+
 				appendStringInfoString(&buffer, typeout);
 				if (objname)
 					*objname = list_make1(typeout);
@@ -4456,8 +4747,14 @@ getObjectIdentityParts(const ObjectAddress *object,
 												object->objectId);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for cast %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for cast %u",
+							 object->objectId);
+
+					table_close(castRel, AccessShareLock);
+					break;
+				}
 
 				castForm = (Form_pg_cast) GETSTRUCT(tup);
 
@@ -4484,8 +4781,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 				collTup = SearchSysCache1(COLLOID,
 										  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(collTup))
-					elog(ERROR, "cache lookup failed for collation %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for collation %u",
+							 object->objectId);
+					break;
+				}
 				coll = (Form_pg_collation) GETSTRUCT(collTup);
 				schema = get_namespace_name_or_temp(coll->collnamespace);
 				appendStringInfoString(&buffer,
@@ -4506,15 +4807,20 @@ getObjectIdentityParts(const ObjectAddress *object,
 				conTup = SearchSysCache1(CONSTROID,
 										 ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(conTup))
-					elog(ERROR, "cache lookup failed for constraint %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for constraint %u",
+							 object->objectId);
+					break;
+				}
 				con = (Form_pg_constraint) GETSTRUCT(conTup);
 
 				if (OidIsValid(con->conrelid))
 				{
 					appendStringInfo(&buffer, "%s on ",
 									 quote_identifier(NameStr(con->conname)));
-					getRelationIdentity(&buffer, con->conrelid, objname);
+					getRelationIdentity(&buffer, con->conrelid, objname,
+										false);
 					if (objname)
 						*objname = lappend(*objname, pstrdup(NameStr(con->conname)));
 				}
@@ -4529,7 +4835,8 @@ getObjectIdentityParts(const ObjectAddress *object,
 
 					appendStringInfo(&buffer, "%s on %s",
 									 quote_identifier(NameStr(con->conname)),
-									 getObjectIdentityParts(&domain, objname, objargs));
+									 getObjectIdentityParts(&domain, objname,
+															objargs, false));
 
 					if (objname)
 						*objargs = lappend(*objargs, pstrdup(NameStr(con->conname)));
@@ -4548,8 +4855,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 				conTup = SearchSysCache1(CONVOID,
 										 ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(conTup))
-					elog(ERROR, "cache lookup failed for conversion %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for conversion %u",
+							 object->objectId);
+					break;
+				}
 				conForm = (Form_pg_conversion) GETSTRUCT(conTup);
 				schema = get_namespace_name_or_temp(conForm->connamespace);
 				appendStringInfoString(&buffer,
@@ -4585,8 +4896,15 @@ getObjectIdentityParts(const ObjectAddress *object,
 				tup = systable_getnext(adscan);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for attrdef %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for attrdef %u",
+							 object->objectId);
+
+					systable_endscan(adscan);
+					table_close(attrdefDesc, AccessShareLock);
+					break;
+				}
 
 				attrdef = (Form_pg_attrdef) GETSTRUCT(tup);
 
@@ -4596,7 +4914,8 @@ getObjectIdentityParts(const ObjectAddress *object,
 
 				appendStringInfo(&buffer, "for %s",
 								 getObjectIdentityParts(&colobject,
-														objname, objargs));
+														objname, objargs,
+														false));
 
 				systable_endscan(adscan);
 				table_close(attrdefDesc, AccessShareLock);
@@ -4611,8 +4930,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 				langTup = SearchSysCache1(LANGOID,
 										  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(langTup))
-					elog(ERROR, "cache lookup failed for language %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for language %u",
+							 object->objectId);
+					break;
+				}
 				langForm = (Form_pg_language) GETSTRUCT(langTup);
 				appendStringInfoString(&buffer,
 									   quote_identifier(NameStr(langForm->lanname)));
@@ -4622,6 +4945,8 @@ getObjectIdentityParts(const ObjectAddress *object,
 				break;
 			}
 		case OCLASS_LARGEOBJECT:
+			if (!LargeObjectExists(object->objectId))
+				break;
 			appendStringInfo(&buffer, "%u",
 							 object->objectId);
 			if (objname)
@@ -4629,11 +4954,18 @@ getObjectIdentityParts(const ObjectAddress *object,
 			break;
 
 		case OCLASS_OPERATOR:
-			appendStringInfoString(&buffer,
-								   format_operator_qualified(object->objectId));
-			if (objname)
-				format_operator_parts(object->objectId, objname, objargs);
-			break;
+			{
+				bits16		flags = FORMAT_OPERATOR_FORCE_QUALIFY | FORMAT_OPERATOR_INVALID_AS_NULL;
+				char	   *oprname = format_operator_extended(object->objectId,
+															   flags);
+				if (oprname == NULL)
+					break;
+
+				appendStringInfoString(&buffer, oprname);
+				if (objname)
+					format_operator_parts(object->objectId, objname, objargs, missing_ok);
+				break;
+			}
 
 		case OCLASS_OPCLASS:
 			{
@@ -4646,8 +4978,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 				opcTup = SearchSysCache1(CLAOID,
 										 ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(opcTup))
-					elog(ERROR, "cache lookup failed for opclass %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for opclass %u",
+							 object->objectId);
+					break;
+				}
 				opcForm = (Form_pg_opclass) GETSTRUCT(opcTup);
 				schema = get_namespace_name_or_temp(opcForm->opcnamespace);
 
@@ -4673,7 +5009,8 @@ getObjectIdentityParts(const ObjectAddress *object,
 			}
 
 		case OCLASS_OPFAMILY:
-			getOpFamilyIdentity(&buffer, object->objectId, objname);
+			getOpFamilyIdentity(&buffer, object->objectId, objname,
+								missing_ok);
 			break;
 
 		case OCLASS_AM:
@@ -4682,8 +5019,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 
 				amname = get_am_name(object->objectId);
 				if (!amname)
-					elog(ERROR, "cache lookup failed for access method %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for access method %u",
+							 object->objectId);
+					break;
+				}
 				appendStringInfoString(&buffer, quote_identifier(amname));
 				if (objname)
 					*objname = list_make1(amname);
@@ -4715,13 +5056,21 @@ getObjectIdentityParts(const ObjectAddress *object,
 				tup = systable_getnext(amscan);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for amop entry %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for amop entry %u",
+							 object->objectId);
+
+					systable_endscan(amscan);
+					table_close(amopDesc, AccessShareLock);
+					break;
+				}
 
 				amopForm = (Form_pg_amop) GETSTRUCT(tup);
 
 				initStringInfo(&opfam);
-				getOpFamilyIdentity(&opfam, amopForm->amopfamily, objname);
+				getOpFamilyIdentity(&opfam, amopForm->amopfamily, objname,
+									false);
 
 				ltype = format_type_be_qualified(amopForm->amoplefttype);
 				rtype = format_type_be_qualified(amopForm->amoprighttype);
@@ -4769,13 +5118,21 @@ getObjectIdentityParts(const ObjectAddress *object,
 				tup = systable_getnext(amscan);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for amproc entry %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for amproc entry %u",
+							 object->objectId);
+
+					systable_endscan(amscan);
+					table_close(amprocDesc, AccessShareLock);
+					break;
+				}
 
 				amprocForm = (Form_pg_amproc) GETSTRUCT(tup);
 
 				initStringInfo(&opfam);
-				getOpFamilyIdentity(&opfam, amprocForm->amprocfamily, objname);
+				getOpFamilyIdentity(&opfam, amprocForm->amprocfamily, objname,
+									false);
 
 				ltype = format_type_be_qualified(amprocForm->amproclefttype);
 				rtype = format_type_be_qualified(amprocForm->amprocrighttype);
@@ -4810,14 +5167,20 @@ getObjectIdentityParts(const ObjectAddress *object,
 												object->objectId);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for rule %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for rule %u",
+							 object->objectId);
+
+					table_close(ruleDesc, AccessShareLock);
+					break;
+				}
 
 				rule = (Form_pg_rewrite) GETSTRUCT(tup);
 
 				appendStringInfo(&buffer, "%s on ",
 								 quote_identifier(NameStr(rule->rulename)));
-				getRelationIdentity(&buffer, rule->ev_class, objname);
+				getRelationIdentity(&buffer, rule->ev_class, objname, false);
 				if (objname)
 					*objname = lappend(*objname, pstrdup(NameStr(rule->rulename)));
 
@@ -4837,14 +5200,20 @@ getObjectIdentityParts(const ObjectAddress *object,
 												object->objectId);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for trigger %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for trigger %u",
+							 object->objectId);
+
+					table_close(trigDesc, AccessShareLock);
+					break;
+				}
 
 				trig = (Form_pg_trigger) GETSTRUCT(tup);
 
 				appendStringInfo(&buffer, "%s on ",
 								 quote_identifier(NameStr(trig->tgname)));
-				getRelationIdentity(&buffer, trig->tgrelid, objname);
+				getRelationIdentity(&buffer, trig->tgrelid, objname, false);
 				if (objname)
 					*objname = lappend(*objname, pstrdup(NameStr(trig->tgname)));
 
@@ -4858,8 +5227,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 
 				nspname = get_namespace_name_or_temp(object->objectId);
 				if (!nspname)
-					elog(ERROR, "cache lookup failed for namespace %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for namespace %u",
+							 object->objectId);
+					break;
+				}
 				appendStringInfoString(&buffer,
 									   quote_identifier(nspname));
 				if (objname)
@@ -4876,8 +5249,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 				tup = SearchSysCache1(STATEXTOID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for statistics object %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for statistics object %u",
+							 object->objectId);
+					break;
+				}
 				formStatistic = (Form_pg_statistic_ext) GETSTRUCT(tup);
 				schema = get_namespace_name_or_temp(formStatistic->stxnamespace);
 				appendStringInfoString(&buffer,
@@ -4899,8 +5276,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 				tup = SearchSysCache1(TSPARSEROID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for text search parser %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for text search parser %u",
+							 object->objectId);
+					break;
+				}
 				formParser = (Form_pg_ts_parser) GETSTRUCT(tup);
 				schema = get_namespace_name_or_temp(formParser->prsnamespace);
 				appendStringInfoString(&buffer,
@@ -4922,8 +5303,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 				tup = SearchSysCache1(TSDICTOID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for text search dictionary %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for text search dictionary %u",
+							 object->objectId);
+					break;
+				}
 				formDict = (Form_pg_ts_dict) GETSTRUCT(tup);
 				schema = get_namespace_name_or_temp(formDict->dictnamespace);
 				appendStringInfoString(&buffer,
@@ -4945,8 +5330,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 				tup = SearchSysCache1(TSTEMPLATEOID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for text search template %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for text search template %u",
+							 object->objectId);
+					break;
+				}
 				formTmpl = (Form_pg_ts_template) GETSTRUCT(tup);
 				schema = get_namespace_name_or_temp(formTmpl->tmplnamespace);
 				appendStringInfoString(&buffer,
@@ -4968,8 +5357,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 				tup = SearchSysCache1(TSCONFIGOID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for text search configuration %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for text search configuration %u",
+							 object->objectId);
+					break;
+				}
 				formCfg = (Form_pg_ts_config) GETSTRUCT(tup);
 				schema = get_namespace_name_or_temp(formCfg->cfgnamespace);
 				appendStringInfoString(&buffer,
@@ -4986,7 +5379,9 @@ getObjectIdentityParts(const ObjectAddress *object,
 			{
 				char	   *username;
 
-				username = GetUserNameFromId(object->objectId, false);
+				username = GetUserNameFromId(object->objectId, missing_ok);
+				if (!username)
+					break;
 				if (objname)
 					*objname = list_make1(username);
 				appendStringInfoString(&buffer,
@@ -5000,8 +5395,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 
 				datname = get_database_name(object->objectId);
 				if (!datname)
-					elog(ERROR, "cache lookup failed for database %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for database %u",
+							 object->objectId);
+					break;
+				}
 				if (objname)
 					*objname = list_make1(datname);
 				appendStringInfoString(&buffer,
@@ -5015,8 +5414,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 
 				tblspace = get_tablespace_name(object->objectId);
 				if (!tblspace)
-					elog(ERROR, "cache lookup failed for tablespace %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for tablespace %u",
+							 object->objectId);
+					break;
+				}
 				if (objname)
 					*objname = list_make1(tblspace);
 				appendStringInfoString(&buffer,
@@ -5028,10 +5431,14 @@ getObjectIdentityParts(const ObjectAddress *object,
 			{
 				ForeignDataWrapper *fdw;
 
-				fdw = GetForeignDataWrapper(object->objectId);
-				appendStringInfoString(&buffer, quote_identifier(fdw->fdwname));
-				if (objname)
-					*objname = list_make1(pstrdup(fdw->fdwname));
+				fdw = GetForeignDataWrapperExtended(object->objectId,
+													missing_ok);
+				if (fdw)
+				{
+					appendStringInfoString(&buffer, quote_identifier(fdw->fdwname));
+					if (objname)
+						*objname = list_make1(pstrdup(fdw->fdwname));
+				}
 				break;
 			}
 
@@ -5039,11 +5446,15 @@ getObjectIdentityParts(const ObjectAddress *object,
 			{
 				ForeignServer *srv;
 
-				srv = GetForeignServer(object->objectId);
-				appendStringInfoString(&buffer,
-									   quote_identifier(srv->servername));
-				if (objname)
-					*objname = list_make1(pstrdup(srv->servername));
+				srv = GetForeignServerExtended(object->objectId,
+											   missing_ok);
+				if (srv)
+				{
+					appendStringInfoString(&buffer,
+										   quote_identifier(srv->servername));
+					if (objname)
+						*objname = list_make1(pstrdup(srv->servername));
+				}
 				break;
 			}
 
@@ -5058,8 +5469,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 				tup = SearchSysCache1(USERMAPPINGOID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for user mapping %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for user mapping %u",
+							 object->objectId);
+					break;
+				}
 				umform = (Form_pg_user_mapping) GETSTRUCT(tup);
 				useid = umform->umuser;
 				srv = GetForeignServer(umform->umserver);
@@ -5106,8 +5521,16 @@ getObjectIdentityParts(const ObjectAddress *object,
 				tup = systable_getnext(rcscan);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for default ACL %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for default ACL %u",
+							 object->objectId);
+
+					systable_endscan(rcscan);
+					table_close(defaclrel, AccessShareLock);
+					break;
+
+				}
 
 				defacl = (Form_pg_default_acl) GETSTRUCT(tup);
 
@@ -5169,8 +5592,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 
 				extname = get_extension_name(object->objectId);
 				if (!extname)
-					elog(ERROR, "cache lookup failed for extension %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for extension %u",
+							 object->objectId);
+					break;
+				}
 				appendStringInfoString(&buffer, quote_identifier(extname));
 				if (objname)
 					*objname = list_make1(extname);
@@ -5189,8 +5616,12 @@ getObjectIdentityParts(const ObjectAddress *object,
 				tup = SearchSysCache1(EVENTTRIGGEROID,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for event trigger %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for event trigger %u",
+							 object->objectId);
+					break;
+				}
 				trigForm = (Form_pg_event_trigger) GETSTRUCT(tup);
 				appendStringInfoString(&buffer,
 									   quote_identifier(NameStr(trigForm->evtname)));
@@ -5210,14 +5641,20 @@ getObjectIdentityParts(const ObjectAddress *object,
 												object->objectId);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for policy %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for policy %u",
+							 object->objectId);
+
+					table_close(polDesc, AccessShareLock);
+					break;
+				}
 
 				policy = (Form_pg_policy) GETSTRUCT(tup);
 
 				appendStringInfo(&buffer, "%s on ",
 								 quote_identifier(NameStr(policy->polname)));
-				getRelationIdentity(&buffer, policy->polrelid, objname);
+				getRelationIdentity(&buffer, policy->polrelid, objname, false);
 				if (objname)
 					*objname = lappend(*objname, pstrdup(NameStr(policy->polname)));
 
@@ -5229,11 +5666,14 @@ getObjectIdentityParts(const ObjectAddress *object,
 			{
 				char	   *pubname;
 
-				pubname = get_publication_name(object->objectId, false);
-				appendStringInfoString(&buffer,
-									   quote_identifier(pubname));
-				if (objname)
-					*objname = list_make1(pubname);
+				pubname = get_publication_name(object->objectId, missing_ok);
+				if (pubname)
+				{
+					appendStringInfoString(&buffer,
+										   quote_identifier(pubname));
+					if (objname)
+						*objname = list_make1(pubname);
+				}
 				break;
 			}
 
@@ -5246,13 +5686,17 @@ getObjectIdentityParts(const ObjectAddress *object,
 				tup = SearchSysCache1(PUBLICATIONREL,
 									  ObjectIdGetDatum(object->objectId));
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "cache lookup failed for publication table %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "cache lookup failed for publication table %u",
+							 object->objectId);
+					break;
+				}
 
 				prform = (Form_pg_publication_rel) GETSTRUCT(tup);
 				pubname = get_publication_name(prform->prpubid, false);
 
-				getRelationIdentity(&buffer, prform->prrelid, objname);
+				getRelationIdentity(&buffer, prform->prrelid, objname, false);
 				appendStringInfo(&buffer, " in publication %s", pubname);
 
 				if (objargs)
@@ -5266,11 +5710,14 @@ getObjectIdentityParts(const ObjectAddress *object,
 			{
 				char	   *subname;
 
-				subname = get_subscription_name(object->objectId, false);
-				appendStringInfoString(&buffer,
-									   quote_identifier(subname));
-				if (objname)
-					*objname = list_make1(subname);
+				subname = get_subscription_name(object->objectId, missing_ok);
+				if (subname)
+				{
+					appendStringInfoString(&buffer,
+										   quote_identifier(subname));
+					if (objname)
+						*objname = list_make1(subname);
+				}
 				break;
 			}
 
@@ -5289,8 +5736,14 @@ getObjectIdentityParts(const ObjectAddress *object,
 												object->objectId);
 
 				if (!HeapTupleIsValid(tup))
-					elog(ERROR, "could not find tuple for transform %u",
-						 object->objectId);
+				{
+					if (!missing_ok)
+						elog(ERROR, "could not find tuple for transform %u",
+							 object->objectId);
+
+					table_close(transformDesc, AccessShareLock);
+					break;
+				}
 
 				transform = (Form_pg_transform) GETSTRUCT(tup);
 
@@ -5316,20 +5769,34 @@ getObjectIdentityParts(const ObjectAddress *object,
 			 */
 	}
 
-	/*
-	 * If a get_object_address representation was requested, make sure we are
-	 * providing one.  We don't check objargs, because many of the cases above
-	 * leave it as NIL.
-	 */
-	if (objname && *objname == NIL)
-		elog(ERROR, "requested object address for unsupported object class %d: text result \"%s\"",
-			 (int) getObjectClass(object), buffer.data);
+	if (!missing_ok)
+	{
+		/*
+		 * If a get_object_address() representation was requested, make sure
+		 * we are providing one.  We don't check objargs, because many of the
+		 * cases above leave it as NIL.
+		 */
+		if (objname && *objname == NIL)
+			elog(ERROR, "requested object address for unsupported object class %d: text result \"%s\"",
+				 (int) getObjectClass(object), buffer.data);
+	}
+	else
+	{
+		/* an empty buffer is equivalent to no object found */
+		if (buffer.len == 0)
+		{
+			Assert((objname == NULL || *objname == NIL) &&
+				   (objargs == NULL || *objargs == NIL));
+			return NULL;
+		}
+	}
 
 	return buffer.data;
 }
 
 static void
-getOpFamilyIdentity(StringInfo buffer, Oid opfid, List **object)
+getOpFamilyIdentity(StringInfo buffer, Oid opfid, List **object,
+					bool missing_ok)
 {
 	HeapTuple	opfTup;
 	Form_pg_opfamily opfForm;
@@ -5339,7 +5806,11 @@ getOpFamilyIdentity(StringInfo buffer, Oid opfid, List **object)
 
 	opfTup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfid));
 	if (!HeapTupleIsValid(opfTup))
-		elog(ERROR, "cache lookup failed for opfamily %u", opfid);
+	{
+		if (!missing_ok)
+			elog(ERROR, "cache lookup failed for opfamily %u", opfid);
+		return;
+	}
 	opfForm = (Form_pg_opfamily) GETSTRUCT(opfTup);
 
 	amTup = SearchSysCache1(AMOID, ObjectIdGetDatum(opfForm->opfmethod));
@@ -5368,7 +5839,8 @@ getOpFamilyIdentity(StringInfo buffer, Oid opfid, List **object)
  * StringInfo.
  */
 static void
-getRelationIdentity(StringInfo buffer, Oid relid, List **object)
+getRelationIdentity(StringInfo buffer, Oid relid, List **object,
+					bool missing_ok)
 {
 	HeapTuple	relTup;
 	Form_pg_class relForm;
@@ -5377,7 +5849,14 @@ getRelationIdentity(StringInfo buffer, Oid relid, List **object)
 	relTup = SearchSysCache1(RELOID,
 							 ObjectIdGetDatum(relid));
 	if (!HeapTupleIsValid(relTup))
-		elog(ERROR, "cache lookup failed for relation %u", relid);
+	{
+		if (!missing_ok)
+			elog(ERROR, "cache lookup failed for relation %u", relid);
+
+		if (object)
+			*object = NIL;
+		return;
+	}
 	relForm = (Form_pg_class) GETSTRUCT(relTup);
 
 	schema = get_namespace_name_or_temp(relForm->relnamespace);
diff --git a/src/backend/catalog/pg_depend.c b/src/backend/catalog/pg_depend.c
index 21cfdcace942..70baf03178f7 100644
--- a/src/backend/catalog/pg_depend.c
+++ b/src/backend/catalog/pg_depend.c
@@ -160,7 +160,7 @@ recordDependencyOnCurrentExtension(const ObjectAddress *object,
 				ereport(ERROR,
 						(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 						 errmsg("%s is already a member of extension \"%s\"",
-								getObjectDescription(object),
+								getObjectDescription(object, false),
 								get_extension_name(oldext))));
 			}
 		}
@@ -536,7 +536,7 @@ changeDependenciesOn(Oid refClassId, Oid oldRefObjectId,
 		ereport(ERROR,
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("cannot remove dependency on %s because it is a system object",
-						getObjectDescription(&objAddr))));
+						getObjectDescription(&objAddr, false))));
 
 	/*
 	 * We can handle adding a dependency on something pinned, though, since
diff --git a/src/backend/catalog/pg_shdepend.c b/src/backend/catalog/pg_shdepend.c
index f776e821b3db..082b935a6984 100644
--- a/src/backend/catalog/pg_shdepend.c
+++ b/src/backend/catalog/pg_shdepend.c
@@ -638,7 +638,7 @@ checkSharedDependencies(Oid classId, Oid objectId,
 			ereport(ERROR,
 					(errcode(ERRCODE_DEPENDENT_OBJECTS_STILL_EXIST),
 					 errmsg("cannot drop %s because it is required by the database system",
-							getObjectDescription(&object))));
+							getObjectDescription(&object, false))));
 		}
 
 		object.classId = sdepForm->classid;
@@ -1147,7 +1147,7 @@ storeObjectDescription(StringInfo descs,
 					   SharedDependencyType deptype,
 					   int count)
 {
-	char	   *objdesc = getObjectDescription(object);
+	char	   *objdesc = getObjectDescription(object, false);
 
 	/* separate entries with a newline */
 	if (descs->len != 0)
@@ -1283,7 +1283,7 @@ shdepDropOwned(List *roleids, DropBehavior behavior)
 					(errcode(ERRCODE_DEPENDENT_OBJECTS_STILL_EXIST),
 					 errmsg("cannot drop objects owned by %s because they are "
 							"required by the database system",
-							getObjectDescription(&obj))));
+							getObjectDescription(&obj, false))));
 		}
 
 		ScanKeyInit(&key[0],
@@ -1429,7 +1429,7 @@ shdepReassignOwned(List *roleids, Oid newrole)
 			ereport(ERROR,
 					(errcode(ERRCODE_DEPENDENT_OBJECTS_STILL_EXIST),
 					 errmsg("cannot reassign ownership of objects owned by %s because they are required by the database system",
-							getObjectDescription(&obj))));
+							getObjectDescription(&obj, false))));
 
 			/*
 			 * There's no need to tell the whole truth, which is that we
diff --git a/src/backend/commands/event_trigger.c b/src/backend/commands/event_trigger.c
index 151cdfff39d2..7844880170ae 100644
--- a/src/backend/commands/event_trigger.c
+++ b/src/backend/commands/event_trigger.c
@@ -1267,10 +1267,11 @@ EventTriggerSQLDropAddObject(const ObjectAddress *object, bool original, bool no
 
 	/* object identity, objname and objargs */
 	obj->objidentity =
-		getObjectIdentityParts(&obj->address, &obj->addrnames, &obj->addrargs);
+		getObjectIdentityParts(&obj->address, &obj->addrnames, &obj->addrargs,
+							   false);
 
 	/* object type */
-	obj->objecttype = getObjectTypeDescription(&obj->address);
+	obj->objecttype = getObjectTypeDescription(&obj->address, false);
 
 	slist_push_head(&(currentEventTriggerState->SQLDropList), &obj->next);
 
@@ -1929,8 +1930,8 @@ pg_event_trigger_ddl_commands(PG_FUNCTION_ARGS)
 					else if (cmd->type == SCT_AlterTSConfig)
 						addr = cmd->d.atscfg.address;
 
-					type = getObjectTypeDescription(&addr);
-					identity = getObjectIdentity(&addr);
+					type = getObjectTypeDescription(&addr, false);
+					identity = getObjectIdentity(&addr, false);
 
 					/*
 					 * Obtain schema name, if any ("pg_temp" if a temp
diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c
index 3b69ab7ed5c4..c796fcd8da0a 100644
--- a/src/backend/commands/extension.c
+++ b/src/backend/commands/extension.c
@@ -2919,7 +2919,7 @@ AlterExtensionNamespace(const char *extensionName, const char *newschema, Oid *o
 					 errmsg("extension \"%s\" does not support SET SCHEMA",
 							NameStr(extForm->extname)),
 					 errdetail("%s is not in the extension's schema \"%s\"",
-							   getObjectDescription(&dep),
+							   getObjectDescription(&dep, false),
 							   get_namespace_name(oldNspOid))));
 	}
 
@@ -3328,7 +3328,7 @@ ExecAlterExtensionContentsStmt(AlterExtensionContentsStmt *stmt,
 			ereport(ERROR,
 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 					 errmsg("%s is already a member of extension \"%s\"",
-							getObjectDescription(&object),
+							getObjectDescription(&object, false),
 							get_extension_name(oldExtension))));
 
 		/*
@@ -3368,7 +3368,7 @@ ExecAlterExtensionContentsStmt(AlterExtensionContentsStmt *stmt,
 			ereport(ERROR,
 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 					 errmsg("%s is not a member of extension \"%s\"",
-							getObjectDescription(&object),
+							getObjectDescription(&object, false),
 							stmt->extname)));
 
 		/*
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 20049f2c5558..27b596cb5912 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -11304,7 +11304,7 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
 					{
 						/* Not expecting any other direct dependencies... */
 						elog(ERROR, "unexpected object depending on column: %s",
-							 getObjectDescription(&foundObject));
+							 getObjectDescription(&foundObject, false));
 					}
 					break;
 				}
@@ -11320,7 +11320,7 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
 						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 						 errmsg("cannot alter type of a column used by a view or rule"),
 						 errdetail("%s depends on column \"%s\"",
-								   getObjectDescription(&foundObject),
+								   getObjectDescription(&foundObject, false),
 								   colName)));
 				break;
 
@@ -11339,7 +11339,7 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
 						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 						 errmsg("cannot alter type of a column used in a trigger definition"),
 						 errdetail("%s depends on column \"%s\"",
-								   getObjectDescription(&foundObject),
+								   getObjectDescription(&foundObject, false),
 								   colName)));
 				break;
 
@@ -11357,7 +11357,7 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
 						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 						 errmsg("cannot alter type of a column used in a policy definition"),
 						 errdetail("%s depends on column \"%s\"",
-								   getObjectDescription(&foundObject),
+								   getObjectDescription(&foundObject, false),
 								   colName)));
 				break;
 
@@ -11418,7 +11418,7 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
 				 * a column.
 				 */
 				elog(ERROR, "unexpected object depending on column: %s",
-					 getObjectDescription(&foundObject));
+					 getObjectDescription(&foundObject, false));
 				break;
 
 				/*
@@ -11474,7 +11474,7 @@ ATExecAlterColumnType(AlteredTableInfo *tab, Relation rel,
 			  foundDep->refobjsubid != 0)
 			)
 			elog(ERROR, "found unexpected dependency for column: %s",
-				 getObjectDescription(&foundObject));
+				 getObjectDescription(&foundObject, false));
 
 		CatalogTupleDelete(depRel, &depTup->t_self);
 	}
diff --git a/src/backend/utils/adt/regproc.c b/src/backend/utils/adt/regproc.c
index b41189db5c1c..6c1ee9c92df3 100644
--- a/src/backend/utils/adt/regproc.c
+++ b/src/backend/utils/adt/regproc.c
@@ -418,7 +418,8 @@ format_procedure_extended(Oid procedure_oid, bits16 flags)
  * This can be used to feed get_object_address.
  */
 void
-format_procedure_parts(Oid procedure_oid, List **objnames, List **objargs)
+format_procedure_parts(Oid procedure_oid, List **objnames, List **objargs,
+					   bool missing_ok)
 {
 	HeapTuple	proctup;
 	Form_pg_proc procform;
@@ -428,7 +429,11 @@ format_procedure_parts(Oid procedure_oid, List **objnames, List **objargs)
 	proctup = SearchSysCache1(PROCOID, ObjectIdGetDatum(procedure_oid));
 
 	if (!HeapTupleIsValid(proctup))
-		elog(ERROR, "cache lookup failed for procedure with OID %u", procedure_oid);
+	{
+		if (!missing_ok)
+			elog(ERROR, "cache lookup failed for procedure with OID %u", procedure_oid);
+		return;
+	}
 
 	procform = (Form_pg_proc) GETSTRUCT(proctup);
 	nargs = procform->pronargs;
@@ -856,15 +861,20 @@ format_operator_qualified(Oid operator_oid)
 }
 
 void
-format_operator_parts(Oid operator_oid, List **objnames, List **objargs)
+format_operator_parts(Oid operator_oid, List **objnames, List **objargs,
+					  bool missing_ok)
 {
 	HeapTuple	opertup;
 	Form_pg_operator oprForm;
 
 	opertup = SearchSysCache1(OPEROID, ObjectIdGetDatum(operator_oid));
 	if (!HeapTupleIsValid(opertup))
-		elog(ERROR, "cache lookup failed for operator with OID %u",
-			 operator_oid);
+	{
+		if (!missing_ok)
+			elog(ERROR, "cache lookup failed for operator with OID %u",
+				 operator_oid);
+		return;
+	}
 
 	oprForm = (Form_pg_operator) GETSTRUCT(opertup);
 	*objnames = list_make2(get_namespace_name_or_temp(oprForm->oprnamespace),
diff --git a/src/include/catalog/objectaddress.h b/src/include/catalog/objectaddress.h
index 144715d4f4d1..b876617c9d78 100644
--- a/src/include/catalog/objectaddress.h
+++ b/src/include/catalog/objectaddress.h
@@ -70,14 +70,18 @@ extern bool get_object_namensp_unique(Oid class_id);
 extern HeapTuple get_catalog_object_by_oid(Relation catalog,
 										   AttrNumber oidcol, Oid objectId);
 
-extern char *getObjectDescription(const ObjectAddress *object);
+extern char *getObjectDescription(const ObjectAddress *object,
+								  bool missing_ok);
 extern char *getObjectDescriptionOids(Oid classid, Oid objid);
 
 extern int	read_objtype_from_string(const char *objtype);
-extern char *getObjectTypeDescription(const ObjectAddress *object);
-extern char *getObjectIdentity(const ObjectAddress *address);
+extern char *getObjectTypeDescription(const ObjectAddress *object,
+									  bool missing_ok);
+extern char *getObjectIdentity(const ObjectAddress *address,
+							   bool missing_ok);
 extern char *getObjectIdentityParts(const ObjectAddress *address,
-									List **objname, List **objargs);
+									List **objname, List **objargs,
+									bool missing_ok);
 extern struct ArrayType *strlist_to_textarray(List *list);
 
 extern ObjectType get_relkind_objtype(char relkind);
diff --git a/src/include/utils/regproc.h b/src/include/utils/regproc.h
index 145452a5ad71..330417e888a0 100644
--- a/src/include/utils/regproc.h
+++ b/src/include/utils/regproc.h
@@ -29,10 +29,11 @@ extern List *stringToQualifiedNameList(const char *string);
 extern char *format_procedure(Oid procedure_oid);
 extern char *format_procedure_qualified(Oid procedure_oid);
 extern void format_procedure_parts(Oid operator_oid, List **objnames,
-								   List **objargs);
+								   List **objargs, bool missing_ok);
+
 extern char *format_operator(Oid operator_oid);
 extern char *format_operator_qualified(Oid operator_oid);
 extern void format_operator_parts(Oid operator_oid, List **objnames,
-								  List **objargs);
+								  List **objargs, bool missing_ok);
 
 #endif
diff --git a/src/test/regress/expected/object_address.out b/src/test/regress/expected/object_address.out
index d6d147015631..388097a69578 100644
--- a/src/test/regress/expected/object_address.out
+++ b/src/test/regress/expected/object_address.out
@@ -521,3 +521,103 @@ drop cascades to function trig()
 drop cascades to function proc(integer)
 DROP OWNED BY regress_addr_user;
 DROP USER regress_addr_user;
+--
+-- Checks for invalid objects
+--
+-- Make sure that NULL handling is correct.
+\pset null 'NULL'
+-- Temporarily disable fancy output, so as future additions never create
+-- a large amount of diffs.
+\a\t
+-- Keep this list in the same order as getObjectIdentityParts()
+-- in objectaddress.c.
+WITH objects (classid, objid, objsubid) AS (VALUES
+    ('pg_class'::regclass, 0, 0), -- no relation
+    ('pg_class'::regclass, 'pg_class'::regclass, 100), -- no column for relation
+    ('pg_proc'::regclass, 0, 0), -- no function
+    ('pg_type'::regclass, 0, 0), -- no type
+    ('pg_cast'::regclass, 0, 0), -- no cast
+    ('pg_collation'::regclass, 0, 0), -- no collation
+    ('pg_constraint'::regclass, 0, 0), -- no constraint
+    ('pg_conversion'::regclass, 0, 0), -- no conversion
+    ('pg_attrdef'::regclass, 0, 0), -- no default attribute
+    ('pg_language'::regclass, 0, 0), -- no language
+    ('pg_largeobject'::regclass, 0, 0), -- no large object, no error
+    ('pg_operator'::regclass, 0, 0), -- no operator
+    ('pg_opclass'::regclass, 0, 0), -- no opclass, no need to check for no access method
+    ('pg_opfamily'::regclass, 0, 0), -- no opfamily
+    ('pg_am'::regclass, 0, 0), -- no access method
+    ('pg_amop'::regclass, 0, 0), -- no AM operator
+    ('pg_amproc'::regclass, 0, 0), -- no AM proc
+    ('pg_rewrite'::regclass, 0, 0), -- no rewrite
+    ('pg_trigger'::regclass, 0, 0), -- no trigger
+    ('pg_namespace'::regclass, 0, 0), -- no schema
+    ('pg_statistic_ext'::regclass, 0, 0), -- no statistics
+    ('pg_ts_parser'::regclass, 0, 0), -- no TS parser
+    ('pg_ts_dict'::regclass, 0, 0), -- no TS dictionnary
+    ('pg_ts_template'::regclass, 0, 0), -- no TS template
+    ('pg_ts_config'::regclass, 0, 0), -- no TS configuration
+    ('pg_authid'::regclass, 0, 0), -- no role
+    ('pg_database'::regclass, 0, 0), -- no database
+    ('pg_tablespace'::regclass, 0, 0), -- no tablespace
+    ('pg_foreign_data_wrapper'::regclass, 0, 0), -- no FDW
+    ('pg_foreign_server'::regclass, 0, 0), -- no server
+    ('pg_user_mapping'::regclass, 0, 0), -- no user mapping
+    ('pg_default_acl'::regclass, 0, 0), -- no default ACL
+    ('pg_extension'::regclass, 0, 0), -- no extension
+    ('pg_event_trigger'::regclass, 0, 0), -- no event trigger
+    ('pg_policy'::regclass, 0, 0), -- no policy
+    ('pg_publication'::regclass, 0, 0), -- no publication
+    ('pg_publication_rel'::regclass, 0, 0), -- no publication relation
+    ('pg_subscription'::regclass, 0, 0), -- no subscription
+    ('pg_transform'::regclass, 0, 0) -- no transformation
+  )
+SELECT ROW(pg_identify_object(objects.classid, objects.objid, objects.objsubid))
+         AS ident,
+       ROW(pg_identify_object_as_address(objects.classid, objects.objid, objects.objsubid))
+         AS addr,
+       pg_describe_object(objects.classid, objects.objid, objects.objsubid)
+         AS descr
+FROM objects
+ORDER BY objects.classid, objects.objid, objects.objsubid;
+("(""default acl"",,,)")|("(""default acl"",,)")|NULL
+("(tablespace,,,)")|("(tablespace,,)")|NULL
+("(type,,,)")|("(type,,)")|NULL
+("(routine,,,)")|("(routine,,)")|NULL
+("(relation,,,)")|("(relation,,)")|NULL
+("(""table column"",,,)")|("(""table column"",,)")|NULL
+("(role,,,)")|("(role,,)")|NULL
+("(database,,,)")|("(database,,)")|NULL
+("(server,,,)")|("(server,,)")|NULL
+("(""user mapping"",,,)")|("(""user mapping"",,)")|NULL
+("(""foreign-data wrapper"",,,)")|("(""foreign-data wrapper"",,)")|NULL
+("(""access method"",,,)")|("(""access method"",,)")|NULL
+("(""operator of access method"",,,)")|("(""operator of access method"",,)")|NULL
+("(""function of access method"",,,)")|("(""function of access method"",,)")|NULL
+("(""default value"",,,)")|("(""default value"",,)")|NULL
+("(cast,,,)")|("(cast,,)")|NULL
+("(constraint,,,)")|("(constraint,,)")|NULL
+("(conversion,,,)")|("(conversion,,)")|NULL
+("(language,,,)")|("(language,,)")|NULL
+("(""large object"",,,)")|("(""large object"",,)")|NULL
+("(schema,,,)")|("(schema,,)")|NULL
+("(""operator class"",,,)")|("(""operator class"",,)")|NULL
+("(operator,,,)")|("(operator,,)")|NULL
+("(rule,,,)")|("(rule,,)")|NULL
+("(trigger,,,)")|("(trigger,,)")|NULL
+("(""operator family"",,,)")|("(""operator family"",,)")|NULL
+("(extension,,,)")|("(extension,,)")|NULL
+("(policy,,,)")|("(policy,,)")|NULL
+("(""statistics object"",,,)")|("(""statistics object"",,)")|NULL
+("(collation,,,)")|("(collation,,)")|NULL
+("(""event trigger"",,,)")|("(""event trigger"",,)")|NULL
+("(transform,,,)")|("(transform,,)")|NULL
+("(""text search dictionary"",,,)")|("(""text search dictionary"",,)")|NULL
+("(""text search parser"",,,)")|("(""text search parser"",,)")|NULL
+("(""text search configuration"",,,)")|("(""text search configuration"",,)")|NULL
+("(""text search template"",,,)")|("(""text search template"",,)")|NULL
+("(subscription,,,)")|("(subscription,,)")|NULL
+("(publication,,,)")|("(publication,,)")|NULL
+("(""publication relation"",,,)")|("(""publication relation"",,)")|NULL
+-- restore normal output mode
+\a\t
diff --git a/src/test/regress/sql/object_address.sql b/src/test/regress/sql/object_address.sql
index 8e06248eb5e5..2f4f66e3e120 100644
--- a/src/test/regress/sql/object_address.sql
+++ b/src/test/regress/sql/object_address.sql
@@ -221,3 +221,67 @@ DROP SCHEMA addr_nsp CASCADE;
 
 DROP OWNED BY regress_addr_user;
 DROP USER regress_addr_user;
+
+--
+-- Checks for invalid objects
+--
+-- Make sure that NULL handling is correct.
+\pset null 'NULL'
+-- Temporarily disable fancy output, so as future additions never create
+-- a large amount of diffs.
+\a\t
+
+-- Keep this list in the same order as getObjectIdentityParts()
+-- in objectaddress.c.
+WITH objects (classid, objid, objsubid) AS (VALUES
+    ('pg_class'::regclass, 0, 0), -- no relation
+    ('pg_class'::regclass, 'pg_class'::regclass, 100), -- no column for relation
+    ('pg_proc'::regclass, 0, 0), -- no function
+    ('pg_type'::regclass, 0, 0), -- no type
+    ('pg_cast'::regclass, 0, 0), -- no cast
+    ('pg_collation'::regclass, 0, 0), -- no collation
+    ('pg_constraint'::regclass, 0, 0), -- no constraint
+    ('pg_conversion'::regclass, 0, 0), -- no conversion
+    ('pg_attrdef'::regclass, 0, 0), -- no default attribute
+    ('pg_language'::regclass, 0, 0), -- no language
+    ('pg_largeobject'::regclass, 0, 0), -- no large object, no error
+    ('pg_operator'::regclass, 0, 0), -- no operator
+    ('pg_opclass'::regclass, 0, 0), -- no opclass, no need to check for no access method
+    ('pg_opfamily'::regclass, 0, 0), -- no opfamily
+    ('pg_am'::regclass, 0, 0), -- no access method
+    ('pg_amop'::regclass, 0, 0), -- no AM operator
+    ('pg_amproc'::regclass, 0, 0), -- no AM proc
+    ('pg_rewrite'::regclass, 0, 0), -- no rewrite
+    ('pg_trigger'::regclass, 0, 0), -- no trigger
+    ('pg_namespace'::regclass, 0, 0), -- no schema
+    ('pg_statistic_ext'::regclass, 0, 0), -- no statistics
+    ('pg_ts_parser'::regclass, 0, 0), -- no TS parser
+    ('pg_ts_dict'::regclass, 0, 0), -- no TS dictionnary
+    ('pg_ts_template'::regclass, 0, 0), -- no TS template
+    ('pg_ts_config'::regclass, 0, 0), -- no TS configuration
+    ('pg_authid'::regclass, 0, 0), -- no role
+    ('pg_database'::regclass, 0, 0), -- no database
+    ('pg_tablespace'::regclass, 0, 0), -- no tablespace
+    ('pg_foreign_data_wrapper'::regclass, 0, 0), -- no FDW
+    ('pg_foreign_server'::regclass, 0, 0), -- no server
+    ('pg_user_mapping'::regclass, 0, 0), -- no user mapping
+    ('pg_default_acl'::regclass, 0, 0), -- no default ACL
+    ('pg_extension'::regclass, 0, 0), -- no extension
+    ('pg_event_trigger'::regclass, 0, 0), -- no event trigger
+    ('pg_policy'::regclass, 0, 0), -- no policy
+    ('pg_publication'::regclass, 0, 0), -- no publication
+    ('pg_publication_rel'::regclass, 0, 0), -- no publication relation
+    ('pg_subscription'::regclass, 0, 0), -- no subscription
+    ('pg_transform'::regclass, 0, 0) -- no transformation
+  )
+SELECT ROW(pg_identify_object(objects.classid, objects.objid, objects.objsubid))
+         AS ident,
+       ROW(pg_identify_object_as_address(objects.classid, objects.objid, objects.objsubid))
+         AS addr,
+       pg_describe_object(objects.classid, objects.objid, objects.objsubid)
+         AS descr
+FROM objects
+ORDER BY objects.classid, objects.objid, objects.objsubid;
+
+-- restore normal output mode
+\a\t

From e949137397383de58f0c566f5a92b017343c5269 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 15 Jul 2020 09:42:21 +0900
Subject: [PATCH 144/334] Fix compilation failure with sepgsql

One change for getObjectIdentity() has been missed in 2a10fdc, causing
the module to not compile properly.  This was actually the only problem,
and it happens that it is easy enough to check the compilation of the
module on Debian after installing libselinux1-dev.

Per buildfarm member rhinoceros.
---
 contrib/sepgsql/relation.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/sepgsql/relation.c b/contrib/sepgsql/relation.c
index b50f386f5b38..96c57854a21a 100644
--- a/contrib/sepgsql/relation.c
+++ b/contrib/sepgsql/relation.c
@@ -546,7 +546,7 @@ sepgsql_relation_truncate(Oid relOid)
 	object.classId = RelationRelationId;
 	object.objectId = relOid;
 	object.objectSubId = 0;
-	audit_name = getObjectIdentity(&object);
+	audit_name = getObjectIdentity(&object, false);
 
 	sepgsql_avc_check_perms(&object,
 							tclass,

From 1d09fb1f0a9c7edc3298104fc4350e49169f6f2a Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 15 Jul 2020 15:17:23 +0900
Subject: [PATCH 145/334] Fix handling of missing files when using pg_rewind
 with online source

When working with an online source cluster, pg_rewind gets a list of all
the files in the source data directory using a WITH RECURSIVE query,
returning a NULL result for a file's metadata if it gets removed between
the moment it is listed in a directory and the moment its metadata is
obtained with pg_stat_file() (say a recycled WAL segment).  The query
result was processed in such a way that for each tuple we checked only
that the first file's metadata was NULL.  This could have two
consequences, both resulting in a failure of the rewind:
- If the first tuple referred to a removed file, all files from the
source would be ignored.
- Any file actually missing would not be considered as such.

While on it, rework slightly the code so as no values are saved if we
know that a file is going to be skipped.

Issue introduced by b36805f, so backpatch down to 9.5.

Author: Justin Pryzby, Michael Paquier
Reviewed-by: Daniel Gustafsson, Masahiko Sawada
Discussion: https://postgr.es/m/20200713061010.GC23581@telsasoft.com
Backpatch-through: 9.5
---
 src/bin/pg_rewind/libpq_fetch.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/bin/pg_rewind/libpq_fetch.c b/src/bin/pg_rewind/libpq_fetch.c
index 1dbbceab0bd4..c44648f82318 100644
--- a/src/bin/pg_rewind/libpq_fetch.c
+++ b/src/bin/pg_rewind/libpq_fetch.c
@@ -214,13 +214,13 @@ libpqProcessFileList(void)
 	/* Read result to local variables */
 	for (i = 0; i < PQntuples(res); i++)
 	{
-		char	   *path = PQgetvalue(res, i, 0);
-		int64		filesize = atol(PQgetvalue(res, i, 1));
-		bool		isdir = (strcmp(PQgetvalue(res, i, 2), "t") == 0);
-		char	   *link_target = PQgetvalue(res, i, 3);
+		char	   *path;
+		int64		filesize;
+		bool		isdir;
+		char	   *link_target;
 		file_type_t type;
 
-		if (PQgetisnull(res, 0, 1))
+		if (PQgetisnull(res, i, 1))
 		{
 			/*
 			 * The file was removed from the server while the query was
@@ -229,6 +229,11 @@ libpqProcessFileList(void)
 			continue;
 		}
 
+		path = PQgetvalue(res, i, 0);
+		filesize = atol(PQgetvalue(res, i, 1));
+		isdir = (strcmp(PQgetvalue(res, i, 2), "t") == 0);
+		link_target = PQgetvalue(res, i, 3);
+
 		if (link_target[0])
 			type = FILE_TYPE_SYMLINK;
 		else if (isdir)

From ed2c7f65bd9f15f8f7cd21ad61602f983b1e72e9 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Wed, 15 Jul 2020 14:23:22 +0200
Subject: [PATCH 146/334] pg_dump: Reorganize dumpFunc() and dumpAgg()

Similar to daa9fe8a5264a3f192efa5ddee8fb011ad9da365, instead of
repeating the almost same large query in each version branch, use one
query and add a few columns to the SELECT list depending on the
version.  This saves a lot of duplication.

Reviewed-by: Fabien COELHO <coelho@cri.ensmp.fr>
Discussion: https://www.postgresql.org/message-id/flat/6594334b-40fd-14f1-6bc5-877afa3feed5@2ndquadrant.com
---
 src/bin/pg_dump/pg_dump.c | 496 ++++++++++++--------------------------
 1 file changed, 159 insertions(+), 337 deletions(-)

diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index e758b5c50adf..857c7c2278ad 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -11794,171 +11794,88 @@ dumpFunc(Archive *fout, FuncInfo *finfo)
 	asPart = createPQExpBuffer();
 
 	/* Fetch function-specific details */
-	if (fout->remoteVersion >= 120000)
-	{
-		/*
-		 * prosupport was added in 12
-		 */
+	appendPQExpBuffer(query,
+					  "SELECT\n"
+					  "proretset,\n"
+					  "prosrc,\n"
+					  "probin,\n"
+					  "provolatile,\n"
+					  "proisstrict,\n"
+					  "prosecdef,\n"
+					  "(SELECT lanname FROM pg_catalog.pg_language WHERE oid = prolang) AS lanname,\n");
+
+	if (fout->remoteVersion >= 80300)
 		appendPQExpBuffer(query,
-						  "SELECT proretset, prosrc, probin, "
-						  "pg_catalog.pg_get_function_arguments(oid) AS funcargs, "
-						  "pg_catalog.pg_get_function_identity_arguments(oid) AS funciargs, "
-						  "pg_catalog.pg_get_function_result(oid) AS funcresult, "
-						  "array_to_string(protrftypes, ' ') AS protrftypes, "
-						  "prokind, provolatile, proisstrict, prosecdef, "
-						  "proleakproof, proconfig, procost, prorows, "
-						  "prosupport, proparallel, "
-						  "(SELECT lanname FROM pg_catalog.pg_language WHERE oid = prolang) AS lanname "
-						  "FROM pg_catalog.pg_proc "
-						  "WHERE oid = '%u'::pg_catalog.oid",
-						  finfo->dobj.catId.oid);
-	}
-	else if (fout->remoteVersion >= 110000)
-	{
-		/*
-		 * prokind was added in 11
-		 */
+						  "proconfig,\n"
+						  "procost,\n"
+						  "prorows,\n");
+	else
 		appendPQExpBuffer(query,
-						  "SELECT proretset, prosrc, probin, "
-						  "pg_catalog.pg_get_function_arguments(oid) AS funcargs, "
-						  "pg_catalog.pg_get_function_identity_arguments(oid) AS funciargs, "
-						  "pg_catalog.pg_get_function_result(oid) AS funcresult, "
-						  "array_to_string(protrftypes, ' ') AS protrftypes, "
-						  "prokind, provolatile, proisstrict, prosecdef, "
-						  "proleakproof, proconfig, procost, prorows, "
-						  "'-' AS prosupport, proparallel, "
-						  "(SELECT lanname FROM pg_catalog.pg_language WHERE oid = prolang) AS lanname "
-						  "FROM pg_catalog.pg_proc "
-						  "WHERE oid = '%u'::pg_catalog.oid",
-						  finfo->dobj.catId.oid);
-	}
-	else if (fout->remoteVersion >= 90600)
+						  "null AS proconfig,\n"
+						  "0 AS procost,\n"
+						  "0 AS prorows,\n");
+
+	if (fout->remoteVersion >= 80400)
 	{
 		/*
-		 * proparallel was added in 9.6
+		 * In 8.4 and up we rely on pg_get_function_arguments and
+		 * pg_get_function_result instead of examining proallargtypes etc.
 		 */
 		appendPQExpBuffer(query,
-						  "SELECT proretset, prosrc, probin, "
-						  "pg_catalog.pg_get_function_arguments(oid) AS funcargs, "
-						  "pg_catalog.pg_get_function_identity_arguments(oid) AS funciargs, "
-						  "pg_catalog.pg_get_function_result(oid) AS funcresult, "
-						  "array_to_string(protrftypes, ' ') AS protrftypes, "
-						  "CASE WHEN proiswindow THEN 'w' ELSE 'f' END AS prokind, "
-						  "provolatile, proisstrict, prosecdef, "
-						  "proleakproof, proconfig, procost, prorows, "
-						  "'-' AS prosupport, proparallel, "
-						  "(SELECT lanname FROM pg_catalog.pg_language WHERE oid = prolang) AS lanname "
-						  "FROM pg_catalog.pg_proc "
-						  "WHERE oid = '%u'::pg_catalog.oid",
-						  finfo->dobj.catId.oid);
+						  "pg_catalog.pg_get_function_arguments(oid) AS funcargs,\n"
+						  "pg_catalog.pg_get_function_identity_arguments(oid) AS funciargs,\n"
+						  "pg_catalog.pg_get_function_result(oid) AS funcresult,\n");
 	}
-	else if (fout->remoteVersion >= 90500)
-	{
-		/*
-		 * protrftypes was added in 9.5
-		 */
+	else if (fout->remoteVersion >= 80100)
 		appendPQExpBuffer(query,
-						  "SELECT proretset, prosrc, probin, "
-						  "pg_catalog.pg_get_function_arguments(oid) AS funcargs, "
-						  "pg_catalog.pg_get_function_identity_arguments(oid) AS funciargs, "
-						  "pg_catalog.pg_get_function_result(oid) AS funcresult, "
-						  "array_to_string(protrftypes, ' ') AS protrftypes, "
-						  "CASE WHEN proiswindow THEN 'w' ELSE 'f' END AS prokind, "
-						  "provolatile, proisstrict, prosecdef, "
-						  "proleakproof, proconfig, procost, prorows, "
-						  "'-' AS prosupport, "
-						  "(SELECT lanname FROM pg_catalog.pg_language WHERE oid = prolang) AS lanname "
-						  "FROM pg_catalog.pg_proc "
-						  "WHERE oid = '%u'::pg_catalog.oid",
-						  finfo->dobj.catId.oid);
-	}
-	else if (fout->remoteVersion >= 90200)
-	{
-		/*
-		 * proleakproof was added in 9.2
-		 */
+						  "proallargtypes,\n"
+						  "proargmodes,\n"
+						  "proargnames,\n");
+	else
 		appendPQExpBuffer(query,
-						  "SELECT proretset, prosrc, probin, "
-						  "pg_catalog.pg_get_function_arguments(oid) AS funcargs, "
-						  "pg_catalog.pg_get_function_identity_arguments(oid) AS funciargs, "
-						  "pg_catalog.pg_get_function_result(oid) AS funcresult, "
-						  "CASE WHEN proiswindow THEN 'w' ELSE 'f' END AS prokind, "
-						  "provolatile, proisstrict, prosecdef, "
-						  "proleakproof, proconfig, procost, prorows, "
-						  "'-' AS prosupport, "
-						  "(SELECT lanname FROM pg_catalog.pg_language WHERE oid = prolang) AS lanname "
-						  "FROM pg_catalog.pg_proc "
-						  "WHERE oid = '%u'::pg_catalog.oid",
-						  finfo->dobj.catId.oid);
-	}
+						  "null AS proallargtypes,\n"
+						  "null AS proargmodes,\n"
+						  "proargnames,\n");
+
+	if (fout->remoteVersion >= 90200)
+		appendPQExpBuffer(query,
+						  "proleakproof,\n");
+	else
+		appendPQExpBuffer(query,
+						  "false AS proleakproof,\n");
+
+	if (fout->remoteVersion >= 90500)
+		appendPQExpBuffer(query,
+						  "array_to_string(protrftypes, ' ') AS protrftypes,\n");
+
+	if (fout->remoteVersion >= 90600)
+		appendPQExpBuffer(query,
+						  "proparallel,\n");
+	else
+		appendPQExpBuffer(query,
+						  "'u' AS proparallel,\n");
+
+	if (fout->remoteVersion >= 110000)
+		appendPQExpBuffer(query,
+						  "prokind,\n");
 	else if (fout->remoteVersion >= 80400)
-	{
-		/*
-		 * In 8.4 and up we rely on pg_get_function_arguments and
-		 * pg_get_function_result instead of examining proallargtypes etc.
-		 */
 		appendPQExpBuffer(query,
-						  "SELECT proretset, prosrc, probin, "
-						  "pg_catalog.pg_get_function_arguments(oid) AS funcargs, "
-						  "pg_catalog.pg_get_function_identity_arguments(oid) AS funciargs, "
-						  "pg_catalog.pg_get_function_result(oid) AS funcresult, "
-						  "CASE WHEN proiswindow THEN 'w' ELSE 'f' END AS prokind, "
-						  "provolatile, proisstrict, prosecdef, "
-						  "false AS proleakproof, "
-						  " proconfig, procost, prorows, "
-						  "'-' AS prosupport, "
-						  "(SELECT lanname FROM pg_catalog.pg_language WHERE oid = prolang) AS lanname "
-						  "FROM pg_catalog.pg_proc "
-						  "WHERE oid = '%u'::pg_catalog.oid",
-						  finfo->dobj.catId.oid);
-	}
-	else if (fout->remoteVersion >= 80300)
-	{
+						  "CASE WHEN proiswindow THEN 'w' ELSE 'f' END AS prokind,\n");
+	else
 		appendPQExpBuffer(query,
-						  "SELECT proretset, prosrc, probin, "
-						  "proallargtypes, proargmodes, proargnames, "
-						  "'f' AS prokind, "
-						  "provolatile, proisstrict, prosecdef, "
-						  "false AS proleakproof, "
-						  "proconfig, procost, prorows, "
-						  "'-' AS prosupport, "
-						  "(SELECT lanname FROM pg_catalog.pg_language WHERE oid = prolang) AS lanname "
-						  "FROM pg_catalog.pg_proc "
-						  "WHERE oid = '%u'::pg_catalog.oid",
-						  finfo->dobj.catId.oid);
-	}
-	else if (fout->remoteVersion >= 80100)
-	{
+						  "'f' AS prokind,\n");
+
+	if (fout->remoteVersion >= 120000)
 		appendPQExpBuffer(query,
-						  "SELECT proretset, prosrc, probin, "
-						  "proallargtypes, proargmodes, proargnames, "
-						  "'f' AS prokind, "
-						  "provolatile, proisstrict, prosecdef, "
-						  "false AS proleakproof, "
-						  "null AS proconfig, 0 AS procost, 0 AS prorows, "
-						  "'-' AS prosupport, "
-						  "(SELECT lanname FROM pg_catalog.pg_language WHERE oid = prolang) AS lanname "
-						  "FROM pg_catalog.pg_proc "
-						  "WHERE oid = '%u'::pg_catalog.oid",
-						  finfo->dobj.catId.oid);
-	}
+						  "prosupport\n");
 	else
-	{
 		appendPQExpBuffer(query,
-						  "SELECT proretset, prosrc, probin, "
-						  "null AS proallargtypes, "
-						  "null AS proargmodes, "
-						  "proargnames, "
-						  "'f' AS prokind, "
-						  "provolatile, proisstrict, prosecdef, "
-						  "false AS proleakproof, "
-						  "null AS proconfig, 0 AS procost, 0 AS prorows, "
-						  "'-' AS prosupport, "
-						  "(SELECT lanname FROM pg_catalog.pg_language WHERE oid = prolang) AS lanname "
-						  "FROM pg_catalog.pg_proc "
-						  "WHERE oid = '%u'::pg_catalog.oid",
-						  finfo->dobj.catId.oid);
-	}
+						  "'-' AS prosupport\n");
+
+	appendPQExpBuffer(query,
+					  "FROM pg_catalog.pg_proc "
+					  "WHERE oid = '%u'::pg_catalog.oid",
+					  finfo->dobj.catId.oid);
 
 	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 
@@ -11992,12 +11909,7 @@ dumpFunc(Archive *fout, FuncInfo *finfo)
 	procost = PQgetvalue(res, 0, PQfnumber(res, "procost"));
 	prorows = PQgetvalue(res, 0, PQfnumber(res, "prorows"));
 	prosupport = PQgetvalue(res, 0, PQfnumber(res, "prosupport"));
-
-	if (PQfnumber(res, "proparallel") != -1)
-		proparallel = PQgetvalue(res, 0, PQfnumber(res, "proparallel"));
-	else
-		proparallel = NULL;
-
+	proparallel = PQgetvalue(res, 0, PQfnumber(res, "proparallel"));
 	lanname = PQgetvalue(res, 0, PQfnumber(res, "lanname"));
 
 	/*
@@ -12211,7 +12123,7 @@ dumpFunc(Archive *fout, FuncInfo *finfo)
 		appendPQExpBuffer(q, " SUPPORT %s", prosupport);
 	}
 
-	if (proparallel != NULL && proparallel[0] != PROPARALLEL_UNSAFE)
+	if (proparallel[0] != PROPARALLEL_UNSAFE)
 	{
 		if (proparallel[0] == PROPARALLEL_SAFE)
 			appendPQExpBufferStr(q, " PARALLEL SAFE");
@@ -13886,27 +13798,8 @@ dumpAgg(Archive *fout, AggInfo *agginfo)
 	char	   *aggfullsig = NULL;	/* full signature */
 	char	   *aggsig_tag;
 	PGresult   *res;
-	int			i_aggtransfn;
-	int			i_aggfinalfn;
-	int			i_aggcombinefn;
-	int			i_aggserialfn;
-	int			i_aggdeserialfn;
-	int			i_aggmtransfn;
-	int			i_aggminvtransfn;
-	int			i_aggmfinalfn;
-	int			i_aggfinalextra;
-	int			i_aggmfinalextra;
-	int			i_aggfinalmodify;
-	int			i_aggmfinalmodify;
-	int			i_aggsortop;
-	int			i_aggkind;
-	int			i_aggtranstype;
-	int			i_aggtransspace;
-	int			i_aggmtranstype;
-	int			i_aggmtransspace;
 	int			i_agginitval;
 	int			i_aggminitval;
-	int			i_proparallel;
 	const char *aggtransfn;
 	const char *aggfinalfn;
 	const char *aggcombinefn;
@@ -13941,170 +13834,104 @@ dumpAgg(Archive *fout, AggInfo *agginfo)
 	details = createPQExpBuffer();
 
 	/* Get aggregate-specific details */
+	appendPQExpBuffer(query,
+					  "SELECT\n"
+					  "aggtransfn,\n"
+					  "aggfinalfn,\n"
+					  "aggtranstype::pg_catalog.regtype,\n"
+					  "agginitval,\n");
+
+	if (fout->remoteVersion >= 80100)
+		appendPQExpBuffer(query,
+						  "aggsortop,\n");
+	else
+		appendPQExpBuffer(query,
+						  "0 AS aggsortop,\n");
+
+	if (fout->remoteVersion >= 80400)
+		appendPQExpBuffer(query,
+						  "pg_catalog.pg_get_function_arguments(p.oid) AS funcargs,\n"
+						  "pg_catalog.pg_get_function_identity_arguments(p.oid) AS funciargs,\n");
+
+	if (fout->remoteVersion >= 90400)
+		appendPQExpBuffer(query,
+						  "aggkind,\n"
+						  "aggmtransfn,\n"
+						  "aggminvtransfn,\n"
+						  "aggmfinalfn,\n"
+						  "aggmtranstype::pg_catalog.regtype,\n"
+						  "aggfinalextra,\n"
+						  "aggmfinalextra,\n"
+						  "aggtransspace,\n"
+						  "aggmtransspace,\n"
+						  "aggminitval,\n");
+	else
+		appendPQExpBuffer(query,
+						  "'n' AS aggkind,\n"
+						  "'-' AS aggmtransfn,\n"
+						  "'-' AS aggminvtransfn,\n"
+						  "'-' AS aggmfinalfn,\n"
+						  "0 AS aggmtranstype,\n"
+						  "false AS aggfinalextra,\n"
+						  "false AS aggmfinalextra,\n"
+						  "0 AS aggtransspace,\n"
+						  "0 AS aggmtransspace,\n"
+						  "NULL AS aggminitval,\n");
+
+	if (fout->remoteVersion >= 90600)
+		appendPQExpBuffer(query,
+						  "aggcombinefn,\n"
+						  "aggserialfn,\n"
+						  "aggdeserialfn,\n"
+						  "proparallel,\n");
+	else
+		appendPQExpBuffer(query,
+						  "'-' AS aggcombinefn,\n"
+						  "'-' AS aggserialfn,\n"
+						  "'-' AS aggdeserialfn,\n"
+						  "'u' AS proparallel,\n");
+
 	if (fout->remoteVersion >= 110000)
-	{
-		appendPQExpBuffer(query, "SELECT aggtransfn, "
-						  "aggfinalfn, aggtranstype::pg_catalog.regtype, "
-						  "aggcombinefn, aggserialfn, aggdeserialfn, aggmtransfn, "
-						  "aggminvtransfn, aggmfinalfn, aggmtranstype::pg_catalog.regtype, "
-						  "aggfinalextra, aggmfinalextra, "
-						  "aggfinalmodify, aggmfinalmodify, "
-						  "aggsortop, "
-						  "aggkind, "
-						  "aggtransspace, agginitval, "
-						  "aggmtransspace, aggminitval, "
-						  "pg_catalog.pg_get_function_arguments(p.oid) AS funcargs, "
-						  "pg_catalog.pg_get_function_identity_arguments(p.oid) AS funciargs, "
-						  "p.proparallel "
-						  "FROM pg_catalog.pg_aggregate a, pg_catalog.pg_proc p "
-						  "WHERE a.aggfnoid = p.oid "
-						  "AND p.oid = '%u'::pg_catalog.oid",
-						  agginfo->aggfn.dobj.catId.oid);
-	}
-	else if (fout->remoteVersion >= 90600)
-	{
-		appendPQExpBuffer(query, "SELECT aggtransfn, "
-						  "aggfinalfn, aggtranstype::pg_catalog.regtype, "
-						  "aggcombinefn, aggserialfn, aggdeserialfn, aggmtransfn, "
-						  "aggminvtransfn, aggmfinalfn, aggmtranstype::pg_catalog.regtype, "
-						  "aggfinalextra, aggmfinalextra, "
-						  "'0' AS aggfinalmodify, '0' AS aggmfinalmodify, "
-						  "aggsortop, "
-						  "aggkind, "
-						  "aggtransspace, agginitval, "
-						  "aggmtransspace, aggminitval, "
-						  "pg_catalog.pg_get_function_arguments(p.oid) AS funcargs, "
-						  "pg_catalog.pg_get_function_identity_arguments(p.oid) AS funciargs, "
-						  "p.proparallel "
-						  "FROM pg_catalog.pg_aggregate a, pg_catalog.pg_proc p "
-						  "WHERE a.aggfnoid = p.oid "
-						  "AND p.oid = '%u'::pg_catalog.oid",
-						  agginfo->aggfn.dobj.catId.oid);
-	}
-	else if (fout->remoteVersion >= 90400)
-	{
-		appendPQExpBuffer(query, "SELECT aggtransfn, "
-						  "aggfinalfn, aggtranstype::pg_catalog.regtype, "
-						  "'-' AS aggcombinefn, '-' AS aggserialfn, "
-						  "'-' AS aggdeserialfn, aggmtransfn, aggminvtransfn, "
-						  "aggmfinalfn, aggmtranstype::pg_catalog.regtype, "
-						  "aggfinalextra, aggmfinalextra, "
-						  "'0' AS aggfinalmodify, '0' AS aggmfinalmodify, "
-						  "aggsortop, "
-						  "aggkind, "
-						  "aggtransspace, agginitval, "
-						  "aggmtransspace, aggminitval, "
-						  "pg_catalog.pg_get_function_arguments(p.oid) AS funcargs, "
-						  "pg_catalog.pg_get_function_identity_arguments(p.oid) AS funciargs "
-						  "FROM pg_catalog.pg_aggregate a, pg_catalog.pg_proc p "
-						  "WHERE a.aggfnoid = p.oid "
-						  "AND p.oid = '%u'::pg_catalog.oid",
-						  agginfo->aggfn.dobj.catId.oid);
-	}
-	else if (fout->remoteVersion >= 80400)
-	{
-		appendPQExpBuffer(query, "SELECT aggtransfn, "
-						  "aggfinalfn, aggtranstype::pg_catalog.regtype, "
-						  "'-' AS aggcombinefn, '-' AS aggserialfn, "
-						  "'-' AS aggdeserialfn, '-' AS aggmtransfn, "
-						  "'-' AS aggminvtransfn, '-' AS aggmfinalfn, "
-						  "0 AS aggmtranstype, false AS aggfinalextra, "
-						  "false AS aggmfinalextra, "
-						  "'0' AS aggfinalmodify, '0' AS aggmfinalmodify, "
-						  "aggsortop, "
-						  "'n' AS aggkind, "
-						  "0 AS aggtransspace, agginitval, "
-						  "0 AS aggmtransspace, NULL AS aggminitval, "
-						  "pg_catalog.pg_get_function_arguments(p.oid) AS funcargs, "
-						  "pg_catalog.pg_get_function_identity_arguments(p.oid) AS funciargs "
-						  "FROM pg_catalog.pg_aggregate a, pg_catalog.pg_proc p "
-						  "WHERE a.aggfnoid = p.oid "
-						  "AND p.oid = '%u'::pg_catalog.oid",
-						  agginfo->aggfn.dobj.catId.oid);
-	}
-	else if (fout->remoteVersion >= 80100)
-	{
-		appendPQExpBuffer(query, "SELECT aggtransfn, "
-						  "aggfinalfn, aggtranstype::pg_catalog.regtype, "
-						  "'-' AS aggcombinefn, '-' AS aggserialfn, "
-						  "'-' AS aggdeserialfn, '-' AS aggmtransfn, "
-						  "'-' AS aggminvtransfn, '-' AS aggmfinalfn, "
-						  "0 AS aggmtranstype, false AS aggfinalextra, "
-						  "false AS aggmfinalextra, "
-						  "'0' AS aggfinalmodify, '0' AS aggmfinalmodify, "
-						  "aggsortop, "
-						  "'n' AS aggkind, "
-						  "0 AS aggtransspace, agginitval, "
-						  "0 AS aggmtransspace, NULL AS aggminitval "
-						  "FROM pg_catalog.pg_aggregate a, pg_catalog.pg_proc p "
-						  "WHERE a.aggfnoid = p.oid "
-						  "AND p.oid = '%u'::pg_catalog.oid",
-						  agginfo->aggfn.dobj.catId.oid);
-	}
+		appendPQExpBuffer(query,
+						  "aggfinalmodify,\n"
+						  "aggmfinalmodify\n");
 	else
-	{
-		appendPQExpBuffer(query, "SELECT aggtransfn, "
-						  "aggfinalfn, aggtranstype::pg_catalog.regtype, "
-						  "'-' AS aggcombinefn, '-' AS aggserialfn, "
-						  "'-' AS aggdeserialfn, '-' AS aggmtransfn, "
-						  "'-' AS aggminvtransfn, '-' AS aggmfinalfn, "
-						  "0 AS aggmtranstype, false AS aggfinalextra, "
-						  "false AS aggmfinalextra, "
-						  "'0' AS aggfinalmodify, '0' AS aggmfinalmodify, "
-						  "0 AS aggsortop, "
-						  "'n' AS aggkind, "
-						  "0 AS aggtransspace, agginitval, "
-						  "0 AS aggmtransspace, NULL AS aggminitval "
-						  "FROM pg_catalog.pg_aggregate a, pg_catalog.pg_proc p "
-						  "WHERE a.aggfnoid = p.oid "
-						  "AND p.oid = '%u'::pg_catalog.oid",
-						  agginfo->aggfn.dobj.catId.oid);
-	}
+		appendPQExpBuffer(query,
+						  "'0' AS aggfinalmodify,\n"
+						  "'0' AS aggmfinalmodify\n");
+
+	appendPQExpBuffer(query,
+					  "FROM pg_catalog.pg_aggregate a, pg_catalog.pg_proc p "
+					  "WHERE a.aggfnoid = p.oid "
+					  "AND p.oid = '%u'::pg_catalog.oid",
+					  agginfo->aggfn.dobj.catId.oid);
 
 	res = ExecuteSqlQueryForSingleRow(fout, query->data);
 
-	i_aggtransfn = PQfnumber(res, "aggtransfn");
-	i_aggfinalfn = PQfnumber(res, "aggfinalfn");
-	i_aggcombinefn = PQfnumber(res, "aggcombinefn");
-	i_aggserialfn = PQfnumber(res, "aggserialfn");
-	i_aggdeserialfn = PQfnumber(res, "aggdeserialfn");
-	i_aggmtransfn = PQfnumber(res, "aggmtransfn");
-	i_aggminvtransfn = PQfnumber(res, "aggminvtransfn");
-	i_aggmfinalfn = PQfnumber(res, "aggmfinalfn");
-	i_aggfinalextra = PQfnumber(res, "aggfinalextra");
-	i_aggmfinalextra = PQfnumber(res, "aggmfinalextra");
-	i_aggfinalmodify = PQfnumber(res, "aggfinalmodify");
-	i_aggmfinalmodify = PQfnumber(res, "aggmfinalmodify");
-	i_aggsortop = PQfnumber(res, "aggsortop");
-	i_aggkind = PQfnumber(res, "aggkind");
-	i_aggtranstype = PQfnumber(res, "aggtranstype");
-	i_aggtransspace = PQfnumber(res, "aggtransspace");
-	i_aggmtranstype = PQfnumber(res, "aggmtranstype");
-	i_aggmtransspace = PQfnumber(res, "aggmtransspace");
 	i_agginitval = PQfnumber(res, "agginitval");
 	i_aggminitval = PQfnumber(res, "aggminitval");
-	i_proparallel = PQfnumber(res, "proparallel");
-
-	aggtransfn = PQgetvalue(res, 0, i_aggtransfn);
-	aggfinalfn = PQgetvalue(res, 0, i_aggfinalfn);
-	aggcombinefn = PQgetvalue(res, 0, i_aggcombinefn);
-	aggserialfn = PQgetvalue(res, 0, i_aggserialfn);
-	aggdeserialfn = PQgetvalue(res, 0, i_aggdeserialfn);
-	aggmtransfn = PQgetvalue(res, 0, i_aggmtransfn);
-	aggminvtransfn = PQgetvalue(res, 0, i_aggminvtransfn);
-	aggmfinalfn = PQgetvalue(res, 0, i_aggmfinalfn);
-	aggfinalextra = (PQgetvalue(res, 0, i_aggfinalextra)[0] == 't');
-	aggmfinalextra = (PQgetvalue(res, 0, i_aggmfinalextra)[0] == 't');
-	aggfinalmodify = PQgetvalue(res, 0, i_aggfinalmodify)[0];
-	aggmfinalmodify = PQgetvalue(res, 0, i_aggmfinalmodify)[0];
-	aggsortop = PQgetvalue(res, 0, i_aggsortop);
-	aggkind = PQgetvalue(res, 0, i_aggkind)[0];
-	aggtranstype = PQgetvalue(res, 0, i_aggtranstype);
-	aggtransspace = PQgetvalue(res, 0, i_aggtransspace);
-	aggmtranstype = PQgetvalue(res, 0, i_aggmtranstype);
-	aggmtransspace = PQgetvalue(res, 0, i_aggmtransspace);
+
+	aggtransfn = PQgetvalue(res, 0, PQfnumber(res, "aggtransfn"));
+	aggfinalfn = PQgetvalue(res, 0, PQfnumber(res, "aggfinalfn"));
+	aggcombinefn = PQgetvalue(res, 0, PQfnumber(res, "aggcombinefn"));
+	aggserialfn = PQgetvalue(res, 0, PQfnumber(res, "aggserialfn"));
+	aggdeserialfn = PQgetvalue(res, 0, PQfnumber(res, "aggdeserialfn"));
+	aggmtransfn = PQgetvalue(res, 0, PQfnumber(res, "aggmtransfn"));
+	aggminvtransfn = PQgetvalue(res, 0, PQfnumber(res, "aggminvtransfn"));
+	aggmfinalfn = PQgetvalue(res, 0, PQfnumber(res, "aggmfinalfn"));
+	aggfinalextra = (PQgetvalue(res, 0, PQfnumber(res, "aggfinalextra"))[0] == 't');
+	aggmfinalextra = (PQgetvalue(res, 0, PQfnumber(res, "aggmfinalextra"))[0] == 't');
+	aggfinalmodify = PQgetvalue(res, 0, PQfnumber(res, "aggfinalmodify"))[0];
+	aggmfinalmodify = PQgetvalue(res, 0, PQfnumber(res, "aggmfinalmodify"))[0];
+	aggsortop = PQgetvalue(res, 0, PQfnumber(res, "aggsortop"));
+	aggkind = PQgetvalue(res, 0, PQfnumber(res, "aggkind"))[0];
+	aggtranstype = PQgetvalue(res, 0, PQfnumber(res, "aggtranstype"));
+	aggtransspace = PQgetvalue(res, 0, PQfnumber(res, "aggtransspace"));
+	aggmtranstype = PQgetvalue(res, 0, PQfnumber(res, "aggmtranstype"));
+	aggmtransspace = PQgetvalue(res, 0, PQfnumber(res, "aggmtransspace"));
 	agginitval = PQgetvalue(res, 0, i_agginitval);
 	aggminitval = PQgetvalue(res, 0, i_aggminitval);
+	proparallel = PQgetvalue(res, 0, PQfnumber(res, "proparallel"));
 
 	if (fout->remoteVersion >= 80400)
 	{
@@ -14123,11 +13950,6 @@ dumpAgg(Archive *fout, AggInfo *agginfo)
 
 	aggsig_tag = format_aggregate_signature(agginfo, fout, false);
 
-	if (i_proparallel != -1)
-		proparallel = PQgetvalue(res, 0, PQfnumber(res, "proparallel"));
-	else
-		proparallel = NULL;
-
 	/* identify default modify flag for aggkind (must match DefineAggregate) */
 	defaultfinalmodify = (aggkind == AGGKIND_NORMAL) ? AGGMODIFY_READ_ONLY : AGGMODIFY_READ_WRITE;
 	/* replace omitted flags for old versions */
@@ -14246,7 +14068,7 @@ dumpAgg(Archive *fout, AggInfo *agginfo)
 	if (aggkind == AGGKIND_HYPOTHETICAL)
 		appendPQExpBufferStr(details, ",\n    HYPOTHETICAL");
 
-	if (proparallel != NULL && proparallel[0] != PROPARALLEL_UNSAFE)
+	if (proparallel[0] != PROPARALLEL_UNSAFE)
 	{
 		if (proparallel[0] == PROPARALLEL_SAFE)
 			appendPQExpBufferStr(details, ",\n    PARALLEL = safe");

From 932f9fb504a57f296cf698d15bd93462ddfe2776 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Thu, 16 Jul 2020 15:52:37 +0900
Subject: [PATCH 147/334] Switch pg_test_fsync to use binary mode on Windows

pg_test_fsync has always opened files using the text mode on Windows, as
this is the default mode used if not enforced by _setmode().

This fixes a failure when running pg_test_fsync down to 12 because
O_DSYNC and the text mode are not able to work together nicely.  We
fixed the handling of O_DSYNC in 12~ for the tool by switching to the
concurrent-safe version of fopen() in src/port/ with 0ba06e0.  And
40cfe86, by enforcing the text mode for compatibility reasons if O_TEXT
or O_BINARY are not specified by the caller, broke pg_test_fsync.  For
all versions, this avoids any translation overhead, and pg_test_fsync
should test binary writes, so it is a gain in all cases.

Note that O_DSYNC is still not handled correctly in ~11, leading to
pg_test_fsync to show insanely high numbers for open_datasync() (using
this property it is easy to notice that the binary mode is much
faster).  This would require a backpatch of 0ba06e0 and 40cfe86, which
could potentially break existing applications, so this is left out.

There are no TAP tests for this tool yet, so I have checked all builds
manually using MSVC.  We could invent a new option to run a single
transaction instead of using a duration of 1s to make the tests a
maximum short, but this is left as future work.

Thanks to Bruce Momjian for the discussion.

Reported-by: Jeff Janes
Author: Michael Paquier
Discussion: https://postgr.es/m/16526-279ded30a230d275@postgresql.org
Backpatch-through: 9.5
---
 src/bin/pg_test_fsync/pg_test_fsync.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/bin/pg_test_fsync/pg_test_fsync.c b/src/bin/pg_test_fsync/pg_test_fsync.c
index 2ca1608bd215..6e4729312331 100644
--- a/src/bin/pg_test_fsync/pg_test_fsync.c
+++ b/src/bin/pg_test_fsync/pg_test_fsync.c
@@ -224,7 +224,7 @@ test_open(void)
 	/*
 	 * test if we can open the target file
 	 */
-	if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1)
+	if ((tmpfile = open(filename, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
 		die("could not open output file");
 	needs_unlink = 1;
 	if (write(tmpfile, full_buf, DEFAULT_XLOG_SEG_SIZE) !=
@@ -259,7 +259,7 @@ test_sync(int writes_per_op)
 	fflush(stdout);
 
 #ifdef OPEN_DATASYNC_FLAG
-	if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT, 0)) == -1)
+	if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT | PG_BINARY, 0)) == -1)
 	{
 		printf(NA_FORMAT, _("n/a*"));
 		fs_warning = true;
@@ -289,7 +289,7 @@ test_sync(int writes_per_op)
 	fflush(stdout);
 
 #ifdef HAVE_FDATASYNC
-	if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
+	if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
 		die("could not open output file");
 	START_TIMER;
 	for (ops = 0; alarm_triggered == false; ops++)
@@ -313,7 +313,7 @@ test_sync(int writes_per_op)
 	printf(LABEL_FORMAT, "fsync");
 	fflush(stdout);
 
-	if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
+	if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
 		die("could not open output file");
 	START_TIMER;
 	for (ops = 0; alarm_triggered == false; ops++)
@@ -336,7 +336,7 @@ test_sync(int writes_per_op)
 	fflush(stdout);
 
 #ifdef HAVE_FSYNC_WRITETHROUGH
-	if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
+	if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
 		die("could not open output file");
 	START_TIMER;
 	for (ops = 0; alarm_triggered == false; ops++)
@@ -362,7 +362,7 @@ test_sync(int writes_per_op)
 	fflush(stdout);
 
 #ifdef OPEN_SYNC_FLAG
-	if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT, 0)) == -1)
+	if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT | PG_BINARY, 0)) == -1)
 	{
 		printf(NA_FORMAT, _("n/a*"));
 		fs_warning = true;
@@ -429,7 +429,7 @@ test_open_sync(const char *msg, int writes_size)
 	fflush(stdout);
 
 #ifdef OPEN_SYNC_FLAG
-	if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT, 0)) == -1)
+	if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT | PG_BINARY, 0)) == -1)
 		printf(NA_FORMAT, _("n/a*"));
 	else
 	{
@@ -477,7 +477,7 @@ test_file_descriptor_sync(void)
 	START_TIMER;
 	for (ops = 0; alarm_triggered == false; ops++)
 	{
-		if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
+		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
 			die("could not open output file");
 		if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
 			die("write failed");
@@ -489,7 +489,7 @@ test_file_descriptor_sync(void)
 		 * open and close the file again to be consistent with the following
 		 * test
 		 */
-		if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
+		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
 			die("could not open output file");
 		close(tmpfile);
 	}
@@ -505,13 +505,13 @@ test_file_descriptor_sync(void)
 	START_TIMER;
 	for (ops = 0; alarm_triggered == false; ops++)
 	{
-		if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
+		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
 			die("could not open output file");
 		if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
 			die("write failed");
 		close(tmpfile);
 		/* reopen file */
-		if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
+		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
 			die("could not open output file");
 		if (fsync(tmpfile) != 0)
 			die("fsync failed");
@@ -536,7 +536,7 @@ test_non_sync(void)
 	START_TIMER;
 	for (ops = 0; alarm_triggered == false; ops++)
 	{
-		if ((tmpfile = open(filename, O_RDWR, 0)) == -1)
+		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
 			die("could not open output file");
 		if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
 			die("write failed");

From d66b23b032d75614e1be47ca182020960d89206d Mon Sep 17 00:00:00 2001
From: Andrew Dunstan <andrew@dunslane.net>
Date: Thu, 16 Jul 2020 14:48:37 -0400
Subject: [PATCH 148/334] Enable almost all TAP tests involving symlinks on
 Windows

Windows has junction points which function as symbolic links for
directories. This patch introduces a new function TestLib::dir_symlink()
which creates a junction point on Windows and a standard Unix type
symbolic link elsewhere.

The function TestLib::perl2host is also modified, first to use cygpath
where it's available (e.g. msys2) and second to allow it to succeed if
the gandparent directory exists but the parent does not.

Given these changes the only symlink tests that need to be skipped on
Windows are those related to permissions or to use of readlink. The
relevant tests for pg_basebackup and pg_rewind are therefore adjusted
accordingly.

Andrew Dunstan, reviewed by Peter Eisentraut and Michael Paquier.

Discussion: https://postgr.es/m/c50a646c-d9bb-7c62-a4bf-8256ff6ff338@2ndquadrant.com
---
 src/bin/pg_basebackup/t/010_pg_basebackup.pl | 280 ++++++++++---------
 src/bin/pg_dump/t/010_dump_connstr.pl        |   4 +-
 src/bin/pg_rewind/t/004_pg_xlog_symlink.pl   |  13 +-
 src/test/perl/TestLib.pm                     |  73 ++++-
 4 files changed, 218 insertions(+), 152 deletions(-)

diff --git a/src/bin/pg_basebackup/t/010_pg_basebackup.pl b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
index 208df557b85f..f674a7c94e70 100644
--- a/src/bin/pg_basebackup/t/010_pg_basebackup.pl
+++ b/src/bin/pg_basebackup/t/010_pg_basebackup.pl
@@ -211,87 +211,93 @@
 	'pg_basebackup tar with long name fails');
 unlink "$pgdata/$superlongname";
 
-# The following tests test symlinks. Windows doesn't have symlinks, so
-# skip on Windows.
+# The following tests are for symlinks.
+
+# Move pg_replslot out of $pgdata and create a symlink to it.
+$node->stop;
+
+# Set umask so test directories and files are created with group permissions
+umask(0027);
+
+# Enable group permissions on PGDATA
+chmod_recursive("$pgdata", 0750, 0640);
+
+rename("$pgdata/pg_replslot", "$tempdir/pg_replslot")
+  or BAIL_OUT "could not move $pgdata/pg_replslot";
+dir_symlink("$tempdir/pg_replslot", "$pgdata/pg_replslot")
+  or BAIL_OUT "could not symlink to $pgdata/pg_replslot";
+
+$node->start;
+
+# Create a temporary directory in the system location and symlink it
+# to our physical temp location.  That way we can use shorter names
+# for the tablespace directories, which hopefully won't run afoul of
+# the 99 character length limit.
+my $shorter_tempdir = TestLib::tempdir_short . "/tempdir";
+dir_symlink "$tempdir", $shorter_tempdir;
+
+mkdir "$tempdir/tblspc1";
+my $realTsDir    = TestLib::perl2host("$shorter_tempdir/tblspc1");
+my $real_tempdir = TestLib::perl2host($tempdir);
+$node->safe_psql('postgres',
+	"CREATE TABLESPACE tblspc1 LOCATION '$realTsDir';");
+$node->safe_psql('postgres',
+	"CREATE TABLE test1 (a int) TABLESPACE tblspc1;");
+$node->command_ok(
+	[ 'pg_basebackup', '-D', "$real_tempdir/tarbackup2", '-Ft' ],
+	'tar format with tablespaces');
+ok(-f "$tempdir/tarbackup2/base.tar", 'backup tar was created');
+my @tblspc_tars = glob "$tempdir/tarbackup2/[0-9]*.tar";
+is(scalar(@tblspc_tars), 1, 'one tablespace tar was created');
+rmtree("$tempdir/tarbackup2");
+
+# Create an unlogged table to test that forks other than init are not copied.
+$node->safe_psql('postgres',
+	'CREATE UNLOGGED TABLE tblspc1_unlogged (id int) TABLESPACE tblspc1;');
+
+my $tblspc1UnloggedPath = $node->safe_psql('postgres',
+	q{select pg_relation_filepath('tblspc1_unlogged')});
+
+# Make sure main and init forks exist
+ok( -f "$pgdata/${tblspc1UnloggedPath}_init",
+	'unlogged init fork in tablespace');
+ok(-f "$pgdata/$tblspc1UnloggedPath", 'unlogged main fork in tablespace');
+
+# Create files that look like temporary relations to ensure they are ignored
+# in a tablespace.
+@tempRelationFiles = qw(t888_888 t888888_888888_vm.1);
+my $tblSpc1Id = basename(
+	dirname(
+		dirname(
+			$node->safe_psql(
+				'postgres', q{select pg_relation_filepath('test1')}))));
+
+foreach my $filename (@tempRelationFiles)
+{
+	append_to_file(
+		"$shorter_tempdir/tblspc1/$tblSpc1Id/$postgresOid/$filename",
+		'TEMP_RELATION');
+}
+
+$node->command_fails(
+	[ 'pg_basebackup', '-D', "$tempdir/backup1", '-Fp' ],
+	'plain format with tablespaces fails without tablespace mapping');
+
+$node->command_ok(
+	[
+		'pg_basebackup',    '-D',
+		"$tempdir/backup1", '-Fp',
+		"-T$realTsDir=$real_tempdir/tbackup/tblspc1"
+	],
+	'plain format with tablespaces succeeds with tablespace mapping');
+ok(-d "$tempdir/tbackup/tblspc1", 'tablespace was relocated');
+
+# This symlink check is not supported on Windows as -l
+# doesn't work with junctions
 SKIP:
 {
-	skip "symlinks not supported on Windows", 18 if ($windows_os);
-
-	# Move pg_replslot out of $pgdata and create a symlink to it.
-	$node->stop;
-
-	# Set umask so test directories and files are created with group permissions
-	umask(0027);
-
-	# Enable group permissions on PGDATA
-	chmod_recursive("$pgdata", 0750, 0640);
-
-	rename("$pgdata/pg_replslot", "$tempdir/pg_replslot")
-	  or BAIL_OUT "could not move $pgdata/pg_replslot";
-	symlink("$tempdir/pg_replslot", "$pgdata/pg_replslot")
-	  or BAIL_OUT "could not symlink to $pgdata/pg_replslot";
-
-	$node->start;
-
-	# Create a temporary directory in the system location and symlink it
-	# to our physical temp location.  That way we can use shorter names
-	# for the tablespace directories, which hopefully won't run afoul of
-	# the 99 character length limit.
-	my $shorter_tempdir = TestLib::tempdir_short . "/tempdir";
-	symlink "$tempdir", $shorter_tempdir;
-
-	mkdir "$tempdir/tblspc1";
-	$node->safe_psql('postgres',
-		"CREATE TABLESPACE tblspc1 LOCATION '$shorter_tempdir/tblspc1';");
-	$node->safe_psql('postgres',
-		"CREATE TABLE test1 (a int) TABLESPACE tblspc1;");
-	$node->command_ok([ 'pg_basebackup', '-D', "$tempdir/tarbackup2", '-Ft' ],
-		'tar format with tablespaces');
-	ok(-f "$tempdir/tarbackup2/base.tar", 'backup tar was created');
-	my @tblspc_tars = glob "$tempdir/tarbackup2/[0-9]*.tar";
-	is(scalar(@tblspc_tars), 1, 'one tablespace tar was created');
-	rmtree("$tempdir/tarbackup2");
-
-	# Create an unlogged table to test that forks other than init are not copied.
-	$node->safe_psql('postgres',
-		'CREATE UNLOGGED TABLE tblspc1_unlogged (id int) TABLESPACE tblspc1;'
-	);
-
-	my $tblspc1UnloggedPath = $node->safe_psql('postgres',
-		q{select pg_relation_filepath('tblspc1_unlogged')});
-
-	# Make sure main and init forks exist
-	ok( -f "$pgdata/${tblspc1UnloggedPath}_init",
-		'unlogged init fork in tablespace');
-	ok(-f "$pgdata/$tblspc1UnloggedPath", 'unlogged main fork in tablespace');
-
-	# Create files that look like temporary relations to ensure they are ignored
-	# in a tablespace.
-	my @tempRelationFiles = qw(t888_888 t888888_888888_vm.1);
-	my $tblSpc1Id         = basename(
-		dirname(
-			dirname(
-				$node->safe_psql(
-					'postgres', q{select pg_relation_filepath('test1')}))));
-
-	foreach my $filename (@tempRelationFiles)
-	{
-		append_to_file(
-			"$shorter_tempdir/tblspc1/$tblSpc1Id/$postgresOid/$filename",
-			'TEMP_RELATION');
-	}
-
-	$node->command_fails(
-		[ 'pg_basebackup', '-D', "$tempdir/backup1", '-Fp' ],
-		'plain format with tablespaces fails without tablespace mapping');
-
-	$node->command_ok(
-		[
-			'pg_basebackup', '-D', "$tempdir/backup1", '-Fp',
-			"-T$shorter_tempdir/tblspc1=$tempdir/tbackup/tblspc1"
-		],
-		'plain format with tablespaces succeeds with tablespace mapping');
-	ok(-d "$tempdir/tbackup/tblspc1", 'tablespace was relocated');
+	skip "symlink check not implemented on Windows", 1
+	  if ($windows_os);
 	opendir(my $dh, "$pgdata/pg_tblspc") or die;
 	ok( (   grep {
 				-l "$tempdir/backup1/pg_tblspc/$_"
@@ -300,65 +306,73 @@
 			} readdir($dh)),
 		"tablespace symlink was updated");
 	closedir $dh;
+}
+
+# Group access should be enabled on all backup files
+SKIP:
+{
+	skip "unix-style permissions not supported on Windows", 1
+	  if ($windows_os);
 
-	# Group access should be enabled on all backup files
 	ok(check_mode_recursive("$tempdir/backup1", 0750, 0640),
 		"check backup dir permissions");
+}
+
+# Unlogged relation forks other than init should not be copied
+my ($tblspc1UnloggedBackupPath) =
+  $tblspc1UnloggedPath =~ /[^\/]*\/[^\/]*\/[^\/]*$/g;
+
+ok(-f "$tempdir/tbackup/tblspc1/${tblspc1UnloggedBackupPath}_init",
+	'unlogged init fork in tablespace backup');
+ok(!-f "$tempdir/tbackup/tblspc1/$tblspc1UnloggedBackupPath",
+	'unlogged main fork not in tablespace backup');
 
-	# Unlogged relation forks other than init should not be copied
-	my ($tblspc1UnloggedBackupPath) =
-	  $tblspc1UnloggedPath =~ /[^\/]*\/[^\/]*\/[^\/]*$/g;
-
-	ok(-f "$tempdir/tbackup/tblspc1/${tblspc1UnloggedBackupPath}_init",
-		'unlogged init fork in tablespace backup');
-	ok(!-f "$tempdir/tbackup/tblspc1/$tblspc1UnloggedBackupPath",
-		'unlogged main fork not in tablespace backup');
-
-	# Temp relations should not be copied.
-	foreach my $filename (@tempRelationFiles)
-	{
-		ok( !-f "$tempdir/tbackup/tblspc1/$tblSpc1Id/$postgresOid/$filename",
-			"[tblspc1]/$postgresOid/$filename not copied");
-
-		# Also remove temp relation files or tablespace drop will fail.
-		my $filepath =
-		  "$shorter_tempdir/tblspc1/$tblSpc1Id/$postgresOid/$filename";
-
-		unlink($filepath)
-		  or BAIL_OUT("unable to unlink $filepath");
-	}
-
-	ok( -d "$tempdir/backup1/pg_replslot",
-		'pg_replslot symlink copied as directory');
-	rmtree("$tempdir/backup1");
-
-	mkdir "$tempdir/tbl=spc2";
-	$node->safe_psql('postgres', "DROP TABLE test1;");
-	$node->safe_psql('postgres', "DROP TABLE tblspc1_unlogged;");
-	$node->safe_psql('postgres', "DROP TABLESPACE tblspc1;");
-	$node->safe_psql('postgres',
-		"CREATE TABLESPACE tblspc2 LOCATION '$shorter_tempdir/tbl=spc2';");
-	$node->command_ok(
-		[
-			'pg_basebackup', '-D', "$tempdir/backup3", '-Fp',
-			"-T$shorter_tempdir/tbl\\=spc2=$tempdir/tbackup/tbl\\=spc2"
-		],
-		'mapping tablespace with = sign in path');
-	ok(-d "$tempdir/tbackup/tbl=spc2",
-		'tablespace with = sign was relocated');
-	$node->safe_psql('postgres', "DROP TABLESPACE tblspc2;");
-	rmtree("$tempdir/backup3");
-
-	mkdir "$tempdir/$superlongname";
-	$node->safe_psql('postgres',
-		"CREATE TABLESPACE tblspc3 LOCATION '$tempdir/$superlongname';");
-	$node->command_ok(
-		[ 'pg_basebackup', '-D', "$tempdir/tarbackup_l3", '-Ft' ],
-		'pg_basebackup tar with long symlink target');
-	$node->safe_psql('postgres', "DROP TABLESPACE tblspc3;");
-	rmtree("$tempdir/tarbackup_l3");
+# Temp relations should not be copied.
+foreach my $filename (@tempRelationFiles)
+{
+	ok(!-f "$tempdir/tbackup/tblspc1/$tblSpc1Id/$postgresOid/$filename",
+		"[tblspc1]/$postgresOid/$filename not copied");
+
+	# Also remove temp relation files or tablespace drop will fail.
+	my $filepath =
+	  "$shorter_tempdir/tblspc1/$tblSpc1Id/$postgresOid/$filename";
+
+	unlink($filepath)
+	  or BAIL_OUT("unable to unlink $filepath");
 }
 
+ok( -d "$tempdir/backup1/pg_replslot",
+	'pg_replslot symlink copied as directory');
+rmtree("$tempdir/backup1");
+
+mkdir "$tempdir/tbl=spc2";
+$realTsDir = TestLib::perl2host("$shorter_tempdir/tbl=spc2");
+$node->safe_psql('postgres', "DROP TABLE test1;");
+$node->safe_psql('postgres', "DROP TABLE tblspc1_unlogged;");
+$node->safe_psql('postgres', "DROP TABLESPACE tblspc1;");
+$node->safe_psql('postgres',
+	"CREATE TABLESPACE tblspc2 LOCATION '$realTsDir';");
+$realTsDir =~ s/=/\\=/;
+$node->command_ok(
+	[
+		'pg_basebackup',    '-D',
+		"$tempdir/backup3", '-Fp',
+		"-T$realTsDir=$real_tempdir/tbackup/tbl\\=spc2"
+	],
+	'mapping tablespace with = sign in path');
+ok(-d "$tempdir/tbackup/tbl=spc2", 'tablespace with = sign was relocated');
+$node->safe_psql('postgres', "DROP TABLESPACE tblspc2;");
+rmtree("$tempdir/backup3");
+
+mkdir "$tempdir/$superlongname";
+$realTsDir = TestLib::perl2host("$shorter_tempdir/$superlongname");
+$node->safe_psql('postgres',
+	"CREATE TABLESPACE tblspc3 LOCATION '$realTsDir';");
+$node->command_ok([ 'pg_basebackup', '-D', "$tempdir/tarbackup_l3", '-Ft' ],
+	'pg_basebackup tar with long symlink target');
+$node->safe_psql('postgres', "DROP TABLESPACE tblspc3;");
+rmtree("$tempdir/tarbackup_l3");
+
 $node->command_ok([ 'pg_basebackup', '-D', "$tempdir/backupR", '-R' ],
 	'pg_basebackup -R runs');
 ok(-f "$tempdir/backupR/postgresql.auto.conf", 'postgresql.auto.conf exists');
@@ -496,7 +510,7 @@
 
 # set page header and block sizes
 my $pageheader_size = 24;
-my $block_size = $node->safe_psql('postgres', 'SHOW block_size;');
+my $block_size      = $node->safe_psql('postgres', 'SHOW block_size;');
 
 # induce corruption
 system_or_bail 'pg_ctl', '-D', $pgdata, 'stop';
diff --git a/src/bin/pg_dump/t/010_dump_connstr.pl b/src/bin/pg_dump/t/010_dump_connstr.pl
index abdb07c5588f..5497e4605642 100644
--- a/src/bin/pg_dump/t/010_dump_connstr.pl
+++ b/src/bin/pg_dump/t/010_dump_connstr.pl
@@ -5,7 +5,7 @@
 use TestLib;
 use Test::More;
 
-if ($^O eq 'msys' && `uname -or` =~ /^[2-9].*Msys/)
+if ($TestLib::is_msys2)
 {
 	plan skip_all => 'High bit name tests fail on Msys2';
 }
@@ -27,7 +27,7 @@
 # The odds of finding something interesting by testing all ASCII letters
 # seem too small to justify the cycles of testing a fifth name.
 my $dbname1 =
-    'regression'
+	'regression'
   . generate_ascii_string(1,  9)
   . generate_ascii_string(11, 12)
   . generate_ascii_string(14, 33)
diff --git a/src/bin/pg_rewind/t/004_pg_xlog_symlink.pl b/src/bin/pg_rewind/t/004_pg_xlog_symlink.pl
index 3813543ee1cc..fff475850834 100644
--- a/src/bin/pg_rewind/t/004_pg_xlog_symlink.pl
+++ b/src/bin/pg_rewind/t/004_pg_xlog_symlink.pl
@@ -6,16 +6,7 @@
 use File::Copy;
 use File::Path qw(rmtree);
 use TestLib;
-use Test::More;
-if ($windows_os)
-{
-	plan skip_all => 'symlinks not supported on Windows';
-	exit;
-}
-else
-{
-	plan tests => 5;
-}
+use Test::More tests => 5;
 
 use FindBin;
 use lib $FindBin::RealBin;
@@ -36,7 +27,7 @@ sub run_test
 	# turn pg_wal into a symlink
 	print("moving $test_primary_datadir/pg_wal to $primary_xlogdir\n");
 	move("$test_primary_datadir/pg_wal", $primary_xlogdir) or die;
-	symlink($primary_xlogdir, "$test_primary_datadir/pg_wal") or die;
+	dir_symlink($primary_xlogdir, "$test_primary_datadir/pg_wal") or die;
 
 	RewindTest::start_primary();
 
diff --git a/src/test/perl/TestLib.pm b/src/test/perl/TestLib.pm
index a7490d2ce797..cbe87f868431 100644
--- a/src/test/perl/TestLib.pm
+++ b/src/test/perl/TestLib.pm
@@ -67,6 +67,7 @@ our @EXPORT = qw(
   check_mode_recursive
   chmod_recursive
   check_pg_config
+  dir_symlink
   system_or_bail
   system_log
   run_log
@@ -84,10 +85,12 @@ our @EXPORT = qw(
   command_checks_all
 
   $windows_os
+  $is_msys2
   $use_unix_sockets
 );
 
-our ($windows_os, $use_unix_sockets, $tmp_check, $log_path, $test_logfile);
+our ($windows_os, $is_msys2, $use_unix_sockets, $tmp_check, $log_path,
+	$test_logfile);
 
 BEGIN
 {
@@ -114,6 +117,9 @@ BEGIN
 
 	# Must be set early
 	$windows_os = $Config{osname} eq 'MSWin32' || $Config{osname} eq 'msys';
+	# Check if this environment is MSYS2.
+	$is_msys2 = $^O eq 'msys' && `uname -or` =~ /^[2-9].*Msys/;
+
 	if ($windows_os)
 	{
 		require Win32API::File;
@@ -137,6 +143,10 @@ BEGIN
 
 Set to true when running under Windows, except on Cygwin.
 
+=item C<$is_msys2>
+
+Set to true when running under MSYS2.
+
 =back
 
 =cut
@@ -152,7 +162,7 @@ INIT
 	# TESTDIR environment variable, which is normally set by the invoking
 	# Makefile.
 	$tmp_check = $ENV{TESTDIR} ? "$ENV{TESTDIR}/tmp_check" : "tmp_check";
-	$log_path = "$tmp_check/log";
+	$log_path  = "$tmp_check/log";
 
 	mkdir $tmp_check;
 	mkdir $log_path;
@@ -263,9 +273,10 @@ sub tempdir_short
 
 =item perl2host()
 
-Translate a Perl file name to a host file name.  Currently, this is a no-op
+Translate a virtual file name to a host file name.  Currently, this is a no-op
 except for the case of Perl=msys and host=mingw32.  The subject need not
-exist, but its parent directory must exist.
+exist, but its parent or grandparent directory must exist unless cygpath is
+available.
 
 =cut
 
@@ -273,6 +284,17 @@ sub perl2host
 {
 	my ($subject) = @_;
 	return $subject unless $Config{osname} eq 'msys';
+	if ($is_msys2)
+	{
+		# get absolute, windows type path
+		my $path = qx{cygpath -a -w "$subject"};
+		if (!$?)
+		{
+			chomp $path;
+			return $path if $path;
+		}
+		# fall through if this didn't work.
+	}
 	my $here = cwd;
 	my $leaf;
 	if (chdir $subject)
@@ -283,7 +305,12 @@ sub perl2host
 	{
 		$leaf = '/' . basename $subject;
 		my $parent = dirname $subject;
-		chdir $parent or die "could not chdir \"$parent\": $!";
+		if (!chdir $parent)
+		{
+			$leaf   = '/' . basename($parent) . $leaf;
+			$parent = dirname $parent;
+			chdir $parent or die "could not chdir \"$parent\": $!";
+		}
 	}
 
 	# this odd way of calling 'pwd -W' is the only way that seems to work.
@@ -602,6 +629,40 @@ sub check_pg_config
 
 =pod
 
+=item dir_symlink(oldname, newname)
+
+Portably create a symlink for a directory. On Windows this creates a junction
+point. Elsewhere it just calls perl's builtin symlink.
+
+=cut
+
+sub dir_symlink
+{
+	my $oldname = shift;
+	my $newname = shift;
+	if ($windows_os)
+	{
+		$oldname = perl2host($oldname);
+		$newname = perl2host($newname);
+		$oldname =~ s,/,\\,g;
+		$newname =~ s,/,\\,g;
+		my $cmd = qq{mklink /j "$newname" "$oldname"};
+		if ($Config{osname} eq 'msys')
+		{
+			# need some indirection on msys
+			$cmd = qq{echo '$cmd' | \$COMSPEC /Q};
+		}
+		system($cmd);
+	}
+	else
+	{
+		symlink $oldname, $newname;
+	}
+	die "No $newname" unless -e $newname;
+}
+
+=pod
+
 =back
 
 =head1 Test::More-LIKE METHODS
@@ -664,7 +725,7 @@ sub command_exit_is
 	# long as the process was not terminated by an exception. To work around
 	# that, use $h->full_results on Windows instead.
 	my $result =
-	    ($Config{osname} eq "MSWin32")
+		($Config{osname} eq "MSWin32")
 	  ? ($h->full_results)[0]
 	  : $h->result(0);
 	is($result, $expected, $test_name);

From d2bddc2500fb74d56e5bc53a1cfa269e2e846510 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Fri, 17 Jul 2020 14:33:00 +1200
Subject: [PATCH 149/334] Add huge_page_size setting for use on Linux.

This allows the huge page size to be set explicitly.  The default is 0,
meaning it will use the system default, as before.

Author: Odin Ugedal <odin@ugedal.com>
Discussion: https://postgr.es/m/20200608154639.20254-1-odin%40ugedal.com
---
 doc/src/sgml/config.sgml                      | 27 ++++++++
 doc/src/sgml/runtime.sgml                     | 55 ++++++++++------
 src/backend/port/sysv_shmem.c                 | 62 ++++++++++++++-----
 src/backend/utils/misc/guc.c                  | 32 +++++++++-
 src/backend/utils/misc/postgresql.conf.sample |  2 +
 src/include/storage/pg_shmem.h                |  1 +
 6 files changed, 141 insertions(+), 38 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index b353c6168308..e0ea397ed40d 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1582,6 +1582,33 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-huge-page-size" xreflabel="huge_page_size">
+      <term><varname>huge_page_size</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>huge_page_size</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Controls the size of huge pages, when they are enabled with
+        <xref linkend="guc-huge-pages"/>.
+        The default is zero (<literal>0</literal>).
+        When set to <literal>0</literal>, the default huge page size on the
+        system will be used.
+       </para>
+       <para>
+        Some commonly available page sizes on modern 64 bit server architectures include:
+        <literal>2MB</literal> and <literal>1GB</literal> (Intel and AMD), <literal>16MB</literal> and
+        <literal>16GB</literal> (IBM POWER), and <literal>64kB</literal>, <literal>2MB</literal>,
+        <literal>32MB</literal> and <literal>1GB</literal> (ARM). For more information
+        about usage and support, see <xref linkend="linux-huge-pages"/>.
+       </para>
+       <para>
+        Non-default settings are currently supported only on Linux.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
       <term><varname>temp_buffers</varname> (<type>integer</type>)
       <indexterm>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index 937bb2e8ac95..e09cb55efcd6 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1391,13 +1391,14 @@ export PG_OOM_ADJUST_VALUE=0
     using large values of <xref linkend="guc-shared-buffers"/>.  To use this
     feature in <productname>PostgreSQL</productname> you need a kernel
     with <varname>CONFIG_HUGETLBFS=y</varname> and
-    <varname>CONFIG_HUGETLB_PAGE=y</varname>. You will also have to adjust
-    the kernel setting <varname>vm.nr_hugepages</varname>. To estimate the
-    number of huge pages needed, start <productname>PostgreSQL</productname>
-    without huge pages enabled and check the
-    postmaster's anonymous shared memory segment size, as well as the system's
-    huge page size, using the <filename>/proc</filename> file system.  This might
-    look like:
+    <varname>CONFIG_HUGETLB_PAGE=y</varname>. You will also have to configure
+    the operating system to provide enough huge pages of the desired size.
+    To estimate the number of huge pages needed, start
+    <productname>PostgreSQL</productname> without huge pages enabled and check
+    the postmaster's anonymous shared memory segment size, as well as the
+    system's default and supported huge page sizes, using the
+    <filename>/proc</filename> and <filename>/sys</filename> file systems.
+    This might look like:
 <programlisting>
 $ <userinput>head -1 $PGDATA/postmaster.pid</userinput>
 4170
@@ -1405,27 +1406,40 @@ $ <userinput>pmap 4170 | awk '/rw-s/ &amp;&amp; /zero/ {print $2}'</userinput>
 6490428K
 $ <userinput>grep ^Hugepagesize /proc/meminfo</userinput>
 Hugepagesize:       2048 kB
+$ <userinput>ls /sys/kernel/mm/hugepages</userinput>
+hugepages-1048576kB  hugepages-2048kB
 </programlisting>
+
+     In this example the default is 2MB, but you can also explicitly request
+     either 2MB or 1GB with <xref linkend="guc-huge-page-size"/>.
+
+     Assuming <literal>2MB</literal> huge pages,
      <literal>6490428</literal> / <literal>2048</literal> gives approximately
      <literal>3169.154</literal>, so in this example we need at
-     least <literal>3170</literal> huge pages, which we can set with:
+     least <literal>3170</literal> huge pages.  A larger setting would be
+     appropriate if other programs on the machine also need huge pages.
+     We can set this with:
+<programlisting>
+# <userinput>sysctl -w vm.nr_hugepages=3170</userinput>
+</programlisting>
+     Don't forget to add this setting to <filename>/etc/sysctl.conf</filename>
+     so that it is reapplied after reboots.  For non-default huge page sizes,
+     we can instead use:
 <programlisting>
-$ <userinput>sysctl -w vm.nr_hugepages=3170</userinput>
+# <userinput>echo 3170 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages</userinput>
 </programlisting>
-    A larger setting would be appropriate if other programs on the machine
-    also need huge pages.  Don't forget to add this setting
-    to <filename>/etc/sysctl.conf</filename> so that it will be reapplied
-    after reboots.
+    It is also possible to provide these settings at boot time using
+    kernel parameters such as <literal>hugepagesz=2M hugepages=3170</literal>.
    </para>
 
    <para>
     Sometimes the kernel is not able to allocate the desired number of huge
-    pages immediately, so it might be necessary to repeat the command or to
-    reboot.  (Immediately after a reboot, most of the machine's memory
-    should be available to convert into huge pages.)  To verify the huge
-    page allocation situation, use:
+    pages immediately due to fragmentation, so it might be necessary
+    to repeat the command or to reboot.  (Immediately after a reboot, most of
+    the machine's memory should be available to convert into huge pages.)
+    To verify the huge page allocation situation for a given size, use:
 <programlisting>
-$ <userinput>grep Huge /proc/meminfo</userinput>
+$ <userinput>cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages</userinput>
 </programlisting>
    </para>
 
@@ -1438,8 +1452,9 @@ $ <userinput>grep Huge /proc/meminfo</userinput>
 
    <para>
     The default behavior for huge pages in
-    <productname>PostgreSQL</productname> is to use them when possible and
-    to fall back to normal pages when failing. To enforce the use of huge
+    <productname>PostgreSQL</productname> is to use them when possible, with
+    the system's default huge page size, and
+    to fall back to normal pages on failure. To enforce the use of huge
     pages, you can set <xref linkend="guc-huge-pages"/>
     to <literal>on</literal> in <filename>postgresql.conf</filename>.
     Note that with this setting <productname>PostgreSQL</productname> will fail to
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 198a6985bf3f..203555822d9a 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -32,6 +32,7 @@
 #endif
 
 #include "miscadmin.h"
+#include "port/pg_bitutils.h"
 #include "portability/mem.h"
 #include "storage/dsm.h"
 #include "storage/fd.h"
@@ -448,7 +449,7 @@ PGSharedMemoryAttach(IpcMemoryId shmId,
 #ifdef MAP_HUGETLB
 
 /*
- * Identify the huge page size to use.
+ * Identify the huge page size to use, and compute the related mmap flags.
  *
  * Some Linux kernel versions have a bug causing mmap() to fail on requests
  * that are not a multiple of the hugepage size.  Versions without that bug
@@ -464,25 +465,13 @@ PGSharedMemoryAttach(IpcMemoryId shmId,
  * hugepage sizes, we might want to think about more invasive strategies,
  * such as increasing shared_buffers to absorb the extra space.
  *
- * Returns the (real or assumed) page size into *hugepagesize,
+ * Returns the (real, assumed or config provided) page size into *hugepagesize,
  * and the hugepage-related mmap flags to use into *mmap_flags.
- *
- * Currently *mmap_flags is always just MAP_HUGETLB.  Someday, on systems
- * that support it, we might OR in additional bits to specify a particular
- * non-default huge page size.
  */
 static void
 GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 {
-	/*
-	 * If we fail to find out the system's default huge page size, assume it
-	 * is 2MB.  This will work fine when the actual size is less.  If it's
-	 * more, we might get mmap() or munmap() failures due to unaligned
-	 * requests; but at this writing, there are no reports of any non-Linux
-	 * systems being picky about that.
-	 */
-	*hugepagesize = 2 * 1024 * 1024;
-	*mmap_flags = MAP_HUGETLB;
+	Size		default_hugepagesize = 0;
 
 	/*
 	 * System-dependent code to find out the default huge page size.
@@ -491,6 +480,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 	 * nnnn kB".  Ignore any failures, falling back to the preset default.
 	 */
 #ifdef __linux__
+
 	{
 		FILE	   *fp = AllocateFile("/proc/meminfo", "r");
 		char		buf[128];
@@ -505,7 +495,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 				{
 					if (ch == 'k')
 					{
-						*hugepagesize = sz * (Size) 1024;
+						default_hugepagesize = sz * (Size) 1024;
 						break;
 					}
 					/* We could accept other units besides kB, if needed */
@@ -515,6 +505,44 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 		}
 	}
 #endif							/* __linux__ */
+
+	if (huge_page_size != 0)
+	{
+		/* If huge page size is requested explicitly, use that. */
+		*hugepagesize = (Size) huge_page_size * 1024;
+	}
+	else if (default_hugepagesize != 0)
+	{
+		/* Otherwise use the system default, if we have it. */
+		*hugepagesize = default_hugepagesize;
+	}
+	else
+	{
+		/*
+		 * If we fail to find out the system's default huge page size, or no
+		 * huge page size is requested explicitly, assume it is 2MB. This will
+		 * work fine when the actual size is less.  If it's more, we might get
+		 * mmap() or munmap() failures due to unaligned requests; but at this
+		 * writing, there are no reports of any non-Linux systems being picky
+		 * about that.
+		 */
+		*hugepagesize = 2 * 1024 * 1024;
+	}
+
+	*mmap_flags = MAP_HUGETLB;
+
+	/*
+	 * On recent enough Linux, also include the explicit page size, if
+	 * necessary.
+	 */
+#if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT)
+	if (*hugepagesize != default_hugepagesize)
+	{
+		int			shift = pg_ceil_log2_64(*hugepagesize);
+
+		*mmap_flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+	}
+#endif
 }
 
 #endif							/* MAP_HUGETLB */
@@ -583,7 +611,7 @@ CreateAnonymousSegment(Size *size)
 						 "(currently %zu bytes), reduce PostgreSQL's shared "
 						 "memory usage, perhaps by reducing shared_buffers or "
 						 "max_connections.",
-						 *size) : 0));
+						 allocsize) : 0));
 	}
 
 	*size = allocsize;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 031ca0327f0e..99a3e4f6f655 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -20,11 +20,14 @@
 #include <float.h>
 #include <math.h>
 #include <limits.h>
-#include <unistd.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
 #include <sys/stat.h>
 #ifdef HAVE_SYSLOG
 #include <syslog.h>
 #endif
+#include <unistd.h>
 
 #include "access/commit_ts.h"
 #include "access/gin.h"
@@ -198,6 +201,7 @@ static bool check_max_wal_senders(int *newval, void **extra, GucSource source);
 static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource source);
 static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source);
 static bool check_maintenance_io_concurrency(int *newval, void **extra, GucSource source);
+static bool check_huge_page_size(int *newval, void **extra, GucSource source);
 static void assign_pgstat_temp_directory(const char *newval, void *extra);
 static bool check_application_name(char **newval, void **extra, GucSource source);
 static void assign_application_name(const char *newval, void *extra);
@@ -576,6 +580,7 @@ int			ssl_renegotiation_limit;
  * need to be duplicated in all the different implementations of pg_shmem.c.
  */
 int			huge_pages;
+int			huge_page_size;
 
 /*
  * These variables are all dummies that don't do anything, except in some
@@ -3381,6 +3386,17 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, assign_tcp_user_timeout, show_tcp_user_timeout
 	},
 
+	{
+		{"huge_page_size", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("The size of huge page that should be requested."),
+			NULL,
+			GUC_UNIT_KB
+		},
+		&huge_page_size,
+		0, 0, INT_MAX,
+		check_huge_page_size, NULL, NULL
+	},
+
 	/* End-of-list marker */
 	{
 		{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL
@@ -11565,6 +11581,20 @@ check_maintenance_io_concurrency(int *newval, void **extra, GucSource source)
 	return true;
 }
 
+static bool
+check_huge_page_size(int *newval, void **extra, GucSource source)
+{
+#if !(defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT))
+	/* Recent enough Linux only, for now.  See GetHugePageSize(). */
+	if (*newval != 0)
+	{
+		GUC_check_errdetail("huge_page_size must be 0 on this platform.");
+		return false;
+	}
+#endif
+	return true;
+}
+
 static void
 assign_pgstat_temp_directory(const char *newval, void *extra)
 {
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index e430e33c7b4b..29e015219668 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -122,6 +122,8 @@
 					# (change requires restart)
 #huge_pages = try			# on, off, or try
 					# (change requires restart)
+#huge_page_size = 0			# zero for system default
+					# (change requires restart)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
 					# (change requires restart)
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 0de26b342764..9992932a0059 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -44,6 +44,7 @@ typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
 /* GUC variables */
 extern int	shared_memory_type;
 extern int	huge_pages;
+extern int	huge_page_size;
 
 /* Possible values for huge_pages */
 typedef enum

From cdc7169509113018cc389da740e950c587b5751f Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Fri, 17 Jul 2020 14:57:50 +1200
Subject: [PATCH 150/334] Use MinimalTuple for tuple queues.

This representation saves 8 bytes per tuple compared to HeapTuple, and
avoids the need to allocate, copy and free on the receiving side.

Gather can emit the returned MinimalTuple directly, but GatherMerge now
needs to make an explicit copy because it buffers multiple tuples at a
time.  That should be no worse than before.

Reviewed-by: Soumyadeep Chakraborty <soumyadeep2007@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKG%2B8T_ggoUTAE-U%3DA%2BOcPc4%3DB0nPPHcSfffuQhvXXjML6w%40mail.gmail.com
---
 src/backend/executor/nodeGather.c       | 16 +++++-----
 src/backend/executor/nodeGatherMerge.c  | 40 ++++++++++++++-----------
 src/backend/executor/tqueue.c           | 30 +++++++++----------
 src/backend/optimizer/plan/createplan.c |  8 +++--
 src/include/executor/tqueue.h           |  4 +--
 5 files changed, 51 insertions(+), 47 deletions(-)

diff --git a/src/backend/executor/nodeGather.c b/src/backend/executor/nodeGather.c
index 6b8ed867d593..a01b46af1480 100644
--- a/src/backend/executor/nodeGather.c
+++ b/src/backend/executor/nodeGather.c
@@ -46,7 +46,7 @@
 
 static TupleTableSlot *ExecGather(PlanState *pstate);
 static TupleTableSlot *gather_getnext(GatherState *gatherstate);
-static HeapTuple gather_readnext(GatherState *gatherstate);
+static MinimalTuple gather_readnext(GatherState *gatherstate);
 static void ExecShutdownGatherWorkers(GatherState *node);
 
 
@@ -120,7 +120,7 @@ ExecInitGather(Gather *node, EState *estate, int eflags)
 	 * Initialize funnel slot to same tuple descriptor as outer plan.
 	 */
 	gatherstate->funnel_slot = ExecInitExtraTupleSlot(estate, tupDesc,
-													  &TTSOpsHeapTuple);
+													  &TTSOpsMinimalTuple);
 
 	/*
 	 * Gather doesn't support checking a qual (it's always more efficient to
@@ -266,7 +266,7 @@ gather_getnext(GatherState *gatherstate)
 	PlanState  *outerPlan = outerPlanState(gatherstate);
 	TupleTableSlot *outerTupleSlot;
 	TupleTableSlot *fslot = gatherstate->funnel_slot;
-	HeapTuple	tup;
+	MinimalTuple	tup;
 
 	while (gatherstate->nreaders > 0 || gatherstate->need_to_scan_locally)
 	{
@@ -278,9 +278,9 @@ gather_getnext(GatherState *gatherstate)
 
 			if (HeapTupleIsValid(tup))
 			{
-				ExecStoreHeapTuple(tup, /* tuple to store */
-								   fslot,	/* slot to store the tuple */
-								   true);	/* pfree tuple when done with it */
+				ExecStoreMinimalTuple(tup, /* tuple to store */
+									  fslot,	/* slot to store the tuple */
+									  false);	/* don't pfree tuple  */
 				return fslot;
 			}
 		}
@@ -308,7 +308,7 @@ gather_getnext(GatherState *gatherstate)
 /*
  * Attempt to read a tuple from one of our parallel workers.
  */
-static HeapTuple
+static MinimalTuple
 gather_readnext(GatherState *gatherstate)
 {
 	int			nvisited = 0;
@@ -316,7 +316,7 @@ gather_readnext(GatherState *gatherstate)
 	for (;;)
 	{
 		TupleQueueReader *reader;
-		HeapTuple	tup;
+		MinimalTuple tup;
 		bool		readerdone;
 
 		/* Check for async events, particularly messages from workers. */
diff --git a/src/backend/executor/nodeGatherMerge.c b/src/backend/executor/nodeGatherMerge.c
index 317ddb4ae271..47129344f327 100644
--- a/src/backend/executor/nodeGatherMerge.c
+++ b/src/backend/executor/nodeGatherMerge.c
@@ -45,7 +45,7 @@
  */
 typedef struct GMReaderTupleBuffer
 {
-	HeapTuple  *tuple;			/* array of length MAX_TUPLE_STORE */
+	MinimalTuple *tuple;		/* array of length MAX_TUPLE_STORE */
 	int			nTuples;		/* number of tuples currently stored */
 	int			readCounter;	/* index of next tuple to extract */
 	bool		done;			/* true if reader is known exhausted */
@@ -54,8 +54,8 @@ typedef struct GMReaderTupleBuffer
 static TupleTableSlot *ExecGatherMerge(PlanState *pstate);
 static int32 heap_compare_slots(Datum a, Datum b, void *arg);
 static TupleTableSlot *gather_merge_getnext(GatherMergeState *gm_state);
-static HeapTuple gm_readnext_tuple(GatherMergeState *gm_state, int nreader,
-								   bool nowait, bool *done);
+static MinimalTuple gm_readnext_tuple(GatherMergeState *gm_state, int nreader,
+									  bool nowait, bool *done);
 static void ExecShutdownGatherMergeWorkers(GatherMergeState *node);
 static void gather_merge_setup(GatherMergeState *gm_state);
 static void gather_merge_init(GatherMergeState *gm_state);
@@ -419,12 +419,12 @@ gather_merge_setup(GatherMergeState *gm_state)
 	{
 		/* Allocate the tuple array with length MAX_TUPLE_STORE */
 		gm_state->gm_tuple_buffers[i].tuple =
-			(HeapTuple *) palloc0(sizeof(HeapTuple) * MAX_TUPLE_STORE);
+			(MinimalTuple *) palloc0(sizeof(MinimalTuple) * MAX_TUPLE_STORE);
 
 		/* Initialize tuple slot for worker */
 		gm_state->gm_slots[i + 1] =
 			ExecInitExtraTupleSlot(gm_state->ps.state, gm_state->tupDesc,
-								   &TTSOpsHeapTuple);
+								   &TTSOpsMinimalTuple);
 	}
 
 	/* Allocate the resources for the merge */
@@ -533,7 +533,7 @@ gather_merge_clear_tuples(GatherMergeState *gm_state)
 		GMReaderTupleBuffer *tuple_buffer = &gm_state->gm_tuple_buffers[i];
 
 		while (tuple_buffer->readCounter < tuple_buffer->nTuples)
-			heap_freetuple(tuple_buffer->tuple[tuple_buffer->readCounter++]);
+			pfree(tuple_buffer->tuple[tuple_buffer->readCounter++]);
 
 		ExecClearTuple(gm_state->gm_slots[i + 1]);
 	}
@@ -613,13 +613,13 @@ load_tuple_array(GatherMergeState *gm_state, int reader)
 	/* Try to fill additional slots in the array. */
 	for (i = tuple_buffer->nTuples; i < MAX_TUPLE_STORE; i++)
 	{
-		HeapTuple	tuple;
+		MinimalTuple tuple;
 
 		tuple = gm_readnext_tuple(gm_state,
 								  reader,
 								  true,
 								  &tuple_buffer->done);
-		if (!HeapTupleIsValid(tuple))
+		if (!tuple)
 			break;
 		tuple_buffer->tuple[i] = tuple;
 		tuple_buffer->nTuples++;
@@ -637,7 +637,7 @@ static bool
 gather_merge_readnext(GatherMergeState *gm_state, int reader, bool nowait)
 {
 	GMReaderTupleBuffer *tuple_buffer;
-	HeapTuple	tup;
+	MinimalTuple tup;
 
 	/*
 	 * If we're being asked to generate a tuple from the leader, then we just
@@ -687,7 +687,7 @@ gather_merge_readnext(GatherMergeState *gm_state, int reader, bool nowait)
 								reader,
 								nowait,
 								&tuple_buffer->done);
-		if (!HeapTupleIsValid(tup))
+		if (!tup)
 			return false;
 
 		/*
@@ -697,13 +697,13 @@ gather_merge_readnext(GatherMergeState *gm_state, int reader, bool nowait)
 		load_tuple_array(gm_state, reader);
 	}
 
-	Assert(HeapTupleIsValid(tup));
+	Assert(tup);
 
 	/* Build the TupleTableSlot for the given tuple */
-	ExecStoreHeapTuple(tup,		/* tuple to store */
-					   gm_state->gm_slots[reader],	/* slot in which to store
-													 * the tuple */
-					   true);	/* pfree tuple when done with it */
+	ExecStoreMinimalTuple(tup,		/* tuple to store */
+						  gm_state->gm_slots[reader],	/* slot in which to store
+														 * the tuple */
+						  true);	/* pfree tuple when done with it */
 
 	return true;
 }
@@ -711,12 +711,12 @@ gather_merge_readnext(GatherMergeState *gm_state, int reader, bool nowait)
 /*
  * Attempt to read a tuple from given worker.
  */
-static HeapTuple
+static MinimalTuple
 gm_readnext_tuple(GatherMergeState *gm_state, int nreader, bool nowait,
 				  bool *done)
 {
 	TupleQueueReader *reader;
-	HeapTuple	tup;
+	MinimalTuple tup;
 
 	/* Check for async events, particularly messages from workers. */
 	CHECK_FOR_INTERRUPTS();
@@ -732,7 +732,11 @@ gm_readnext_tuple(GatherMergeState *gm_state, int nreader, bool nowait,
 	reader = gm_state->reader[nreader - 1];
 	tup = TupleQueueReaderNext(reader, nowait, done);
 
-	return tup;
+	/*
+	 * Since we'll be buffering these across multiple calls, we need to make a
+	 * copy.
+	 */
+	return tup ? heap_copy_minimal_tuple(tup) : NULL;
 }
 
 /*
diff --git a/src/backend/executor/tqueue.c b/src/backend/executor/tqueue.c
index e5656fbfac83..30a264ebea98 100644
--- a/src/backend/executor/tqueue.c
+++ b/src/backend/executor/tqueue.c
@@ -54,16 +54,16 @@ static bool
 tqueueReceiveSlot(TupleTableSlot *slot, DestReceiver *self)
 {
 	TQueueDestReceiver *tqueue = (TQueueDestReceiver *) self;
-	HeapTuple	tuple;
+	MinimalTuple tuple;
 	shm_mq_result result;
 	bool		should_free;
 
 	/* Send the tuple itself. */
-	tuple = ExecFetchSlotHeapTuple(slot, true, &should_free);
-	result = shm_mq_send(tqueue->queue, tuple->t_len, tuple->t_data, false);
+	tuple = ExecFetchSlotMinimalTuple(slot, &should_free);
+	result = shm_mq_send(tqueue->queue, tuple->t_len, tuple, false);
 
 	if (should_free)
-		heap_freetuple(tuple);
+		pfree(tuple);
 
 	/* Check for failure. */
 	if (result == SHM_MQ_DETACHED)
@@ -164,18 +164,18 @@ DestroyTupleQueueReader(TupleQueueReader *reader)
  * nowait = true and no tuple is ready to return.  *done, if not NULL,
  * is set to true when there are no remaining tuples and otherwise to false.
  *
- * The returned tuple, if any, is allocated in CurrentMemoryContext.
- * Note that this routine must not leak memory!  (We used to allow that,
- * but not any more.)
+ * The returned tuple, if any, is either in shared memory or a private buffer
+ * and should not be freed.  The pointer is invalid after the next call to
+ * TupleQueueReaderNext().
  *
  * Even when shm_mq_receive() returns SHM_MQ_WOULD_BLOCK, this can still
  * accumulate bytes from a partially-read message, so it's useful to call
  * this with nowait = true even if nothing is returned.
  */
-HeapTuple
+MinimalTuple
 TupleQueueReaderNext(TupleQueueReader *reader, bool nowait, bool *done)
 {
-	HeapTupleData htup;
+	MinimalTuple tuple;
 	shm_mq_result result;
 	Size		nbytes;
 	void	   *data;
@@ -200,13 +200,11 @@ TupleQueueReaderNext(TupleQueueReader *reader, bool nowait, bool *done)
 	Assert(result == SHM_MQ_SUCCESS);
 
 	/*
-	 * Set up a dummy HeapTupleData pointing to the data from the shm_mq
-	 * (which had better be sufficiently aligned).
+	 * Return a pointer to the queue memory directly (which had better be
+	 * sufficiently aligned).
 	 */
-	ItemPointerSetInvalid(&htup.t_self);
-	htup.t_tableOid = InvalidOid;
-	htup.t_len = nbytes;
-	htup.t_data = data;
+	tuple = (MinimalTuple) data;
+	Assert(tuple->t_len == nbytes);
 
-	return heap_copytuple(&htup);
+	return tuple;
 }
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 9941dfe65e46..99278eed9319 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -1730,8 +1730,10 @@ create_gather_plan(PlannerInfo *root, GatherPath *best_path)
 	List	   *tlist;
 
 	/*
-	 * Although the Gather node can project, we prefer to push down such work
-	 * to its child node, so demand an exact tlist from the child.
+	 * Push projection down to the child node.  That way, the projection work
+	 * is parallelized, and there can be no system columns in the result (they
+	 * can't travel through a tuple queue because it uses MinimalTuple
+	 * representation).
 	 */
 	subplan = create_plan_recurse(root, best_path->subpath, CP_EXACT_TLIST);
 
@@ -1766,7 +1768,7 @@ create_gather_merge_plan(PlannerInfo *root, GatherMergePath *best_path)
 	List	   *pathkeys = best_path->path.pathkeys;
 	List	   *tlist = build_path_tlist(root, &best_path->path);
 
-	/* As with Gather, it's best to project away columns in the workers. */
+	/* As with Gather, project away columns in the workers. */
 	subplan = create_plan_recurse(root, best_path->subpath, CP_EXACT_TLIST);
 
 	/* Create a shell for a GatherMerge plan. */
diff --git a/src/include/executor/tqueue.h b/src/include/executor/tqueue.h
index 93655ef6bda8..264eb566410d 100644
--- a/src/include/executor/tqueue.h
+++ b/src/include/executor/tqueue.h
@@ -26,7 +26,7 @@ extern DestReceiver *CreateTupleQueueDestReceiver(shm_mq_handle *handle);
 /* Use these to receive tuples from a shm_mq. */
 extern TupleQueueReader *CreateTupleQueueReader(shm_mq_handle *handle);
 extern void DestroyTupleQueueReader(TupleQueueReader *reader);
-extern HeapTuple TupleQueueReaderNext(TupleQueueReader *reader,
-									  bool nowait, bool *done);
+extern MinimalTuple TupleQueueReaderNext(TupleQueueReader *reader,
+										 bool nowait, bool *done);
 
 #endif							/* TQUEUE_H */

From 01160a3de3d89346c9597414ff85416b77f5391f Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Fri, 17 Jul 2020 09:02:44 +0530
Subject: [PATCH 151/334] Fix signal handler setup for SIGHUP in the apply
 launcher process.

Commit 1e53fe0e70 has unified the usage of the config-file reload flag by
using the same signal handler function for the SIGHUP signal at many places
in the code.  By mistake, it used the wrong SIGNAL in apply launcher
process for the SIGHUP signal handler function.

Author: Bharath Rupireddy
Reviewed-by: Dilip Kumar
Backpatch-through: 13, where it was introduced
Discussion: https://postgr.es/m/CALj2ACVzHCRnS20bOiEHaLtP5PVBENZQn4khdsSJQgOv_GM-LA@mail.gmail.com
---
 src/backend/replication/logical/launcher.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c
index aec885e98719..ff985b9b24ca 100644
--- a/src/backend/replication/logical/launcher.c
+++ b/src/backend/replication/logical/launcher.c
@@ -956,7 +956,7 @@ ApplyLauncherMain(Datum main_arg)
 	LogicalRepCtx->launcher_pid = MyProcPid;
 
 	/* Establish signal handlers. */
-	pqsignal(SIGTERM, SignalHandlerForConfigReload);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
 	pqsignal(SIGTERM, die);
 	BackgroundWorkerUnblockSignals();
 

From 44f34365b81fd1be575d212593c2bd9c47060aa6 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Fri, 17 Jul 2020 15:07:54 +0200
Subject: [PATCH 152/334] Resolve gratuitous tabs in SQL file

---
 src/backend/catalog/system_views.sql | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index b6d35c2d1131..5ecd2e986bae 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1084,10 +1084,10 @@ CREATE VIEW pg_stat_progress_basebackup AS
                       WHEN 4 THEN 'waiting for wal archiving to finish'
                       WHEN 5 THEN 'transferring wal files'
                       END AS phase,
-	CASE S.param2 WHEN -1 THEN NULL ELSE S.param2 END AS backup_total,
-	S.param3 AS backup_streamed,
-	S.param4 AS tablespaces_total,
-	S.param5 AS tablespaces_streamed
+        CASE S.param2 WHEN -1 THEN NULL ELSE S.param2 END AS backup_total,
+        S.param3 AS backup_streamed,
+        S.param4 AS tablespaces_total,
+        S.param5 AS tablespaces_streamed
     FROM pg_stat_get_progress_info('BASEBACKUP') AS S;
 
 CREATE VIEW pg_user_mappings AS

From 20ef35516328350a244d22b590e37f24ea958526 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Fri, 17 Jul 2020 15:16:13 +0200
Subject: [PATCH 153/334] Fix whitespace

---
 src/backend/utils/adt/genfile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/genfile.c b/src/backend/utils/adt/genfile.c
index c1cc19d1f5c3..d34182a7b04d 100644
--- a/src/backend/utils/adt/genfile.c
+++ b/src/backend/utils/adt/genfile.c
@@ -166,7 +166,7 @@ read_binary_file(const char *filename, int64 seek_offset, int64 bytes_to_read,
 			 */
 			if (sbuf.len == MaxAllocSize - 1)
 			{
-				char	rbuf[1]; 
+				char	rbuf[1];
 
 				if (fread(rbuf, 1, 1, file) != 0 || !feof(file))
 					ereport(ERROR,

From 7fe3083f4cc9cb213f99deecf1bf775a9270b3b2 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 17 Jul 2020 11:03:55 -0400
Subject: [PATCH 154/334] Ensure that distributed timezone abbreviation files
 are plain ASCII.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We had two occurrences of "Mitteleuropäische Zeit" in Europe.txt,
though the corresponding entries in Default were spelled
"Mitteleuropaeische Zeit".  Standardize on the latter spelling to
avoid questions of which encoding to use.

While here, correct a couple of other trivial inconsistencies between
the Default file and the supposedly-matching entries in the *.txt
files, as exposed by some checking with comm(1).  Also, add BDST to
the Europe.txt file; it previously was only listed in Default.
None of this has any direct functional effect.

Per complaint from Christoph Berg.  As usual for timezone data patches,
apply to all branches.

Discussion: https://postgr.es/m/20200716100743.GE3534683@msg.df7cb.de
---
 src/timezone/tznames/Antarctica.txt | 2 +-
 src/timezone/tznames/Australia.txt  | 4 ++--
 src/timezone/tznames/Default        | 4 ++--
 src/timezone/tznames/Europe.txt     | 5 +++--
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/timezone/tznames/Antarctica.txt b/src/timezone/tznames/Antarctica.txt
index 709f6c023b9a..413b928ccf9d 100644
--- a/src/timezone/tznames/Antarctica.txt
+++ b/src/timezone/tznames/Antarctica.txt
@@ -13,7 +13,7 @@ AWST    28800    # Australian Western Standard Time
 CLST   -10800 D  # Chile Summer Time (obsolete)
 CLT    America/Santiago  # Chile Time (obsolete)
 DAVT    Antarctica/Davis  # Davis Time (Antarctica) (obsolete)
-DDUT    36000    # Dumont-d`Urville Time (Antarctica) (obsolete)
+DDUT    36000    # Dumont-d'Urville Time (Antarctica) (obsolete)
 MAWT    Antarctica/Mawson  # Mawson Time (Antarctica) (obsolete)
 MIST    39600    # Macquarie Island Time (obsolete)
 NZDT    46800 D  # New Zealand Daylight Time
diff --git a/src/timezone/tznames/Australia.txt b/src/timezone/tznames/Australia.txt
index 01629de72a01..da90866d183b 100644
--- a/src/timezone/tznames/Australia.txt
+++ b/src/timezone/tznames/Australia.txt
@@ -7,7 +7,7 @@
 # src/timezone/tznames/Australia.txt
 #
 
-ACSST   37800 D  # Central Australia (not in IANA database)
+ACSST   37800 D  # Australian Central Summer Standard Time (not in IANA database)
 ACDT    37800 D  # Australian Central Daylight Time
                  #     (Australia/Adelaide)
                  #     (Australia/Broken_Hill)
@@ -17,7 +17,7 @@ ACST    34200    # Australian Central Standard Time
                  #     (Australia/Broken_Hill)
                  #     (Australia/Darwin)
 ACWST   31500    # Australian Central Western Standard Time (obsolete)
-AESST   39600 D  # Australia Eastern Summer Standard Time (not in IANA database)
+AESST   39600 D  # Australian Eastern Summer Standard Time (not in IANA database)
 AEDT    39600 D  # Australian Eastern Daylight Time
                  #     (Australia/Brisbane)
                  #     (Australia/Currie)
diff --git a/src/timezone/tznames/Default b/src/timezone/tznames/Default
index 1532413bfada..8a4dc59f8862 100644
--- a/src/timezone/tznames/Default
+++ b/src/timezone/tznames/Default
@@ -597,8 +597,8 @@ CHUT    36000    # Chuuk Time (obsolete)
 CKT     Pacific/Rarotonga  # Cook Islands Time (obsolete)
 EASST   Pacific/Easter  # Easter Island Summer Time (obsolete)
 EAST    Pacific/Easter  # Easter Island Time (Chile) (obsolete)
-FJST    46800 D  # Fiji Summer Time (obsolete)
-FJT     43200    # Fiji Time (obsolete)
+FJST    46800 D  # Fiji Summer Time (caution: this used to mean -46800) (obsolete)
+FJT     43200    # Fiji Time (caution: this used to mean -43200) (obsolete)
 GALT   -21600    # Galapagos Time (obsolete)
 GAMT   -32400    # Gambier Time (obsolete)
 GILT    43200    # Gilbert Islands Time (obsolete)
diff --git a/src/timezone/tznames/Europe.txt b/src/timezone/tznames/Europe.txt
index 86378bf8e9f3..2e762b90700f 100644
--- a/src/timezone/tznames/Europe.txt
+++ b/src/timezone/tznames/Europe.txt
@@ -12,6 +12,7 @@
 #  - BST: Bougainville Standard Time (Papua New Guinea)
 BST      3600 D  # British Summer Time
                  #     (Europe/London)
+BDST     7200 D  # British Double Summer Time
 CEST     7200 D  # Central Europe Summer Time
                  #     (Africa/Ceuta)
                  #     (Europe/Amsterdam)
@@ -184,12 +185,12 @@ IST      3600    # Irish Standard Time
                  #     (Europe/Dublin)
 MEST     7200 D  # Middle Europe Summer Time
                  #     (MET)
-MESZ     7200 D  # Mitteleurop�ische Sommerzeit (German)
+MESZ     7200 D  # Mitteleuropaeische Sommerzeit (German)
                  #     (attested in IANA comments though not their code)
 MET      3600    # Middle Europe Time
                  #     (MET)
 METDST   7200 D  # Middle Europe Summer Time (not in IANA database)
-MEZ      3600    # Mitteleurop�ische Zeit (German)
+MEZ      3600    # Mitteleuropaeische Zeit (German)
                  #     (attested in IANA comments though not their code)
 MSD     14400 D  # Moscow Daylight Time (obsolete)
 MSK     Europe/Moscow  # Moscow Time

From 5da8bf8bbb5c119d4bd767dbdfaf10efd348c0fd Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 17 Jul 2020 09:50:48 -0700
Subject: [PATCH 155/334] Avoid CREATE INDEX unique index deduplication.

There is no advantage to attempting deduplication for a unique index
during CREATE INDEX, since there cannot possibly be any duplicates.
Doing so wastes cycles due to unnecessary copying.  Make sure that we
avoid it consistently.

We already avoided unique index deduplication in the case where there
were some spool2 tuples to merge.  That didn't account for the fact that
spool2 is removed early/unset in the common case where it has no tuples
that need to be merged (i.e. it failed to account for the "spool2 turns
out to be unnecessary" optimization in _bt_spools_heapscan()).

Oversight in commit 0d861bbb, which added nbtree deduplication

Backpatch: 13-, where nbtree deduplication was introduced.
---
 src/backend/access/nbtree/nbtsort.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index e6b72111363e..efee86784bbf 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1192,7 +1192,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	int64		tuples_done = 0;
 	bool		deduplicate;
 
-	deduplicate = wstate->inskey->allequalimage &&
+	deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&
 		BTGetDeduplicateItems(wstate->index);
 
 	if (merge)

From a8d0732ac2b5527ce47cce5b325f8df93f4d19cc Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 17 Jul 2020 12:14:28 -0400
Subject: [PATCH 156/334] Remove manual tracking of file position in
 pg_dump/pg_backup_custom.c.

We do not really need to track the file position by hand.  We were
already relying on ftello() whenever the archive file is seekable,
while if it's not seekable we don't need the file position info
anyway because we're not going to be able to re-write the TOC.

Moreover, that tracking was buggy since it failed to account for
the effects of fseeko().  Somewhat remarkably, that seems not to
have made for any live bugs up to now.  We could fix the oversights,
but it seems better to just get rid of the whole error-prone mess.

In itself this is merely code cleanup.  However, it's necessary
infrastructure for an upcoming bug-fix patch (because that code
*does* need valid file position after fseeko).  The bug fix
needs to go back as far as v12; hence, back-patch that far.

Discussion: https://postgr.es/m/CALBH9DDuJ+scZc4MEvw5uO-=vRyR2=QF9+Yh=3hPEnKHWfS81A@mail.gmail.com
---
 src/bin/pg_dump/pg_backup_custom.c | 53 ++++++++----------------------
 1 file changed, 14 insertions(+), 39 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c
index 6ab122242ce3..3a9881d6010c 100644
--- a/src/bin/pg_dump/pg_backup_custom.c
+++ b/src/bin/pg_dump/pg_backup_custom.c
@@ -70,14 +70,12 @@ typedef struct
 {
 	CompressorState *cs;
 	int			hasSeek;
-	pgoff_t		filePos;
-	pgoff_t		dataStart;
 } lclContext;
 
 typedef struct
 {
 	int			dataState;
-	pgoff_t		dataPos;
+	pgoff_t		dataPos;		/* valid only if dataState=K_OFFSET_POS_SET */
 } lclTocEntry;
 
 
@@ -144,8 +142,6 @@ InitArchiveFmt_Custom(ArchiveHandle *AH)
 	AH->lo_buf_size = LOBBUFSIZE;
 	AH->lo_buf = (void *) pg_malloc(LOBBUFSIZE);
 
-	ctx->filePos = 0;
-
 	/*
 	 * Now open the file
 	 */
@@ -185,7 +181,6 @@ InitArchiveFmt_Custom(ArchiveHandle *AH)
 
 		ReadHead(AH);
 		ReadToc(AH);
-		ctx->dataStart = _getFilePos(AH, ctx);
 	}
 
 }
@@ -290,7 +285,8 @@ _StartData(ArchiveHandle *AH, TocEntry *te)
 	lclTocEntry *tctx = (lclTocEntry *) te->formatData;
 
 	tctx->dataPos = _getFilePos(AH, ctx);
-	tctx->dataState = K_OFFSET_POS_SET;
+	if (tctx->dataPos >= 0)
+		tctx->dataState = K_OFFSET_POS_SET;
 
 	_WriteByte(AH, BLK_DATA);	/* Block type */
 	WriteInt(AH, te->dumpId);	/* For sanity check */
@@ -350,7 +346,8 @@ _StartBlobs(ArchiveHandle *AH, TocEntry *te)
 	lclTocEntry *tctx = (lclTocEntry *) te->formatData;
 
 	tctx->dataPos = _getFilePos(AH, ctx);
-	tctx->dataState = K_OFFSET_POS_SET;
+	if (tctx->dataPos >= 0)
+		tctx->dataState = K_OFFSET_POS_SET;
 
 	_WriteByte(AH, BLK_BLOBS);	/* Block type */
 	WriteInt(AH, te->dumpId);	/* For sanity check */
@@ -551,7 +548,6 @@ _skipBlobs(ArchiveHandle *AH)
 static void
 _skipData(ArchiveHandle *AH)
 {
-	lclContext *ctx = (lclContext *) AH->formatData;
 	size_t		blkLen;
 	char	   *buf = NULL;
 	int			buflen = 0;
@@ -575,8 +571,6 @@ _skipData(ArchiveHandle *AH)
 				fatal("could not read from input file: %m");
 		}
 
-		ctx->filePos += blkLen;
-
 		blkLen = ReadInt(AH);
 	}
 
@@ -594,12 +588,10 @@ _skipData(ArchiveHandle *AH)
 static int
 _WriteByte(ArchiveHandle *AH, const int i)
 {
-	lclContext *ctx = (lclContext *) AH->formatData;
 	int			res;
 
 	if ((res = fputc(i, AH->FH)) == EOF)
 		WRITE_ERROR_EXIT;
-	ctx->filePos += 1;
 
 	return 1;
 }
@@ -615,13 +607,11 @@ _WriteByte(ArchiveHandle *AH, const int i)
 static int
 _ReadByte(ArchiveHandle *AH)
 {
-	lclContext *ctx = (lclContext *) AH->formatData;
 	int			res;
 
 	res = getc(AH->FH);
 	if (res == EOF)
 		READ_ERROR_EXIT(AH->FH);
-	ctx->filePos += 1;
 	return res;
 }
 
@@ -635,11 +625,8 @@ _ReadByte(ArchiveHandle *AH)
 static void
 _WriteBuf(ArchiveHandle *AH, const void *buf, size_t len)
 {
-	lclContext *ctx = (lclContext *) AH->formatData;
-
 	if (fwrite(buf, 1, len, AH->FH) != len)
 		WRITE_ERROR_EXIT;
-	ctx->filePos += len;
 }
 
 /*
@@ -652,11 +639,8 @@ _WriteBuf(ArchiveHandle *AH, const void *buf, size_t len)
 static void
 _ReadBuf(ArchiveHandle *AH, void *buf, size_t len)
 {
-	lclContext *ctx = (lclContext *) AH->formatData;
-
 	if (fread(buf, 1, len, AH->FH) != len)
 		READ_ERROR_EXIT(AH->FH);
-	ctx->filePos += len;
 }
 
 /*
@@ -688,7 +672,6 @@ _CloseArchive(ArchiveHandle *AH)
 		if (tpos < 0 && ctx->hasSeek)
 			fatal("could not determine seek position in archive file: %m");
 		WriteToc(AH);
-		ctx->dataStart = _getFilePos(AH, ctx);
 		WriteDataChunks(AH, NULL);
 
 		/*
@@ -862,30 +845,24 @@ _WorkerJobRestoreCustom(ArchiveHandle *AH, TocEntry *te)
 
 /*
  * Get the current position in the archive file.
+ *
+ * With a non-seekable archive file, we may not be able to obtain the
+ * file position.  If so, just return -1.  It's not too important in
+ * that case because we won't be able to rewrite the TOC to fill in
+ * data block offsets anyway.
  */
 static pgoff_t
 _getFilePos(ArchiveHandle *AH, lclContext *ctx)
 {
 	pgoff_t		pos;
 
-	if (ctx->hasSeek)
+	pos = ftello(AH->FH);
+	if (pos < 0)
 	{
-		/*
-		 * Prior to 1.7 (pg7.3) we relied on the internally maintained
-		 * pointer.  Now we rely on ftello() always, unless the file has been
-		 * found to not support it.  For debugging purposes, print a warning
-		 * if the internal pointer disagrees, so that we're more likely to
-		 * notice if something's broken about the internal position tracking.
-		 */
-		pos = ftello(AH->FH);
-		if (pos < 0)
+		/* Not expected if we found we can seek. */
+		if (ctx->hasSeek)
 			fatal("could not determine seek position in archive file: %m");
-
-		if (pos != ctx->filePos)
-			pg_log_warning("ftell mismatch with expected position -- ftell used");
 	}
-	else
-		pos = ctx->filePos;
 	return pos;
 }
 
@@ -897,7 +874,6 @@ _getFilePos(ArchiveHandle *AH, lclContext *ctx)
 static void
 _readBlockHeader(ArchiveHandle *AH, int *type, int *id)
 {
-	lclContext *ctx = (lclContext *) AH->formatData;
 	int			byt;
 
 	/*
@@ -918,7 +894,6 @@ _readBlockHeader(ArchiveHandle *AH, int *type, int *id)
 			*id = 0;			/* don't return an uninitialized value */
 			return;
 		}
-		ctx->filePos += 1;
 	}
 
 	*id = ReadInt(AH);

From f009591d6eddbeece955aab70c35e7002fd01aec Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 17 Jul 2020 13:03:50 -0400
Subject: [PATCH 157/334] Cope with data-offset-less archive files during
 out-of-order restores.

pg_dump produces custom-format archive files that lack data offsets
when it is unable to seek its output.  Up to now that's been a hazard
for pg_restore.  But if pg_restore is able to seek in the archive
file, there is no reason to throw up our hands when asked to restore
data blocks out of order.  Instead, whenever we are searching for a
data block, record the locations of the blocks we passed over (that
is, fill in the missing data-offset fields in our in-memory copy of
the TOC data).  Then, when we hit a case that requires going
backwards, we can just seek back.

Also track the furthest point that we've searched to, and seek back
to there when beginning a search for a new data block.  This avoids
possible O(N^2) time consumption, by ensuring that each data block
is examined at most twice.  (On Unix systems, that's at most twice
per parallel-restore job; but since Windows uses threads here, the
threads can share block location knowledge, reducing the amount of
duplicated work.)

We can also improve the code a bit by using fseeko() to skip over
data blocks during the search.

This is all of some use even in simple restores, but it's really
significant for parallel pg_restore.  In that case, we require
seekability of the input already, and we will very probably need
to do out-of-order restores.

Back-patch to v12, as this fixes a regression introduced by commit
548e50976.  Before that, parallel restore avoided requesting
out-of-order restores, so it would work on a data-offset-less
archive.  Now it will again.

Ideally this patch would include some test coverage, but there are
other open bugs that need to be fixed before we can extend our
coverage of parallel restore very much.  Plan to revisit that later.

David Gilman and Tom Lane; reviewed by Justin Pryzby

Discussion: https://postgr.es/m/CALBH9DDuJ+scZc4MEvw5uO-=vRyR2=QF9+Yh=3hPEnKHWfS81A@mail.gmail.com
---
 doc/src/sgml/ref/pg_restore.sgml   |  15 ++--
 src/bin/pg_dump/pg_backup_custom.c | 136 +++++++++++++++++++++++------
 2 files changed, 117 insertions(+), 34 deletions(-)

diff --git a/doc/src/sgml/ref/pg_restore.sgml b/doc/src/sgml/ref/pg_restore.sgml
index 6cb06d4910ca..b942cb238b1b 100644
--- a/doc/src/sgml/ref/pg_restore.sgml
+++ b/doc/src/sgml/ref/pg_restore.sgml
@@ -246,12 +246,14 @@ PostgreSQL documentation
       <term><option>--jobs=<replaceable class="parameter">number-of-jobs</replaceable></option></term>
       <listitem>
        <para>
-        Run the most time-consuming parts
-        of <application>pg_restore</application> &mdash; those which load data,
-        create indexes, or create constraints &mdash; using multiple
-        concurrent jobs.  This option can dramatically reduce the time
+        Run the most time-consuming steps
+        of <application>pg_restore</application> &mdash; those that load data,
+        create indexes, or create constraints &mdash; concurrently, using up
+        to <replaceable class="parameter">number-of-jobs</replaceable>
+        concurrent sessions.  This option can dramatically reduce the time
         to restore a large database to a server running on a
-        multiprocessor machine.
+        multiprocessor machine.  This option is ignored when emitting a script
+        rather than connecting directly to a database server.
        </para>
 
        <para>
@@ -274,8 +276,7 @@ PostgreSQL documentation
         Only the custom and directory archive formats are supported
         with this option.
         The input must be a regular file or directory (not, for example, a
-        pipe).  This option is ignored when emitting a script rather
-        than connecting directly to a database server.  Also, multiple
+        pipe or standard input).  Also, multiple
         jobs cannot be used together with the
         option <option>--single-transaction</option>.
        </para>
diff --git a/src/bin/pg_dump/pg_backup_custom.c b/src/bin/pg_dump/pg_backup_custom.c
index 3a9881d6010c..971e6adf4875 100644
--- a/src/bin/pg_dump/pg_backup_custom.c
+++ b/src/bin/pg_dump/pg_backup_custom.c
@@ -70,6 +70,8 @@ typedef struct
 {
 	CompressorState *cs;
 	int			hasSeek;
+	/* lastFilePos is used only when reading, and may be invalid if !hasSeek */
+	pgoff_t		lastFilePos;	/* position after last data block we've read */
 } lclContext;
 
 typedef struct
@@ -181,8 +183,13 @@ InitArchiveFmt_Custom(ArchiveHandle *AH)
 
 		ReadHead(AH);
 		ReadToc(AH);
-	}
 
+		/*
+		 * Remember location of first data block (i.e., the point after TOC)
+		 * in case we have to search for desired data blocks.
+		 */
+		ctx->lastFilePos = _getFilePos(AH, ctx);
+	}
 }
 
 /*
@@ -418,13 +425,62 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te)
 	{
 		/*
 		 * We cannot seek directly to the desired block.  Instead, skip over
-		 * block headers until we find the one we want.  This could fail if we
-		 * are asked to restore items out-of-order.
+		 * block headers until we find the one we want.  Remember the
+		 * positions of skipped-over blocks, so that if we later decide we
+		 * need to read one, we'll be able to seek to it.
+		 *
+		 * When our input file is seekable, we can do the search starting from
+		 * the point after the last data block we scanned in previous
+		 * iterations of this function.
 		 */
-		_readBlockHeader(AH, &blkType, &id);
+		if (ctx->hasSeek)
+		{
+			if (fseeko(AH->FH, ctx->lastFilePos, SEEK_SET) != 0)
+				fatal("error during file seek: %m");
+		}
 
-		while (blkType != EOF && id != te->dumpId)
+		for (;;)
 		{
+			pgoff_t		thisBlkPos = _getFilePos(AH, ctx);
+
+			_readBlockHeader(AH, &blkType, &id);
+
+			if (blkType == EOF || id == te->dumpId)
+				break;
+
+			/* Remember the block position, if we got one */
+			if (thisBlkPos >= 0)
+			{
+				TocEntry   *otherte = getTocEntryByDumpId(AH, id);
+
+				if (otherte && otherte->formatData)
+				{
+					lclTocEntry *othertctx = (lclTocEntry *) otherte->formatData;
+
+					/*
+					 * Note: on Windows, multiple threads might access/update
+					 * the same lclTocEntry concurrently, but that should be
+					 * safe as long as we update dataPos before dataState.
+					 * Ideally, we'd use pg_write_barrier() to enforce that,
+					 * but the needed infrastructure doesn't exist in frontend
+					 * code.  But Windows only runs on machines with strong
+					 * store ordering, so it should be okay for now.
+					 */
+					if (othertctx->dataState == K_OFFSET_POS_NOT_SET)
+					{
+						othertctx->dataPos = thisBlkPos;
+						othertctx->dataState = K_OFFSET_POS_SET;
+					}
+					else if (othertctx->dataPos != thisBlkPos ||
+							 othertctx->dataState != K_OFFSET_POS_SET)
+					{
+						/* sanity check */
+						pg_log_warning("data block %d has wrong seek position",
+									   id);
+					}
+				}
+			}
+
 			switch (blkType)
 			{
 				case BLK_DATA:
@@ -440,7 +496,6 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te)
 						  blkType);
 					break;
 			}
-			_readBlockHeader(AH, &blkType, &id);
 		}
 	}
 	else
@@ -452,20 +507,18 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te)
 		_readBlockHeader(AH, &blkType, &id);
 	}
 
-	/* Produce suitable failure message if we fell off end of file */
+	/*
+	 * If we reached EOF without finding the block we want, then either it
+	 * doesn't exist, or it does but we lack the ability to seek back to it.
+	 */
 	if (blkType == EOF)
 	{
-		if (tctx->dataState == K_OFFSET_POS_NOT_SET)
-			fatal("could not find block ID %d in archive -- "
-				  "possibly due to out-of-order restore request, "
-				  "which cannot be handled due to lack of data offsets in archive",
-				  te->dumpId);
-		else if (!ctx->hasSeek)
+		if (!ctx->hasSeek)
 			fatal("could not find block ID %d in archive -- "
 				  "possibly due to out-of-order restore request, "
 				  "which cannot be handled due to non-seekable input file",
 				  te->dumpId);
-		else					/* huh, the dataPos led us to EOF? */
+		else
 			fatal("could not find block ID %d in archive -- "
 				  "possibly corrupt archive",
 				  te->dumpId);
@@ -491,6 +544,20 @@ _PrintTocData(ArchiveHandle *AH, TocEntry *te)
 				  blkType);
 			break;
 	}
+
+	/*
+	 * If our input file is seekable but lacks data offsets, update our
+	 * knowledge of where to start future searches from.  (Note that we did
+	 * not update the current TE's dataState/dataPos.  We could have, but
+	 * there is no point since it will not be visited again.)
+	 */
+	if (ctx->hasSeek && tctx->dataState == K_OFFSET_POS_NOT_SET)
+	{
+		pgoff_t		curPos = _getFilePos(AH, ctx);
+
+		if (curPos > ctx->lastFilePos)
+			ctx->lastFilePos = curPos;
+	}
 }
 
 /*
@@ -548,6 +615,7 @@ _skipBlobs(ArchiveHandle *AH)
 static void
 _skipData(ArchiveHandle *AH)
 {
+	lclContext *ctx = (lclContext *) AH->formatData;
 	size_t		blkLen;
 	char	   *buf = NULL;
 	int			buflen = 0;
@@ -556,19 +624,27 @@ _skipData(ArchiveHandle *AH)
 	blkLen = ReadInt(AH);
 	while (blkLen != 0)
 	{
-		if (blkLen > buflen)
+		if (ctx->hasSeek)
 		{
-			if (buf)
-				free(buf);
-			buf = (char *) pg_malloc(blkLen);
-			buflen = blkLen;
+			if (fseeko(AH->FH, blkLen, SEEK_CUR) != 0)
+				fatal("error during file seek: %m");
 		}
-		if ((cnt = fread(buf, 1, blkLen, AH->FH)) != blkLen)
+		else
 		{
-			if (feof(AH->FH))
-				fatal("could not read from input file: end of file");
-			else
-				fatal("could not read from input file: %m");
+			if (blkLen > buflen)
+			{
+				if (buf)
+					free(buf);
+				buf = (char *) pg_malloc(blkLen);
+				buflen = blkLen;
+			}
+			if ((cnt = fread(buf, 1, blkLen, AH->FH)) != blkLen)
+			{
+				if (feof(AH->FH))
+					fatal("could not read from input file: end of file");
+				else
+					fatal("could not read from input file: %m");
+			}
 		}
 
 		blkLen = ReadInt(AH);
@@ -804,6 +880,9 @@ _Clone(ArchiveHandle *AH)
 {
 	lclContext *ctx = (lclContext *) AH->formatData;
 
+	/*
+	 * Each thread must have private lclContext working state.
+	 */
 	AH->formatData = (lclContext *) pg_malloc(sizeof(lclContext));
 	memcpy(AH->formatData, ctx, sizeof(lclContext));
 	ctx = (lclContext *) AH->formatData;
@@ -813,10 +892,13 @@ _Clone(ArchiveHandle *AH)
 		fatal("compressor active");
 
 	/*
+	 * We intentionally do not clone TOC-entry-local state: it's useful to
+	 * share knowledge about where the data blocks are across threads.
+	 * _PrintTocData has to be careful about the order of operations on that
+	 * state, though.
+	 *
 	 * Note: we do not make a local lo_buf because we expect at most one BLOBS
-	 * entry per archive, so no parallelism is possible.  Likewise,
-	 * TOC-entry-local state isn't an issue because any one TOC entry is
-	 * touched by just one worker child.
+	 * entry per archive, so no parallelism is possible.
 	 */
 }
 

From 1e0dfd166b3fa7fc79e4fad73b6fae056bab598a Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 17 Jul 2020 17:49:45 -0700
Subject: [PATCH 158/334] Add Valgrind buffer access instrumentation.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Teach Valgrind memcheck to maintain the "defined-ness" of each shared
buffer based on whether the backend holds at least one pin at the point
it is accessed by access method code.  Bugs like the one fixed by commit
b0229f26 can be detected using this new instrumentation.

Note that backends running with Valgrind naturally have their own
independent ideas about whether any given byte in shared memory is safe
or unsafe to access.  There is no risk that concurrent access by
multiple backends to the same shared memory will confuse Valgrind's
instrumentation, because everything already works at the process level
(or at the memory mapping level, if you prefer).

Author: Álvaro Herrera, Peter Geoghegan
Reviewed-By: Anastasia Lubennikova
Discussion: https://postgr.es/m/20150723195349.GW5596@postgresql.org
Discussion: https://postgr.es/m/CAH2-WzkLgyN3zBvRZ1pkNJThC=xi_0gpWRUb_45eexLH1+k2_Q@mail.gmail.com
---
 src/backend/storage/buffer/bufmgr.c | 18 ++++++++++++++++++
 src/include/pg_config_manual.h      | 13 +++++++------
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 29c920800a63..8ef073b3821b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -49,6 +49,7 @@
 #include "storage/proc.h"
 #include "storage/smgr.h"
 #include "storage/standby.h"
+#include "utils/memdebug.h"
 #include "utils/ps_status.h"
 #include "utils/rel.h"
 #include "utils/resowner_private.h"
@@ -1633,6 +1634,13 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
 											   buf_state))
 			{
 				result = (buf_state & BM_VALID) != 0;
+
+				/*
+				 * If we successfully acquired our first pin on this buffer
+				 * within this backend, mark buffer contents defined
+				 */
+				if (result)
+					VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
 				break;
 			}
 		}
@@ -1683,6 +1691,13 @@ PinBuffer_Locked(BufferDesc *buf)
 	 */
 	Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
 
+	/*
+	 * Buffer can't have a preexisting pin, so mark its page as defined to
+	 * Valgrind (this is similar to the PinBuffer() case where the backend
+	 * doesn't already have a buffer pin)
+	 */
+	VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
+
 	/*
 	 * Since we hold the buffer spinlock, we can update the buffer state and
 	 * release the lock in one operation.
@@ -1728,6 +1743,9 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
 		uint32		buf_state;
 		uint32		old_buf_state;
 
+		/* Mark undefined, now that no pins remain in backend */
+		VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
+
 		/* I'd better not still hold any locks on the buffer */
 		Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
 		Assert(!LWLockHeldByMe(BufferDescriptorGetIOLock(buf)));
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index 8f3ec6bde183..45b6a457896f 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -269,12 +269,13 @@
 /*
  * Include Valgrind "client requests", mostly in the memory allocator, so
  * Valgrind understands PostgreSQL memory contexts.  This permits detecting
- * memory errors that Valgrind would not detect on a vanilla build.  See also
- * src/tools/valgrind.supp.  "make installcheck" runs 20-30x longer under
- * Valgrind.  Note that USE_VALGRIND slowed older versions of Valgrind by an
- * additional order of magnitude; Valgrind 3.8.1 does not have this problem.
- * The client requests fall in hot code paths, so USE_VALGRIND also slows
- * native execution by a few percentage points.
+ * memory errors that Valgrind would not detect on a vanilla build.  It also
+ * enables detection of buffer accesses that take place without holding a
+ * buffer pin.  See also src/tools/valgrind.supp.
+ *
+ * "make installcheck" is significantly slower under Valgrind.  The client
+ * requests fall in hot code paths, so USE_VALGRIND slows native execution by
+ * a few percentage points even when not run under Valgrind.
  *
  * You should normally use MEMORY_CONTEXT_CHECKING with USE_VALGRIND;
  * instrumentation of repalloc() is inferior without it.

From 564ce62164cd57c0311752ae07ec10439c78d599 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 17 Jul 2020 18:24:23 -0700
Subject: [PATCH 159/334] Rename "hash_mem" local variable.

The term "hash_mem" will take on new significance when pending work to
add a new hash_mem_multiplier GUC is committed.  Rename a local variable
that happens to have been called hash_mem now to avoid confusion.
---
 src/backend/executor/nodeAgg.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 8eb1732ca884..b79c845a6b73 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1896,7 +1896,7 @@ static void
 hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 {
 	Size		meta_mem;
-	Size		hash_mem;
+	Size		hashkey_mem;
 	Size		buffer_mem;
 	Size		total_mem;
 
@@ -1908,7 +1908,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt, true);
 
 	/* memory for the group keys and transition states */
-	hash_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
+	hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory, true);
 
 	/* memory for read/write tape buffers, if spilled */
 	buffer_mem = npartitions * HASHAGG_WRITE_BUFFER_SIZE;
@@ -1916,7 +1916,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 		buffer_mem += HASHAGG_READ_BUFFER_SIZE;
 
 	/* update peak mem */
-	total_mem = meta_mem + hash_mem + buffer_mem;
+	total_mem = meta_mem + hashkey_mem + buffer_mem;
 	if (total_mem > aggstate->hash_mem_peak)
 		aggstate->hash_mem_peak = total_mem;
 
@@ -1934,7 +1934,7 @@ hash_agg_update_metrics(AggState *aggstate, bool from_tape, int npartitions)
 	{
 		aggstate->hashentrysize =
 			sizeof(TupleHashEntryData) +
-			(hash_mem / (double) aggstate->hash_ngroups_current);
+			(hashkey_mem / (double) aggstate->hash_ngroups_current);
 	}
 }
 

From b74d449a02b3c972051b1847f3915128da8775dc Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Sat, 18 Jul 2020 10:42:41 +0900
Subject: [PATCH 160/334] doc: Fix description of \copy for psql

The WHERE clause introduced by 31f3817 was not described.  While on it,
split the grammar of \copy FROM and TO into two distinct parts for
clarity as they support different set of options.

Author: Vignesh C
Discussion: https://postgr.es/m/CALDaNm3zWr=OmxeNqOqfT=uZTSdam_j-gkX94CL8eTNfgUtf6A@mail.gmail.com
Backpatch-through: 12
---
 doc/src/sgml/ref/psql-ref.sgml | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml
index 42e862cf179f..13179e845da7 100644
--- a/doc/src/sgml/ref/psql-ref.sgml
+++ b/doc/src/sgml/ref/psql-ref.sgml
@@ -982,9 +982,15 @@ testdb=&gt;
       </varlistentry>
 
       <varlistentry id="app-psql-meta-commands-copy">
+        <term><literal>\copy { <replaceable class="parameter">table</replaceable> [ ( <replaceable class="parameter">column_list</replaceable> ) ] }
+        <literal>from</literal>
+        { <replaceable class="parameter">'filename'</replaceable> | program <replaceable class="parameter">'command'</replaceable> | stdin | pstdin }
+        [ [ with ] ( <replaceable class="parameter">option</replaceable> [, ...] ) ]
+        [ where <replaceable class="parameter">condition</replaceable> ]</literal></term>
+
         <term><literal>\copy { <replaceable class="parameter">table</replaceable> [ ( <replaceable class="parameter">column_list</replaceable> ) ] | ( <replaceable class="parameter">query</replaceable> ) }
-        { <literal>from</literal> | <literal>to</literal> }
-        { <replaceable class="parameter">'filename'</replaceable> | program <replaceable class="parameter">'command'</replaceable> | stdin | stdout | pstdin | pstdout }
+        <literal>to</literal>
+        { <replaceable class="parameter">'filename'</replaceable> | program <replaceable class="parameter">'command'</replaceable> | stdout | pstdout }
         [ [ with ] ( <replaceable class="parameter">option</replaceable> [, ...] ) ]</literal></term>
 
         <listitem>

From df7c5cb16e8fcf960e3302355fa6547fba428f5e Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Sat, 18 Jul 2020 09:47:38 +0530
Subject: [PATCH 161/334] Fix comments in reorderbuffer.c.

Author: Dave Cramer
Reviewed-by: David G. Johnston
Discussion: https://postgr.es/m/CADK3HHL8do4Fp1bsymgNasx375njV3AR7zY3UgYwzbL_Dx-n2Q@mail.gmail.com
---
 src/backend/replication/logical/reorderbuffer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 525193266907..4adbe21dfc9f 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -47,7 +47,7 @@
  *	  ReorderBuffer uses two special memory context types - SlabContext for
  *	  allocations of fixed-length structures (changes and transactions), and
  *	  GenerationContext for the variable-length transaction data (allocated
- *	  and freed in groups with similar lifespan).
+ *	  and freed in groups with similar lifespans).
  *
  *	  To limit the amount of memory used by decoded changes, we track memory
  *	  used at the reorder buffer level (i.e. total amount of memory), and for
@@ -58,7 +58,7 @@
  *	  Only decoded changes are evicted from memory (spilled to disk), not the
  *	  transaction records. The number of toplevel transactions is limited,
  *	  but a transaction with many subtransactions may still consume significant
- *	  amounts of memory. The transaction records are fairly small, though, and
+ *	  amounts of memory. The transaction records are fairly small though and
  *	  are not included in the memory limit.
  *
  *	  The current eviction algorithm is very simple - the transaction is
@@ -69,13 +69,13 @@
  *
  *	  We still rely on max_changes_in_memory when loading serialized changes
  *	  back into memory. At that point we can't use the memory limit directly
- *	  as we load the subxacts independently. One option do deal with this
+ *	  as we load the subxacts independently. One option to deal with this
  *	  would be to count the subxacts, and allow each to allocate 1/N of the
  *	  memory limit. That however does not seem very appealing, because with
- *	  many subtransactions it may easily cause trashing (short cycles of
+ *	  many subtransactions it may easily cause thrashing (short cycles of
  *	  deserializing and applying very few changes). We probably should give
  *	  a bit more memory to the oldest subtransactions, because it's likely
- *	  the source for the next sequence of changes.
+ *	  they are the source for the next sequence of changes.
  *
  * -------------------------------------------------------------------------
  */

From f41fbee7e704947fd1d2241f35bc8d0e37705919 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Sat, 18 Jul 2020 09:57:23 +0530
Subject: [PATCH 162/334] Adjust minor comment in reorderbuffer.c.

Author: Dave Cramer
Reviewed-by: David G. Johnston
Discussion: https://postgr.es/m/CADK3HHL8do4Fp1bsymgNasx375njV3AR7zY3UgYwzbL_Dx-n2Q@mail.gmail.com
---
 src/backend/replication/logical/reorderbuffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 4adbe21dfc9f..449327a147f9 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -58,7 +58,7 @@
  *	  Only decoded changes are evicted from memory (spilled to disk), not the
  *	  transaction records. The number of toplevel transactions is limited,
  *	  but a transaction with many subtransactions may still consume significant
- *	  amounts of memory. The transaction records are fairly small though and
+ *	  amounts of memory. However, the transaction records are fairly small and
  *	  are not included in the memory limit.
  *
  *	  The current eviction algorithm is very simple - the transaction is

From 9add405014f8e47e038af7124528b7601249a2ac Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Sat, 18 Jul 2020 22:43:35 +0900
Subject: [PATCH 163/334] doc: Refresh more URLs in the docs

This updates some URLs that are redirections, mostly to an equivalent
using https.  One URL referring to generalized partial indexes was
outdated.

Author: Kyotaro Horiguchi
Discussion: https://postgr.es/m/20200717.121308.1369606287593685396.horikyota.ntt@gmail.com
Backpatch-through: 9.5
---
 doc/src/sgml/acronyms.sgml          |  6 +++---
 doc/src/sgml/biblio.sgml            | 22 +++++++++++-----------
 doc/src/sgml/config.sgml            |  2 +-
 doc/src/sgml/cube.sgml              |  2 +-
 doc/src/sgml/dfunc.sgml             |  2 +-
 doc/src/sgml/earthdistance.sgml     |  2 +-
 doc/src/sgml/external-projects.sgml | 10 +++++-----
 doc/src/sgml/geqo.sgml              |  2 +-
 doc/src/sgml/install-windows.sgml   |  6 +++---
 doc/src/sgml/intro.sgml             |  2 +-
 doc/src/sgml/isn.sgml               |  2 +-
 doc/src/sgml/monitoring.sgml        |  2 +-
 doc/src/sgml/nls.sgml               |  2 +-
 doc/src/sgml/pgcrypto.sgml          |  6 +++---
 doc/src/sgml/plperl.sgml            |  2 +-
 doc/src/sgml/pltcl.sgml             |  2 +-
 doc/src/sgml/seg.sgml               |  2 +-
 doc/src/sgml/textsearch.sgml        |  2 +-
 18 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/doc/src/sgml/acronyms.sgml b/doc/src/sgml/acronyms.sgml
index f638665dc926..4e5ec983c0f2 100644
--- a/doc/src/sgml/acronyms.sgml
+++ b/doc/src/sgml/acronyms.sgml
@@ -111,7 +111,7 @@
     <term><acronym>CVE</acronym></term>
     <listitem>
      <para>
-      <ulink url="http://cve.mitre.org/">Common Vulnerabilities and Exposures</ulink>
+      <ulink url="https://cve.mitre.org/">Common Vulnerabilities and Exposures</ulink>
      </para>
     </listitem>
    </varlistentry>
@@ -321,7 +321,7 @@
     <term><acronym>IEEE</acronym></term>
     <listitem>
      <para>
-      <ulink url="http://standards.ieee.org/">Institute of Electrical and
+      <ulink url="https://standards.ieee.org/">Institute of Electrical and
       Electronics Engineers</ulink>
      </para>
     </listitem>
@@ -384,7 +384,7 @@
     <listitem>
      <para>
       <ulink
-      url="http://json.org">JavaScript Object Notation</ulink>
+      url="https://www.json.org">JavaScript Object Notation</ulink>
      </para>
     </listitem>
    </varlistentry>
diff --git a/doc/src/sgml/biblio.sgml b/doc/src/sgml/biblio.sgml
index e72a65ff8aa8..128072ded9be 100644
--- a/doc/src/sgml/biblio.sgml
+++ b/doc/src/sgml/biblio.sgml
@@ -12,7 +12,7 @@
    Some white papers and technical reports from the original
    <productname>POSTGRES</productname> development team
    are available at the University of California, Berkeley, Computer Science
-   Department <ulink url="http://db.cs.berkeley.edu/papers/">web site</ulink>.
+   Department <ulink url="https://dsf.berkeley.edu/papers/">web site</ulink>.
   </para>
 
   <bibliodiv>
@@ -137,7 +137,7 @@
    </biblioentry>
 
    <biblioentry id="sqltr-19075-6">
-    <title><ulink url="http://standards.iso.org/ittf/PubliclyAvailableStandards/c067367_ISO_IEC_TR_19075-6_2017.zip">SQL Technical Report</ulink></title>
+    <title><ulink url="https://standards.iso.org/ittf/PubliclyAvailableStandards/c067367_ISO_IEC_TR_19075-6_2017.zip">SQL Technical Report</ulink></title>
     <subtitle>Part 6: SQL support for JavaScript Object
       Notation (JSON)</subtitle>
     <edition>First Edition</edition>
@@ -213,7 +213,7 @@ ssimkovi@ag.or.at
    </biblioentry>
 
   <biblioentry id="fong">
-   <title><ulink url="http://db.cs.berkeley.edu/papers/UCB-MS-zfong.pdf">The
+   <title><ulink url="https://dsf.berkeley.edu/papers/UCB-MS-zfong.pdf">The
    design and implementation of the <productname>POSTGRES</productname> query
    optimizer</ulink></title>
    <author>
@@ -330,7 +330,7 @@ ssimkovi@ag.or.at
 
    <biblioentry id="rowe87">
    <biblioset relation="article">
-    <title><ulink url="http://db.cs.berkeley.edu/papers/ERL-M87-13.pdf">The <productname>POSTGRES</productname>
+    <title><ulink url="https://dsf.berkeley.edu/papers/ERL-M87-13.pdf">The <productname>POSTGRES</productname>
     data model</ulink></title>
     <authorgroup>
      <author>
@@ -352,7 +352,7 @@ ssimkovi@ag.or.at
 
    <biblioentry id="seshadri95">
    <biblioset relation="article">
-    <title><ulink url="http://citeseer.ist.psu.edu/seshadri95generalized.html">Generalized
+    <title><ulink url="https://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.40.5740">Generalized
     Partial Indexes</ulink></title>
     <authorgroup>
      <author>
@@ -381,7 +381,7 @@ ssimkovi@ag.or.at
 
    <biblioentry id="ston86">
    <biblioset relation="article">
-    <title><ulink url="http://db.cs.berkeley.edu/papers/ERL-M85-95.pdf">The
+    <title><ulink url="https://dsf.berkeley.edu/papers/ERL-M85-95.pdf">The
     design of <productname>POSTGRES</productname></ulink></title>
     <authorgroup>
      <author>
@@ -428,7 +428,7 @@ ssimkovi@ag.or.at
 
    <biblioentry id="ston87b">
    <biblioset relation="article">
-    <title><ulink url="http://db.cs.berkeley.edu/papers/ERL-M87-06.pdf">The
+    <title><ulink url="https://dsf.berkeley.edu/papers/ERL-M87-06.pdf">The
     design of the <productname>POSTGRES</productname> storage
     system</ulink></title>
     <authorgroup>
@@ -447,7 +447,7 @@ ssimkovi@ag.or.at
 
    <biblioentry id="ston89">
    <biblioset relation="article">
-    <title><ulink url="http://db.cs.berkeley.edu/papers/ERL-M89-82.pdf">A
+    <title><ulink url="https://dsf.berkeley.edu/papers/ERL-M89-82.pdf">A
     commentary on the <productname>POSTGRES</productname> rules
     system</ulink></title>
     <authorgroup>
@@ -473,7 +473,7 @@ ssimkovi@ag.or.at
 
    <biblioentry id="ston89b">
    <biblioset relation="article">
-    <title><ulink url="http://db.cs.berkeley.edu/papers/ERL-M89-17.pdf">The
+    <title><ulink url="https://dsf.berkeley.edu/papers/ERL-M89-17.pdf">The
     case for partial indexes</ulink></title>
     <authorgroup>
      <author>
@@ -491,7 +491,7 @@ ssimkovi@ag.or.at
 
    <biblioentry id="ston90a">
    <biblioset relation="article">
-    <title><ulink url="http://db.cs.berkeley.edu/papers/ERL-M90-34.pdf">The
+    <title><ulink url="https://dsf.berkeley.edu/papers/ERL-M90-34.pdf">The
     implementation of <productname>POSTGRES</productname></ulink></title>
     <authorgroup>
      <author>
@@ -519,7 +519,7 @@ ssimkovi@ag.or.at
 
    <biblioentry id="ston90b">
    <biblioset relation="article">
-    <title><ulink url="http://db.cs.berkeley.edu/papers/ERL-M90-36.pdf">On
+    <title><ulink url="https://dsf.berkeley.edu/papers/ERL-M90-36.pdf">On
     Rules, Procedures, Caching and Views in Database Systems</ulink></title>
     <authorgroup>
      <author>
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e0ea397ed40d..1c491fb8ffcb 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -5788,7 +5788,7 @@ local0.*    /var/log/postgresql
         by <xref linkend="guc-log-timezone"/>.)
         The supported <literal>%</literal>-escapes are similar to those
         listed in the Open Group's <ulink
-        url="http://pubs.opengroup.org/onlinepubs/009695399/functions/strftime.html">strftime
+        url="https://pubs.opengroup.org/onlinepubs/009695399/functions/strftime.html">strftime
         </ulink> specification.
         Note that the system's <function>strftime</function> is not used
         directly, so platform-specific (nonstandard) extensions do not work.
diff --git a/doc/src/sgml/cube.sgml b/doc/src/sgml/cube.sgml
index 70037a193fd8..3b39147d14d9 100644
--- a/doc/src/sgml/cube.sgml
+++ b/doc/src/sgml/cube.sgml
@@ -619,7 +619,7 @@ t
 
   <para>
    My thanks are primarily to Prof. Joe Hellerstein
-   (<ulink url="http://db.cs.berkeley.edu/jmh/"></ulink>) for elucidating the
+   (<ulink url="https://dsf.berkeley.edu/jmh/"></ulink>) for elucidating the
    gist of the GiST (<ulink url="http://gist.cs.berkeley.edu/"></ulink>), and
    to his former student Andy Dong for his example written for Illustra.
    I am also grateful to all Postgres developers, present and past, for
diff --git a/doc/src/sgml/dfunc.sgml b/doc/src/sgml/dfunc.sgml
index a87e47a104dc..a63576762171 100644
--- a/doc/src/sgml/dfunc.sgml
+++ b/doc/src/sgml/dfunc.sgml
@@ -208,7 +208,7 @@ gcc -G -o foo.so foo.o
  <tip>
   <para>
    If this is too complicated for you, you should consider using
-   <ulink url="http://www.gnu.org/software/libtool/">
+   <ulink url="https://www.gnu.org/software/libtool/">
    <productname>GNU Libtool</productname></ulink>,
    which hides the platform differences behind a uniform interface.
   </para>
diff --git a/doc/src/sgml/earthdistance.sgml b/doc/src/sgml/earthdistance.sgml
index 22762b02f4b2..4ac52cb191cb 100644
--- a/doc/src/sgml/earthdistance.sgml
+++ b/doc/src/sgml/earthdistance.sgml
@@ -19,7 +19,7 @@
  <para>
   In this module, the Earth is assumed to be perfectly spherical.
   (If that's too inaccurate for you, you might want to look at the
-  <application><ulink url="http://postgis.net/">PostGIS</ulink></application>
+  <application><ulink url="https://postgis.net/">PostGIS</ulink></application>
   project.)
  </para>
 
diff --git a/doc/src/sgml/external-projects.sgml b/doc/src/sgml/external-projects.sgml
index 108bbc65d4c3..4627adc18fcb 100644
--- a/doc/src/sgml/external-projects.sgml
+++ b/doc/src/sgml/external-projects.sgml
@@ -79,7 +79,7 @@
       <entry>libpqxx</entry>
       <entry>C++</entry>
       <entry>C++ interface</entry>
-      <entry><ulink url="http://pqxx.org/"></ulink></entry>
+      <entry><ulink url="https://pqxx.org/"></ulink></entry>
      </row>
 
      <row>
@@ -93,7 +93,7 @@
       <entry>Npgsql</entry>
       <entry>.NET</entry>
       <entry>.NET data provider</entry>
-      <entry><ulink url="http://www.npgsql.org/"></ulink></entry>
+      <entry><ulink url="https://www.npgsql.org/"></ulink></entry>
      </row>
 
      <row>
@@ -128,7 +128,7 @@
       <entry>psycopg</entry>
       <entry>Python</entry>
       <entry>DB API 2.0-compliant</entry>
-      <entry><ulink url="http://initd.org/psycopg/"></ulink></entry>
+      <entry><ulink url="https://www.psycopg.org/"></ulink></entry>
      </row>
     </tbody>
    </tgroup>
@@ -240,10 +240,10 @@
    contains several extensions, which are described in
    <xref linkend="contrib"/>.  Other extensions are developed
    independently, like <application><ulink
-   url="http://postgis.net/">PostGIS</ulink></application>.  Even
+   url="https://postgis.net/">PostGIS</ulink></application>.  Even
    <productname>PostgreSQL</productname> replication solutions can be developed
    externally. For example, <application> <ulink
-   url="http://www.slony.info">Slony-I</ulink></application> is a popular
+   url="https://www.slony.info">Slony-I</ulink></application> is a popular
    primary/standby replication solution that is developed independently
    from the core project.
   </para>
diff --git a/doc/src/sgml/geqo.sgml b/doc/src/sgml/geqo.sgml
index c754b2b63e09..ac552efd84b1 100644
--- a/doc/src/sgml/geqo.sgml
+++ b/doc/src/sgml/geqo.sgml
@@ -279,7 +279,7 @@
 
     <listitem>
      <para>
-      <ulink url="http://www.red3d.com/cwr/evolve.html">
+      <ulink url="https://www.red3d.com/cwr/evolve.html">
       Evolutionary Computation and its application to art and design</ulink>, by
       Craig Reynolds
      </para>
diff --git a/doc/src/sgml/install-windows.sgml b/doc/src/sgml/install-windows.sgml
index e2b8a4de578a..0d885d0a25a3 100644
--- a/doc/src/sgml/install-windows.sgml
+++ b/doc/src/sgml/install-windows.sgml
@@ -287,7 +287,7 @@ $ENV{MSBFLAGS}="/m";
      <listitem><para>
       Required for GSSAPI authentication support. MIT Kerberos can be
       downloaded from
-      <ulink url="http://web.mit.edu/Kerberos/dist/index.html"></ulink>.
+      <ulink url="https://web.mit.edu/Kerberos/dist/index.html"></ulink>.
      </para></listitem>
     </varlistentry>
 
@@ -296,7 +296,7 @@ $ENV{MSBFLAGS}="/m";
       <productname>libxslt</productname></term>
      <listitem><para>
       Required for XML support. Binaries can be downloaded from
-      <ulink url="http://zlatkovic.com/pub/libxml"></ulink> or source from
+      <ulink url="https://zlatkovic.com/pub/libxml"></ulink> or source from
       <ulink url="http://xmlsoft.org"></ulink>. Note that libxml2 requires iconv,
       which is available from the same download location.
      </para></listitem>
@@ -333,7 +333,7 @@ $ENV{MSBFLAGS}="/m";
      <listitem><para>
       Required for compression support in <application>pg_dump</application>
       and <application>pg_restore</application>. Binaries can be downloaded
-      from <ulink url="http://www.zlib.net"></ulink>.
+      from <ulink url="https://www.zlib.net"></ulink>.
      </para></listitem>
     </varlistentry>
 
diff --git a/doc/src/sgml/intro.sgml b/doc/src/sgml/intro.sgml
index 25e98ebe07d5..63eda059f0c9 100644
--- a/doc/src/sgml/intro.sgml
+++ b/doc/src/sgml/intro.sgml
@@ -87,7 +87,7 @@
   <para>
    <productname>PostgreSQL</productname> is an object-relational
    database management system (<acronym>ORDBMS</acronym>) based on
-   <ulink url="http://db.cs.berkeley.edu/postgres.html">
+   <ulink url="https://dsf.berkeley.edu/postgres.html">
    <productname>POSTGRES, Version 4.2</productname></ulink>,
    developed at the University of California at Berkeley Computer Science
    Department.  POSTGRES pioneered many concepts that only became
diff --git a/doc/src/sgml/isn.sgml b/doc/src/sgml/isn.sgml
index e1ea209ff11b..e55ed073120c 100644
--- a/doc/src/sgml/isn.sgml
+++ b/doc/src/sgml/isn.sgml
@@ -389,7 +389,7 @@ SELECT isbn13(id) FROM test;
    several sites, including:
    <itemizedlist>
     <listitem><para><ulink url="https://www.isbn-international.org/"></ulink></para></listitem>
-    <listitem><para><ulink url="http://www.issn.org/"></ulink></para></listitem>
+    <listitem><para><ulink url="https://www.issn.org/"></ulink></para></listitem>
     <listitem><para><ulink url="https://www.ismn-international.org/"></ulink></para></listitem>
     <listitem><para><ulink url="https://www.wikipedia.org/"></ulink></para></listitem>
    </itemizedlist>
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 048ccc09886b..dc49177c78c1 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -6133,7 +6133,7 @@ SELECT pg_stat_get_backend_pid(s.backendid) AS pid,
    <ulink url="https://en.wikipedia.org/wiki/DTrace">DTrace</ulink>
    utility is supported, which, at the time of this writing, is available
    on Solaris, macOS, FreeBSD, NetBSD, and Oracle Linux.  The
-   <ulink url="http://sourceware.org/systemtap/">SystemTap</ulink> project
+   <ulink url="https://sourceware.org/systemtap/">SystemTap</ulink> project
    for Linux provides a DTrace equivalent and can also be used.  Supporting other dynamic
    tracing utilities is theoretically possible by changing the definitions for
    the macros in <filename>src/include/utils/probes.h</filename>.
diff --git a/doc/src/sgml/nls.sgml b/doc/src/sgml/nls.sgml
index f2c1792955b3..3764d49f627e 100644
--- a/doc/src/sgml/nls.sgml
+++ b/doc/src/sgml/nls.sgml
@@ -146,7 +146,7 @@ msgstr "another translated"
     someone has already done some translation work.  The files are
     named <filename><replaceable>language</replaceable>.po</filename>,
     where <replaceable>language</replaceable> is the
-    <ulink url="http://www.loc.gov/standards/iso639-2/php/English_list.php">
+    <ulink url="https://www.loc.gov/standards/iso639-2/php/English_list.php">
     ISO 639-1 two-letter language code (in lower case)</ulink>, e.g.,
     <filename>fr.po</filename> for French.  If there is really a need
     for more than one translation effort per language then the files
diff --git a/doc/src/sgml/pgcrypto.sgml b/doc/src/sgml/pgcrypto.sgml
index cc916ff1d65b..6fd645aa70ad 100644
--- a/doc/src/sgml/pgcrypto.sgml
+++ b/doc/src/sgml/pgcrypto.sgml
@@ -1297,12 +1297,12 @@ gen_random_uuid() returns uuid
      <para>The GNU Privacy Handbook.</para>
     </listitem>
     <listitem>
-     <para><ulink url="http://www.openwall.com/crypt/"></ulink></para>
+     <para><ulink url="https://www.openwall.com/crypt/"></ulink></para>
      <para>Describes the crypt-blowfish algorithm.</para>
     </listitem>
     <listitem>
      <para>
-      <ulink url="http://www.iusmentis.com/security/passphrasefaq/"></ulink>
+      <ulink url="https://www.iusmentis.com/security/passphrasefaq/"></ulink>
      </para>
      <para>How to choose a good password.</para>
     </listitem>
@@ -1348,7 +1348,7 @@ gen_random_uuid() returns uuid
      <para>Description of Fortuna CSPRNG.</para>
     </listitem>
     <listitem>
-     <para><ulink url="http://jlcooke.ca/random/"></ulink></para>
+     <para><ulink url="https://jlcooke.ca/random/"></ulink></para>
      <para>Jean-Luc Cooke Fortuna-based <filename>/dev/random</filename> driver for Linux.</para>
     </listitem>
    </itemizedlist>
diff --git a/doc/src/sgml/plperl.sgml b/doc/src/sgml/plperl.sgml
index 033ed6960c3e..59cd92733a7f 100644
--- a/doc/src/sgml/plperl.sgml
+++ b/doc/src/sgml/plperl.sgml
@@ -14,7 +14,7 @@
   <para>
    PL/Perl is a loadable procedural language that enables you to write
    <productname>PostgreSQL</productname> functions in the
-   <ulink url="http://www.perl.org">Perl programming language</ulink>.
+   <ulink url="https://www.perl.org">Perl programming language</ulink>.
   </para>
 
   <para>
diff --git a/doc/src/sgml/pltcl.sgml b/doc/src/sgml/pltcl.sgml
index 87735af34fe1..fb1ec632a86a 100644
--- a/doc/src/sgml/pltcl.sgml
+++ b/doc/src/sgml/pltcl.sgml
@@ -14,7 +14,7 @@
   <para>
    PL/Tcl is a loadable procedural language for the
    <productname>PostgreSQL</productname> database system
-   that enables the <ulink url="http://www.tcl.tk/">
+   that enables the <ulink url="https://www.tcl.tk/">
    Tcl language</ulink> to be used to write
    <productname>PostgreSQL</productname> functions.
   </para>
diff --git a/doc/src/sgml/seg.sgml b/doc/src/sgml/seg.sgml
index 8178bc20b5d4..e86142d885a5 100644
--- a/doc/src/sgml/seg.sgml
+++ b/doc/src/sgml/seg.sgml
@@ -410,7 +410,7 @@ postgres=&gt; select '10(+-)1'::seg as seg;
 
   <para>
    My thanks are primarily to Prof. Joe Hellerstein
-   (<ulink url="http://db.cs.berkeley.edu/jmh/"></ulink>) for elucidating the
+   (<ulink url="https://dsf.berkeley.edu/jmh/"></ulink>) for elucidating the
    gist of the GiST (<ulink url="http://gist.cs.berkeley.edu/"></ulink>). I am
    also grateful to all Postgres developers, present and past, for enabling
    myself to create my own world and live undisturbed in it. And I would like
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml
index 5bc57cd137ad..64c7ddb94aa5 100644
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -3107,7 +3107,7 @@ largehearted
     The <application>Snowball</application> dictionary template is based on a project
     by Martin Porter, inventor of the popular Porter's stemming algorithm
     for the English language.  Snowball now provides stemming algorithms for
-    many languages (see the <ulink url="http://snowballstem.org/">Snowball
+    many languages (see the <ulink url="https://snowballstem.org/">Snowball
     site</ulink> for more information).  Each algorithm understands how to
     reduce common variant forms of words to a base, or stem, spelling within
     its language.  A Snowball dictionary requires a <literal>language</literal>

From 9de77b5453130242654ff0b30a551c9c862ed661 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 18 Jul 2020 12:44:51 -0400
Subject: [PATCH 164/334] Allow logical replication to transfer data in binary
 format.

This patch adds a "binary" option to CREATE/ALTER SUBSCRIPTION.
When that's set, the publisher will send data using the data type's
typsend function if any, rather than typoutput.  This is generally
faster, if slightly less robust.

As committed, we won't try to transfer user-defined array or composite
types in binary, for fear that type OIDs won't match at the subscriber.
This might be changed later, but it seems like fit material for a
follow-on patch.

Dave Cramer, reviewed by Daniel Gustafsson, Petr Jelinek, and others;
adjusted some by me

Discussion: https://postgr.es/m/CADK3HH+R3xMn=8t3Ct+uD+qJ1KD=Hbif5NFMJ+d5DkoCzp6Vgw@mail.gmail.com
---
 doc/src/sgml/catalogs.sgml                    |  26 +-
 doc/src/sgml/ref/alter_subscription.sgml      |   6 +-
 doc/src/sgml/ref/create_subscription.sgml     |  26 +-
 src/backend/catalog/pg_subscription.c         |   1 +
 src/backend/catalog/system_views.sql          |   2 +-
 src/backend/commands/subscriptioncmds.c       | 102 ++++++--
 .../libpqwalreceiver/libpqwalreceiver.c       |   4 +
 src/backend/replication/logical/proto.c       | 131 ++++++----
 src/backend/replication/logical/worker.c      | 225 ++++++++++--------
 src/backend/replication/pgoutput/pgoutput.c   |  28 ++-
 src/bin/pg_dump/pg_dump.c                     |  33 ++-
 src/bin/pg_dump/pg_dump.h                     |   1 +
 src/bin/psql/describe.c                       |   8 +-
 src/include/catalog/catversion.h              |   2 +-
 src/include/catalog/pg_subscription.h         |   5 +
 src/include/replication/logicalproto.h        |  21 +-
 src/include/replication/pgoutput.h            |   4 +-
 src/include/replication/walreceiver.h         |   1 +
 src/test/regress/expected/subscription.out    |  47 +++-
 src/test/regress/sql/subscription.sql         |  15 ++
 src/test/subscription/t/014_binary.pl         | 134 +++++++++++
 21 files changed, 610 insertions(+), 212 deletions(-)
 create mode 100644 src/test/subscription/t/014_binary.pl

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index e9cdff486415..18ab3d434cb1 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -7472,7 +7472,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        (references <link linkend="catalog-pg-database"><structname>pg_database</structname></link>.<structfield>oid</structfield>)
       </para>
       <para>
-       OID of the database which the subscription resides in
+       OID of the database that the subscription resides in
       </para></entry>
      </row>
 
@@ -7500,7 +7500,17 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        <structfield>subenabled</structfield> <type>bool</type>
       </para>
       <para>
-       If true, the subscription is enabled and should be replicating.
+       If true, the subscription is enabled and should be replicating
+      </para></entry>
+     </row>
+
+    <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>subbinary</structfield> <type>bool</type>
+      </para>
+      <para>
+       If true, the subscription will request that the publisher send data
+       in binary format
       </para></entry>
      </row>
 
@@ -7518,8 +7528,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        <structfield>subslotname</structfield> <type>name</type>
       </para>
       <para>
-       Name of the replication slot in the upstream database. Also used
-       for local replication origin name.
+       Name of the replication slot in the upstream database (also used
+       for the local replication origin name)
       </para></entry>
      </row>
 
@@ -7528,8 +7538,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        <structfield>subsynccommit</structfield> <type>text</type>
       </para>
       <para>
-       Contains the value of the <varname>synchronous_commit</varname>
-       setting for the subscription workers.
+       The <varname>synchronous_commit</varname>
+       setting for the subscription's workers to use
       </para></entry>
      </row>
 
@@ -7538,8 +7548,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        <structfield>subpublications</structfield> <type>text[]</type>
       </para>
       <para>
-       Array of subscribed publication names. These reference the
-       publications on the publisher server. For more on publications
+       Array of subscribed publication names. These reference
+       publications defined in the upstream database. For more on publications
        see <xref linkend="logical-replication-publication"/>.
       </para></entry>
      </row>
diff --git a/doc/src/sgml/ref/alter_subscription.sgml b/doc/src/sgml/ref/alter_subscription.sgml
index c24ace14d10c..81c4e70cdf45 100644
--- a/doc/src/sgml/ref/alter_subscription.sgml
+++ b/doc/src/sgml/ref/alter_subscription.sgml
@@ -163,8 +163,10 @@ ALTER SUBSCRIPTION <replaceable class="parameter">name</replaceable> RENAME TO <
      <para>
       This clause alters parameters originally set by
       <xref linkend="sql-createsubscription"/>.  See there for more
-      information.  The allowed options are <literal>slot_name</literal> and
-      <literal>synchronous_commit</literal>
+      information.  The parameters that can be altered
+      are <literal>slot_name</literal>,
+      <literal>synchronous_commit</literal>, and
+      <literal>binary</literal>.
      </para>
     </listitem>
    </varlistentry>
diff --git a/doc/src/sgml/ref/create_subscription.sgml b/doc/src/sgml/ref/create_subscription.sgml
index 5bbc165f70d5..cdb22c54feab 100644
--- a/doc/src/sgml/ref/create_subscription.sgml
+++ b/doc/src/sgml/ref/create_subscription.sgml
@@ -152,8 +152,9 @@ CREATE SUBSCRIPTION <replaceable class="parameter">subscription_name</replaceabl
         <listitem>
          <para>
           The value of this parameter overrides the
-          <xref linkend="guc-synchronous-commit"/> setting.  The default
-          value is <literal>off</literal>.
+          <xref linkend="guc-synchronous-commit"/> setting within this
+          subscription's apply worker processes.  The default value
+          is <literal>off</literal>.
          </para>
 
          <para>
@@ -178,6 +179,27 @@ CREATE SUBSCRIPTION <replaceable class="parameter">subscription_name</replaceabl
         </listitem>
        </varlistentry>
 
+       <varlistentry>
+        <term><literal>binary</literal> (<type>boolean</type>)</term>
+        <listitem>
+         <para>
+          Specifies whether the subscription will request the publisher to
+          send the data in binary format (as opposed to text).
+          The default is <literal>false</literal>.
+          Even when this option is enabled, only data types that have
+          binary send and receive functions will be transferred in binary.
+         </para>
+
+         <para>
+          When doing cross-version replication, it could happen that the
+          publisher has a binary send function for some data type, but the
+          subscriber lacks a binary receive function for the type.  In
+          such a case, data transfer will fail, and
+          the <literal>binary</literal> option cannot be used.
+         </para>
+        </listitem>
+       </varlistentry>
+
        <varlistentry>
         <term><literal>connect</literal> (<type>boolean</type>)</term>
         <listitem>
diff --git a/src/backend/catalog/pg_subscription.c b/src/backend/catalog/pg_subscription.c
index cb1573111545..e6afb3203e9f 100644
--- a/src/backend/catalog/pg_subscription.c
+++ b/src/backend/catalog/pg_subscription.c
@@ -65,6 +65,7 @@ GetSubscription(Oid subid, bool missing_ok)
 	sub->name = pstrdup(NameStr(subform->subname));
 	sub->owner = subform->subowner;
 	sub->enabled = subform->subenabled;
+	sub->binary = subform->subbinary;
 
 	/* Get conninfo */
 	datum = SysCacheGetAttr(SUBSCRIPTIONOID,
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 5ecd2e986bae..8625cbeab6e4 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1122,7 +1122,7 @@ REVOKE ALL ON pg_replication_origin_status FROM public;
 
 -- All columns of pg_subscription except subconninfo are readable.
 REVOKE ALL ON pg_subscription FROM public;
-GRANT SELECT (subdbid, subname, subowner, subenabled, subslotname, subpublications)
+GRANT SELECT (subdbid, subname, subowner, subenabled, subbinary, subslotname, subpublications)
     ON pg_subscription TO public;
 
 
diff --git a/src/backend/commands/subscriptioncmds.c b/src/backend/commands/subscriptioncmds.c
index 9ebb026187f7..40b6377a8522 100644
--- a/src/backend/commands/subscriptioncmds.c
+++ b/src/backend/commands/subscriptioncmds.c
@@ -55,11 +55,15 @@ static List *fetch_table_list(WalReceiverConn *wrconn, List *publications);
  * accommodate that.
  */
 static void
-parse_subscription_options(List *options, bool *connect, bool *enabled_given,
-						   bool *enabled, bool *create_slot,
+parse_subscription_options(List *options,
+						   bool *connect,
+						   bool *enabled_given, bool *enabled,
+						   bool *create_slot,
 						   bool *slot_name_given, char **slot_name,
-						   bool *copy_data, char **synchronous_commit,
-						   bool *refresh)
+						   bool *copy_data,
+						   char **synchronous_commit,
+						   bool *refresh,
+						   bool *binary_given, bool *binary)
 {
 	ListCell   *lc;
 	bool		connect_given = false;
@@ -90,6 +94,11 @@ parse_subscription_options(List *options, bool *connect, bool *enabled_given,
 		*synchronous_commit = NULL;
 	if (refresh)
 		*refresh = true;
+	if (binary)
+	{
+		*binary_given = false;
+		*binary = false;
+	}
 
 	/* Parse options */
 	foreach(lc, options)
@@ -175,6 +184,16 @@ parse_subscription_options(List *options, bool *connect, bool *enabled_given,
 			refresh_given = true;
 			*refresh = defGetBoolean(defel);
 		}
+		else if (strcmp(defel->defname, "binary") == 0 && binary)
+		{
+			if (*binary_given)
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("conflicting or redundant options")));
+
+			*binary_given = true;
+			*binary = defGetBoolean(defel);
+		}
 		else
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
@@ -322,6 +341,8 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel)
 	char	   *conninfo;
 	char	   *slotname;
 	bool		slotname_given;
+	bool		binary;
+	bool		binary_given;
 	char		originname[NAMEDATALEN];
 	bool		create_slot;
 	List	   *publications;
@@ -331,10 +352,15 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel)
 	 *
 	 * Connection and publication should not be specified here.
 	 */
-	parse_subscription_options(stmt->options, &connect, &enabled_given,
-							   &enabled, &create_slot, &slotname_given,
-							   &slotname, &copy_data, &synchronous_commit,
-							   NULL);
+	parse_subscription_options(stmt->options,
+							   &connect,
+							   &enabled_given, &enabled,
+							   &create_slot,
+							   &slotname_given, &slotname,
+							   &copy_data,
+							   &synchronous_commit,
+							   NULL,	/* no "refresh" */
+							   &binary_given, &binary);
 
 	/*
 	 * Since creating a replication slot is not transactional, rolling back
@@ -400,6 +426,7 @@ CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel)
 		DirectFunctionCall1(namein, CStringGetDatum(stmt->subname));
 	values[Anum_pg_subscription_subowner - 1] = ObjectIdGetDatum(owner);
 	values[Anum_pg_subscription_subenabled - 1] = BoolGetDatum(enabled);
+	values[Anum_pg_subscription_subbinary - 1] = BoolGetDatum(binary);
 	values[Anum_pg_subscription_subconninfo - 1] =
 		CStringGetTextDatum(conninfo);
 	if (slotname)
@@ -669,10 +696,18 @@ AlterSubscription(AlterSubscriptionStmt *stmt)
 				char	   *slotname;
 				bool		slotname_given;
 				char	   *synchronous_commit;
-
-				parse_subscription_options(stmt->options, NULL, NULL, NULL,
-										   NULL, &slotname_given, &slotname,
-										   NULL, &synchronous_commit, NULL);
+				bool		binary_given;
+				bool		binary;
+
+				parse_subscription_options(stmt->options,
+										   NULL,	/* no "connect" */
+										   NULL, NULL,	/* no "enabled" */
+										   NULL,	/* no "create_slot" */
+										   &slotname_given, &slotname,
+										   NULL,	/* no "copy_data" */
+										   &synchronous_commit,
+										   NULL,	/* no "refresh" */
+										   &binary_given, &binary);
 
 				if (slotname_given)
 				{
@@ -697,6 +732,13 @@ AlterSubscription(AlterSubscriptionStmt *stmt)
 					replaces[Anum_pg_subscription_subsynccommit - 1] = true;
 				}
 
+				if (binary_given)
+				{
+					values[Anum_pg_subscription_subbinary - 1] =
+						BoolGetDatum(binary);
+					replaces[Anum_pg_subscription_subbinary - 1] = true;
+				}
+
 				update_tuple = true;
 				break;
 			}
@@ -706,9 +748,15 @@ AlterSubscription(AlterSubscriptionStmt *stmt)
 				bool		enabled,
 							enabled_given;
 
-				parse_subscription_options(stmt->options, NULL,
-										   &enabled_given, &enabled, NULL,
-										   NULL, NULL, NULL, NULL, NULL);
+				parse_subscription_options(stmt->options,
+										   NULL,	/* no "connect" */
+										   &enabled_given, &enabled,
+										   NULL,	/* no "create_slot" */
+										   NULL, NULL,	/* no "slot_name" */
+										   NULL,	/* no "copy_data" */
+										   NULL,	/* no "synchronous_commit" */
+										   NULL,	/* no "refresh" */
+										   NULL, NULL); /* no "binary" */
 				Assert(enabled_given);
 
 				if (!sub->slotname && enabled)
@@ -744,9 +792,15 @@ AlterSubscription(AlterSubscriptionStmt *stmt)
 				bool		copy_data;
 				bool		refresh;
 
-				parse_subscription_options(stmt->options, NULL, NULL, NULL,
-										   NULL, NULL, NULL, &copy_data,
-										   NULL, &refresh);
+				parse_subscription_options(stmt->options,
+										   NULL,	/* no "connect" */
+										   NULL, NULL,	/* no "enabled" */
+										   NULL,	/* no "create_slot" */
+										   NULL, NULL,	/* no "slot_name" */
+										   &copy_data,
+										   NULL,	/* no "synchronous_commit" */
+										   &refresh,
+										   NULL, NULL); /* no "binary" */
 
 				values[Anum_pg_subscription_subpublications - 1] =
 					publicationListToArray(stmt->publication);
@@ -781,9 +835,15 @@ AlterSubscription(AlterSubscriptionStmt *stmt)
 							(errcode(ERRCODE_SYNTAX_ERROR),
 							 errmsg("ALTER SUBSCRIPTION ... REFRESH is not allowed for disabled subscriptions")));
 
-				parse_subscription_options(stmt->options, NULL, NULL, NULL,
-										   NULL, NULL, NULL, &copy_data,
-										   NULL, NULL);
+				parse_subscription_options(stmt->options,
+										   NULL,	/* no "connect" */
+										   NULL, NULL,	/* no "enabled" */
+										   NULL,	/* no "create_slot" */
+										   NULL, NULL,	/* no "slot_name" */
+										   &copy_data,
+										   NULL,	/* no "synchronous_commit" */
+										   NULL,	/* no "refresh" */
+										   NULL, NULL); /* no "binary" */
 
 				AlterSubscription_refresh(sub, copy_data);
 
diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
index e4fd1f9bb6f0..e9057230e40c 100644
--- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
+++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
@@ -424,6 +424,10 @@ libpqrcv_startstreaming(WalReceiverConn *conn,
 		PQfreemem(pubnames_literal);
 		pfree(pubnames_str);
 
+		if (options->proto.logical.binary &&
+			PQserverVersion(conn->streamConn) >= 140000)
+			appendStringInfoString(&cmd, ", binary 'true'");
+
 		appendStringInfoChar(&cmd, ')');
 	}
 	else
diff --git a/src/backend/replication/logical/proto.c b/src/backend/replication/logical/proto.c
index 3c6d0cd17139..2b1356ee249f 100644
--- a/src/backend/replication/logical/proto.c
+++ b/src/backend/replication/logical/proto.c
@@ -17,7 +17,6 @@
 #include "catalog/pg_type.h"
 #include "libpq/pqformat.h"
 #include "replication/logicalproto.h"
-#include "utils/builtins.h"
 #include "utils/lsyscache.h"
 #include "utils/syscache.h"
 
@@ -31,7 +30,7 @@
 
 static void logicalrep_write_attrs(StringInfo out, Relation rel);
 static void logicalrep_write_tuple(StringInfo out, Relation rel,
-								   HeapTuple tuple);
+								   HeapTuple tuple, bool binary);
 
 static void logicalrep_read_attrs(StringInfo in, LogicalRepRelation *rel);
 static void logicalrep_read_tuple(StringInfo in, LogicalRepTupleData *tuple);
@@ -139,7 +138,7 @@ logicalrep_read_origin(StringInfo in, XLogRecPtr *origin_lsn)
  * Write INSERT to the output stream.
  */
 void
-logicalrep_write_insert(StringInfo out, Relation rel, HeapTuple newtuple)
+logicalrep_write_insert(StringInfo out, Relation rel, HeapTuple newtuple, bool binary)
 {
 	pq_sendbyte(out, 'I');		/* action INSERT */
 
@@ -147,7 +146,7 @@ logicalrep_write_insert(StringInfo out, Relation rel, HeapTuple newtuple)
 	pq_sendint32(out, RelationGetRelid(rel));
 
 	pq_sendbyte(out, 'N');		/* new tuple follows */
-	logicalrep_write_tuple(out, rel, newtuple);
+	logicalrep_write_tuple(out, rel, newtuple, binary);
 }
 
 /*
@@ -179,7 +178,7 @@ logicalrep_read_insert(StringInfo in, LogicalRepTupleData *newtup)
  */
 void
 logicalrep_write_update(StringInfo out, Relation rel, HeapTuple oldtuple,
-						HeapTuple newtuple)
+						HeapTuple newtuple, bool binary)
 {
 	pq_sendbyte(out, 'U');		/* action UPDATE */
 
@@ -196,11 +195,11 @@ logicalrep_write_update(StringInfo out, Relation rel, HeapTuple oldtuple,
 			pq_sendbyte(out, 'O');	/* old tuple follows */
 		else
 			pq_sendbyte(out, 'K');	/* old key follows */
-		logicalrep_write_tuple(out, rel, oldtuple);
+		logicalrep_write_tuple(out, rel, oldtuple, binary);
 	}
 
 	pq_sendbyte(out, 'N');		/* new tuple follows */
-	logicalrep_write_tuple(out, rel, newtuple);
+	logicalrep_write_tuple(out, rel, newtuple, binary);
 }
 
 /*
@@ -248,7 +247,7 @@ logicalrep_read_update(StringInfo in, bool *has_oldtuple,
  * Write DELETE to the output stream.
  */
 void
-logicalrep_write_delete(StringInfo out, Relation rel, HeapTuple oldtuple)
+logicalrep_write_delete(StringInfo out, Relation rel, HeapTuple oldtuple, bool binary)
 {
 	Assert(rel->rd_rel->relreplident == REPLICA_IDENTITY_DEFAULT ||
 		   rel->rd_rel->relreplident == REPLICA_IDENTITY_FULL ||
@@ -264,7 +263,7 @@ logicalrep_write_delete(StringInfo out, Relation rel, HeapTuple oldtuple)
 	else
 		pq_sendbyte(out, 'K');	/* old key follows */
 
-	logicalrep_write_tuple(out, rel, oldtuple);
+	logicalrep_write_tuple(out, rel, oldtuple, binary);
 }
 
 /*
@@ -437,7 +436,7 @@ logicalrep_read_typ(StringInfo in, LogicalRepTyp *ltyp)
  * Write a tuple to the outputstream, in the most efficient format possible.
  */
 static void
-logicalrep_write_tuple(StringInfo out, Relation rel, HeapTuple tuple)
+logicalrep_write_tuple(StringInfo out, Relation rel, HeapTuple tuple, bool binary)
 {
 	TupleDesc	desc;
 	Datum		values[MaxTupleAttributeNumber];
@@ -474,12 +473,18 @@ logicalrep_write_tuple(StringInfo out, Relation rel, HeapTuple tuple)
 
 		if (isnull[i])
 		{
-			pq_sendbyte(out, 'n');	/* null column */
+			pq_sendbyte(out, LOGICALREP_COLUMN_NULL);
 			continue;
 		}
-		else if (att->attlen == -1 && VARATT_IS_EXTERNAL_ONDISK(values[i]))
+
+		if (att->attlen == -1 && VARATT_IS_EXTERNAL_ONDISK(values[i]))
 		{
-			pq_sendbyte(out, 'u');	/* unchanged toast column */
+			/*
+			 * Unchanged toasted datum.  (Note that we don't promise to detect
+			 * unchanged data in general; this is just a cheap check to avoid
+			 * sending large values unnecessarily.)
+			 */
+			pq_sendbyte(out, LOGICALREP_COLUMN_UNCHANGED);
 			continue;
 		}
 
@@ -488,20 +493,48 @@ logicalrep_write_tuple(StringInfo out, Relation rel, HeapTuple tuple)
 			elog(ERROR, "cache lookup failed for type %u", att->atttypid);
 		typclass = (Form_pg_type) GETSTRUCT(typtup);
 
-		pq_sendbyte(out, 't');	/* 'text' data follows */
-
-		outputstr = OidOutputFunctionCall(typclass->typoutput, values[i]);
-		pq_sendcountedtext(out, outputstr, strlen(outputstr), false);
-		pfree(outputstr);
+		/*
+		 * Choose whether to send in binary.  Obviously, the option must be
+		 * requested and the type must have a send function.  Also, if the
+		 * type is not built-in then it must not be a composite or array type.
+		 * Such types contain type OIDs, which will likely not match at the
+		 * receiver if it's not a built-in type.
+		 *
+		 * XXX this could be relaxed if we changed record_recv and array_recv
+		 * to be less picky.
+		 *
+		 * XXX this fails to apply the restriction to domains over such types.
+		 */
+		if (binary &&
+			OidIsValid(typclass->typsend) &&
+			(att->atttypid < FirstGenbkiObjectId ||
+			 (typclass->typtype != TYPTYPE_COMPOSITE &&
+			  typclass->typelem == InvalidOid)))
+		{
+			bytea	   *outputbytes;
+			int			len;
+
+			pq_sendbyte(out, LOGICALREP_COLUMN_BINARY);
+			outputbytes = OidSendFunctionCall(typclass->typsend, values[i]);
+			len = VARSIZE(outputbytes) - VARHDRSZ;
+			pq_sendint(out, len, 4);	/* length */
+			pq_sendbytes(out, VARDATA(outputbytes), len);	/* data */
+			pfree(outputbytes);
+		}
+		else
+		{
+			pq_sendbyte(out, LOGICALREP_COLUMN_TEXT);
+			outputstr = OidOutputFunctionCall(typclass->typoutput, values[i]);
+			pq_sendcountedtext(out, outputstr, strlen(outputstr), false);
+			pfree(outputstr);
+		}
 
 		ReleaseSysCache(typtup);
 	}
 }
 
 /*
- * Read tuple in remote format from stream.
- *
- * The returned tuple points into the input stringinfo.
+ * Read tuple in logical replication format from stream.
  */
 static void
 logicalrep_read_tuple(StringInfo in, LogicalRepTupleData *tuple)
@@ -512,38 +545,52 @@ logicalrep_read_tuple(StringInfo in, LogicalRepTupleData *tuple)
 	/* Get number of attributes */
 	natts = pq_getmsgint(in, 2);
 
-	memset(tuple->changed, 0, sizeof(tuple->changed));
+	/* Allocate space for per-column values; zero out unused StringInfoDatas */
+	tuple->colvalues = (StringInfoData *) palloc0(natts * sizeof(StringInfoData));
+	tuple->colstatus = (char *) palloc(natts * sizeof(char));
 
 	/* Read the data */
 	for (i = 0; i < natts; i++)
 	{
 		char		kind;
+		int			len;
+		StringInfo	value = &tuple->colvalues[i];
 
 		kind = pq_getmsgbyte(in);
+		tuple->colstatus[i] = kind;
 
 		switch (kind)
 		{
-			case 'n':			/* null */
-				tuple->values[i] = NULL;
-				tuple->changed[i] = true;
+			case LOGICALREP_COLUMN_NULL:
+				/* nothing more to do */
 				break;
-			case 'u':			/* unchanged column */
+			case LOGICALREP_COLUMN_UNCHANGED:
 				/* we don't receive the value of an unchanged column */
-				tuple->values[i] = NULL;
 				break;
-			case 't':			/* text formatted value */
-				{
-					int			len;
-
-					tuple->changed[i] = true;
-
-					len = pq_getmsgint(in, 4);	/* read length */
-
-					/* and data */
-					tuple->values[i] = palloc(len + 1);
-					pq_copymsgbytes(in, tuple->values[i], len);
-					tuple->values[i][len] = '\0';
-				}
+			case LOGICALREP_COLUMN_TEXT:
+				len = pq_getmsgint(in, 4);	/* read length */
+
+				/* and data */
+				value->data = palloc(len + 1);
+				pq_copymsgbytes(in, value->data, len);
+				value->data[len] = '\0';
+				/* make StringInfo fully valid */
+				value->len = len;
+				value->cursor = 0;
+				value->maxlen = len;
+				break;
+			case LOGICALREP_COLUMN_BINARY:
+				len = pq_getmsgint(in, 4);	/* read length */
+
+				/* and data */
+				value->data = palloc(len + 1);
+				pq_copymsgbytes(in, value->data, len);
+				/* not strictly necessary but per StringInfo practice */
+				value->data[len] = '\0';
+				/* make StringInfo fully valid */
+				value->len = len;
+				value->cursor = 0;
+				value->maxlen = len;
 				break;
 			default:
 				elog(ERROR, "unrecognized data representation type '%c'", kind);
@@ -552,7 +599,7 @@ logicalrep_read_tuple(StringInfo in, LogicalRepTupleData *tuple)
 }
 
 /*
- * Write relation attributes to the stream.
+ * Write relation attribute metadata to the stream.
  */
 static void
 logicalrep_write_attrs(StringInfo out, Relation rel)
@@ -611,7 +658,7 @@ logicalrep_write_attrs(StringInfo out, Relation rel)
 }
 
 /*
- * Read relation attribute names from the stream.
+ * Read relation attribute metadata from the stream.
  */
 static void
 logicalrep_read_attrs(StringInfo in, LogicalRepRelation *rel)
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index f90a896fc3e9..407eee3c0bc9 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -319,13 +319,13 @@ slot_store_error_callback(void *arg)
 }
 
 /*
- * Store data in C string form into slot.
- * This is similar to BuildTupleFromCStrings but TupleTableSlot fits our
- * use better.
+ * Store tuple data into slot.
+ *
+ * Incoming data can be either text or binary format.
  */
 static void
-slot_store_cstrings(TupleTableSlot *slot, LogicalRepRelMapEntry *rel,
-					char **values)
+slot_store_data(TupleTableSlot *slot, LogicalRepRelMapEntry *rel,
+				LogicalRepTupleData *tupleData)
 {
 	int			natts = slot->tts_tupleDescriptor->natts;
 	int			i;
@@ -343,27 +343,65 @@ slot_store_cstrings(TupleTableSlot *slot, LogicalRepRelMapEntry *rel,
 	errcallback.previous = error_context_stack;
 	error_context_stack = &errcallback;
 
-	/* Call the "in" function for each non-dropped attribute */
+	/* Call the "in" function for each non-dropped, non-null attribute */
 	Assert(natts == rel->attrmap->maplen);
 	for (i = 0; i < natts; i++)
 	{
 		Form_pg_attribute att = TupleDescAttr(slot->tts_tupleDescriptor, i);
 		int			remoteattnum = rel->attrmap->attnums[i];
 
-		if (!att->attisdropped && remoteattnum >= 0 &&
-			values[remoteattnum] != NULL)
+		if (!att->attisdropped && remoteattnum >= 0)
 		{
-			Oid			typinput;
-			Oid			typioparam;
+			StringInfo	colvalue = &tupleData->colvalues[remoteattnum];
 
 			errarg.local_attnum = i;
 			errarg.remote_attnum = remoteattnum;
 
-			getTypeInputInfo(att->atttypid, &typinput, &typioparam);
-			slot->tts_values[i] =
-				OidInputFunctionCall(typinput, values[remoteattnum],
-									 typioparam, att->atttypmod);
-			slot->tts_isnull[i] = false;
+			if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_TEXT)
+			{
+				Oid			typinput;
+				Oid			typioparam;
+
+				getTypeInputInfo(att->atttypid, &typinput, &typioparam);
+				slot->tts_values[i] =
+					OidInputFunctionCall(typinput, colvalue->data,
+										 typioparam, att->atttypmod);
+				slot->tts_isnull[i] = false;
+			}
+			else if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_BINARY)
+			{
+				Oid			typreceive;
+				Oid			typioparam;
+
+				/*
+				 * In some code paths we may be asked to re-parse the same
+				 * tuple data.  Reset the StringInfo's cursor so that works.
+				 */
+				colvalue->cursor = 0;
+
+				getTypeBinaryInputInfo(att->atttypid, &typreceive, &typioparam);
+				slot->tts_values[i] =
+					OidReceiveFunctionCall(typreceive, colvalue,
+										   typioparam, att->atttypmod);
+
+				/* Trouble if it didn't eat the whole buffer */
+				if (colvalue->cursor != colvalue->len)
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
+							 errmsg("incorrect binary data format in logical replication column %d",
+									remoteattnum + 1)));
+				slot->tts_isnull[i] = false;
+			}
+			else
+			{
+				/*
+				 * NULL value from remote.  (We don't expect to see
+				 * LOGICALREP_COLUMN_UNCHANGED here, but if we do, treat it as
+				 * NULL.)
+				 */
+				slot->tts_values[i] = (Datum) 0;
+				slot->tts_isnull[i] = true;
+			}
 
 			errarg.local_attnum = -1;
 			errarg.remote_attnum = -1;
@@ -371,8 +409,8 @@ slot_store_cstrings(TupleTableSlot *slot, LogicalRepRelMapEntry *rel,
 		else
 		{
 			/*
-			 * We assign NULL to dropped attributes, NULL values, and missing
-			 * values (missing values should be later filled using
+			 * We assign NULL to dropped attributes and missing values
+			 * (missing values should be later filled using
 			 * slot_fill_defaults).
 			 */
 			slot->tts_values[i] = (Datum) 0;
@@ -387,20 +425,21 @@ slot_store_cstrings(TupleTableSlot *slot, LogicalRepRelMapEntry *rel,
 }
 
 /*
- * Replace selected columns with user data provided as C strings.
+ * Replace updated columns with data from the LogicalRepTupleData struct.
  * This is somewhat similar to heap_modify_tuple but also calls the type
  * input functions on the user data.
- * "slot" is filled with a copy of the tuple in "srcslot", with
- * columns selected by the "replaces" array replaced with data values
- * from "values".
+ *
+ * "slot" is filled with a copy of the tuple in "srcslot", replacing
+ * columns provided in "tupleData" and leaving others as-is.
+ *
  * Caution: unreplaced pass-by-ref columns in "slot" will point into the
  * storage for "srcslot".  This is OK for current usage, but someday we may
  * need to materialize "slot" at the end to make it independent of "srcslot".
  */
 static void
-slot_modify_cstrings(TupleTableSlot *slot, TupleTableSlot *srcslot,
-					 LogicalRepRelMapEntry *rel,
-					 char **values, bool *replaces)
+slot_modify_data(TupleTableSlot *slot, TupleTableSlot *srcslot,
+				 LogicalRepRelMapEntry *rel,
+				 LogicalRepTupleData *tupleData)
 {
 	int			natts = slot->tts_tupleDescriptor->natts;
 	int			i;
@@ -438,31 +477,58 @@ slot_modify_cstrings(TupleTableSlot *slot, TupleTableSlot *srcslot,
 		if (remoteattnum < 0)
 			continue;
 
-		if (!replaces[remoteattnum])
-			continue;
-
-		if (values[remoteattnum] != NULL)
+		if (tupleData->colstatus[remoteattnum] != LOGICALREP_COLUMN_UNCHANGED)
 		{
-			Oid			typinput;
-			Oid			typioparam;
+			StringInfo	colvalue = &tupleData->colvalues[remoteattnum];
 
 			errarg.local_attnum = i;
 			errarg.remote_attnum = remoteattnum;
 
-			getTypeInputInfo(att->atttypid, &typinput, &typioparam);
-			slot->tts_values[i] =
-				OidInputFunctionCall(typinput, values[remoteattnum],
-									 typioparam, att->atttypmod);
-			slot->tts_isnull[i] = false;
+			if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_TEXT)
+			{
+				Oid			typinput;
+				Oid			typioparam;
+
+				getTypeInputInfo(att->atttypid, &typinput, &typioparam);
+				slot->tts_values[i] =
+					OidInputFunctionCall(typinput, colvalue->data,
+										 typioparam, att->atttypmod);
+				slot->tts_isnull[i] = false;
+			}
+			else if (tupleData->colstatus[remoteattnum] == LOGICALREP_COLUMN_BINARY)
+			{
+				Oid			typreceive;
+				Oid			typioparam;
+
+				/*
+				 * In some code paths we may be asked to re-parse the same
+				 * tuple data.  Reset the StringInfo's cursor so that works.
+				 */
+				colvalue->cursor = 0;
+
+				getTypeBinaryInputInfo(att->atttypid, &typreceive, &typioparam);
+				slot->tts_values[i] =
+					OidReceiveFunctionCall(typreceive, colvalue,
+										   typioparam, att->atttypmod);
+
+				/* Trouble if it didn't eat the whole buffer */
+				if (colvalue->cursor != colvalue->len)
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
+							 errmsg("incorrect binary data format in logical replication column %d",
+									remoteattnum + 1)));
+				slot->tts_isnull[i] = false;
+			}
+			else
+			{
+				/* must be LOGICALREP_COLUMN_NULL */
+				slot->tts_values[i] = (Datum) 0;
+				slot->tts_isnull[i] = true;
+			}
 
 			errarg.local_attnum = -1;
 			errarg.remote_attnum = -1;
 		}
-		else
-		{
-			slot->tts_values[i] = (Datum) 0;
-			slot->tts_isnull[i] = true;
-		}
 	}
 
 	/* Pop the error context stack */
@@ -641,7 +707,7 @@ apply_handle_insert(StringInfo s)
 
 	/* Process and store remote tuple in the slot */
 	oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
-	slot_store_cstrings(remoteslot, rel, newtup.values);
+	slot_store_data(remoteslot, rel, &newtup);
 	slot_fill_defaults(rel, estate, remoteslot);
 	MemoryContextSwitchTo(oldctx);
 
@@ -765,7 +831,7 @@ apply_handle_update(StringInfo s)
 	target_rte = list_nth(estate->es_range_table, 0);
 	for (int i = 0; i < remoteslot->tts_tupleDescriptor->natts; i++)
 	{
-		if (newtup.changed[i])
+		if (newtup.colstatus[i] != LOGICALREP_COLUMN_UNCHANGED)
 			target_rte->updatedCols = bms_add_member(target_rte->updatedCols,
 													 i + 1 - FirstLowInvalidHeapAttributeNumber);
 	}
@@ -776,8 +842,8 @@ apply_handle_update(StringInfo s)
 
 	/* Build the search tuple. */
 	oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
-	slot_store_cstrings(remoteslot, rel,
-						has_oldtup ? oldtup.values : newtup.values);
+	slot_store_data(remoteslot, rel,
+					has_oldtup ? &oldtup : &newtup);
 	MemoryContextSwitchTo(oldctx);
 
 	/* For a partitioned table, apply update to correct partition. */
@@ -831,8 +897,7 @@ apply_handle_update_internal(ResultRelInfo *relinfo,
 	{
 		/* Process and store remote tuple in the slot */
 		oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
-		slot_modify_cstrings(remoteslot, localslot, relmapentry,
-							 newtup->values, newtup->changed);
+		slot_modify_data(remoteslot, localslot, relmapentry, newtup);
 		MemoryContextSwitchTo(oldctx);
 
 		EvalPlanQualSetSlot(&epqstate, remoteslot);
@@ -900,7 +965,7 @@ apply_handle_delete(StringInfo s)
 
 	/* Build the search tuple. */
 	oldctx = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate));
-	slot_store_cstrings(remoteslot, rel, oldtup.values);
+	slot_store_data(remoteslot, rel, &oldtup);
 	MemoryContextSwitchTo(oldctx);
 
 	/* For a partitioned table, apply delete to correct partition. */
@@ -1096,9 +1161,9 @@ apply_handle_tuple_routing(ResultRelInfo *relinfo,
 				if (found)
 				{
 					/* Apply the update.  */
-					slot_modify_cstrings(remoteslot_part, localslot,
-										 part_entry,
-										 newtup->values, newtup->changed);
+					slot_modify_data(remoteslot_part, localslot,
+									 part_entry,
+									 newtup);
 					MemoryContextSwitchTo(oldctx);
 				}
 				else
@@ -1312,8 +1377,8 @@ apply_handle_truncate(StringInfo s)
 	}
 
 	/*
-	 * Even if we used CASCADE on the upstream primary we explicitly default to
-	 * replaying changes without further cascading. This might be later
+	 * Even if we used CASCADE on the upstream primary we explicitly default
+	 * to replaying changes without further cascading. This might be later
 	 * changeable with a user specified option.
 	 */
 	ExecuteTruncateGuts(rels, relids, relids_logged, DROP_RESTRICT, restart_seqs);
@@ -1850,60 +1915,21 @@ maybe_reread_subscription(void)
 		proc_exit(0);
 	}
 
-	/*
-	 * Exit if connection string was changed. The launcher will start new
-	 * worker.
-	 */
-	if (strcmp(newsub->conninfo, MySubscription->conninfo) != 0)
-	{
-		ereport(LOG,
-				(errmsg("logical replication apply worker for subscription \"%s\" will "
-						"restart because the connection information was changed",
-						MySubscription->name)));
-
-		proc_exit(0);
-	}
-
-	/*
-	 * Exit if subscription name was changed (it's used for
-	 * fallback_application_name). The launcher will start new worker.
-	 */
-	if (strcmp(newsub->name, MySubscription->name) != 0)
-	{
-		ereport(LOG,
-				(errmsg("logical replication apply worker for subscription \"%s\" will "
-						"restart because subscription was renamed",
-						MySubscription->name)));
-
-		proc_exit(0);
-	}
-
 	/* !slotname should never happen when enabled is true. */
 	Assert(newsub->slotname);
 
 	/*
-	 * We need to make new connection to new slot if slot name has changed so
-	 * exit here as well if that's the case.
-	 */
-	if (strcmp(newsub->slotname, MySubscription->slotname) != 0)
-	{
-		ereport(LOG,
-				(errmsg("logical replication apply worker for subscription \"%s\" will "
-						"restart because the replication slot name was changed",
-						MySubscription->name)));
-
-		proc_exit(0);
-	}
-
-	/*
-	 * Exit if publication list was changed. The launcher will start new
-	 * worker.
+	 * Exit if any parameter that affects the remote connection was changed.
+	 * The launcher will start a new worker.
 	 */
-	if (!equal(newsub->publications, MySubscription->publications))
+	if (strcmp(newsub->conninfo, MySubscription->conninfo) != 0 ||
+		strcmp(newsub->name, MySubscription->name) != 0 ||
+		strcmp(newsub->slotname, MySubscription->slotname) != 0 ||
+		newsub->binary != MySubscription->binary ||
+		!equal(newsub->publications, MySubscription->publications))
 	{
 		ereport(LOG,
-				(errmsg("logical replication apply worker for subscription \"%s\" will "
-						"restart because subscription's publications were changed",
+				(errmsg("logical replication apply worker for subscription \"%s\" will restart because of a parameter change",
 						MySubscription->name)));
 
 		proc_exit(0);
@@ -2106,6 +2132,7 @@ ApplyWorkerMain(Datum main_arg)
 	options.slotname = myslotname;
 	options.proto.logical.proto_version = LOGICALREP_PROTO_VERSION_NUM;
 	options.proto.logical.publication_names = MySubscription->publications;
+	options.proto.logical.binary = MySubscription->binary;
 
 	/* Start normal logical streaming replication. */
 	walrcv_startstreaming(wrconn, &options);
diff --git a/src/backend/replication/pgoutput/pgoutput.c b/src/backend/replication/pgoutput/pgoutput.c
index 15379e311819..81ef7dc4c1a3 100644
--- a/src/backend/replication/pgoutput/pgoutput.c
+++ b/src/backend/replication/pgoutput/pgoutput.c
@@ -15,6 +15,7 @@
 #include "access/tupconvert.h"
 #include "catalog/partition.h"
 #include "catalog/pg_publication.h"
+#include "commands/defrem.h"
 #include "fmgr.h"
 #include "replication/logical.h"
 #include "replication/logicalproto.h"
@@ -118,11 +119,14 @@ _PG_output_plugin_init(OutputPluginCallbacks *cb)
 
 static void
 parse_output_parameters(List *options, uint32 *protocol_version,
-						List **publication_names)
+						List **publication_names, bool *binary)
 {
 	ListCell   *lc;
 	bool		protocol_version_given = false;
 	bool		publication_names_given = false;
+	bool		binary_option_given = false;
+
+	*binary = false;
 
 	foreach(lc, options)
 	{
@@ -168,6 +172,16 @@ parse_output_parameters(List *options, uint32 *protocol_version,
 						(errcode(ERRCODE_INVALID_NAME),
 						 errmsg("invalid publication_names syntax")));
 		}
+		else if (strcmp(defel->defname, "binary") == 0)
+		{
+			if (binary_option_given)
+				ereport(ERROR,
+						(errcode(ERRCODE_SYNTAX_ERROR),
+						 errmsg("conflicting or redundant options")));
+			binary_option_given = true;
+
+			*binary = defGetBoolean(defel);
+		}
 		else
 			elog(ERROR, "unrecognized pgoutput option: %s", defel->defname);
 	}
@@ -202,7 +216,8 @@ pgoutput_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
 		/* Parse the params and ERROR if we see any we don't recognize */
 		parse_output_parameters(ctx->output_plugin_options,
 								&data->protocol_version,
-								&data->publication_names);
+								&data->publication_names,
+								&data->binary);
 
 		/* Check if we support requested protocol */
 		if (data->protocol_version > LOGICALREP_PROTO_VERSION_NUM)
@@ -411,7 +426,8 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
 				}
 
 				OutputPluginPrepareWrite(ctx, true);
-				logicalrep_write_insert(ctx->out, relation, tuple);
+				logicalrep_write_insert(ctx->out, relation, tuple,
+										data->binary);
 				OutputPluginWrite(ctx, true);
 				break;
 			}
@@ -435,7 +451,8 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
 				}
 
 				OutputPluginPrepareWrite(ctx, true);
-				logicalrep_write_update(ctx->out, relation, oldtuple, newtuple);
+				logicalrep_write_update(ctx->out, relation, oldtuple, newtuple,
+										data->binary);
 				OutputPluginWrite(ctx, true);
 				break;
 			}
@@ -455,7 +472,8 @@ pgoutput_change(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
 				}
 
 				OutputPluginPrepareWrite(ctx, true);
-				logicalrep_write_delete(ctx->out, relation, oldtuple);
+				logicalrep_write_delete(ctx->out, relation, oldtuple,
+										data->binary);
 				OutputPluginWrite(ctx, true);
 			}
 			else
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 857c7c2278ad..94459b3539ad 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -4205,6 +4205,7 @@ getSubscriptions(Archive *fout)
 	int			i_subslotname;
 	int			i_subsynccommit;
 	int			i_subpublications;
+	int			i_subbinary;
 	int			i,
 				ntups;
 
@@ -4229,18 +4230,26 @@ getSubscriptions(Archive *fout)
 
 	query = createPQExpBuffer();
 
-	resetPQExpBuffer(query);
-
 	/* Get the subscriptions in current database. */
 	appendPQExpBuffer(query,
-					  "SELECT s.tableoid, s.oid, s.subname,"
-					  "(%s s.subowner) AS rolname, "
-					  " s.subconninfo, s.subslotname, s.subsynccommit, "
-					  " s.subpublications "
-					  "FROM pg_subscription s "
-					  "WHERE s.subdbid = (SELECT oid FROM pg_database"
-					  "                   WHERE datname = current_database())",
+					  "SELECT s.tableoid, s.oid, s.subname,\n"
+					  " (%s s.subowner) AS rolname,\n"
+					  " s.subconninfo, s.subslotname, s.subsynccommit,\n"
+					  " s.subpublications,\n",
 					  username_subquery);
+
+	if (fout->remoteVersion >= 140000)
+		appendPQExpBuffer(query,
+						  " s.subbinary\n");
+	else
+		appendPQExpBuffer(query,
+						  " false AS subbinary\n");
+
+	appendPQExpBuffer(query,
+					  "FROM pg_subscription s\n"
+					  "WHERE s.subdbid = (SELECT oid FROM pg_database\n"
+					  "                   WHERE datname = current_database())");
+
 	res = ExecuteSqlQuery(fout, query->data, PGRES_TUPLES_OK);
 
 	ntups = PQntuples(res);
@@ -4253,6 +4262,7 @@ getSubscriptions(Archive *fout)
 	i_subslotname = PQfnumber(res, "subslotname");
 	i_subsynccommit = PQfnumber(res, "subsynccommit");
 	i_subpublications = PQfnumber(res, "subpublications");
+	i_subbinary = PQfnumber(res, "subbinary");
 
 	subinfo = pg_malloc(ntups * sizeof(SubscriptionInfo));
 
@@ -4274,6 +4284,8 @@ getSubscriptions(Archive *fout)
 			pg_strdup(PQgetvalue(res, i, i_subsynccommit));
 		subinfo[i].subpublications =
 			pg_strdup(PQgetvalue(res, i, i_subpublications));
+		subinfo[i].subbinary =
+			pg_strdup(PQgetvalue(res, i, i_subbinary));
 
 		if (strlen(subinfo[i].rolname) == 0)
 			pg_log_warning("owner of subscription \"%s\" appears to be invalid",
@@ -4342,6 +4354,9 @@ dumpSubscription(Archive *fout, SubscriptionInfo *subinfo)
 	else
 		appendPQExpBufferStr(query, "NONE");
 
+	if (strcmp(subinfo->subbinary, "t") == 0)
+		appendPQExpBuffer(query, ", binary = true");
+
 	if (strcmp(subinfo->subsynccommit, "off") != 0)
 		appendPQExpBuffer(query, ", synchronous_commit = %s", fmtId(subinfo->subsynccommit));
 
diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h
index 0c2fcfb3a9ca..da97b731b159 100644
--- a/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@@ -625,6 +625,7 @@ typedef struct _SubscriptionInfo
 	char	   *rolname;
 	char	   *subconninfo;
 	char	   *subslotname;
+	char	   *subbinary;
 	char	   *subsynccommit;
 	char	   *subpublications;
 } SubscriptionInfo;
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index 3b870c3b17e2..e197dcdb4d23 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -5963,7 +5963,7 @@ describeSubscriptions(const char *pattern, bool verbose)
 	PGresult   *res;
 	printQueryOpt myopt = pset.popt;
 	static const bool translate_columns[] = {false, false, false, false,
-	false, false};
+	false, false, false};
 
 	if (pset.sversion < 100000)
 	{
@@ -5989,6 +5989,12 @@ describeSubscriptions(const char *pattern, bool verbose)
 
 	if (verbose)
 	{
+		/* Binary mode is only supported in v14 and higher */
+		if (pset.sversion >= 140000)
+			appendPQExpBuffer(&buf,
+							  ", subbinary AS \"%s\"\n",
+							  gettext_noop("Binary"));
+
 		appendPQExpBuffer(&buf,
 						  ",  subsynccommit AS \"%s\"\n"
 						  ",  subconninfo AS \"%s\"\n",
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 6b3aa7c0063e..ccb586ad00f4 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202007131
+#define CATALOG_VERSION_NO	202007181
 
 #endif
diff --git a/src/include/catalog/pg_subscription.h b/src/include/catalog/pg_subscription.h
index 0a756d42d845..a041ce9740ad 100644
--- a/src/include/catalog/pg_subscription.h
+++ b/src/include/catalog/pg_subscription.h
@@ -48,6 +48,9 @@ CATALOG(pg_subscription,6100,SubscriptionRelationId) BKI_SHARED_RELATION BKI_ROW
 	bool		subenabled;		/* True if the subscription is enabled (the
 								 * worker should be running) */
 
+	bool		subbinary;		/* True if the subscription wants the
+								 * publisher to send data in binary */
+
 #ifdef CATALOG_VARLEN			/* variable-length fields start here */
 	/* Connection string to the publisher */
 	text		subconninfo BKI_FORCE_NOT_NULL;
@@ -73,6 +76,8 @@ typedef struct Subscription
 	char	   *name;			/* Name of the subscription */
 	Oid			owner;			/* Oid of the subscription owner */
 	bool		enabled;		/* Indicates if the subscription is enabled */
+	bool		binary;			/* Indicates if the subscription wants data in
+								 * binary format */
 	char	   *conninfo;		/* Connection string to the publisher */
 	char	   *slotname;		/* Name of the replication slot */
 	char	   *synccommit;		/* Synchronous commit setting for worker */
diff --git a/src/include/replication/logicalproto.h b/src/include/replication/logicalproto.h
index 4860561be9f5..287288ab415c 100644
--- a/src/include/replication/logicalproto.h
+++ b/src/include/replication/logicalproto.h
@@ -30,12 +30,19 @@
 /* Tuple coming via logical replication. */
 typedef struct LogicalRepTupleData
 {
-	/* column values in text format, or NULL for a null value: */
-	char	   *values[MaxTupleAttributeNumber];
-	/* markers for changed/unchanged column values: */
-	bool		changed[MaxTupleAttributeNumber];
+	/* Array of StringInfos, one per column; some may be unused */
+	StringInfoData *colvalues;
+	/* Array of markers for null/unchanged/text/binary, one per column */
+	char	   *colstatus;
 } LogicalRepTupleData;
 
+/* Possible values for LogicalRepTupleData.colstatus[colnum] */
+/* These values are also used in the on-the-wire protocol */
+#define LOGICALREP_COLUMN_NULL		'n'
+#define LOGICALREP_COLUMN_UNCHANGED	'u'
+#define LOGICALREP_COLUMN_TEXT		't'
+#define LOGICALREP_COLUMN_BINARY	'b' /* added in PG14 */
+
 typedef uint32 LogicalRepRelId;
 
 /* Relation information */
@@ -87,15 +94,15 @@ extern void logicalrep_write_origin(StringInfo out, const char *origin,
 									XLogRecPtr origin_lsn);
 extern char *logicalrep_read_origin(StringInfo in, XLogRecPtr *origin_lsn);
 extern void logicalrep_write_insert(StringInfo out, Relation rel,
-									HeapTuple newtuple);
+									HeapTuple newtuple, bool binary);
 extern LogicalRepRelId logicalrep_read_insert(StringInfo in, LogicalRepTupleData *newtup);
 extern void logicalrep_write_update(StringInfo out, Relation rel, HeapTuple oldtuple,
-									HeapTuple newtuple);
+									HeapTuple newtuple, bool binary);
 extern LogicalRepRelId logicalrep_read_update(StringInfo in,
 											  bool *has_oldtuple, LogicalRepTupleData *oldtup,
 											  LogicalRepTupleData *newtup);
 extern void logicalrep_write_delete(StringInfo out, Relation rel,
-									HeapTuple oldtuple);
+									HeapTuple oldtuple, bool binary);
 extern LogicalRepRelId logicalrep_read_delete(StringInfo in,
 											  LogicalRepTupleData *oldtup);
 extern void logicalrep_write_truncate(StringInfo out, int nrelids, Oid relids[],
diff --git a/src/include/replication/pgoutput.h b/src/include/replication/pgoutput.h
index 2e8e9daf4457..a8c676ed23b5 100644
--- a/src/include/replication/pgoutput.h
+++ b/src/include/replication/pgoutput.h
@@ -20,11 +20,11 @@ typedef struct PGOutputData
 	MemoryContext context;		/* private memory context for transient
 								 * allocations */
 
-	/* client info */
+	/* client-supplied info: */
 	uint32		protocol_version;
-
 	List	   *publication_names;
 	List	   *publications;
+	bool		binary;
 } PGOutputData;
 
 #endif							/* PGOUTPUT_H */
diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h
index c75dcebea0c1..c2d5dbee5491 100644
--- a/src/include/replication/walreceiver.h
+++ b/src/include/replication/walreceiver.h
@@ -177,6 +177,7 @@ typedef struct
 		{
 			uint32		proto_version;	/* Logical protocol version */
 			List	   *publication_names;	/* String list of publications */
+			bool		binary; /* Ask publisher to use binary */
 		}			logical;
 	}			proto;
 } WalRcvStreamOptions;
diff --git a/src/test/regress/expected/subscription.out b/src/test/regress/expected/subscription.out
index e7add9d2b81b..d71db0d52074 100644
--- a/src/test/regress/expected/subscription.out
+++ b/src/test/regress/expected/subscription.out
@@ -76,10 +76,10 @@ ALTER SUBSCRIPTION regress_testsub CONNECTION 'foobar';
 ERROR:  invalid connection string syntax: missing "=" after "foobar" in connection info string
 
 \dRs+
-                                                 List of subscriptions
-      Name       |           Owner           | Enabled | Publication | Synchronous commit |          Conninfo           
------------------+---------------------------+---------+-------------+--------------------+-----------------------------
- regress_testsub | regress_subscription_user | f       | {testpub}   | off                | dbname=regress_doesnotexist
+                                                      List of subscriptions
+      Name       |           Owner           | Enabled | Publication | Binary | Synchronous commit |          Conninfo           
+-----------------+---------------------------+---------+-------------+--------+--------------------+-----------------------------
+ regress_testsub | regress_subscription_user | f       | {testpub}   | f      | off                | dbname=regress_doesnotexist
 (1 row)
 
 ALTER SUBSCRIPTION regress_testsub SET PUBLICATION testpub2, testpub3 WITH (refresh = false);
@@ -91,10 +91,10 @@ ERROR:  subscription "regress_doesnotexist" does not exist
 ALTER SUBSCRIPTION regress_testsub SET (create_slot = false);
 ERROR:  unrecognized subscription parameter: "create_slot"
 \dRs+
-                                                      List of subscriptions
-      Name       |           Owner           | Enabled |     Publication     | Synchronous commit |           Conninfo           
------------------+---------------------------+---------+---------------------+--------------------+------------------------------
- regress_testsub | regress_subscription_user | f       | {testpub2,testpub3} | off                | dbname=regress_doesnotexist2
+                                                          List of subscriptions
+      Name       |           Owner           | Enabled |     Publication     | Binary | Synchronous commit |           Conninfo           
+-----------------+---------------------------+---------+---------------------+--------+--------------------+------------------------------
+ regress_testsub | regress_subscription_user | f       | {testpub2,testpub3} | f      | off                | dbname=regress_doesnotexist2
 (1 row)
 
 BEGIN;
@@ -126,10 +126,10 @@ ALTER SUBSCRIPTION regress_testsub_foo SET (synchronous_commit = foobar);
 ERROR:  invalid value for parameter "synchronous_commit": "foobar"
 HINT:  Available values: local, remote_write, remote_apply, on, off.
 \dRs+
-                                                        List of subscriptions
-        Name         |           Owner           | Enabled |     Publication     | Synchronous commit |           Conninfo           
----------------------+---------------------------+---------+---------------------+--------------------+------------------------------
- regress_testsub_foo | regress_subscription_user | f       | {testpub2,testpub3} | local              | dbname=regress_doesnotexist2
+                                                            List of subscriptions
+        Name         |           Owner           | Enabled |     Publication     | Binary | Synchronous commit |           Conninfo           
+---------------------+---------------------------+---------+---------------------+--------+--------------------+------------------------------
+ regress_testsub_foo | regress_subscription_user | f       | {testpub2,testpub3} | f      | local              | dbname=regress_doesnotexist2
 (1 row)
 
 -- rename back to keep the rest simple
@@ -155,6 +155,29 @@ DROP SUBSCRIPTION IF EXISTS regress_testsub;
 NOTICE:  subscription "regress_testsub" does not exist, skipping
 DROP SUBSCRIPTION regress_testsub;  -- fail
 ERROR:  subscription "regress_testsub" does not exist
+-- fail - binary must be boolean
+CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUBLICATION testpub WITH (connect = false, binary = foo);
+ERROR:  binary requires a Boolean value
+-- now it works
+CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUBLICATION testpub WITH (connect = false, binary = true);
+WARNING:  tables were not subscribed, you will have to run ALTER SUBSCRIPTION ... REFRESH PUBLICATION to subscribe the tables
+\dRs+
+                                                      List of subscriptions
+      Name       |           Owner           | Enabled | Publication | Binary | Synchronous commit |          Conninfo           
+-----------------+---------------------------+---------+-------------+--------+--------------------+-----------------------------
+ regress_testsub | regress_subscription_user | f       | {testpub}   | t      | off                | dbname=regress_doesnotexist
+(1 row)
+
+ALTER SUBSCRIPTION regress_testsub SET (binary = false);
+ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE);
+\dRs+
+                                                      List of subscriptions
+      Name       |           Owner           | Enabled | Publication | Binary | Synchronous commit |          Conninfo           
+-----------------+---------------------------+---------+-------------+--------+--------------------+-----------------------------
+ regress_testsub | regress_subscription_user | f       | {testpub}   | f      | off                | dbname=regress_doesnotexist
+(1 row)
+
+DROP SUBSCRIPTION regress_testsub;
 RESET SESSION AUTHORIZATION;
 DROP ROLE regress_subscription_user;
 DROP ROLE regress_subscription_user2;
diff --git a/src/test/regress/sql/subscription.sql b/src/test/regress/sql/subscription.sql
index 9e234ab8b3f7..eeb2ec06ebed 100644
--- a/src/test/regress/sql/subscription.sql
+++ b/src/test/regress/sql/subscription.sql
@@ -117,6 +117,21 @@ COMMIT;
 DROP SUBSCRIPTION IF EXISTS regress_testsub;
 DROP SUBSCRIPTION regress_testsub;  -- fail
 
+-- fail - binary must be boolean
+CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUBLICATION testpub WITH (connect = false, binary = foo);
+
+-- now it works
+CREATE SUBSCRIPTION regress_testsub CONNECTION 'dbname=regress_doesnotexist' PUBLICATION testpub WITH (connect = false, binary = true);
+
+\dRs+
+
+ALTER SUBSCRIPTION regress_testsub SET (binary = false);
+ALTER SUBSCRIPTION regress_testsub SET (slot_name = NONE);
+
+\dRs+
+
+DROP SUBSCRIPTION regress_testsub;
+
 RESET SESSION AUTHORIZATION;
 DROP ROLE regress_subscription_user;
 DROP ROLE regress_subscription_user2;
diff --git a/src/test/subscription/t/014_binary.pl b/src/test/subscription/t/014_binary.pl
new file mode 100644
index 000000000000..36a2f58e17be
--- /dev/null
+++ b/src/test/subscription/t/014_binary.pl
@@ -0,0 +1,134 @@
+# Binary mode logical replication test
+
+use strict;
+use warnings;
+use PostgresNode;
+use TestLib;
+use Test::More tests => 5;
+
+# Create and initialize a publisher node
+my $node_publisher = get_new_node('publisher');
+$node_publisher->init(allows_streaming => 'logical');
+$node_publisher->start;
+
+# Create and initialize subscriber node
+my $node_subscriber = get_new_node('subscriber');
+$node_subscriber->init(allows_streaming => 'logical');
+$node_subscriber->start;
+
+# Create tables on both sides of the replication
+my $ddl = qq(
+	CREATE TABLE public.test_numerical (
+		a INTEGER PRIMARY KEY,
+		b NUMERIC,
+		c FLOAT,
+		d BIGINT
+		);
+	CREATE TABLE public.test_arrays (
+		a INTEGER[] PRIMARY KEY,
+		b NUMERIC[],
+		c TEXT[]
+		););
+
+$node_publisher->safe_psql('postgres', $ddl);
+$node_subscriber->safe_psql('postgres', $ddl);
+
+# Configure logical replication
+$node_publisher->safe_psql('postgres',
+	"CREATE PUBLICATION tpub FOR ALL TABLES");
+
+my $publisher_connstring = $node_publisher->connstr . ' dbname=postgres';
+$node_subscriber->safe_psql('postgres',
+	    "CREATE SUBSCRIPTION tsub CONNECTION '$publisher_connstring' "
+	  . "PUBLICATION tpub WITH (slot_name = tpub_slot, binary = true)");
+
+# Ensure nodes are in sync with each other
+$node_publisher->wait_for_catchup('tsub');
+$node_subscriber->poll_query_until('postgres',
+	"SELECT count(1) = 0 FROM pg_subscription_rel WHERE srsubstate NOT IN ('s', 'r');"
+) or die "Timed out while waiting for subscriber to synchronize data";
+
+# Insert some content and make sure it's replicated across
+$node_publisher->safe_psql(
+	'postgres', qq(
+	INSERT INTO public.test_arrays (a, b, c) VALUES
+		('{1,2,3}', '{1.1, 1.2, 1.3}', '{"one", "two", "three"}'),
+		('{3,1,2}', '{1.3, 1.1, 1.2}', '{"three", "one", "two"}');
+
+	INSERT INTO public.test_numerical (a, b, c, d) VALUES
+		(1, 1.2, 1.3, 10),
+		(2, 2.2, 2.3, 20),
+		(3, 3.2, 3.3, 30);
+	));
+
+$node_publisher->wait_for_catchup('tsub');
+
+my $result = $node_subscriber->safe_psql('postgres',
+	"SELECT a, b, c, d FROM test_numerical ORDER BY a");
+
+is( $result, '1|1.2|1.3|10
+2|2.2|2.3|20
+3|3.2|3.3|30', 'check replicated data on subscriber');
+
+# Test updates as well
+$node_publisher->safe_psql(
+	'postgres', qq(
+	UPDATE public.test_arrays SET b[1] = 42, c = NULL;
+	UPDATE public.test_numerical SET b = 42, c = NULL;
+	));
+
+$node_publisher->wait_for_catchup('tsub');
+
+$result = $node_subscriber->safe_psql('postgres',
+	"SELECT a, b, c FROM test_arrays ORDER BY a");
+
+is( $result, '{1,2,3}|{42,1.2,1.3}|
+{3,1,2}|{42,1.1,1.2}|', 'check updated replicated data on subscriber');
+
+$result = $node_subscriber->safe_psql('postgres',
+	"SELECT a, b, c, d FROM test_numerical ORDER BY a");
+
+is( $result, '1|42||10
+2|42||20
+3|42||30', 'check updated replicated data on subscriber');
+
+# Test to reset back to text formatting, and then to binary again
+$node_subscriber->safe_psql('postgres',
+	"ALTER SUBSCRIPTION tsub SET (binary = false);");
+
+$node_publisher->safe_psql(
+	'postgres', qq(
+	INSERT INTO public.test_numerical (a, b, c, d) VALUES
+		(4, 4.2, 4.3, 40);
+	));
+
+$node_publisher->wait_for_catchup('tsub');
+
+$result = $node_subscriber->safe_psql('postgres',
+	"SELECT a, b, c, d FROM test_numerical ORDER BY a");
+
+is( $result, '1|42||10
+2|42||20
+3|42||30
+4|4.2|4.3|40', 'check replicated data on subscriber');
+
+$node_subscriber->safe_psql('postgres',
+	"ALTER SUBSCRIPTION tsub SET (binary = true);");
+
+$node_publisher->safe_psql(
+	'postgres', qq(
+	INSERT INTO public.test_arrays (a, b, c) VALUES
+		('{2,3,1}', '{1.2, 1.3, 1.1}', '{"two", "three", "one"}');
+	));
+
+$node_publisher->wait_for_catchup('tsub');
+
+$result = $node_subscriber->safe_psql('postgres',
+	"SELECT a, b, c FROM test_arrays ORDER BY a");
+
+is( $result, '{1,2,3}|{42,1.2,1.3}|
+{2,3,1}|{1.2,1.3,1.1}|{two,three,one}
+{3,1,2}|{42,1.1,1.2}|', 'check replicated data on subscriber');
+
+$node_subscriber->stop('fast');
+$node_publisher->stop('fast');

From 9b14280b207053f8b09e20b5719785ec4413b9d2 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 18 Jul 2020 14:58:18 -0400
Subject: [PATCH 165/334] Fix replication/worker_internal.h to compile without
 other headers.

This header hasn't changed recently, so the fact that it now fails
headerscheck/cpluspluscheck testing must be due to changes in what
it includes.  Probably f21916791 is to blame, but I didn't try to
verify that.

Discussion: https://postgr.es/m/3699703.1595016554@sss.pgh.pa.us
---
 src/include/replication/worker_internal.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/include/replication/worker_internal.h b/src/include/replication/worker_internal.h
index 8ed7e45056c9..6602b23edc51 100644
--- a/src/include/replication/worker_internal.h
+++ b/src/include/replication/worker_internal.h
@@ -18,6 +18,8 @@
 #include "catalog/pg_subscription.h"
 #include "datatype/timestamp.h"
 #include "storage/lock.h"
+#include "storage/spin.h"
+
 
 typedef struct LogicalRepWorker
 {

From 4d3db13621be64fbac2faf7c01c4879d20885c1b Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Sun, 19 Jul 2020 12:14:42 +0200
Subject: [PATCH 166/334] Define OPENSSL_API_COMPAT

This avoids deprecation warnings from newer OpenSSL versions (3.0.0 in
particular).

Discussion: https://www.postgresql.org/message-id/flat/FEF81714-D479-4512-839B-C769D2605F8A%40yesql.se
---
 configure                  |  6 +++++-
 configure.in               |  3 +++
 src/include/pg_config.h.in |  4 ++++
 src/tools/msvc/Solution.pm | 10 +++++++++-
 4 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/configure b/configure
index 9907637e3176..cb8fbe105114 100755
--- a/configure
+++ b/configure
@@ -12174,7 +12174,11 @@ fi
 fi
 
 if test "$with_openssl" = yes ; then
-    if test "$PORTNAME" != "win32"; then
+    # Minimum required OpenSSL version is 1.0.1
+
+$as_echo "#define OPENSSL_API_COMPAT 0x10001000L" >>confdefs.h
+
+  if test "$PORTNAME" != "win32"; then
      { $as_echo "$as_me:${as_lineno-$LINENO}: checking for CRYPTO_new_ex_data in -lcrypto" >&5
 $as_echo_n "checking for CRYPTO_new_ex_data in -lcrypto... " >&6; }
 if ${ac_cv_lib_crypto_CRYPTO_new_ex_data+:} false; then :
diff --git a/configure.in b/configure.in
index 2e05ce2e4d69..e91e49a579ee 100644
--- a/configure.in
+++ b/configure.in
@@ -1206,6 +1206,9 @@ fi
 
 if test "$with_openssl" = yes ; then
   dnl Order matters!
+  # Minimum required OpenSSL version is 1.0.1
+  AC_DEFINE(OPENSSL_API_COMPAT, [0x10001000L],
+            [Define to the OpenSSL API version in use. This avoids deprecation warnings from newer OpenSSL versions.])
   if test "$PORTNAME" != "win32"; then
      AC_CHECK_LIB(crypto, CRYPTO_new_ex_data, [], [AC_MSG_ERROR([library 'crypto' is required for OpenSSL])])
      AC_CHECK_LIB(ssl,    SSL_new, [], [AC_MSG_ERROR([library 'ssl' is required for OpenSSL])])
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index c199cd46d2da..73aa61816694 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -749,6 +749,10 @@
 /* Define bytes to use libc memset(). */
 #undef MEMSET_LOOP_LIMIT
 
+/* Define to the OpenSSL API version in use. This avoids deprecation warnings
+   from newer OpenSSL versions. */
+#undef OPENSSL_API_COMPAT
+
 /* Define to the address where bug reports for this package should be sent. */
 #undef PACKAGE_BUGREPORT
 
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index a13ca6e02e8f..023da623826f 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -152,6 +152,8 @@ sub GenerateFiles
 	my $package_bugreport;
 	my $package_url;
 	my ($majorver, $minorver);
+	my $ac_define_openssl_api_compat_found = 0;
+	my $openssl_api_compat;
 
 	# Parse configure.in to get version numbers
 	open(my $c, '<', "configure.in")
@@ -176,10 +178,15 @@ sub GenerateFiles
 			$majorver = sprintf("%d", $1);
 			$minorver = sprintf("%d", $2 ? $2 : 0);
 		}
+		elsif (/\bAC_DEFINE\(OPENSSL_API_COMPAT, \[([0-9xL]+)\]/)
+		{
+			$ac_define_openssl_api_compat_found = 1;
+			$openssl_api_compat = $1;
+		}
 	}
 	close($c);
 	confess "Unable to parse configure.in for all variables!"
-	  unless $ac_init_found;
+	  unless $ac_init_found && $ac_define_openssl_api_compat_found;
 
 	if (IsNewer("src/include/pg_config_os.h", "src/include/port/win32.h"))
 	{
@@ -433,6 +440,7 @@ sub GenerateFiles
 		LOCALE_T_IN_XLOCALE                      => undef,
 		MAXIMUM_ALIGNOF                          => 8,
 		MEMSET_LOOP_LIMIT                        => 1024,
+		OPENSSL_API_COMPAT                       => $openssl_api_compat,
 		PACKAGE_BUGREPORT                        => qq{"$package_bugreport"},
 		PACKAGE_NAME                             => qq{"$package_name"},
 		PACKAGE_STRING      => qq{"$package_name $package_version"},

From 72eab84a565cbc0677bf8907cd4bfaddf064bd64 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 19 Jul 2020 12:37:23 -0400
Subject: [PATCH 167/334] Correctly mark pg_subscription.subslotname as
 nullable.

Due to the layout of this catalog, subslotname has to be explicitly
marked BKI_FORCE_NULL, else initdb will default to the assumption
that it's non-nullable.  Since, in fact, CREATE/ALTER SUBSCRIPTION
will store null values there, the existing marking is just wrong,
and has been since this catalog was invented.

We haven't noticed because not much in the system actually depends
on attnotnull being truthful.  However, JIT'ed tuple deconstruction
does depend on that in some cases, allowing crashes or wrong answers
in queries that inspect pg_subscription.  Commit 9de77b545 quite
accidentally exposed this on the buildfarm members that force JIT
activation.

Back-patch to v13.  The problem goes further back, but we cannot
force initdb in released branches, so some klugier solution will
be needed there.  Before working on that, push this simple fix
to try to get the buildfarm back to green.

Discussion: https://postgr.es/m/4118109.1595096139@sss.pgh.pa.us
---
 doc/src/sgml/catalogs.sgml            | 3 ++-
 src/include/catalog/catversion.h      | 2 +-
 src/include/catalog/pg_subscription.h | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 18ab3d434cb1..80c183b2355b 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -7529,7 +7529,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
       </para>
       <para>
        Name of the replication slot in the upstream database (also used
-       for the local replication origin name)
+       for the local replication origin name);
+       null represents <literal>NONE</literal>
       </para></entry>
      </row>
 
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index ccb586ad00f4..dab7f4f47131 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202007181
+#define CATALOG_VERSION_NO	202007192
 
 #endif
diff --git a/src/include/catalog/pg_subscription.h b/src/include/catalog/pg_subscription.h
index a041ce9740ad..9795c35000d8 100644
--- a/src/include/catalog/pg_subscription.h
+++ b/src/include/catalog/pg_subscription.h
@@ -56,7 +56,7 @@ CATALOG(pg_subscription,6100,SubscriptionRelationId) BKI_SHARED_RELATION BKI_ROW
 	text		subconninfo BKI_FORCE_NOT_NULL;
 
 	/* Slot name on publisher */
-	NameData	subslotname;
+	NameData	subslotname BKI_FORCE_NULL;
 
 	/* Synchronous commit setting for worker */
 	text		subsynccommit BKI_FORCE_NOT_NULL;

From 46ef520b9566d9eccb095ceafa53e5c305cf80b9 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sun, 19 Jul 2020 09:46:44 -0700
Subject: [PATCH 168/334] Mark buffers as defined to Valgrind consistently.

Make PinBuffer() mark buffers as defined to Valgrind unconditionally,
including when the buffer header spinlock must be acquired.  Failure to
handle that case could lead to false positive reports from Valgrind.

This theoretically creates a risk that we'll mark buffers defined even
when external callers don't end up with a buffer pin.  That seems
perfectly acceptable, though, since in general we make no guarantees
about buffers that are unsafe to access being reliably marked as unsafe.

Oversight in commit 1e0dfd16, which added valgrind buffer access
instrumentation.
---
 src/backend/storage/buffer/bufmgr.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 8ef073b3821b..83d91b14fb1c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1636,11 +1636,13 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
 				result = (buf_state & BM_VALID) != 0;
 
 				/*
-				 * If we successfully acquired our first pin on this buffer
-				 * within this backend, mark buffer contents defined
+				 * Assume that we acquired a buffer pin for the purposes of
+				 * Valgrind buffer client checks (even in !result case) to
+				 * keep things simple.  Buffers that are unsafe to access are
+				 * not generally guaranteed to be marked undefined in any
+				 * case.
 				 */
-				if (result)
-					VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
+				VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
 				break;
 			}
 		}

From a766d6ca22ac7c233e69c896ae0c5f19de916db4 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sun, 19 Jul 2020 16:12:51 -0700
Subject: [PATCH 169/334] Avoid harmless Valgrind no-buffer-pin errors.

Valgrind builds with assertions enabled sometimes perform a
theoretically unsafe page access inside an assertion in
heapam_tuple_lock().  This happened when the eval-plan-qual isolation
test ran one of the permutations added by commit a2418f9e238.

Avoid complaints from Valgrind by moving the assertion ever so slightly.
This is minor cleanup for commit 1e0dfd16, which added Valgrind buffer
access instrumentation.

No backpatch, since this only happens within an assertion, and seems
very unlikely to cause any real problems even with assert-enabled
builds.
---
 src/backend/access/heap/heapam_handler.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 56b35622f1a4..8f2e5379210c 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -368,10 +368,11 @@ heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot,
 	if (result == TM_Updated &&
 		(flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION))
 	{
-		ReleaseBuffer(buffer);
 		/* Should not encounter speculative tuple on recheck */
 		Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data));
 
+		ReleaseBuffer(buffer);
+
 		if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self))
 		{
 			SnapshotData SnapshotDirty;

From 044dc7b964147ec6303d59320fb743693b22af30 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Mon, 20 Jul 2020 07:45:26 +0530
Subject: [PATCH 170/334] Fix minor typo in nodeIncrementalSort.c.

Author: Vignesh C
Reviewed-by: James Coleman
Backpatch-through: 13, where it was introduced
Discussion: https://postgr.es/m/CALDaNm0WjZqRvdeL59ZfYH0o4mLbKQ23jm-bnjXcFzgpANx55g@mail.gmail.com
---
 src/backend/executor/nodeIncrementalSort.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/executor/nodeIncrementalSort.c b/src/backend/executor/nodeIncrementalSort.c
index e056469448c3..6c0d24ee25a5 100644
--- a/src/backend/executor/nodeIncrementalSort.c
+++ b/src/backend/executor/nodeIncrementalSort.c
@@ -65,8 +65,8 @@
  *	into the second mode if we believe it's beneficial.
  *
  *	Sorting incrementally can potentially use less memory, avoid fetching
- *	and sorting all tuples in the the dataset, and begin returning tuples
- *	before the entire result set is available.
+ *	and sorting all tuples in the dataset, and begin returning tuples before
+ *	the entire result set is available.
  *
  *	The hybrid mode approach allows us to optimize for both very small
  *	groups (where the overhead of a new tuplesort is high) and very	large

From d05b172a760e0ccb3008a2144f96053720000b12 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Mon, 20 Jul 2020 11:55:50 +0900
Subject: [PATCH 171/334] Add generic_plans and custom_plans fields into
 pg_prepared_statements.

There was no easy way to find how many times generic and custom plans
have been executed for a prepared statement. This commit exposes those
numbers of times in pg_prepared_statements view.

Author: Atsushi Torikoshi, Kyotaro Horiguchi
Reviewed-by: Tatsuro Yamada, Masahiro Ikeda, Fujii Masao
Discussion: https://postgr.es/m/CACZ0uYHZ4M=NZpofH6JuPHeX=__5xcDELF8hT8_2T+R55w4RQw@mail.gmail.com
---
 doc/src/sgml/catalogs.sgml              | 18 +++++++++++
 src/backend/commands/prepare.c          | 15 ++++++---
 src/backend/utils/cache/plancache.c     | 17 ++++++----
 src/include/catalog/pg_proc.dat         |  6 ++--
 src/include/utils/plancache.h           |  3 +-
 src/test/regress/expected/plancache.out | 43 +++++++++++++++++++++++++
 src/test/regress/expected/rules.out     |  6 ++--
 src/test/regress/sql/plancache.sql      | 13 ++++++++
 8 files changed, 105 insertions(+), 16 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 80c183b2355b..9f00ea90e32a 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -10841,6 +10841,24 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
        frontend/backend protocol
       </para></entry>
      </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>generic_plans</structfield> <type>int8</type>
+      </para>
+      <para>
+       Number of times generic plan was chosen
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>custom_plans</structfield> <type>int8</type>
+      </para>
+      <para>
+       Number of times custom plan was chosen
+      </para></entry>
+     </row>
     </tbody>
    </tgroup>
   </table>
diff --git a/src/backend/commands/prepare.c b/src/backend/commands/prepare.c
index 80d6df8ac1e7..4b18be5b2725 100644
--- a/src/backend/commands/prepare.c
+++ b/src/backend/commands/prepare.c
@@ -694,7 +694,8 @@ ExplainExecuteQuery(ExecuteStmt *execstmt, IntoClause *into, ExplainState *es,
 
 /*
  * This set returning function reads all the prepared statements and
- * returns a set of (name, statement, prepare_time, param_types, from_sql).
+ * returns a set of (name, statement, prepare_time, param_types, from_sql,
+ * generic_plans, custom_plans).
  */
 Datum
 pg_prepared_statement(PG_FUNCTION_ARGS)
@@ -723,7 +724,7 @@ pg_prepared_statement(PG_FUNCTION_ARGS)
 	 * build tupdesc for result tuples. This must match the definition of the
 	 * pg_prepared_statements view in system_views.sql
 	 */
-	tupdesc = CreateTemplateTupleDesc(5);
+	tupdesc = CreateTemplateTupleDesc(7);
 	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "name",
 					   TEXTOID, -1, 0);
 	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "statement",
@@ -734,6 +735,10 @@ pg_prepared_statement(PG_FUNCTION_ARGS)
 					   REGTYPEARRAYOID, -1, 0);
 	TupleDescInitEntry(tupdesc, (AttrNumber) 5, "from_sql",
 					   BOOLOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 6, "generic_plans",
+					   INT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 7, "custom_plans",
+					   INT8OID, -1, 0);
 
 	/*
 	 * We put all the tuples into a tuplestore in one scan of the hashtable.
@@ -755,8 +760,8 @@ pg_prepared_statement(PG_FUNCTION_ARGS)
 		hash_seq_init(&hash_seq, prepared_queries);
 		while ((prep_stmt = hash_seq_search(&hash_seq)) != NULL)
 		{
-			Datum		values[5];
-			bool		nulls[5];
+			Datum		values[7];
+			bool		nulls[7];
 
 			MemSet(nulls, 0, sizeof(nulls));
 
@@ -766,6 +771,8 @@ pg_prepared_statement(PG_FUNCTION_ARGS)
 			values[3] = build_regtype_array(prep_stmt->plansource->param_types,
 											prep_stmt->plansource->num_params);
 			values[4] = BoolGetDatum(prep_stmt->from_sql);
+			values[5] = Int64GetDatumFast(prep_stmt->plansource->num_generic_plans);
+			values[6] = Int64GetDatumFast(prep_stmt->plansource->num_custom_plans);
 
 			tuplestore_putvalues(tupstore, tupdesc, values, nulls);
 		}
diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c
index 75b475c179b9..50d6ad28b4cd 100644
--- a/src/backend/utils/cache/plancache.c
+++ b/src/backend/utils/cache/plancache.c
@@ -218,6 +218,7 @@ CreateCachedPlan(RawStmt *raw_parse_tree,
 	plansource->generation = 0;
 	plansource->generic_cost = -1;
 	plansource->total_custom_cost = 0;
+	plansource->num_generic_plans = 0;
 	plansource->num_custom_plans = 0;
 
 	MemoryContextSwitchTo(oldcxt);
@@ -285,6 +286,7 @@ CreateOneShotCachedPlan(RawStmt *raw_parse_tree,
 	plansource->generation = 0;
 	plansource->generic_cost = -1;
 	plansource->total_custom_cost = 0;
+	plansource->num_generic_plans = 0;
 	plansource->num_custom_plans = 0;
 
 	return plansource;
@@ -1213,12 +1215,14 @@ GetCachedPlan(CachedPlanSource *plansource, ParamListInfo boundParams,
 	{
 		/* Build a custom plan */
 		plan = BuildCachedPlan(plansource, qlist, boundParams, queryEnv);
-		/* Accumulate total costs of custom plans, but 'ware overflow */
-		if (plansource->num_custom_plans < INT_MAX)
-		{
-			plansource->total_custom_cost += cached_plan_cost(plan, true);
-			plansource->num_custom_plans++;
-		}
+		/* Accumulate total costs of custom plans */
+		plansource->total_custom_cost += cached_plan_cost(plan, true);
+
+		plansource->num_custom_plans++;
+	}
+	else
+	{
+		plansource->num_generic_plans++;
 	}
 
 	Assert(plan != NULL);
@@ -1574,6 +1578,7 @@ CopyCachedPlan(CachedPlanSource *plansource)
 	/* We may as well copy any acquired cost knowledge */
 	newsource->generic_cost = plansource->generic_cost;
 	newsource->total_custom_cost = plansource->total_custom_cost;
+	newsource->num_generic_plans = plansource->num_generic_plans;
 	newsource->num_custom_plans = plansource->num_custom_plans;
 
 	MemoryContextSwitchTo(oldcxt);
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 95604e988aa0..4b5af32440fb 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -7755,9 +7755,9 @@
 { oid => '2510', descr => 'get the prepared statements for this session',
   proname => 'pg_prepared_statement', prorows => '1000', proretset => 't',
   provolatile => 's', proparallel => 'r', prorettype => 'record',
-  proargtypes => '', proallargtypes => '{text,text,timestamptz,_regtype,bool}',
-  proargmodes => '{o,o,o,o,o}',
-  proargnames => '{name,statement,prepare_time,parameter_types,from_sql}',
+  proargtypes => '', proallargtypes => '{text,text,timestamptz,_regtype,bool,int8,int8}',
+  proargmodes => '{o,o,o,o,o,o,o}',
+  proargnames => '{name,statement,prepare_time,parameter_types,from_sql,generic_plans,custom_plans}',
   prosrc => 'pg_prepared_statement' },
 { oid => '2511', descr => 'get the open cursors for this session',
   proname => 'pg_cursor', prorows => '1000', proretset => 't',
diff --git a/src/include/utils/plancache.h b/src/include/utils/plancache.h
index 522020d7635a..4901568553c5 100644
--- a/src/include/utils/plancache.h
+++ b/src/include/utils/plancache.h
@@ -130,7 +130,8 @@ typedef struct CachedPlanSource
 	/* State kept to help decide whether to use custom or generic plans: */
 	double		generic_cost;	/* cost of generic plan, or -1 if not known */
 	double		total_custom_cost;	/* total cost of custom plans so far */
-	int			num_custom_plans;	/* number of plans included in total */
+	int64		num_custom_plans;	/* # of custom plans included in total */
+	int64		num_generic_plans;	/* # of generic plans */
 } CachedPlanSource;
 
 /*
diff --git a/src/test/regress/expected/plancache.out b/src/test/regress/expected/plancache.out
index 7d289b8c5e70..4e59188196c9 100644
--- a/src/test/regress/expected/plancache.out
+++ b/src/test/regress/expected/plancache.out
@@ -284,7 +284,15 @@ insert into test_mode select 1 from generate_series(1,1000) union all select 2;
 create index on test_mode (a);
 analyze test_mode;
 prepare test_mode_pp (int) as select count(*) from test_mode where a = $1;
+select name, generic_plans, custom_plans from pg_prepared_statements
+  where  name = 'test_mode_pp';
+     name     | generic_plans | custom_plans 
+--------------+---------------+--------------
+ test_mode_pp |             0 |            0
+(1 row)
+
 -- up to 5 executions, custom plan is used
+set plan_cache_mode to auto;
 explain (costs off) execute test_mode_pp(2);
                         QUERY PLAN                        
 ----------------------------------------------------------
@@ -293,6 +301,13 @@ explain (costs off) execute test_mode_pp(2);
          Index Cond: (a = 2)
 (3 rows)
 
+select name, generic_plans, custom_plans from pg_prepared_statements
+  where  name = 'test_mode_pp';
+     name     | generic_plans | custom_plans 
+--------------+---------------+--------------
+ test_mode_pp |             0 |            1
+(1 row)
+
 -- force generic plan
 set plan_cache_mode to force_generic_plan;
 explain (costs off) execute test_mode_pp(2);
@@ -303,6 +318,13 @@ explain (costs off) execute test_mode_pp(2);
          Filter: (a = $1)
 (3 rows)
 
+select name, generic_plans, custom_plans from pg_prepared_statements
+  where  name = 'test_mode_pp';
+     name     | generic_plans | custom_plans 
+--------------+---------------+--------------
+ test_mode_pp |             1 |            1
+(1 row)
+
 -- get to generic plan by 5 executions
 set plan_cache_mode to auto;
 execute test_mode_pp(1); -- 1x
@@ -329,12 +351,26 @@ execute test_mode_pp(1); -- 4x
   1000
 (1 row)
 
+select name, generic_plans, custom_plans from pg_prepared_statements
+  where  name = 'test_mode_pp';
+     name     | generic_plans | custom_plans 
+--------------+---------------+--------------
+ test_mode_pp |             1 |            5
+(1 row)
+
 execute test_mode_pp(1); -- 5x
  count 
 -------
   1000
 (1 row)
 
+select name, generic_plans, custom_plans from pg_prepared_statements
+  where  name = 'test_mode_pp';
+     name     | generic_plans | custom_plans 
+--------------+---------------+--------------
+ test_mode_pp |             2 |            5
+(1 row)
+
 -- we should now get a really bad plan
 explain (costs off) execute test_mode_pp(2);
          QUERY PLAN          
@@ -354,4 +390,11 @@ explain (costs off) execute test_mode_pp(2);
          Index Cond: (a = 2)
 (3 rows)
 
+select name, generic_plans, custom_plans from pg_prepared_statements
+  where  name = 'test_mode_pp';
+     name     | generic_plans | custom_plans 
+--------------+---------------+--------------
+ test_mode_pp |             3 |            6
+(1 row)
+
 drop table test_mode;
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index fa436f2caa0a..601734a6f1ec 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1428,8 +1428,10 @@ pg_prepared_statements| SELECT p.name,
     p.statement,
     p.prepare_time,
     p.parameter_types,
-    p.from_sql
-   FROM pg_prepared_statement() p(name, statement, prepare_time, parameter_types, from_sql);
+    p.from_sql,
+    p.generic_plans,
+    p.custom_plans
+   FROM pg_prepared_statement() p(name, statement, prepare_time, parameter_types, from_sql, generic_plans, custom_plans);
 pg_prepared_xacts| SELECT p.transaction,
     p.gid,
     p.prepared,
diff --git a/src/test/regress/sql/plancache.sql b/src/test/regress/sql/plancache.sql
index fa218c8d2177..4b2f11dcc641 100644
--- a/src/test/regress/sql/plancache.sql
+++ b/src/test/regress/sql/plancache.sql
@@ -186,13 +186,20 @@ create index on test_mode (a);
 analyze test_mode;
 
 prepare test_mode_pp (int) as select count(*) from test_mode where a = $1;
+select name, generic_plans, custom_plans from pg_prepared_statements
+  where  name = 'test_mode_pp';
 
 -- up to 5 executions, custom plan is used
+set plan_cache_mode to auto;
 explain (costs off) execute test_mode_pp(2);
+select name, generic_plans, custom_plans from pg_prepared_statements
+  where  name = 'test_mode_pp';
 
 -- force generic plan
 set plan_cache_mode to force_generic_plan;
 explain (costs off) execute test_mode_pp(2);
+select name, generic_plans, custom_plans from pg_prepared_statements
+  where  name = 'test_mode_pp';
 
 -- get to generic plan by 5 executions
 set plan_cache_mode to auto;
@@ -200,7 +207,11 @@ execute test_mode_pp(1); -- 1x
 execute test_mode_pp(1); -- 2x
 execute test_mode_pp(1); -- 3x
 execute test_mode_pp(1); -- 4x
+select name, generic_plans, custom_plans from pg_prepared_statements
+  where  name = 'test_mode_pp';
 execute test_mode_pp(1); -- 5x
+select name, generic_plans, custom_plans from pg_prepared_statements
+  where  name = 'test_mode_pp';
 
 -- we should now get a really bad plan
 explain (costs off) execute test_mode_pp(2);
@@ -208,5 +219,7 @@ explain (costs off) execute test_mode_pp(2);
 -- but we can force a custom plan
 set plan_cache_mode to force_custom_plan;
 explain (costs off) execute test_mode_pp(2);
+select name, generic_plans, custom_plans from pg_prepared_statements
+  where  name = 'test_mode_pp';
 
 drop table test_mode;

From 0bead9af484c1d0a67e690fda47011addaa5bc9d Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Mon, 20 Jul 2020 08:48:26 +0530
Subject: [PATCH 172/334] Immediately WAL-log subtransaction and top-level XID
 association.

The logical decoding infrastructure needs to know which top-level
transaction the subxact belongs to, in order to decode all the
changes. Until now that might be delayed until commit, due to the
caching (GPROC_MAX_CACHED_SUBXIDS), preventing features requiring
incremental decoding.

So we also write the assignment info into WAL immediately, as part
of the next WAL record (to minimize overhead) only when wal_level=logical.
We can not remove the existing XLOG_XACT_ASSIGNMENT WAL as that is
required for avoiding overflow in the hot standby snapshot.

Bump XLOG_PAGE_MAGIC, since this introduces XLR_BLOCK_ID_TOPLEVEL_XID.

Author: Tomas Vondra, Dilip Kumar, Amit Kapila
Reviewed-by: Amit Kapila
Tested-by: Neha Sharma and Mahendra Singh Thalor
Discussion: https://postgr.es/m/688b0b7f-2f6c-d827-c27b-216a8e3ea700@2ndquadrant.com
---
 src/backend/access/transam/xact.c        | 50 ++++++++++++++++++++++++
 src/backend/access/transam/xloginsert.c  | 23 ++++++++++-
 src/backend/access/transam/xlogreader.c  |  5 +++
 src/backend/replication/logical/decode.c | 44 +++++++++++----------
 src/include/access/xact.h                |  3 ++
 src/include/access/xlog.h                |  1 +
 src/include/access/xlog_internal.h       |  2 +-
 src/include/access/xlogreader.h          |  3 ++
 src/include/access/xlogrecord.h          |  1 +
 9 files changed, 108 insertions(+), 24 deletions(-)

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index b3ee7fa7ea04..bd4c3cf32585 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -191,6 +191,7 @@ typedef struct TransactionStateData
 	bool		didLogXid;		/* has xid been included in WAL record? */
 	int			parallelModeLevel;	/* Enter/ExitParallelMode counter */
 	bool		chain;			/* start a new block after this one */
+	bool		assigned;		/* assigned to top-level XID */
 	struct TransactionStateData *parent;	/* back link to parent */
 } TransactionStateData;
 
@@ -223,6 +224,7 @@ typedef struct SerializedTransactionState
 static TransactionStateData TopTransactionStateData = {
 	.state = TRANS_DEFAULT,
 	.blockState = TBLOCK_DEFAULT,
+	.assigned = false,
 };
 
 /*
@@ -5120,6 +5122,7 @@ PushTransaction(void)
 	GetUserIdAndSecContext(&s->prevUser, &s->prevSecContext);
 	s->prevXactReadOnly = XactReadOnly;
 	s->parallelModeLevel = 0;
+	s->assigned = false;
 
 	CurrentTransactionState = s;
 
@@ -6022,3 +6025,50 @@ xact_redo(XLogReaderState *record)
 	else
 		elog(PANIC, "xact_redo: unknown op code %u", info);
 }
+
+/*
+ * IsSubTransactionAssignmentPending
+ *
+ * This is used to decide whether we need to WAL log the top-level XID for
+ * operation in a subtransaction.  We require that for logical decoding, see
+ * LogicalDecodingProcessRecord.
+ *
+ * This returns true if wal_level >= logical and we are inside a valid
+ * subtransaction, for which the assignment was not yet written to any WAL
+ * record.
+ */
+bool
+IsSubTransactionAssignmentPending(void)
+{
+	/* wal_level has to be logical */
+	if (!XLogLogicalInfoActive())
+		return false;
+
+	/* we need to be in a transaction state */
+	if (!IsTransactionState())
+		return false;
+
+	/* it has to be a subtransaction */
+	if (!IsSubTransaction())
+		return false;
+
+	/* the subtransaction has to have a XID assigned */
+	if (!TransactionIdIsValid(GetCurrentTransactionIdIfAny()))
+		return false;
+
+	/* and it should not be already 'assigned' */
+	return !CurrentTransactionState->assigned;
+}
+
+/*
+ * MarkSubTransactionAssigned
+ *
+ * Mark the subtransaction assignment as completed.
+ */
+void
+MarkSubTransactionAssigned(void)
+{
+	Assert(IsSubTransactionAssignmentPending());
+
+	CurrentTransactionState->assigned = true;
+}
diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c
index b21679f09eb4..c526bb19281e 100644
--- a/src/backend/access/transam/xloginsert.c
+++ b/src/backend/access/transam/xloginsert.c
@@ -89,11 +89,13 @@ static XLogRecData hdr_rdt;
 static char *hdr_scratch = NULL;
 
 #define SizeOfXlogOrigin	(sizeof(RepOriginId) + sizeof(char))
+#define SizeOfXLogTransactionId	(sizeof(TransactionId) + sizeof(char))
 
 #define HEADER_SCRATCH_SIZE \
 	(SizeOfXLogRecord + \
 	 MaxSizeOfXLogRecordBlockHeader * (XLR_MAX_BLOCK_ID + 1) + \
-	 SizeOfXLogRecordDataHeaderLong + SizeOfXlogOrigin)
+	 SizeOfXLogRecordDataHeaderLong + SizeOfXlogOrigin + \
+	 SizeOfXLogTransactionId)
 
 /*
  * An array of XLogRecData structs, to hold registered data.
@@ -195,6 +197,10 @@ XLogResetInsertion(void)
 {
 	int			i;
 
+	/* reset the subxact assignment flag (if needed) */
+	if (curinsert_flags & XLOG_INCLUDE_XID)
+		MarkSubTransactionAssigned();
+
 	for (i = 0; i < max_registered_block_id; i++)
 		registered_buffers[i].in_use = false;
 
@@ -398,7 +404,7 @@ void
 XLogSetRecordFlags(uint8 flags)
 {
 	Assert(begininsert_called);
-	curinsert_flags = flags;
+	curinsert_flags |= flags;
 }
 
 /*
@@ -748,6 +754,19 @@ XLogRecordAssemble(RmgrId rmid, uint8 info,
 		scratch += sizeof(replorigin_session_origin);
 	}
 
+	/* followed by toplevel XID, if not already included in previous record */
+	if (IsSubTransactionAssignmentPending())
+	{
+		TransactionId xid = GetTopTransactionIdIfAny();
+
+		/* update the flag (later used by XLogResetInsertion) */
+		XLogSetRecordFlags(XLOG_INCLUDE_XID);
+
+		*(scratch++) = (char) XLR_BLOCK_ID_TOPLEVEL_XID;
+		memcpy(scratch, &xid, sizeof(TransactionId));
+		scratch += sizeof(TransactionId);
+	}
+
 	/* followed by main data, if any */
 	if (mainrdata_len > 0)
 	{
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index cb76be4f4696..a757baccfc55 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -1197,6 +1197,7 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
 
 	state->decoded_record = record;
 	state->record_origin = InvalidRepOriginId;
+	state->toplevel_xid = InvalidTransactionId;
 
 	ptr = (char *) record;
 	ptr += SizeOfXLogRecord;
@@ -1235,6 +1236,10 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg)
 		{
 			COPY_HEADER_FIELD(&state->record_origin, sizeof(RepOriginId));
 		}
+		else if (block_id == XLR_BLOCK_ID_TOPLEVEL_XID)
+		{
+			COPY_HEADER_FIELD(&state->toplevel_xid, sizeof(TransactionId));
+		}
 		else if (block_id <= XLR_MAX_BLOCK_ID)
 		{
 			/* XLogRecordBlockHeader */
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index c2e5e3abf82e..0c0c37173919 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -94,11 +94,27 @@ void
 LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *record)
 {
 	XLogRecordBuffer buf;
+	TransactionId txid;
 
 	buf.origptr = ctx->reader->ReadRecPtr;
 	buf.endptr = ctx->reader->EndRecPtr;
 	buf.record = record;
 
+	txid = XLogRecGetTopXid(record);
+
+	/*
+	 * If the top-level xid is valid, we need to assign the subxact to the
+	 * top-level xact. We need to do this for all records, hence we do it
+	 * before the switch.
+	 */
+	if (TransactionIdIsValid(txid))
+	{
+		ReorderBufferAssignChild(ctx->reorder,
+								 txid,
+								 record->decoded_record->xl_xid,
+								 buf.origptr);
+	}
+
 	/* cast so we get a warning when new rmgrs are added */
 	switch ((RmgrId) XLogRecGetRmid(record))
 	{
@@ -216,13 +232,8 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 	/*
 	 * If the snapshot isn't yet fully built, we cannot decode anything, so
 	 * bail out.
-	 *
-	 * However, it's critical to process XLOG_XACT_ASSIGNMENT records even
-	 * when the snapshot is being built: it is possible to get later records
-	 * that require subxids to be properly assigned.
 	 */
-	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT &&
-		info != XLOG_XACT_ASSIGNMENT)
+	if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT)
 		return;
 
 	switch (info)
@@ -264,22 +275,13 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 				break;
 			}
 		case XLOG_XACT_ASSIGNMENT:
-			{
-				xl_xact_assignment *xlrec;
-				int			i;
-				TransactionId *sub_xid;
 
-				xlrec = (xl_xact_assignment *) XLogRecGetData(r);
-
-				sub_xid = &xlrec->xsub[0];
-
-				for (i = 0; i < xlrec->nsubxacts; i++)
-				{
-					ReorderBufferAssignChild(reorder, xlrec->xtop,
-											 *(sub_xid++), buf->origptr);
-				}
-				break;
-			}
+			/*
+			 * We assign subxact to the toplevel xact while processing each
+			 * record if required.  So, we don't need to do anything here.
+			 * See LogicalDecodingProcessRecord.
+			 */
+			break;
 		case XLOG_XACT_PREPARE:
 
 			/*
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index db191879b9d2..aef855536744 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -428,6 +428,9 @@ extern void UnregisterXactCallback(XactCallback callback, void *arg);
 extern void RegisterSubXactCallback(SubXactCallback callback, void *arg);
 extern void UnregisterSubXactCallback(SubXactCallback callback, void *arg);
 
+extern bool IsSubTransactionAssignmentPending(void);
+extern void MarkSubTransactionAssigned(void);
+
 extern int	xactGetCommittedChildren(TransactionId **ptr);
 
 extern XLogRecPtr XactLogCommitRecord(TimestampTz commit_time,
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 5b143348879b..d8391aa3783a 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -237,6 +237,7 @@ extern bool XLOG_DEBUG;
  */
 #define XLOG_INCLUDE_ORIGIN		0x01	/* include the replication origin */
 #define XLOG_MARK_UNIMPORTANT	0x02	/* record not important for durability */
+#define XLOG_INCLUDE_XID		0x04	/* include XID of top-level xact */
 
 
 /* Checkpoint statistics */
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 88f3d767007b..b9490a3afeff 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -31,7 +31,7 @@
 /*
  * Each page of XLOG file has a header like this:
  */
-#define XLOG_PAGE_MAGIC 0xD106	/* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD107	/* can be used as WAL version indicator */
 
 typedef struct XLogPageHeaderData
 {
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index b0f2a6ed43a9..b97688222910 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -191,6 +191,8 @@ struct XLogReaderState
 
 	RepOriginId record_origin;
 
+	TransactionId toplevel_xid; /* XID of top-level transaction */
+
 	/* information about blocks referenced by the record. */
 	DecodedBkpBlock blocks[XLR_MAX_BLOCK_ID + 1];
 
@@ -304,6 +306,7 @@ extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record,
 #define XLogRecGetRmid(decoder) ((decoder)->decoded_record->xl_rmid)
 #define XLogRecGetXid(decoder) ((decoder)->decoded_record->xl_xid)
 #define XLogRecGetOrigin(decoder) ((decoder)->record_origin)
+#define XLogRecGetTopXid(decoder) ((decoder)->toplevel_xid)
 #define XLogRecGetData(decoder) ((decoder)->main_data)
 #define XLogRecGetDataLen(decoder) ((decoder)->main_data_len)
 #define XLogRecHasAnyBlockRefs(decoder) ((decoder)->max_block_id >= 0)
diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h
index acd9af0194d4..2f0c8bf58966 100644
--- a/src/include/access/xlogrecord.h
+++ b/src/include/access/xlogrecord.h
@@ -223,5 +223,6 @@ typedef struct XLogRecordDataHeaderLong
 #define XLR_BLOCK_ID_DATA_SHORT		255
 #define XLR_BLOCK_ID_DATA_LONG		254
 #define XLR_BLOCK_ID_ORIGIN			253
+#define XLR_BLOCK_ID_TOPLEVEL_XID	252
 
 #endif							/* XLOGRECORD_H */

From c3fe108c025e4a080315562d4c15ecbe3f00405e Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Mon, 20 Jul 2020 13:30:18 +0900
Subject: [PATCH 173/334] Rename wal_keep_segments to wal_keep_size.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

max_slot_wal_keep_size that was added in v13 and wal_keep_segments are
the GUC parameters to specify how much WAL files to retain for
the standby servers. While max_slot_wal_keep_size accepts the number of
bytes of WAL files, wal_keep_segments accepts the number of WAL files.
This difference of setting units between those similar parameters could
be confusing to users.

To alleviate this situation, this commit renames wal_keep_segments to
wal_keep_size, and make users specify the WAL size in it instead of
the number of WAL files.

There was also the idea to rename max_slot_wal_keep_size to
max_slot_wal_keep_segments, in the discussion. But we have been moving
away from measuring in segments, for example, checkpoint_segments was
replaced by max_wal_size. So we concluded to rename wal_keep_segments
to wal_keep_size.

Back-patch to v13 where max_slot_wal_keep_size was added.

Author: Fujii Masao
Reviewed-by: Álvaro Herrera, Kyotaro Horiguchi, David Steele
Discussion: https://postgr.es/m/574b4ea3-e0f9-b175-ead2-ebea7faea855@oss.nttdata.com
---
 doc/src/sgml/catalogs.sgml                    |  2 +-
 doc/src/sgml/config.sgml                      | 21 ++++++-------
 doc/src/sgml/high-availability.sgml           |  4 +--
 doc/src/sgml/ref/pg_basebackup.sgml           |  2 +-
 doc/src/sgml/wal.sgml                         |  3 +-
 src/backend/access/transam/xlog.c             | 30 +++++++++++--------
 src/backend/replication/slotfuncs.c           | 13 ++++----
 src/backend/utils/misc/guc.c                  | 11 +++----
 src/backend/utils/misc/postgresql.conf.sample |  2 +-
 src/bin/pg_rewind/t/RewindTest.pm             |  4 +--
 src/include/access/xlog.h                     |  4 +--
 src/test/recovery/t/019_replslot_limit.pl     | 10 +++----
 12 files changed, 58 insertions(+), 48 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 9f00ea90e32a..048ff284f76d 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -11278,7 +11278,7 @@ SELECT * FROM pg_locks pl LEFT JOIN pg_prepared_xacts ppx
          <para><literal>extended</literal> means
           that <varname>max_wal_size</varname> is exceeded but the files are
           still retained, either by the replication slot or
-          by <varname>wal_keep_segments</varname>.
+          by <varname>wal_keep_size</varname>.
          </para>
         </listitem>
         <listitem>
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 1c491fb8ffcb..ca6a3a523ff6 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -3151,7 +3151,7 @@ include_dir 'conf.d'
         checkpoints. This is a soft limit; WAL size can exceed
         <varname>max_wal_size</varname> under special circumstances, such as
         heavy load, a failing <varname>archive_command</varname>, or a high
-        <varname>wal_keep_segments</varname> setting.
+        <varname>wal_keep_size</varname> setting.
         If this value is specified without units, it is taken as megabytes.
         The default is 1 GB.
         Increasing this parameter can increase the amount of time needed for
@@ -3778,21 +3778,21 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"'  # Windows
        </listitem>
       </varlistentry>
 
-      <varlistentry id="guc-wal-keep-segments" xreflabel="wal_keep_segments">
-       <term><varname>wal_keep_segments</varname> (<type>integer</type>)
+      <varlistentry id="guc-wal-keep-size" xreflabel="wal_keep_size">
+       <term><varname>wal_keep_size</varname> (<type>integer</type>)
        <indexterm>
-        <primary><varname>wal_keep_segments</varname> configuration parameter</primary>
+        <primary><varname>wal_keep_size</varname> configuration parameter</primary>
        </indexterm>
        </term>
        <listitem>
        <para>
-        Specifies the minimum number of past log file segments kept in the
+        Specifies the minimum size of past log file segments kept in the
         <filename>pg_wal</filename>
         directory, in case a standby server needs to fetch them for streaming
-        replication. Each segment is normally 16 megabytes. If a standby
+        replication. If a standby
         server connected to the sending server falls behind by more than
-        <varname>wal_keep_segments</varname> segments, the sending server might remove
-        a WAL segment still needed by the standby, in which case the
+        <varname>wal_keep_size</varname> megabytes, the sending server might
+        remove a WAL segment still needed by the standby, in which case the
         replication connection will be terminated.  Downstream connections
         will also eventually fail as a result.  (However, the standby
         server can recover by fetching the segment from archive, if WAL
@@ -3800,14 +3800,15 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"'  # Windows
        </para>
 
        <para>
-        This sets only the minimum number of segments retained in
+        This sets only the minimum size of segments retained in
         <filename>pg_wal</filename>; the system might need to retain more segments
         for WAL archival or to recover from a checkpoint. If
-        <varname>wal_keep_segments</varname> is zero (the default), the system
+        <varname>wal_keep_size</varname> is zero (the default), the system
         doesn't keep any extra segments for standby purposes, so the number
         of old WAL segments available to standby servers is a function of
         the location of the previous checkpoint and status of WAL
         archiving.
+        If this value is specified without units, it is taken as megabytes.
         This parameter can only be set in the
         <filename>postgresql.conf</filename> file or on the server command line.
        </para>
diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml
index 6a9184f314e6..89f6d6eda636 100644
--- a/doc/src/sgml/high-availability.sgml
+++ b/doc/src/sgml/high-availability.sgml
@@ -785,7 +785,7 @@ archive_cleanup_command = 'pg_archivecleanup /path/to/archive %r'
     archiving, the server might recycle old WAL segments before the standby
     has received them.  If this occurs, the standby will need to be
     reinitialized from a new base backup.  You can avoid this by setting
-    <varname>wal_keep_segments</varname> to a value large enough to ensure that
+    <varname>wal_keep_size</varname> to a value large enough to ensure that
     WAL segments are not recycled too early, or by configuring a replication
     slot for the standby.  If you set up a WAL archive that's accessible from
     the standby, these solutions are not required, since the standby can
@@ -929,7 +929,7 @@ primary_conninfo = 'host=192.168.1.50 port=5432 user=foo password=foopass'
    </para>
    <para>
     In lieu of using replication slots, it is possible to prevent the removal
-    of old WAL segments using <xref linkend="guc-wal-keep-segments"/>, or by
+    of old WAL segments using <xref linkend="guc-wal-keep-size"/>, or by
     storing the segments in an archive using
     <xref linkend="guc-archive-command"/>.
     However, these methods often result in retaining more WAL segments than
diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index aa41fb444fa7..e246efbdb520 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -305,7 +305,7 @@ PostgreSQL documentation
            <para>
             The write-ahead log files are collected at the end of the backup.
             Therefore, it is necessary for the
-            <xref linkend="guc-wal-keep-segments"/> parameter to be set high
+            <xref linkend="guc-wal-keep-size"/> parameter to be set high
              enough that the log is not removed before the end of the backup.
              If the log has been rotated when it's time to transfer it, the
              backup will fail and be unusable.
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index 1902f36291db..7a13d8d5025e 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -578,7 +578,8 @@
 
   <para>
    Independently of <varname>max_wal_size</varname>,
-   <xref linkend="guc-wal-keep-segments"/> + 1 most recent WAL files are
+   the most recent <xref linkend="guc-wal-keep-size"/> megabytes of
+   WAL files plus one additional WAL file are
    kept at all times. Also, if WAL archiving is used, old segments can not be
    removed or recycled until they are archived. If WAL archiving cannot keep up
    with the pace that WAL is generated, or if <varname>archive_command</varname>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 0a97b1d37fbe..184c6672f3be 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -88,7 +88,7 @@ extern uint32 bootstrap_data_checksum_version;
 /* User-settable parameters */
 int			max_wal_size_mb = 1024; /* 1 GB */
 int			min_wal_size_mb = 80;	/* 80 MB */
-int			wal_keep_segments = 0;
+int			wal_keep_size_mb = 0;
 int			XLOGbuffers = -1;
 int			XLogArchiveTimeout = 0;
 int			XLogArchiveMode = ARCHIVE_MODE_OFF;
@@ -9525,7 +9525,7 @@ GetWALAvailability(XLogRecPtr targetLSN)
 
 	/*
 	 * Calculate the oldest segment currently reserved by all slots,
-	 * considering wal_keep_segments and max_slot_wal_keep_size.  Initialize
+	 * considering wal_keep_size and max_slot_wal_keep_size.  Initialize
 	 * oldestSlotSeg to the current segment.
 	 */
 	currpos = GetXLogWriteRecPtr();
@@ -9576,9 +9576,9 @@ GetWALAvailability(XLogRecPtr targetLSN)
 
 /*
  * Retreat *logSegNo to the last segment that we need to retain because of
- * either wal_keep_segments or replication slots.
+ * either wal_keep_size or replication slots.
  *
- * This is calculated by subtracting wal_keep_segments from the given xlog
+ * This is calculated by subtracting wal_keep_size from the given xlog
  * location, recptr and by making sure that that result is below the
  * requirement of replication slots.  For the latter criterion we do consider
  * the effects of max_slot_wal_keep_size: reserve at most that much space back
@@ -9616,14 +9616,20 @@ KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
 		}
 	}
 
-	/* but, keep at least wal_keep_segments if that's set */
-	if (wal_keep_segments > 0 && currSegNo - segno < wal_keep_segments)
+	/* but, keep at least wal_keep_size if that's set */
+	if (wal_keep_size_mb > 0)
 	{
-		/* avoid underflow, don't go below 1 */
-		if (currSegNo <= wal_keep_segments)
-			segno = 1;
-		else
-			segno = currSegNo - wal_keep_segments;
+		uint64		keep_segs;
+
+		keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
+		if (currSegNo - segno < keep_segs)
+		{
+			/* avoid underflow, don't go below 1 */
+			if (currSegNo <= keep_segs)
+				segno = 1;
+			else
+				segno = currSegNo - keep_segs;
+		}
 	}
 
 	/* don't delete WAL segments newer than the calculated segment */
@@ -11328,7 +11334,7 @@ do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
 	 * If archiving is enabled, wait for all the required WAL files to be
 	 * archived before returning. If archiving isn't enabled, the required WAL
 	 * needs to be transported via streaming replication (hopefully with
-	 * wal_keep_segments set high enough), or some more exotic mechanism like
+	 * wal_keep_size set high enough), or some more exotic mechanism like
 	 * polling and copying files from pg_wal with script. We have no knowledge
 	 * of those mechanisms, so it's up to the user to ensure that he gets all
 	 * the required WAL.
diff --git a/src/backend/replication/slotfuncs.c b/src/backend/replication/slotfuncs.c
index 9fe147bf44ec..f88694672fba 100644
--- a/src/backend/replication/slotfuncs.c
+++ b/src/backend/replication/slotfuncs.c
@@ -413,19 +413,20 @@ pg_get_replication_slots(PG_FUNCTION_ARGS)
 		else
 		{
 			XLogSegNo   targetSeg;
-			XLogSegNo   keepSegs;
+			uint64   slotKeepSegs;
+			uint64   keepSegs;
 			XLogSegNo   failSeg;
 			XLogRecPtr  failLSN;
 
 			XLByteToSeg(slot_contents.data.restart_lsn, targetSeg, wal_segment_size);
 
-			/* determine how many segments slots can be kept by slots ... */
-			keepSegs = XLogMBVarToSegs(max_slot_wal_keep_size_mb, wal_segment_size);
-			/* ... and override by wal_keep_segments as needed */
-			keepSegs = Max(keepSegs, wal_keep_segments);
+			/* determine how many segments slots can be kept by slots */
+			slotKeepSegs = XLogMBVarToSegs(max_slot_wal_keep_size_mb, wal_segment_size);
+			/* ditto for wal_keep_size */
+			keepSegs = XLogMBVarToSegs(wal_keep_size_mb, wal_segment_size);
 
 			/* if currpos reaches failLSN, we lose our segment */
-			failSeg = targetSeg + keepSegs + 1;
+			failSeg = targetSeg + Max(slotKeepSegs, keepSegs) + 1;
 			XLogSegNoOffsetToRecPtr(failSeg, 0, wal_segment_size, failLSN);
 
 			values[i++] = Int64GetDatum(failLSN - currlsn);
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 99a3e4f6f655..6f603cbbe8c8 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2636,12 +2636,13 @@ static struct config_int ConfigureNamesInt[] =
 	},
 
 	{
-		{"wal_keep_segments", PGC_SIGHUP, REPLICATION_SENDING,
-			gettext_noop("Sets the number of WAL files held for standby servers."),
-			NULL
+		{"wal_keep_size", PGC_SIGHUP, REPLICATION_SENDING,
+			gettext_noop("Sets the size of WAL files held for standby servers."),
+			NULL,
+			GUC_UNIT_MB
 		},
-		&wal_keep_segments,
-		0, 0, INT_MAX,
+		&wal_keep_size_mb,
+		0, 0, MAX_KILOBYTES,
 		NULL, NULL, NULL
 	},
 
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 29e015219668..5a0b8e982179 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -290,7 +290,7 @@
 
 #max_wal_senders = 10		# max number of walsender processes
 				# (change requires restart)
-#wal_keep_segments = 0		# in logfile segments; 0 disables
+#wal_keep_size = 0		# in megabytes; 0 disables
 #max_slot_wal_keep_size = -1	# in megabytes; -1 disables
 #wal_sender_timeout = 60s	# in milliseconds; 0 disables
 
diff --git a/src/bin/pg_rewind/t/RewindTest.pm b/src/bin/pg_rewind/t/RewindTest.pm
index 7516af7a01a6..41ed7d4b3bef 100644
--- a/src/bin/pg_rewind/t/RewindTest.pm
+++ b/src/bin/pg_rewind/t/RewindTest.pm
@@ -135,11 +135,11 @@ sub setup_cluster
 		extra            => $extra,
 		auth_extra       => [ '--create-role', 'rewind_user' ]);
 
-	# Set wal_keep_segments to prevent WAL segment recycling after enforced
+	# Set wal_keep_size to prevent WAL segment recycling after enforced
 	# checkpoints in the tests.
 	$node_primary->append_conf(
 		'postgresql.conf', qq(
-wal_keep_segments = 20
+wal_keep_size = 320MB
 ));
 	return;
 }
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index d8391aa3783a..219a7299e1f0 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -107,7 +107,7 @@ extern bool reachedConsistency;
 extern int	wal_segment_size;
 extern int	min_wal_size_mb;
 extern int	max_wal_size_mb;
-extern int	wal_keep_segments;
+extern int	wal_keep_size_mb;
 extern int	max_slot_wal_keep_size_mb;
 extern int	XLOGbuffers;
 extern int	XLogArchiveTimeout;
@@ -273,7 +273,7 @@ typedef enum WALAvailability
 	WALAVAIL_INVALID_LSN,		/* parameter error */
 	WALAVAIL_RESERVED,			/* WAL segment is within max_wal_size */
 	WALAVAIL_EXTENDED,			/* WAL segment is reserved by a slot or
-								 * wal_keep_segments */
+								 * wal_keep_size */
 	WALAVAIL_UNRESERVED,		/* no longer reserved, but not removed yet */
 	WALAVAIL_REMOVED			/* WAL segment has been removed */
 } WALAvailability;
diff --git a/src/test/recovery/t/019_replslot_limit.pl b/src/test/recovery/t/019_replslot_limit.pl
index 1fced12fca50..20f4a1bbc3dc 100644
--- a/src/test/recovery/t/019_replslot_limit.pl
+++ b/src/test/recovery/t/019_replslot_limit.pl
@@ -116,19 +116,19 @@
 $node_primary->wait_for_catchup($node_standby, 'replay', $start_lsn);
 $node_standby->stop;
 
-# wal_keep_segments overrides max_slot_wal_keep_size
+# wal_keep_size overrides max_slot_wal_keep_size
 $result = $node_primary->safe_psql('postgres',
-	"ALTER SYSTEM SET wal_keep_segments to 8; SELECT pg_reload_conf();");
+	"ALTER SYSTEM SET wal_keep_size to '8MB'; SELECT pg_reload_conf();");
 # Advance WAL again then checkpoint, reducing remain by 6 MB.
 advance_wal($node_primary, 6);
 $result = $node_primary->safe_psql('postgres',
 	"SELECT wal_status as remain FROM pg_replication_slots WHERE slot_name = 'rep1'"
 );
 is($result, "extended",
-	'check that wal_keep_segments overrides max_slot_wal_keep_size');
-# restore wal_keep_segments
+	'check that wal_keep_size overrides max_slot_wal_keep_size');
+# restore wal_keep_size
 $result = $node_primary->safe_psql('postgres',
-	"ALTER SYSTEM SET wal_keep_segments to 0; SELECT pg_reload_conf();");
+	"ALTER SYSTEM SET wal_keep_size to 0; SELECT pg_reload_conf();");
 
 # The standby can reconnect to primary
 $node_standby->start;

From d98c08cdc638a290d80fb69b20b8288c3924d875 Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Mon, 20 Jul 2020 13:59:50 +0300
Subject: [PATCH 174/334] Update btree_gist extension for parallel query

All functions provided by this extension are PARALLEL SAFE.

Discussion: https://postgr.es/m/AM5PR0901MB1587E47B1ACF23C6089DFCA3FD9B0%40AM5PR0901MB1587.eurprd09.prod.outlook.com
Author: Steven Winfield
---
 contrib/btree_gist/Makefile                 |   3 +-
 contrib/btree_gist/btree_gist--1.5--1.6.sql | 191 ++++++++++++++++++++
 contrib/btree_gist/btree_gist.control       |   2 +-
 3 files changed, 194 insertions(+), 2 deletions(-)
 create mode 100644 contrib/btree_gist/btree_gist--1.5--1.6.sql

diff --git a/contrib/btree_gist/Makefile b/contrib/btree_gist/Makefile
index a85db35e5576..e92d974a1a3b 100644
--- a/contrib/btree_gist/Makefile
+++ b/contrib/btree_gist/Makefile
@@ -31,7 +31,8 @@ OBJS =  \
 EXTENSION = btree_gist
 DATA = btree_gist--1.0--1.1.sql \
        btree_gist--1.1--1.2.sql btree_gist--1.2.sql btree_gist--1.2--1.3.sql \
-       btree_gist--1.3--1.4.sql btree_gist--1.4--1.5.sql
+       btree_gist--1.3--1.4.sql btree_gist--1.4--1.5.sql \
+       btree_gist--1.5--1.6.sql
 PGFILEDESC = "btree_gist - B-tree equivalent GiST operator classes"
 
 REGRESS = init int2 int4 int8 float4 float8 cash oid timestamp timestamptz \
diff --git a/contrib/btree_gist/btree_gist--1.5--1.6.sql b/contrib/btree_gist/btree_gist--1.5--1.6.sql
new file mode 100644
index 000000000000..5e1fcb47bdea
--- /dev/null
+++ b/contrib/btree_gist/btree_gist--1.5--1.6.sql
@@ -0,0 +1,191 @@
+/* contrib/btree_gist/btree_gist--1.5--1.6.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "ALTER EXTENSION btree_gist UPDATE TO '1.6'" to load this file. \quit
+
+-- This upgrade script marks all btree_gist functions as parallel safe.
+
+-- Input/output functions for GiST key types (gbtreekey*)
+ALTER FUNCTION gbtreekey4_in(cstring) PARALLEL SAFE;
+ALTER FUNCTION gbtreekey4_out(gbtreekey4) PARALLEL SAFE;
+ALTER FUNCTION gbtreekey8_in(cstring) PARALLEL SAFE;
+ALTER FUNCTION gbtreekey8_out(gbtreekey8) PARALLEL SAFE;
+ALTER FUNCTION gbtreekey16_in(cstring) PARALLEL SAFE;
+ALTER FUNCTION gbtreekey16_out(gbtreekey16) PARALLEL SAFE;
+ALTER FUNCTION gbtreekey32_in(cstring) PARALLEL SAFE;
+ALTER FUNCTION gbtreekey32_out(gbtreekey32) PARALLEL SAFE;
+ALTER FUNCTION gbtreekey_var_in(cstring) PARALLEL SAFE;
+ALTER FUNCTION gbtreekey_var_out(gbtreekey_var) PARALLEL SAFE;
+
+-- Functions, which implement distance operators (<->)
+ALTER FUNCTION cash_dist(money, money) PARALLEL SAFE;
+ALTER FUNCTION date_dist(date, date) PARALLEL SAFE;
+ALTER FUNCTION float4_dist(real, real) PARALLEL SAFE;
+ALTER FUNCTION float8_dist(double precision, double precision) PARALLEL SAFE;
+ALTER FUNCTION int2_dist(smallint, smallint) PARALLEL SAFE;
+ALTER FUNCTION int4_dist(integer, integer) PARALLEL SAFE;
+ALTER FUNCTION int8_dist(bigint, bigint) PARALLEL SAFE;
+ALTER FUNCTION interval_dist(interval, interval) PARALLEL SAFE;
+ALTER FUNCTION oid_dist(oid, oid) PARALLEL SAFE;
+ALTER FUNCTION time_dist(time without time zone, time without time zone) PARALLEL SAFE;
+ALTER FUNCTION ts_dist(timestamp without time zone, timestamp without time zone) PARALLEL SAFE;
+ALTER FUNCTION tstz_dist(timestamp with time zone, timestamp with time zone) PARALLEL SAFE;
+
+-- GiST support methods
+ALTER FUNCTION gbt_oid_consistent(internal, oid, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_oid_distance(internal, oid, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_oid_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_oid_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_decompress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_var_decompress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_var_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_oid_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_oid_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_oid_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_oid_same(gbtreekey8, gbtreekey8, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int2_consistent(internal, smallint, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int2_distance(internal, smallint, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int2_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int2_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int2_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int2_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int2_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int2_same(gbtreekey4, gbtreekey4, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int4_consistent(internal, integer, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int4_distance(internal, integer, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int4_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int4_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int4_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int4_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int4_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int4_same(gbtreekey8, gbtreekey8, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int8_consistent(internal, bigint, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int8_distance(internal, bigint, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int8_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int8_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int8_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int8_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int8_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_int8_same(gbtreekey16, gbtreekey16, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float4_consistent(internal, real, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float4_distance(internal, real, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float4_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float4_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float4_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float4_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float4_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float4_same(gbtreekey8, gbtreekey8, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float8_consistent(internal, double precision, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float8_distance(internal, double precision, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float8_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float8_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float8_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float8_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float8_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_float8_same(gbtreekey16, gbtreekey16, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_ts_consistent(internal, timestamp without time zone, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_ts_distance(internal, timestamp without time zone, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_tstz_consistent(internal, timestamp with time zone, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_tstz_distance(internal, timestamp with time zone, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_ts_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_tstz_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_ts_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_ts_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_ts_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_ts_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_ts_same(gbtreekey16, gbtreekey16, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_time_consistent(internal, time without time zone, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_time_distance(internal, time without time zone, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_timetz_consistent(internal, time with time zone, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_time_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_timetz_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_time_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_time_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_time_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_time_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_time_same(gbtreekey16, gbtreekey16, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_date_consistent(internal, date, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_date_distance(internal, date, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_date_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_date_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_date_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_date_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_date_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_date_same(gbtreekey8, gbtreekey8, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_intv_consistent(internal, interval, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_intv_distance(internal, interval, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_intv_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_intv_decompress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_intv_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_intv_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_intv_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_intv_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_intv_same(gbtreekey32, gbtreekey32, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_cash_consistent(internal, money, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_cash_distance(internal, money, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_cash_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_cash_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_cash_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_cash_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_cash_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_cash_same(gbtreekey16, gbtreekey16, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad_consistent(internal, macaddr, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad_same(gbtreekey16, gbtreekey16, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_text_consistent(internal, text, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bpchar_consistent(internal, character, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_text_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bpchar_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_text_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_text_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_text_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_text_same(gbtreekey_var, gbtreekey_var, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bytea_consistent(internal, bytea, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bytea_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bytea_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bytea_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bytea_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bytea_same(gbtreekey_var, gbtreekey_var, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_numeric_consistent(internal, numeric, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_numeric_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_numeric_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_numeric_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_numeric_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_numeric_same(gbtreekey_var, gbtreekey_var, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bit_consistent(internal, bit, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bit_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bit_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bit_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bit_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_bit_same(gbtreekey_var, gbtreekey_var, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_inet_consistent(internal, inet, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_inet_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_inet_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_inet_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_inet_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_inet_same(gbtreekey16, gbtreekey16, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_uuid_consistent(internal, uuid, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_uuid_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_uuid_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_uuid_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_uuid_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_uuid_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_uuid_same(gbtreekey32, gbtreekey32, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad8_consistent(internal, macaddr8, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad8_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad8_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad8_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad8_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad8_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_macad8_same(gbtreekey16, gbtreekey16, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_enum_consistent(internal, anyenum, smallint, oid, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_enum_compress(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_enum_fetch(internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_enum_penalty(internal, internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_enum_picksplit(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_enum_union(internal, internal) PARALLEL SAFE;
+ALTER FUNCTION gbt_enum_same(gbtreekey8, gbtreekey8, internal) PARALLEL SAFE;
diff --git a/contrib/btree_gist/btree_gist.control b/contrib/btree_gist/btree_gist.control
index cd2d7eb4abbc..e5c41fe8f397 100644
--- a/contrib/btree_gist/btree_gist.control
+++ b/contrib/btree_gist/btree_gist.control
@@ -1,6 +1,6 @@
 # btree_gist extension
 comment = 'support for indexing common datatypes in GiST'
-default_version = '1.5'
+default_version = '1.6'
 module_pathname = '$libdir/btree_gist'
 relocatable = true
 trusted = true

From d5daae47db5e8a61ce6ed7afaa3e3a99af108c06 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 20 Jul 2020 13:40:16 -0400
Subject: [PATCH 175/334] Fix construction of updated-columns bitmap in logical
 replication.

Commit b9c130a1f failed to apply the publisher-to-subscriber column
mapping while checking which columns were updated.  Perhaps less
significantly, it didn't exclude dropped columns either.  This could
result in an incorrect updated-columns bitmap and thus wrong decisions
about whether to fire column-specific triggers on the subscriber while
applying updates.  In HEAD (since commit 9de77b545), it could also
result in accesses off the end of the colstatus array, as detected by
buildfarm member skink.  Fix the logic, and adjust 003_constraints.pl
so that the problem is exposed in unpatched code.

In HEAD, also add some assertions to check that we don't access off
the ends of these newly variable-sized arrays.

Back-patch to v10, as b9c130a1f was.

Discussion: https://postgr.es/m/CAH2-Wz=79hKQ4++c5A060RYbjTHgiYTHz=fw6mptCtgghH2gJA@mail.gmail.com
---
 src/backend/replication/logical/proto.c    |  1 +
 src/backend/replication/logical/worker.c   | 18 +++++++++++++++---
 src/include/replication/logicalproto.h     |  7 ++++++-
 src/test/subscription/t/003_constraints.pl | 12 +++++++-----
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/src/backend/replication/logical/proto.c b/src/backend/replication/logical/proto.c
index 2b1356ee249f..04b4f494bb93 100644
--- a/src/backend/replication/logical/proto.c
+++ b/src/backend/replication/logical/proto.c
@@ -548,6 +548,7 @@ logicalrep_read_tuple(StringInfo in, LogicalRepTupleData *tuple)
 	/* Allocate space for per-column values; zero out unused StringInfoDatas */
 	tuple->colvalues = (StringInfoData *) palloc0(natts * sizeof(StringInfoData));
 	tuple->colstatus = (char *) palloc(natts * sizeof(char));
+	tuple->ncols = natts;
 
 	/* Read the data */
 	for (i = 0; i < natts; i++)
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index 407eee3c0bc9..2fcf2e61bc3e 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -354,6 +354,8 @@ slot_store_data(TupleTableSlot *slot, LogicalRepRelMapEntry *rel,
 		{
 			StringInfo	colvalue = &tupleData->colvalues[remoteattnum];
 
+			Assert(remoteattnum < tupleData->ncols);
+
 			errarg.local_attnum = i;
 			errarg.remote_attnum = remoteattnum;
 
@@ -477,6 +479,8 @@ slot_modify_data(TupleTableSlot *slot, TupleTableSlot *srcslot,
 		if (remoteattnum < 0)
 			continue;
 
+		Assert(remoteattnum < tupleData->ncols);
+
 		if (tupleData->colstatus[remoteattnum] != LOGICALREP_COLUMN_UNCHANGED)
 		{
 			StringInfo	colvalue = &tupleData->colvalues[remoteattnum];
@@ -831,9 +835,17 @@ apply_handle_update(StringInfo s)
 	target_rte = list_nth(estate->es_range_table, 0);
 	for (int i = 0; i < remoteslot->tts_tupleDescriptor->natts; i++)
 	{
-		if (newtup.colstatus[i] != LOGICALREP_COLUMN_UNCHANGED)
-			target_rte->updatedCols = bms_add_member(target_rte->updatedCols,
-													 i + 1 - FirstLowInvalidHeapAttributeNumber);
+		Form_pg_attribute att = TupleDescAttr(remoteslot->tts_tupleDescriptor, i);
+		int			remoteattnum = rel->attrmap->attnums[i];
+
+		if (!att->attisdropped && remoteattnum >= 0)
+		{
+			Assert(remoteattnum < newtup.ncols);
+			if (newtup.colstatus[remoteattnum] != LOGICALREP_COLUMN_UNCHANGED)
+				target_rte->updatedCols =
+					bms_add_member(target_rte->updatedCols,
+								   i + 1 - FirstLowInvalidHeapAttributeNumber);
+		}
 	}
 
 	fill_extraUpdatedCols(target_rte, RelationGetDescr(rel->localrel));
diff --git a/src/include/replication/logicalproto.h b/src/include/replication/logicalproto.h
index 287288ab415c..60a76bc85cf9 100644
--- a/src/include/replication/logicalproto.h
+++ b/src/include/replication/logicalproto.h
@@ -27,13 +27,18 @@
 #define LOGICALREP_PROTO_MIN_VERSION_NUM 1
 #define LOGICALREP_PROTO_VERSION_NUM 1
 
-/* Tuple coming via logical replication. */
+/*
+ * This struct stores a tuple received via logical replication.
+ * Keep in mind that the columns correspond to the *remote* table.
+ */
 typedef struct LogicalRepTupleData
 {
 	/* Array of StringInfos, one per column; some may be unused */
 	StringInfoData *colvalues;
 	/* Array of markers for null/unchanged/text/binary, one per column */
 	char	   *colstatus;
+	/* Length of above arrays */
+	int			ncols;
 } LogicalRepTupleData;
 
 /* Possible values for LogicalRepTupleData.colstatus[colnum] */
diff --git a/src/test/subscription/t/003_constraints.pl b/src/test/subscription/t/003_constraints.pl
index 3a590f871a5d..9f140b552b49 100644
--- a/src/test/subscription/t/003_constraints.pl
+++ b/src/test/subscription/t/003_constraints.pl
@@ -19,14 +19,14 @@
 $node_publisher->safe_psql('postgres',
 	"CREATE TABLE tab_fk (bid int PRIMARY KEY);");
 $node_publisher->safe_psql('postgres',
-	"CREATE TABLE tab_fk_ref (id int PRIMARY KEY, bid int REFERENCES tab_fk (bid));"
+	"CREATE TABLE tab_fk_ref (id int PRIMARY KEY, junk text, bid int REFERENCES tab_fk (bid));"
 );
 
-# Setup structure on subscriber
+# Setup structure on subscriber; column order intentionally different
 $node_subscriber->safe_psql('postgres',
 	"CREATE TABLE tab_fk (bid int PRIMARY KEY);");
 $node_subscriber->safe_psql('postgres',
-	"CREATE TABLE tab_fk_ref (id int PRIMARY KEY, bid int REFERENCES tab_fk (bid));"
+	"CREATE TABLE tab_fk_ref (id int PRIMARY KEY, bid int REFERENCES tab_fk (bid), junk text);"
 );
 
 # Setup logical replication
@@ -42,8 +42,10 @@
 
 $node_publisher->safe_psql('postgres',
 	"INSERT INTO tab_fk (bid) VALUES (1);");
+# "junk" value is meant to be large enough to force out-of-line storage
 $node_publisher->safe_psql('postgres',
-	"INSERT INTO tab_fk_ref (id, bid) VALUES (1, 1);");
+	"INSERT INTO tab_fk_ref (id, bid, junk) VALUES (1, 1, repeat(pi()::text,20000));"
+);
 
 $node_publisher->wait_for_catchup('tap_sub');
 
@@ -128,7 +130,7 @@ BEGIN
 $result = $node_subscriber->safe_psql('postgres',
 	"SELECT count(*), min(id), max(id) FROM tab_fk_ref;");
 is($result, qq(2|1|2),
-	'check column trigger applied on even for other column');
+	'check column trigger applied even on update for other column');
 
 $node_subscriber->stop('fast');
 $node_publisher->stop('fast');

From 0fa0b487b5d75d2b8576dec86a317212542642b8 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 20 Jul 2020 14:55:56 -0400
Subject: [PATCH 176/334] Correctly mark pg_subscription_rel.srsublsn as
 nullable.

The code has always set this column to NULL when it's not valid,
but the catalog header's description failed to reflect that,
as did the SGML docs, as did some of the code.  To prevent future
coding errors of the same ilk, let's hide the field from C code
as though it were variable-length (which, in a sense, it is).

As with commit 72eab84a5, we can only fix this cleanly in HEAD
and v13; the problem extends further back but we'll need some
klugery in the released branches.

Discussion: https://postgr.es/m/367660.1595202498@sss.pgh.pa.us
---
 doc/src/sgml/catalogs.sgml                |  4 +++-
 src/backend/catalog/pg_subscription.c     | 18 ++++++++++++++++--
 src/include/catalog/catversion.h          |  2 +-
 src/include/catalog/pg_subscription_rel.h | 14 ++++++++++++--
 4 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 048ff284f76d..a99c681887b5 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -7631,7 +7631,9 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        <structfield>srsublsn</structfield> <type>pg_lsn</type>
       </para>
       <para>
-       End LSN for <literal>s</literal> and <literal>r</literal> states.
+       Remote LSN of the state change used for synchronization coordination
+       when in <literal>s</literal> or <literal>r</literal> states,
+       otherwise null
       </para></entry>
      </row>
     </tbody>
diff --git a/src/backend/catalog/pg_subscription.c b/src/backend/catalog/pg_subscription.c
index e6afb3203e9f..90bf5cf0c6de 100644
--- a/src/backend/catalog/pg_subscription.c
+++ b/src/backend/catalog/pg_subscription.c
@@ -452,13 +452,20 @@ GetSubscriptionRelations(Oid subid)
 	{
 		Form_pg_subscription_rel subrel;
 		SubscriptionRelState *relstate;
+		Datum		d;
+		bool		isnull;
 
 		subrel = (Form_pg_subscription_rel) GETSTRUCT(tup);
 
 		relstate = (SubscriptionRelState *) palloc(sizeof(SubscriptionRelState));
 		relstate->relid = subrel->srrelid;
 		relstate->state = subrel->srsubstate;
-		relstate->lsn = subrel->srsublsn;
+		d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup,
+							Anum_pg_subscription_rel_srsublsn, &isnull);
+		if (isnull)
+			relstate->lsn = InvalidXLogRecPtr;
+		else
+			relstate->lsn = DatumGetLSN(d);
 
 		res = lappend(res, relstate);
 	}
@@ -504,13 +511,20 @@ GetSubscriptionNotReadyRelations(Oid subid)
 	{
 		Form_pg_subscription_rel subrel;
 		SubscriptionRelState *relstate;
+		Datum		d;
+		bool		isnull;
 
 		subrel = (Form_pg_subscription_rel) GETSTRUCT(tup);
 
 		relstate = (SubscriptionRelState *) palloc(sizeof(SubscriptionRelState));
 		relstate->relid = subrel->srrelid;
 		relstate->state = subrel->srsubstate;
-		relstate->lsn = subrel->srsublsn;
+		d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup,
+							Anum_pg_subscription_rel_srsublsn, &isnull);
+		if (isnull)
+			relstate->lsn = InvalidXLogRecPtr;
+		else
+			relstate->lsn = DatumGetLSN(d);
 
 		res = lappend(res, relstate);
 	}
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index dab7f4f47131..ed3aef93d04d 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202007192
+#define CATALOG_VERSION_NO	202007202
 
 #endif
diff --git a/src/include/catalog/pg_subscription_rel.h b/src/include/catalog/pg_subscription_rel.h
index c44df04c7211..f384f4e7fa65 100644
--- a/src/include/catalog/pg_subscription_rel.h
+++ b/src/include/catalog/pg_subscription_rel.h
@@ -33,8 +33,18 @@ CATALOG(pg_subscription_rel,6102,SubscriptionRelRelationId)
 	Oid			srsubid;		/* Oid of subscription */
 	Oid			srrelid;		/* Oid of relation */
 	char		srsubstate;		/* state of the relation in subscription */
-	XLogRecPtr	srsublsn;		/* remote lsn of the state change used for
-								 * synchronization coordination */
+
+	/*
+	 * Although srsublsn is a fixed-width type, it is allowed to be NULL, so
+	 * we prevent direct C code access to it just as for a varlena field.
+	 */
+#ifdef CATALOG_VARLEN			/* variable-length fields start here */
+
+	XLogRecPtr	srsublsn BKI_FORCE_NULL;	/* remote LSN of the state change
+											 * used for synchronization
+											 * coordination, or NULL if not
+											 * valid */
+#endif
 } FormData_pg_subscription_rel;
 
 typedef FormData_pg_subscription_rel *Form_pg_subscription_rel;

From 6ca7cd89a2d1998b16e8168dda62d43a9e0fdaff Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 20 Jul 2020 16:03:38 -0700
Subject: [PATCH 177/334] Assert that buffer is pinned in LockBuffer().

Strengthen the LockBuffer() assertion that verifies BufferIsValid() by
making it verify BufferIsPinned() instead.  Do the same in nearby
related functions.

There is probably not much chance that anybody will try to lock a buffer
that is not already pinned, but we might as well make sure of that.
---
 src/backend/storage/buffer/bufmgr.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 83d91b14fb1c..9b9303ff650c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3743,7 +3743,7 @@ LockBuffer(Buffer buffer, int mode)
 {
 	BufferDesc *buf;
 
-	Assert(BufferIsValid(buffer));
+	Assert(BufferIsPinned(buffer));
 	if (BufferIsLocal(buffer))
 		return;					/* local buffers need no lock */
 
@@ -3769,7 +3769,7 @@ ConditionalLockBuffer(Buffer buffer)
 {
 	BufferDesc *buf;
 
-	Assert(BufferIsValid(buffer));
+	Assert(BufferIsPinned(buffer));
 	if (BufferIsLocal(buffer))
 		return true;			/* act as though we got it */
 
@@ -3801,7 +3801,7 @@ LockBufferForCleanup(Buffer buffer)
 	BufferDesc *bufHdr;
 	char	   *new_status = NULL;
 
-	Assert(BufferIsValid(buffer));
+	Assert(BufferIsPinned(buffer));
 	Assert(PinCountWaitBuf == NULL);
 
 	if (BufferIsLocal(buffer))

From 4fb6aeb4f6e807c8ce3e140d2d2281f50eb6fb1a Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 20 Jul 2020 19:44:41 -0400
Subject: [PATCH 178/334] Make floating-point "NaN / 0" return NaN instead of
 raising an error.

This is more consistent with the IEEE 754 spec and our treatment of
NaNs elsewhere; in particular, the case has always acted that way in
"numeric" arithmetic.

Noted by Dean Rasheed.

Discussion: https://postgr.es/m/3421746.1594927785@sss.pgh.pa.us
---
 src/include/utils/float.h                             | 4 ++--
 src/test/regress/expected/float4-misrounded-input.out | 6 ++++++
 src/test/regress/expected/float4.out                  | 6 ++++++
 src/test/regress/expected/float8.out                  | 6 ++++++
 src/test/regress/sql/float4.sql                       | 1 +
 src/test/regress/sql/float8.sql                       | 1 +
 6 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/src/include/utils/float.h b/src/include/utils/float.h
index 8d4bbc51a654..e2aae8ef1773 100644
--- a/src/include/utils/float.h
+++ b/src/include/utils/float.h
@@ -222,7 +222,7 @@ float4_div(const float4 val1, const float4 val2)
 {
 	float4		result;
 
-	if (unlikely(val2 == 0.0f))
+	if (unlikely(val2 == 0.0f) && !isnan(val1))
 		float_zero_divide_error();
 	result = val1 / val2;
 	if (unlikely(isinf(result)) && !isinf(val1) && !isinf(val2))
@@ -238,7 +238,7 @@ float8_div(const float8 val1, const float8 val2)
 {
 	float8		result;
 
-	if (unlikely(val2 == 0.0))
+	if (unlikely(val2 == 0.0) && !isnan(val1))
 		float_zero_divide_error();
 	result = val1 / val2;
 	if (unlikely(isinf(result)) && !isinf(val1) && !isinf(val2))
diff --git a/src/test/regress/expected/float4-misrounded-input.out b/src/test/regress/expected/float4-misrounded-input.out
index 6c89af6394fc..38b847a1adbd 100644
--- a/src/test/regress/expected/float4-misrounded-input.out
+++ b/src/test/regress/expected/float4-misrounded-input.out
@@ -143,6 +143,12 @@ SELECT 'nan'::float4 / 'nan'::float4;
       NaN
 (1 row)
 
+SELECT 'nan'::float4 / '0'::float4;
+ ?column? 
+----------
+      NaN
+(1 row)
+
 SELECT 'nan'::numeric::float4;
  float4 
 --------
diff --git a/src/test/regress/expected/float4.out b/src/test/regress/expected/float4.out
index d6c22c1752a3..310402b6ee22 100644
--- a/src/test/regress/expected/float4.out
+++ b/src/test/regress/expected/float4.out
@@ -143,6 +143,12 @@ SELECT 'nan'::float4 / 'nan'::float4;
       NaN
 (1 row)
 
+SELECT 'nan'::float4 / '0'::float4;
+ ?column? 
+----------
+      NaN
+(1 row)
+
 SELECT 'nan'::numeric::float4;
  float4 
 --------
diff --git a/src/test/regress/expected/float8.out b/src/test/regress/expected/float8.out
index c635dd7dcb67..2304b579d2be 100644
--- a/src/test/regress/expected/float8.out
+++ b/src/test/regress/expected/float8.out
@@ -126,6 +126,12 @@ SELECT 'nan'::float8 / 'nan'::float8;
       NaN
 (1 row)
 
+SELECT 'nan'::float8 / '0'::float8;
+ ?column? 
+----------
+      NaN
+(1 row)
+
 SELECT 'nan'::numeric::float8;
  float8 
 --------
diff --git a/src/test/regress/sql/float4.sql b/src/test/regress/sql/float4.sql
index 393d98fb1438..1fcf823c21b1 100644
--- a/src/test/regress/sql/float4.sql
+++ b/src/test/regress/sql/float4.sql
@@ -50,6 +50,7 @@ SELECT ' INFINITY    x'::float4;
 SELECT 'Infinity'::float4 + 100.0;
 SELECT 'Infinity'::float4 / 'Infinity'::float4;
 SELECT 'nan'::float4 / 'nan'::float4;
+SELECT 'nan'::float4 / '0'::float4;
 SELECT 'nan'::numeric::float4;
 
 SELECT '' AS five, * FROM FLOAT4_TBL;
diff --git a/src/test/regress/sql/float8.sql b/src/test/regress/sql/float8.sql
index 288969aed65d..f103871cdb2f 100644
--- a/src/test/regress/sql/float8.sql
+++ b/src/test/regress/sql/float8.sql
@@ -43,6 +43,7 @@ SELECT ' INFINITY    x'::float8;
 SELECT 'Infinity'::float8 + 100.0;
 SELECT 'Infinity'::float8 / 'Infinity'::float8;
 SELECT 'nan'::float8 / 'nan'::float8;
+SELECT 'nan'::float8 / '0'::float8;
 SELECT 'nan'::numeric::float8;
 
 SELECT '' AS five, * FROM FLOAT8_TBL;

From a4faef8f8fe1493397679c014cf11a6e27c0f1e5 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 20 Jul 2020 22:03:18 -0400
Subject: [PATCH 179/334] Fix some corner cases for window ranges with infinite
 offsets.

Many situations where the offset is infinity were not handled sanely.
We should generally allow the val versus base +/- offset comparison to
proceed according to the normal rules of IEEE arithmetic; however, we
must do something special for the corner cases where base +/- offset
would produce NaN due to subtracting two like-signed infinities.
That corresponds to asking which values infinitely precede +inf or
infinitely follow -inf, which should certainly be true of any finite
value or of the opposite-signed infinity.  After some discussion it
seems that the best decision is to make it true of the same-signed
infinity as well, ie, just return constant TRUE if the calculation
would produce a NaN.

(We could write this with a bit less code by subtracting anyway,
and then checking for a NaN result.  However, I prefer this
formulation because it'll be easier to transpose into numeric.c.)

Although this seems like clearly a bug fix with respect to finite
values, it is less obviously correct for infinite values.  Between
that and the fact that the whole issue only arises for very strange
window specifications (e.g. RANGE BETWEEN 'inf' PRECEDING AND 'inf'
PRECEDING), I'll desist from back-patching.

Noted by Dean Rasheed.

Discussion: https://postgr.es/m/3393130.1594925893@sss.pgh.pa.us
---
 src/backend/utils/adt/float.c        | 46 +++++++++++-------
 src/test/regress/expected/window.out | 72 ++++++++++++++++++++++++++++
 src/test/regress/sql/window.sql      | 16 +++++++
 3 files changed, 118 insertions(+), 16 deletions(-)

diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index 50ec3d3dde5f..ffd1ce8c7610 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -1088,18 +1088,25 @@ in_range_float8_float8(PG_FUNCTION_ARGS)
 	}
 
 	/*
-	 * Deal with infinite offset (necessarily +inf, at this point).  We must
-	 * special-case this because if base happens to be -inf, their sum would
-	 * be NaN, which is an overflow-ish condition we should avoid.
+	 * Deal with cases where both base and offset are infinite, and computing
+	 * base +/- offset would produce NaN.  This corresponds to a window frame
+	 * whose boundary infinitely precedes +inf or infinitely follows -inf,
+	 * which is not well-defined.  For consistency with other cases involving
+	 * infinities, such as the fact that +inf infinitely follows +inf, we
+	 * choose to assume that +inf infinitely precedes +inf and -inf infinitely
+	 * follows -inf, and therefore that all finite and infinite values are in
+	 * such a window frame.
+	 *
+	 * offset is known positive, so we need only check the sign of base in
+	 * this test.
 	 */
-	if (isinf(offset))
-	{
-		PG_RETURN_BOOL(sub ? !less : less);
-	}
+	if (isinf(offset) && isinf(base) &&
+		(sub ? base > 0 : base < 0))
+		PG_RETURN_BOOL(true);
 
 	/*
 	 * Otherwise it should be safe to compute base +/- offset.  We trust the
-	 * FPU to cope if base is +/-inf or the true sum would overflow, and
+	 * FPU to cope if an input is +/-inf or the true sum would overflow, and
 	 * produce a suitably signed infinity, which will compare properly against
 	 * val whether or not that's infinity.
 	 */
@@ -1157,18 +1164,25 @@ in_range_float4_float8(PG_FUNCTION_ARGS)
 	}
 
 	/*
-	 * Deal with infinite offset (necessarily +inf, at this point).  We must
-	 * special-case this because if base happens to be -inf, their sum would
-	 * be NaN, which is an overflow-ish condition we should avoid.
+	 * Deal with cases where both base and offset are infinite, and computing
+	 * base +/- offset would produce NaN.  This corresponds to a window frame
+	 * whose boundary infinitely precedes +inf or infinitely follows -inf,
+	 * which is not well-defined.  For consistency with other cases involving
+	 * infinities, such as the fact that +inf infinitely follows +inf, we
+	 * choose to assume that +inf infinitely precedes +inf and -inf infinitely
+	 * follows -inf, and therefore that all finite and infinite values are in
+	 * such a window frame.
+	 *
+	 * offset is known positive, so we need only check the sign of base in
+	 * this test.
 	 */
-	if (isinf(offset))
-	{
-		PG_RETURN_BOOL(sub ? !less : less);
-	}
+	if (isinf(offset) && isinf(base) &&
+		(sub ? base > 0 : base < 0))
+		PG_RETURN_BOOL(true);
 
 	/*
 	 * Otherwise it should be safe to compute base +/- offset.  We trust the
-	 * FPU to cope if base is +/-inf or the true sum would overflow, and
+	 * FPU to cope if an input is +/-inf or the true sum would overflow, and
 	 * produce a suitably signed infinity, which will compare properly against
 	 * val whether or not that's infinity.
 	 */
diff --git a/src/test/regress/expected/window.out b/src/test/regress/expected/window.out
index d5fd4045f9fb..432edfa0630e 100644
--- a/src/test/regress/expected/window.out
+++ b/src/test/regress/expected/window.out
@@ -1936,6 +1936,42 @@ window w as (order by f_float4 range between
   9 |       NaN |           9 |          9
 (10 rows)
 
+select id, f_float4, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_float4 range between
+             'inf' preceding and 'inf' preceding);
+ id | f_float4  | first_value | last_value 
+----+-----------+-------------+------------
+  0 | -Infinity |           0 |          0
+  1 |        -3 |           0 |          0
+  2 |        -1 |           0 |          0
+  3 |         0 |           0 |          0
+  4 |       1.1 |           0 |          0
+  5 |      1.12 |           0 |          0
+  6 |         2 |           0 |          0
+  7 |       100 |           0 |          0
+  8 |  Infinity |           0 |          8
+  9 |       NaN |           9 |          9
+(10 rows)
+
+select id, f_float4, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_float4 range between
+             'inf' following and 'inf' following);
+ id | f_float4  | first_value | last_value 
+----+-----------+-------------+------------
+  0 | -Infinity |           0 |          8
+  1 |        -3 |           8 |          8
+  2 |        -1 |           8 |          8
+  3 |         0 |           8 |          8
+  4 |       1.1 |           8 |          8
+  5 |      1.12 |           8 |          8
+  6 |         2 |           8 |          8
+  7 |       100 |           8 |          8
+  8 |  Infinity |           8 |          8
+  9 |       NaN |           9 |          9
+(10 rows)
+
 select id, f_float4, first_value(id) over w, last_value(id) over w
 from numerics
 window w as (order by f_float4 range between
@@ -1995,6 +2031,42 @@ window w as (order by f_float8 range between
   9 |       NaN |           9 |          9
 (10 rows)
 
+select id, f_float8, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_float8 range between
+             'inf' preceding and 'inf' preceding);
+ id | f_float8  | first_value | last_value 
+----+-----------+-------------+------------
+  0 | -Infinity |           0 |          0
+  1 |        -3 |           0 |          0
+  2 |        -1 |           0 |          0
+  3 |         0 |           0 |          0
+  4 |       1.1 |           0 |          0
+  5 |      1.12 |           0 |          0
+  6 |         2 |           0 |          0
+  7 |       100 |           0 |          0
+  8 |  Infinity |           0 |          8
+  9 |       NaN |           9 |          9
+(10 rows)
+
+select id, f_float8, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_float8 range between
+             'inf' following and 'inf' following);
+ id | f_float8  | first_value | last_value 
+----+-----------+-------------+------------
+  0 | -Infinity |           0 |          8
+  1 |        -3 |           8 |          8
+  2 |        -1 |           8 |          8
+  3 |         0 |           8 |          8
+  4 |       1.1 |           8 |          8
+  5 |      1.12 |           8 |          8
+  6 |         2 |           8 |          8
+  7 |       100 |           8 |          8
+  8 |  Infinity |           8 |          8
+  9 |       NaN |           9 |          9
+(10 rows)
+
 select id, f_float8, first_value(id) over w, last_value(id) over w
 from numerics
 window w as (order by f_float8 range between
diff --git a/src/test/regress/sql/window.sql b/src/test/regress/sql/window.sql
index fe273aa31e6a..51ec0bac9ad1 100644
--- a/src/test/regress/sql/window.sql
+++ b/src/test/regress/sql/window.sql
@@ -524,6 +524,14 @@ window w as (order by f_float4 range between
              'inf' preceding and 'inf' following);
 select id, f_float4, first_value(id) over w, last_value(id) over w
 from numerics
+window w as (order by f_float4 range between
+             'inf' preceding and 'inf' preceding);
+select id, f_float4, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_float4 range between
+             'inf' following and 'inf' following);
+select id, f_float4, first_value(id) over w, last_value(id) over w
+from numerics
 window w as (order by f_float4 range between
              1.1 preceding and 'NaN' following);  -- error, NaN disallowed
 
@@ -541,6 +549,14 @@ window w as (order by f_float8 range between
              'inf' preceding and 'inf' following);
 select id, f_float8, first_value(id) over w, last_value(id) over w
 from numerics
+window w as (order by f_float8 range between
+             'inf' preceding and 'inf' preceding);
+select id, f_float8, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_float8 range between
+             'inf' following and 'inf' following);
+select id, f_float8, first_value(id) over w, last_value(id) over w
+from numerics
 window w as (order by f_float8 range between
              1.1 preceding and 'NaN' following);  -- error, NaN disallowed
 

From c273d9d8ce412a475082b1729462845d2abb5ae8 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 21 Jul 2020 12:05:07 +0900
Subject: [PATCH 180/334] Rework tab completion of COPY and \copy in psql

This corrects and simplifies $subject in a number of ways:
- Remove from the completion the pre-9.0 grammar still supported for
compatibility purposes.  This simplifies the code, and allows to extend
it more easily with new patterns.
- Add completion for the options of FORMAT within a WITH clause.
- Complete WHERE and WITH clauses correctly depending on if TO or FROM
are used, WHERE being only available with COPY FROM.

Author: Vignesh C, Michael Paquier
Reviewed-by: Ahsan Hadi
Discussion: https://postgr.es/m/CALDaNm3zWr=OmxeNqOqfT=uZTSdam_j-gkX94CL8eTNfgUtf6A@mail.gmail.com
---
 src/bin/psql/tab-complete.c | 48 +++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index eb018854a5c5..8b735476ade4 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -2316,19 +2316,14 @@ psql_completion(const char *text, int start, int end)
 	else if (Matches("COPY|\\copy"))
 		COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_tables,
 								   " UNION ALL SELECT '('");
-	/* If we have COPY BINARY, complete with list of tables */
-	else if (Matches("COPY", "BINARY"))
-		COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_tables, NULL);
-	/* If we have COPY (, complete it with legal commands */
+	/* Complete COPY ( with legal query commands */
 	else if (Matches("COPY|\\copy", "("))
 		COMPLETE_WITH("SELECT", "TABLE", "VALUES", "INSERT", "UPDATE", "DELETE", "WITH");
-	/* If we have COPY [BINARY] <sth>, complete it with "TO" or "FROM" */
-	else if (Matches("COPY|\\copy", MatchAny) ||
-			 Matches("COPY", "BINARY", MatchAny))
+	/* Complete COPY <sth> */
+	else if (Matches("COPY|\\copy", MatchAny))
 		COMPLETE_WITH("FROM", "TO");
-	/* If we have COPY [BINARY] <sth> FROM|TO, complete with filename */
-	else if (Matches("COPY", MatchAny, "FROM|TO") ||
-			 Matches("COPY", "BINARY", MatchAny, "FROM|TO"))
+	/* Complete COPY <sth> FROM|TO with filename */
+	else if (Matches("COPY", MatchAny, "FROM|TO"))
 	{
 		completion_charp = "";
 		completion_force_quote = true;	/* COPY requires quoted filename */
@@ -2340,17 +2335,28 @@ psql_completion(const char *text, int start, int end)
 		completion_force_quote = false;
 		matches = rl_completion_matches(text, complete_from_files);
 	}
-	/* Offer options after COPY [BINARY] <sth> FROM|TO filename */
-	else if (Matches("COPY|\\copy", MatchAny, "FROM|TO", MatchAny) ||
-			 Matches("COPY", "BINARY", MatchAny, "FROM|TO", MatchAny))
-		COMPLETE_WITH("BINARY", "DELIMITER", "NULL", "CSV",
-					  "ENCODING");
-
-	/* Offer options after COPY [BINARY] <sth> FROM|TO filename CSV */
-	else if (Matches("COPY|\\copy", MatchAny, "FROM|TO", MatchAny, "CSV") ||
-			 Matches("COPY", "BINARY", MatchAny, "FROM|TO", MatchAny, "CSV"))
-		COMPLETE_WITH("HEADER", "QUOTE", "ESCAPE", "FORCE QUOTE",
-					  "FORCE NOT NULL");
+
+	/* Complete COPY <sth> TO <sth> */
+	else if (Matches("COPY|\\copy", MatchAny, "TO", MatchAny))
+		COMPLETE_WITH("WITH (");
+
+	/* Complete COPY <sth> FROM <sth> */
+	else if (Matches("COPY|\\copy", MatchAny, "FROM", MatchAny))
+		COMPLETE_WITH("WITH (", "WHERE");
+
+	/* Complete COPY <sth> FROM|TO filename WITH ( */
+	else if (Matches("COPY|\\copy", MatchAny, "FROM|TO", MatchAny, "WITH", "("))
+		COMPLETE_WITH("FORMAT", "FREEZE", "DELIMITER", "NULL",
+					  "HEADER", "QUOTE", "ESCAPE", "FORCE_QUOTE",
+					  "FORCE_NOT_NULL", "FORCE_NULL", "ENCODING");
+
+	/* Complete COPY <sth> FROM|TO filename WITH (FORMAT */
+	else if (Matches("COPY|\\copy", MatchAny, "FROM|TO", MatchAny, "WITH", "(", "FORMAT"))
+		COMPLETE_WITH("binary", "csv", "text");
+
+	/* Complete COPY <sth> FROM <sth> WITH (<options>) */
+	else if (Matches("COPY|\\copy", MatchAny, "FROM", MatchAny, "WITH", MatchAny))
+		COMPLETE_WITH("WHERE");
 
 	/* CREATE ACCESS METHOD */
 	/* Complete "CREATE ACCESS METHOD <name>" */

From 3e66019f15549d3e1e13da665b2549684061d630 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 21 Jul 2020 12:38:08 -0400
Subject: [PATCH 181/334] Assert that we don't insert nulls into attnotnull
 catalog columns.

The executor checks for this error, and so does the bootstrap catalog
loader, but we never checked for it in retail catalog manipulations.
The folly of that has now been exposed, so let's add assertions
checking it.  Checking in CatalogTupleInsert[WithInfo] and
CatalogTupleUpdate[WithInfo] should be enough to cover this.

Back-patch to v10; the aforesaid functions didn't exist before that,
and it didn't seem worth adapting the patch to the oldest branches.
But given the risk of JIT crashes, I think we certainly need this
as far back as v11.

Pre-v13, we have to explicitly exclude pg_subscription.subslotname
and pg_subscription_rel.srsublsn from the checks, since they are
mismarked.  (Even if we change our mind about applying BKI_FORCE_NULL
in the branch tips, it doesn't seem wise to have assertions that
would fire in existing databases.)

Discussion: https://postgr.es/m/298837.1595196283@sss.pgh.pa.us
---
 doc/src/sgml/bki.sgml          |  5 +---
 src/backend/catalog/indexing.c | 45 ++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/doc/src/sgml/bki.sgml b/doc/src/sgml/bki.sgml
index 6776c4a3c186..4e7568f5ce92 100644
--- a/doc/src/sgml/bki.sgml
+++ b/doc/src/sgml/bki.sgml
@@ -122,10 +122,7 @@
    if they are fixed-width and are not preceded by any nullable column.
    Where this rule is inadequate, you can force correct marking by using
    <literal>BKI_FORCE_NOT_NULL</literal>
-   and <literal>BKI_FORCE_NULL</literal> annotations as needed.  But note
-   that <literal>NOT NULL</literal> constraints are only enforced in the
-   executor, not against tuples that are generated by random C code,
-   so care is still needed when manually creating or updating catalog rows.
+   and <literal>BKI_FORCE_NULL</literal> annotations as needed.
   </para>
 
   <para>
diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c
index d63fcf58cf1d..fe277f3ad371 100644
--- a/src/backend/catalog/indexing.c
+++ b/src/backend/catalog/indexing.c
@@ -167,6 +167,43 @@ CatalogIndexInsert(CatalogIndexState indstate, HeapTuple heapTuple)
 	ExecDropSingleTupleTableSlot(slot);
 }
 
+/*
+ * Subroutine to verify that catalog constraints are honored.
+ *
+ * Tuples inserted via CatalogTupleInsert/CatalogTupleUpdate are generally
+ * "hand made", so that it's possible that they fail to satisfy constraints
+ * that would be checked if they were being inserted by the executor.  That's
+ * a coding error, so we only bother to check for it in assert-enabled builds.
+ */
+#ifdef USE_ASSERT_CHECKING
+
+static void
+CatalogTupleCheckConstraints(Relation heapRel, HeapTuple tup)
+{
+	/*
+	 * Currently, the only constraints implemented for system catalogs are
+	 * attnotnull constraints.
+	 */
+	if (HeapTupleHasNulls(tup))
+	{
+		TupleDesc	tupdesc = RelationGetDescr(heapRel);
+		bits8	   *bp = tup->t_data->t_bits;
+
+		for (int attnum = 0; attnum < tupdesc->natts; attnum++)
+		{
+			Form_pg_attribute thisatt = TupleDescAttr(tupdesc, attnum);
+
+			Assert(!(thisatt->attnotnull && att_isnull(attnum, bp)));
+		}
+	}
+}
+
+#else							/* !USE_ASSERT_CHECKING */
+
+#define CatalogTupleCheckConstraints(heapRel, tup)  ((void) 0)
+
+#endif							/* USE_ASSERT_CHECKING */
+
 /*
  * CatalogTupleInsert - do heap and indexing work for a new catalog tuple
  *
@@ -184,6 +221,8 @@ CatalogTupleInsert(Relation heapRel, HeapTuple tup)
 {
 	CatalogIndexState indstate;
 
+	CatalogTupleCheckConstraints(heapRel, tup);
+
 	indstate = CatalogOpenIndexes(heapRel);
 
 	simple_heap_insert(heapRel, tup);
@@ -204,6 +243,8 @@ void
 CatalogTupleInsertWithInfo(Relation heapRel, HeapTuple tup,
 						   CatalogIndexState indstate)
 {
+	CatalogTupleCheckConstraints(heapRel, tup);
+
 	simple_heap_insert(heapRel, tup);
 
 	CatalogIndexInsert(indstate, tup);
@@ -225,6 +266,8 @@ CatalogTupleUpdate(Relation heapRel, ItemPointer otid, HeapTuple tup)
 {
 	CatalogIndexState indstate;
 
+	CatalogTupleCheckConstraints(heapRel, tup);
+
 	indstate = CatalogOpenIndexes(heapRel);
 
 	simple_heap_update(heapRel, otid, tup);
@@ -245,6 +288,8 @@ void
 CatalogTupleUpdateWithInfo(Relation heapRel, ItemPointer otid, HeapTuple tup,
 						   CatalogIndexState indstate)
 {
+	CatalogTupleCheckConstraints(heapRel, tup);
+
 	simple_heap_update(heapRel, otid, tup);
 
 	CatalogIndexInsert(indstate, tup);

From fc032bed2fb809ee69eaf2fece349f65274c876f Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 21 Jul 2020 13:03:48 -0400
Subject: [PATCH 182/334] Be more careful about marking catalog columns NOT
 NULL by default.

The bug fixed in commit 72eab84a5 would not have occurred if initdb
had a less surprising rule about which columns should be marked
NOT NULL by default.  Let's make that rule be strictly that the
column must be fixed-width and its predecessors must be fixed-width
and NOT NULL, removing the hacky and unsafe exceptions for oidvector
and int2vector.

Since we do still want all existing oidvector and int2vector columns
to be marked NOT NULL, we have to put BKI_FORCE_NOT_NULL labels on
them.  But making this less magic and more documented seems like a
good idea, even if it's a shade more verbose.

I didn't bump catversion since the initial catalog contents are
not actually changed by this patch.  Note however that the
contents of postgres.bki do change, and feeding an old copy of
that to a new backend will produce wrong results.

Discussion: https://postgr.es/m/204760.1595181800@sss.pgh.pa.us
---
 doc/src/sgml/bki.sgml                      |  3 +-
 src/backend/bootstrap/bootstrap.c          | 17 ++++-------
 src/backend/catalog/genbki.pl              | 33 ++++++++++++----------
 src/include/catalog/genbki.h               |  4 +--
 src/include/catalog/pg_index.h             | 10 ++++---
 src/include/catalog/pg_partitioned_table.h | 14 +++++----
 src/include/catalog/pg_proc.h              |  2 +-
 src/include/catalog/pg_statistic_ext.h     |  2 +-
 src/include/catalog/pg_trigger.h           |  3 +-
 9 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/doc/src/sgml/bki.sgml b/doc/src/sgml/bki.sgml
index 4e7568f5ce92..272a10a98c5a 100644
--- a/doc/src/sgml/bki.sgml
+++ b/doc/src/sgml/bki.sgml
@@ -119,7 +119,8 @@
    require all columns that should be non-nullable to be marked so
    in <structname>pg_attribute</structname>.  The bootstrap code will
    automatically mark catalog columns as <literal>NOT NULL</literal>
-   if they are fixed-width and are not preceded by any nullable column.
+   if they are fixed-width and are not preceded by any nullable or
+   variable-width column.
    Where this rule is inadequate, you can force correct marking by using
    <literal>BKI_FORCE_NOT_NULL</literal>
    and <literal>BKI_FORCE_NULL</literal> annotations as needed.
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 5480a024e05b..45b7efbe4659 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -770,25 +770,18 @@ DefineAttr(char *name, char *type, int attnum, int nullness)
 
 		/*
 		 * Mark as "not null" if type is fixed-width and prior columns are
-		 * too.  This corresponds to case where column can be accessed
-		 * directly via C struct declaration.
-		 *
-		 * oidvector and int2vector are also treated as not-nullable, even
-		 * though they are no longer fixed-width.
+		 * likewise fixed-width and not-null.  This corresponds to case where
+		 * column can be accessed directly via C struct declaration.
 		 */
-#define MARKNOTNULL(att) \
-		((att)->attlen > 0 || \
-		 (att)->atttypid == OIDVECTOROID || \
-		 (att)->atttypid == INT2VECTOROID)
-
-		if (MARKNOTNULL(attrtypes[attnum]))
+		if (attrtypes[attnum]->attlen > 0)
 		{
 			int			i;
 
 			/* check earlier attributes */
 			for (i = 0; i < attnum; i++)
 			{
-				if (!attrtypes[i]->attnotnull)
+				if (attrtypes[i]->attlen <= 0 ||
+					!attrtypes[i]->attnotnull)
 					break;
 			}
 			if (i == attnum)
diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl
index b07537fbbac9..dc5f442397a4 100644
--- a/src/backend/catalog/genbki.pl
+++ b/src/backend/catalog/genbki.pl
@@ -713,8 +713,8 @@ sub gen_pg_attribute
 		push @tables_needing_macros, $table_name;
 
 		# Generate entries for user attributes.
-		my $attnum       = 0;
-		my $priornotnull = 1;
+		my $attnum          = 0;
+		my $priorfixedwidth = 1;
 		foreach my $attr (@{ $table->{columns} })
 		{
 			$attnum++;
@@ -722,8 +722,12 @@ sub gen_pg_attribute
 			$row{attnum}   = $attnum;
 			$row{attrelid} = $table->{relation_oid};
 
-			morph_row_for_pgattr(\%row, $schema, $attr, $priornotnull);
-			$priornotnull &= ($row{attnotnull} eq 't');
+			morph_row_for_pgattr(\%row, $schema, $attr, $priorfixedwidth);
+
+			# Update $priorfixedwidth --- must match morph_row_for_pgattr
+			$priorfixedwidth &=
+			  ($row{attnotnull} eq 't'
+				  && ($row{attlen} eq 'NAMEDATALEN' || $row{attlen} > 0));
 
 			# If it's bootstrapped, put an entry in postgres.bki.
 			print_bki_insert(\%row, $schema) if $table->{bootstrap};
@@ -765,13 +769,13 @@ sub gen_pg_attribute
 
 # Given $pgattr_schema (the pg_attribute schema for a catalog sufficient for
 # AddDefaultValues), $attr (the description of a catalog row), and
-# $priornotnull (whether all prior attributes in this catalog are not null),
+# $priorfixedwidth (all prior columns are fixed-width and not null),
 # modify the $row hashref for print_bki_insert.  This includes setting data
 # from the corresponding pg_type element and filling in any default values.
 # Any value not handled here must be supplied by caller.
 sub morph_row_for_pgattr
 {
-	my ($row, $pgattr_schema, $attr, $priornotnull) = @_;
+	my ($row, $pgattr_schema, $attr, $priorfixedwidth) = @_;
 	my $attname = $attr->{name};
 	my $atttype = $attr->{type};
 
@@ -801,19 +805,18 @@ sub morph_row_for_pgattr
 	{
 		$row->{attnotnull} = 'f';
 	}
-	elsif ($priornotnull)
+	elsif ($priorfixedwidth)
 	{
 
 		# attnotnull will automatically be set if the type is
-		# fixed-width and prior columns are all NOT NULL ---
-		# compare DefineAttr in bootstrap.c. oidvector and
-		# int2vector are also treated as not-nullable.
+		# fixed-width and prior columns are likewise fixed-width
+		# and NOT NULL --- compare DefineAttr in bootstrap.c.
+		# At this point the width of type name is still symbolic,
+		# so we need a special test.
 		$row->{attnotnull} =
-		    $type->{typname} eq 'oidvector'  ? 't'
-		  : $type->{typname} eq 'int2vector' ? 't'
-		  : $type->{typlen} eq 'NAMEDATALEN' ? 't'
-		  : $type->{typlen} > 0              ? 't'
-		  :                                    'f';
+		    $row->{attlen} eq 'NAMEDATALEN' ? 't'
+		  : $row->{attlen} > 0              ? 't'
+		  :                                   'f';
 	}
 	else
 	{
diff --git a/src/include/catalog/genbki.h b/src/include/catalog/genbki.h
index 4a6c8636daf0..8cac7ec87894 100644
--- a/src/include/catalog/genbki.h
+++ b/src/include/catalog/genbki.h
@@ -46,8 +46,8 @@
 /*
  * Variable-length catalog fields (except possibly the first not nullable one)
  * should not be visible in C structures, so they are made invisible by #ifdefs
- * of an undefined symbol.  See also MARKNOTNULL in bootstrap.c for how this is
- * handled.
+ * of an undefined symbol.  See also the BOOTCOL_NULL_AUTO code in bootstrap.c
+ * for how this is handled.
  */
 #undef CATALOG_VARLEN
 
diff --git a/src/include/catalog/pg_index.h b/src/include/catalog/pg_index.h
index d3d7ea77fbbe..4a642f336fa6 100644
--- a/src/include/catalog/pg_index.h
+++ b/src/include/catalog/pg_index.h
@@ -44,12 +44,14 @@ CATALOG(pg_index,2610,IndexRelationId) BKI_SCHEMA_MACRO
 	bool		indisreplident; /* is this index the identity for replication? */
 
 	/* variable-length fields start here, but we allow direct access to indkey */
-	int2vector	indkey;			/* column numbers of indexed cols, or 0 */
+	int2vector	indkey BKI_FORCE_NOT_NULL;	/* column numbers of indexed cols,
+											 * or 0 */
 
 #ifdef CATALOG_VARLEN
-	oidvector	indcollation;	/* collation identifiers */
-	oidvector	indclass;		/* opclass identifiers */
-	int2vector	indoption;		/* per-column flags (AM-specific meanings) */
+	oidvector	indcollation BKI_FORCE_NOT_NULL;	/* collation identifiers */
+	oidvector	indclass BKI_FORCE_NOT_NULL;	/* opclass identifiers */
+	int2vector	indoption BKI_FORCE_NOT_NULL;	/* per-column flags
+												 * (AM-specific meanings) */
 	pg_node_tree indexprs;		/* expression trees for index attributes that
 								 * are not simple column references; one for
 								 * each zero entry in indkey[] */
diff --git a/src/include/catalog/pg_partitioned_table.h b/src/include/catalog/pg_partitioned_table.h
index a73cd0d3a445..7ee0419373ca 100644
--- a/src/include/catalog/pg_partitioned_table.h
+++ b/src/include/catalog/pg_partitioned_table.h
@@ -41,13 +41,17 @@ CATALOG(pg_partitioned_table,3350,PartitionedRelationId)
 	 * field of a heap tuple can be reliably accessed using its C struct
 	 * offset, as previous fields are all non-nullable fixed-length fields.
 	 */
-	int2vector	partattrs;		/* each member of the array is the attribute
-								 * number of a partition key column, or 0 if
-								 * the column is actually an expression */
+	int2vector	partattrs BKI_FORCE_NOT_NULL;	/* each member of the array is
+												 * the attribute number of a
+												 * partition key column, or 0
+												 * if the column is actually
+												 * an expression */
 
 #ifdef CATALOG_VARLEN
-	oidvector	partclass;		/* operator class to compare keys */
-	oidvector	partcollation;	/* user-specified collation for keys */
+	oidvector	partclass BKI_FORCE_NOT_NULL;	/* operator class to compare
+												 * keys */
+	oidvector	partcollation BKI_FORCE_NOT_NULL;	/* user-specified
+													 * collation for keys */
 	pg_node_tree partexprs;		/* list of expressions in the partition key;
 								 * one item for each zero entry in partattrs[] */
 #endif
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index 65e8c9f0546d..b50fa25dbd86 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -92,7 +92,7 @@ CATALOG(pg_proc,1255,ProcedureRelationId) BKI_BOOTSTRAP BKI_ROWTYPE_OID(81,Proce
 	 */
 
 	/* parameter types (excludes OUT params) */
-	oidvector	proargtypes BKI_LOOKUP(pg_type);
+	oidvector	proargtypes BKI_LOOKUP(pg_type) BKI_FORCE_NOT_NULL;
 
 #ifdef CATALOG_VARLEN
 
diff --git a/src/include/catalog/pg_statistic_ext.h b/src/include/catalog/pg_statistic_ext.h
index a8cb16997a7c..8747903fc735 100644
--- a/src/include/catalog/pg_statistic_ext.h
+++ b/src/include/catalog/pg_statistic_ext.h
@@ -47,7 +47,7 @@ CATALOG(pg_statistic_ext,3381,StatisticExtRelationId)
 	 * variable-length fields start here, but we allow direct access to
 	 * stxkeys
 	 */
-	int2vector	stxkeys;		/* array of column keys */
+	int2vector	stxkeys BKI_FORCE_NOT_NULL; /* array of column keys */
 
 #ifdef CATALOG_VARLEN
 	char		stxkind[1] BKI_FORCE_NOT_NULL;	/* statistics kinds requested
diff --git a/src/include/catalog/pg_trigger.h b/src/include/catalog/pg_trigger.h
index 9612b9bdd65b..fa5761b78459 100644
--- a/src/include/catalog/pg_trigger.h
+++ b/src/include/catalog/pg_trigger.h
@@ -54,7 +54,8 @@ CATALOG(pg_trigger,2620,TriggerRelationId)
 	 * Variable-length fields start here, but we allow direct access to
 	 * tgattr. Note: tgattr and tgargs must not be null.
 	 */
-	int2vector	tgattr;			/* column numbers, if trigger is on columns */
+	int2vector	tgattr BKI_FORCE_NOT_NULL;	/* column numbers, if trigger is
+											 * on columns */
 
 #ifdef CATALOG_VARLEN
 	bytea		tgargs BKI_FORCE_NOT_NULL;	/* first\000second\000tgnargs\000 */

From a0b2d583db9f040e2c156570b741e46bc33c6aec Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Tue, 21 Jul 2020 13:09:42 -0400
Subject: [PATCH 183/334] Minor glossary tweaks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add "(process)" qualifier to two terms, remove self-reference in one
term.

Author: Jürgen Purtz <juergen@purtz.de>
Discussion: https://postgr.es/m/95f90a5d-7692-701d-2c0c-0c88eb5cea7d@purtz.de
---
 doc/src/sgml/glossary.sgml | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml
index 76525c6302a1..abb6f56668e7 100644
--- a/doc/src/sgml/glossary.sgml
+++ b/doc/src/sgml/glossary.sgml
@@ -108,7 +108,7 @@
   </glossentry>
 
   <glossentry id="glossary-autovacuum">
-   <glossterm>Autovacuum</glossterm>
+   <glossterm>Autovacuum (process)</glossterm>
    <glossdef>
     <para>
      A set of background processes that routinely perform
@@ -855,8 +855,7 @@
    <glossterm>Logger (process)</glossterm>
    <glossdef>
     <para>
-     If activated, the
-     <glossterm linkend="glossary-logger">Logger</glossterm> process
+     If activated, the process
      writes information about database events into the current
      <glossterm linkend="glossary-log-file">log file</glossterm>.
      When reaching certain time- or
@@ -1486,7 +1485,7 @@
   </glossentry>
 
   <glossentry id="glossary-stats-collector">
-   <glossterm>Stats collector</glossterm>
+   <glossterm>Stats collector (process)</glossterm>
    <glossdef>
     <para>
      This process collects statistical information about the

From 606c3845988ddd9497cbbbf6fc559b91c76ed65d Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Tue, 21 Jul 2020 13:11:23 -0400
Subject: [PATCH 184/334] Glossary: Add term "base backup"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Author: Jürgen Purtz <juergen@purtz.de>
Discussion: https://postgr.es/m/95f90a5d-7692-701d-2c0c-0c88eb5cea7d@purtz.de
---
 doc/src/sgml/glossary.sgml | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/doc/src/sgml/glossary.sgml b/doc/src/sgml/glossary.sgml
index abb6f56668e7..9d2385031ca4 100644
--- a/doc/src/sgml/glossary.sgml
+++ b/doc/src/sgml/glossary.sgml
@@ -178,6 +178,19 @@
    </glossdef>
   </glossentry>
 
+  <glossentry id="glossary-basebackup">
+   <glossterm>Base Backup</glossterm>
+   <glossdef>
+    <para>
+     A binary copy of all
+     <glossterm linkend="glossary-db-cluster">database cluster</glossterm>
+     files. It is generated by the tool <xref linkend="app-pgbasebackup"/>.
+     In combination with WAL files it can be used as the starting point
+     for recovery, log shipping, or streaming replication.
+    </para>
+   </glossdef>
+  </glossentry>
+
   <glossentry id="glossary-bloat">
    <glossterm>Bloat</glossterm>
    <glossdef>

From 670c0a1d474bf296dbcc1d6de912d4841f2ed643 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 21 Jul 2020 15:19:46 -0400
Subject: [PATCH 185/334] Weaken type-OID-matching checks in array_recv and
 record_recv.

Rather than always insisting on an exact match of the type OID in the
data to the element type or column type we expect, complain only when
both OIDs fall within the manually-assigned range.  This acknowledges
the reality that user-defined types don't have stable OIDs, while
still preserving some of the mistake-detection value of the old test.

(It's not entirely clear whether to error if one OID is manually
assigned and the other isn't.  But perhaps that case could arise in
cross-version cases where a former extension type has been imported
into core, so I let it pass.)

This change allows us to remove the prohibition on binary transfer
of user-defined arrays and composites in the recently-landed support
for binary logical replication (commit 9de77b545).  We can just
unconditionally drop that check, since if the client has asked for
binary transfer it must be >= v14 and must have this change.

Discussion: https://postgr.es/m/CADK3HH+R3xMn=8t3Ct+uD+qJ1KD=Hbif5NFMJ+d5DkoCzp6Vgw@mail.gmail.com
---
 src/backend/replication/logical/proto.c | 17 ++-------------
 src/backend/utils/adt/arrayfuncs.c      | 29 +++++++++++++++++++++----
 src/backend/utils/adt/rowtypes.c        | 28 ++++++++++++++++++++----
 3 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/src/backend/replication/logical/proto.c b/src/backend/replication/logical/proto.c
index 04b4f494bb93..9ff8097bf5fd 100644
--- a/src/backend/replication/logical/proto.c
+++ b/src/backend/replication/logical/proto.c
@@ -494,22 +494,9 @@ logicalrep_write_tuple(StringInfo out, Relation rel, HeapTuple tuple, bool binar
 		typclass = (Form_pg_type) GETSTRUCT(typtup);
 
 		/*
-		 * Choose whether to send in binary.  Obviously, the option must be
-		 * requested and the type must have a send function.  Also, if the
-		 * type is not built-in then it must not be a composite or array type.
-		 * Such types contain type OIDs, which will likely not match at the
-		 * receiver if it's not a built-in type.
-		 *
-		 * XXX this could be relaxed if we changed record_recv and array_recv
-		 * to be less picky.
-		 *
-		 * XXX this fails to apply the restriction to domains over such types.
+		 * Send in binary if requested and type has suitable send function.
 		 */
-		if (binary &&
-			OidIsValid(typclass->typsend) &&
-			(att->atttypid < FirstGenbkiObjectId ||
-			 (typclass->typtype != TYPTYPE_COMPOSITE &&
-			  typclass->typelem == InvalidOid)))
+		if (binary && OidIsValid(typclass->typsend))
 		{
 			bytea	   *outputbytes;
 			int			len;
diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c
index 800107d4e726..392445ea0329 100644
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -1308,13 +1308,34 @@ array_recv(PG_FUNCTION_ARGS)
 				(errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
 				 errmsg("invalid array flags")));
 
+	/* Check element type recorded in the data */
 	element_type = pq_getmsgint(buf, sizeof(Oid));
+
+	/*
+	 * From a security standpoint, it doesn't matter whether the input's
+	 * element type matches what we expect: the element type's receive
+	 * function has to be robust enough to cope with invalid data.  However,
+	 * from a user-friendliness standpoint, it's nicer to complain about type
+	 * mismatches than to throw "improper binary format" errors.  But there's
+	 * a problem: only built-in types have OIDs that are stable enough to
+	 * believe that a mismatch is a real issue.  So complain only if both OIDs
+	 * are in the built-in range.  Otherwise, carry on with the element type
+	 * we "should" be getting.
+	 */
 	if (element_type != spec_element_type)
 	{
-		/* XXX Can we allow taking the input element type in any cases? */
-		ereport(ERROR,
-				(errcode(ERRCODE_DATATYPE_MISMATCH),
-				 errmsg("wrong element type")));
+		if (element_type < FirstGenbkiObjectId &&
+			spec_element_type < FirstGenbkiObjectId)
+			ereport(ERROR,
+					(errcode(ERRCODE_DATATYPE_MISMATCH),
+					 errmsg("binary data has array element type %u (%s) instead of expected %u (%s)",
+							element_type,
+							format_type_extended(element_type, -1,
+												 FORMAT_TYPE_ALLOW_INVALID),
+							spec_element_type,
+							format_type_extended(spec_element_type, -1,
+												 FORMAT_TYPE_ALLOW_INVALID))));
+		element_type = spec_element_type;
 	}
 
 	for (i = 0; i < ndim; i++)
diff --git a/src/backend/utils/adt/rowtypes.c b/src/backend/utils/adt/rowtypes.c
index 80cba2f4c261..674cf0a55d86 100644
--- a/src/backend/utils/adt/rowtypes.c
+++ b/src/backend/utils/adt/rowtypes.c
@@ -551,13 +551,33 @@ record_recv(PG_FUNCTION_ARGS)
 			continue;
 		}
 
-		/* Verify column datatype */
+		/* Check column type recorded in the data */
 		coltypoid = pq_getmsgint(buf, sizeof(Oid));
-		if (coltypoid != column_type)
+
+		/*
+		 * From a security standpoint, it doesn't matter whether the input's
+		 * column type matches what we expect: the column type's receive
+		 * function has to be robust enough to cope with invalid data.
+		 * However, from a user-friendliness standpoint, it's nicer to
+		 * complain about type mismatches than to throw "improper binary
+		 * format" errors.  But there's a problem: only built-in types have
+		 * OIDs that are stable enough to believe that a mismatch is a real
+		 * issue.  So complain only if both OIDs are in the built-in range.
+		 * Otherwise, carry on with the column type we "should" be getting.
+		 */
+		if (coltypoid != column_type &&
+			coltypoid < FirstGenbkiObjectId &&
+			column_type < FirstGenbkiObjectId)
 			ereport(ERROR,
 					(errcode(ERRCODE_DATATYPE_MISMATCH),
-					 errmsg("wrong data type: %u, expected %u",
-							coltypoid, column_type)));
+					 errmsg("binary data has type %u (%s) instead of expected %u (%s) in record column %d",
+							coltypoid,
+							format_type_extended(coltypoid, -1,
+												 FORMAT_TYPE_ALLOW_INVALID),
+							column_type,
+							format_type_extended(column_type, -1,
+												 FORMAT_TYPE_ALLOW_INVALID),
+							i + 1)));
 
 		/* Get and check the item length */
 		itemlen = pq_getmsgint(buf, 4);

From 4a70f829d86cb8dbd68f561720e6329f5e818c94 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 21 Jul 2020 15:50:58 -0700
Subject: [PATCH 186/334] Add nbtree Valgrind buffer lock checks.

Holding just a buffer pin (with no buffer lock) on an nbtree buffer/page
provides very weak guarantees, especially compared to heapam, where it's
often safe to read a page while only holding a buffer pin.  This commit
has Valgrind enforce the following rule: it is never okay to access an
nbtree buffer without holding both a pin and a lock on the buffer.

A draft version of this patch detected questionable code that was
cleaned up by commits fa7ff642 and 7154aa16.  The code in question used
to access an nbtree buffer page's special/opaque area with no buffer
lock (only a buffer pin).  This practice (which isn't obviously unsafe)
is hereby formally disallowed in nbtree.  There doesn't seem to be any
reason to allow it, and banning it keeps things simple for Valgrind.

The new checks are implemented by adding custom nbtree client requests
(located in LockBuffer() wrapper functions); these requests are
"superimposed" on top of the generic bufmgr.c Valgrind client requests
added by commit 1e0dfd16.  No custom resource management cleanup code is
needed to undo the effects of marking buffers as non-accessible under
this scheme.

Author: Peter Geoghegan
Reviewed-By: Anastasia Lubennikova, Georgios Kokolatos
Discussion: https://postgr.es/m/CAH2-WzkLgyN3zBvRZ1pkNJThC=xi_0gpWRUb_45eexLH1+k2_Q@mail.gmail.com
---
 src/backend/access/nbtree/nbtinsert.c |   2 +-
 src/backend/access/nbtree/nbtpage.c   | 145 ++++++++++++++++++++++----
 src/backend/access/nbtree/nbtree.c    |   5 +-
 src/backend/access/nbtree/nbtsearch.c |  23 ++--
 src/backend/access/nbtree/nbtutils.c  |   4 +-
 src/backend/storage/buffer/bufmgr.c   |  23 +++-
 src/include/access/nbtree.h           |   4 +
 src/include/pg_config_manual.h        |   8 +-
 8 files changed, 170 insertions(+), 44 deletions(-)

diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index b86c122763eb..e3a44bc09e02 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -303,7 +303,7 @@ _bt_search_insert(Relation rel, BTInsertState insertstate)
 	{
 		/* Simulate a _bt_getbuf() call with conditional locking */
 		insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
-		if (ConditionalLockBuffer(insertstate->buf))
+		if (_bt_conditionallockbuf(rel, insertstate->buf))
 		{
 			Page		page;
 			BTPageOpaque lpageop;
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 75628e0eb982..70bac0052fc6 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -32,6 +32,7 @@
 #include "storage/indexfsm.h"
 #include "storage/lmgr.h"
 #include "storage/predicate.h"
+#include "utils/memdebug.h"
 #include "utils/snapmgr.h"
 
 static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
@@ -198,8 +199,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
 	}
 
 	/* trade in our read lock for a write lock */
-	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-	LockBuffer(metabuf, BT_WRITE);
+	_bt_unlockbuf(rel, metabuf);
+	_bt_lockbuf(rel, metabuf, BT_WRITE);
 
 	START_CRIT_SECTION();
 
@@ -340,8 +341,8 @@ _bt_getroot(Relation rel, int access)
 		}
 
 		/* trade in our read lock for a write lock */
-		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
-		LockBuffer(metabuf, BT_WRITE);
+		_bt_unlockbuf(rel, metabuf);
+		_bt_lockbuf(rel, metabuf, BT_WRITE);
 
 		/*
 		 * Race condition:	if someone else initialized the metadata between
@@ -434,8 +435,8 @@ _bt_getroot(Relation rel, int access)
 		 * else accessing the new root page while it's unlocked, since no one
 		 * else knows where it is yet.
 		 */
-		LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
-		LockBuffer(rootbuf, BT_READ);
+		_bt_unlockbuf(rel, rootbuf);
+		_bt_lockbuf(rel, rootbuf, BT_READ);
 
 		/* okay, metadata is correct, release lock on it without caching */
 		_bt_relbuf(rel, metabuf);
@@ -783,10 +784,20 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX
  *		blkno == P_NEW means to get an unallocated index page.  The page
  *		will be initialized before returning it.
  *
+ *		The general rule in nbtree is that it's never okay to access a
+ *		page without holding both a buffer pin and a buffer lock on
+ *		the page's buffer.
+ *
  *		When this routine returns, the appropriate lock is set on the
  *		requested buffer and its reference count has been incremented
  *		(ie, the buffer is "locked and pinned").  Also, we apply
- *		_bt_checkpage to sanity-check the page (except in P_NEW case).
+ *		_bt_checkpage to sanity-check the page (except in P_NEW case),
+ *		and perform Valgrind client requests that help Valgrind detect
+ *		unsafe page accesses.
+ *
+ *		Note: raw LockBuffer() calls are disallowed in nbtree; all
+ *		buffer lock requests need to go through wrapper functions such
+ *		as _bt_lockbuf().
  */
 Buffer
 _bt_getbuf(Relation rel, BlockNumber blkno, int access)
@@ -797,7 +808,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 	{
 		/* Read an existing block of the relation */
 		buf = ReadBuffer(rel, blkno);
-		LockBuffer(buf, access);
+		_bt_lockbuf(rel, buf, access);
 		_bt_checkpage(rel, buf);
 	}
 	else
@@ -837,7 +848,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 			if (blkno == InvalidBlockNumber)
 				break;
 			buf = ReadBuffer(rel, blkno);
-			if (ConditionalLockBuffer(buf))
+			if (_bt_conditionallockbuf(rel, buf))
 			{
 				page = BufferGetPage(buf);
 				if (_bt_page_recyclable(page))
@@ -890,7 +901,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 		buf = ReadBuffer(rel, P_NEW);
 
 		/* Acquire buffer lock on new page */
-		LockBuffer(buf, BT_WRITE);
+		_bt_lockbuf(rel, buf, BT_WRITE);
 
 		/*
 		 * Release the file-extension lock; it's now OK for someone else to
@@ -931,9 +942,10 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
 
 	Assert(blkno != P_NEW);
 	if (BufferIsValid(obuf))
-		LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
+		_bt_unlockbuf(rel, obuf);
 	buf = ReleaseAndReadBuffer(obuf, rel, blkno);
-	LockBuffer(buf, access);
+	_bt_lockbuf(rel, buf, access);
+
 	_bt_checkpage(rel, buf);
 	return buf;
 }
@@ -946,7 +958,102 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
 void
 _bt_relbuf(Relation rel, Buffer buf)
 {
-	UnlockReleaseBuffer(buf);
+	_bt_unlockbuf(rel, buf);
+	ReleaseBuffer(buf);
+}
+
+/*
+ *	_bt_lockbuf() -- lock a pinned buffer.
+ *
+ * Lock is acquired without acquiring another pin.  This is like a raw
+ * LockBuffer() call, but performs extra steps needed by Valgrind.
+ *
+ * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
+ * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
+ */
+void
+_bt_lockbuf(Relation rel, Buffer buf, int access)
+{
+	/* LockBuffer() asserts that pin is held by this backend */
+	LockBuffer(buf, access);
+
+	/*
+	 * It doesn't matter that _bt_unlockbuf() won't get called in the
+	 * event of an nbtree error (e.g. a unique violation error).  That
+	 * won't cause Valgrind false positives.
+	 *
+	 * The nbtree client requests are superimposed on top of the
+	 * bufmgr.c buffer pin client requests.  In the event of an nbtree
+	 * error the buffer will certainly get marked as defined when the
+	 * backend once again acquires its first pin on the buffer. (Of
+	 * course, if the backend never touches the buffer again then it
+	 * doesn't matter that it remains non-accessible to Valgrind.)
+	 *
+	 * Note: When an IndexTuple C pointer gets computed using an
+	 * ItemId read from a page while a lock was held, the C pointer
+	 * becomes unsafe to dereference forever as soon as the lock is
+	 * released.  Valgrind can only detect cases where the pointer
+	 * gets dereferenced with no _current_ lock/pin held, though.
+	 */
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ *	_bt_unlockbuf() -- unlock a pinned buffer.
+ */
+void
+_bt_unlockbuf(Relation rel, Buffer buf)
+{
+	/*
+	 * Buffer is pinned and locked, which means that it is expected to be
+	 * defined and addressable.  Check that proactively.
+	 */
+	VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+	/* LockBuffer() asserts that pin is held by this backend */
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
+}
+
+/*
+ *	_bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
+ *	buffer.
+ *
+ * Note: Caller may need to call _bt_checkpage() with buf when pin on buf
+ * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
+ */
+bool
+_bt_conditionallockbuf(Relation rel, Buffer buf)
+{
+	/* ConditionalLockBuffer() asserts that pin is held by this backend */
+	if (!ConditionalLockBuffer(buf))
+		return false;
+
+	if (!RelationUsesLocalBuffers(rel))
+		VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+	return true;
+}
+
+/*
+ *	_bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup
+ *	lock.
+ */
+void
+_bt_upgradelockbufcleanup(Relation rel, Buffer buf)
+{
+	/*
+	 * Buffer is pinned and locked, which means that it is expected to be
+	 * defined and addressable.  Check that proactively.
+	 */
+	VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
+
+	/* LockBuffer() asserts that pin is held by this backend */
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+	LockBufferForCleanup(buf);
 }
 
 /*
@@ -1580,7 +1687,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
 				 * To avoid deadlocks, we'd better drop the leaf page lock
 				 * before going further.
 				 */
-				LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
+				_bt_unlockbuf(rel, leafbuf);
 
 				/*
 				 * Check that the left sibling of leafbuf (if any) is not
@@ -1616,7 +1723,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
 				 * (Page deletion can cope with the stack being to the left of
 				 * leafbuf, but not to the right of leafbuf.)
 				 */
-				LockBuffer(leafbuf, BT_WRITE);
+				_bt_lockbuf(rel, leafbuf, BT_WRITE);
 				continue;
 			}
 
@@ -1970,7 +2077,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	leafleftsib = opaque->btpo_prev;
 	leafrightsib = opaque->btpo_next;
 
-	LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
+	_bt_unlockbuf(rel, leafbuf);
 
 	/*
 	 * Check here, as calling loops will have locks held, preventing
@@ -2005,7 +2112,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 		 * To avoid deadlocks, we'd better drop the target page lock before
 		 * going further.
 		 */
-		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		_bt_unlockbuf(rel, buf);
 	}
 
 	/*
@@ -2022,7 +2129,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	 * table.)
 	 */
 	if (target != leafblkno)
-		LockBuffer(leafbuf, BT_WRITE);
+		_bt_lockbuf(rel, leafbuf, BT_WRITE);
 	if (leftsib != P_NONE)
 	{
 		lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
@@ -2072,7 +2179,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	 * rather than a superexclusive lock, since no scan will stop on an empty
 	 * page.
 	 */
-	LockBuffer(buf, BT_WRITE);
+	_bt_lockbuf(rel, buf, BT_WRITE);
 	page = BufferGetPage(buf);
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index e947addef6b2..d65f4357cc8b 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1115,7 +1115,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
 	 */
 	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
 							 info->strategy);
-	LockBuffer(buf, BT_READ);
+	_bt_lockbuf(rel, buf, BT_READ);
 	page = BufferGetPage(buf);
 	opaque = NULL;
 	if (!PageIsNew(page))
@@ -1222,8 +1222,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
 		 * course of the vacuum scan, whether or not it actually contains any
 		 * deletable tuples --- see nbtree/README.
 		 */
-		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-		LockBufferForCleanup(buf);
+		_bt_upgradelockbufcleanup(rel, buf);
 
 		/*
 		 * Check whether we need to backtrack to earlier pages.  What we are
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index f228c87a2b77..28dc196b55e3 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -64,7 +64,7 @@ static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
 static void
 _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
 {
-	LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
+	_bt_unlockbuf(scan->indexRelation, sp->buf);
 
 	if (IsMVCCSnapshot(scan->xs_snapshot) &&
 		RelationNeedsWAL(scan->indexRelation) &&
@@ -187,14 +187,13 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
 	if (access == BT_WRITE && page_access == BT_READ)
 	{
 		/* trade in our read lock for a write lock */
-		LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
-		LockBuffer(*bufP, BT_WRITE);
+		_bt_unlockbuf(rel, *bufP);
+		_bt_lockbuf(rel, *bufP, BT_WRITE);
 
 		/*
-		 * If the page was split between the time that we surrendered our read
-		 * lock and acquired our write lock, then this page may no longer be
-		 * the right place for the key we want to insert.  In this case, we
-		 * need to move right in the tree.
+		 * Race -- the leaf page may have split after we dropped the read lock
+		 * but before we acquired a write lock.  If it has, we may need to
+		 * move right to its new sibling.  Do that.
 		 */
 		*bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
 							  snapshot);
@@ -289,8 +288,8 @@ _bt_moveright(Relation rel,
 			/* upgrade our lock if necessary */
 			if (access == BT_READ)
 			{
-				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-				LockBuffer(buf, BT_WRITE);
+				_bt_unlockbuf(rel, buf);
+				_bt_lockbuf(rel, buf, BT_WRITE);
 			}
 
 			if (P_INCOMPLETE_SPLIT(opaque))
@@ -1413,7 +1412,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
 		 * There's no actually-matching data on this page.  Try to advance to
 		 * the next page.  Return false if there's no matching data at all.
 		 */
-		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
 		if (!_bt_steppage(scan, dir))
 			return false;
 	}
@@ -2061,7 +2060,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
 		 * deleted.
 		 */
 		if (BTScanPosIsPinned(so->currPos))
-			LockBuffer(so->currPos.buf, BT_READ);
+			_bt_lockbuf(rel, so->currPos.buf, BT_READ);
 		else
 			so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
 
@@ -2439,7 +2438,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
 		 * There's no actually-matching data on this page.  Try to advance to
 		 * the next page.  Return false if there's no matching data at all.
 		 */
-		LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+		_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
 		if (!_bt_steppage(scan, dir))
 			return false;
 	}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 7c33711a9f35..81589b9056dd 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -1744,7 +1744,7 @@ _bt_killitems(IndexScanDesc scan)
 		 * LSN.
 		 */
 		droppedpin = false;
-		LockBuffer(so->currPos.buf, BT_READ);
+		_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
 
 		page = BufferGetPage(so->currPos.buf);
 	}
@@ -1885,7 +1885,7 @@ _bt_killitems(IndexScanDesc scan)
 		MarkBufferDirtyHint(so->currPos.buf, true);
 	}
 
-	LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+	_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
 }
 
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 9b9303ff650c..f1ae6f9f8443 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1639,8 +1639,8 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
 				 * Assume that we acquired a buffer pin for the purposes of
 				 * Valgrind buffer client checks (even in !result case) to
 				 * keep things simple.  Buffers that are unsafe to access are
-				 * not generally guaranteed to be marked undefined in any
-				 * case.
+				 * not generally guaranteed to be marked undefined or
+				 * non-accessible in any case.
 				 */
 				VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
 				break;
@@ -1649,7 +1649,16 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
 	}
 	else
 	{
-		/* If we previously pinned the buffer, it must surely be valid */
+		/*
+		 * If we previously pinned the buffer, it must surely be valid.
+		 *
+		 * Note: We deliberately avoid a Valgrind client request here.
+		 * Individual access methods can optionally superimpose buffer page
+		 * client requests on top of our client requests to enforce that
+		 * buffers are only accessed while locked (and pinned).  It's possible
+		 * that the buffer page is legitimately non-accessible here.  We
+		 * cannot meddle with that.
+		 */
 		result = true;
 	}
 
@@ -1745,7 +1754,13 @@ UnpinBuffer(BufferDesc *buf, bool fixOwner)
 		uint32		buf_state;
 		uint32		old_buf_state;
 
-		/* Mark undefined, now that no pins remain in backend */
+		/*
+		 * Mark buffer non-accessible to Valgrind.
+		 *
+		 * Note that the buffer may have already been marked non-accessible
+		 * within access method code that enforces that buffers are only
+		 * accessed while a buffer lock is held.
+		 */
 		VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
 
 		/* I'd better not still hold any locks on the buffer */
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 79506c748b2e..f5274cc75083 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1073,6 +1073,10 @@ extern Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access);
 extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
 							   BlockNumber blkno, int access);
 extern void _bt_relbuf(Relation rel, Buffer buf);
+extern void _bt_lockbuf(Relation rel, Buffer buf, int access);
+extern void _bt_unlockbuf(Relation rel, Buffer buf);
+extern bool _bt_conditionallockbuf(Relation rel, Buffer buf);
+extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf);
 extern void _bt_pageinit(Page page, Size size);
 extern bool _bt_page_recyclable(Page page);
 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index 45b6a457896f..705dc69c06a2 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -271,11 +271,13 @@
  * Valgrind understands PostgreSQL memory contexts.  This permits detecting
  * memory errors that Valgrind would not detect on a vanilla build.  It also
  * enables detection of buffer accesses that take place without holding a
- * buffer pin.  See also src/tools/valgrind.supp.
+ * buffer pin (or without holding a buffer lock in the case of index access
+ * methods that superimpose their own custom client requests on top of the
+ * generic bufmgr.c requests).  See also src/tools/valgrind.supp.
  *
  * "make installcheck" is significantly slower under Valgrind.  The client
- * requests fall in hot code paths, so USE_VALGRIND slows native execution by
- * a few percentage points even when not run under Valgrind.
+ * requests fall in hot code paths, so USE_VALGRIND slows execution by a few
+ * percentage points even when not run under Valgrind.
  *
  * You should normally use MEMORY_CONTEXT_CHECKING with USE_VALGRIND;
  * instrumentation of repalloc() is inferior without it.

From bd0d893aa7aa303d7f344e267a9d3e53b0219491 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 21 Jul 2020 19:40:44 -0400
Subject: [PATCH 187/334] neqjoinsel must now pass through collation to
 eqjoinsel.

Since commit 044c99bc5, eqjoinsel passes the passed-in collation
to any operators it invokes.  However, neqjoinsel failed to pass
on whatever collation it got, so that if we invoked a
collation-dependent operator via that code path, we'd get "could not
determine which collation to use for string comparison" or the like.

Per report from Justin Pryzby.  Back-patch to v12, like the previous
commit.

Discussion: https://postgr.es/m/20200721191606.GL5748@telsasoft.com
---
 src/backend/utils/adt/selfuncs.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index be08eb481489..53d974125fd5 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -2775,6 +2775,7 @@ neqjoinsel(PG_FUNCTION_ARGS)
 	List	   *args = (List *) PG_GETARG_POINTER(2);
 	JoinType	jointype = (JoinType) PG_GETARG_INT16(3);
 	SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4);
+	Oid			collation = PG_GET_COLLATION();
 	float8		result;
 
 	if (jointype == JOIN_SEMI || jointype == JOIN_ANTI)
@@ -2821,12 +2822,14 @@ neqjoinsel(PG_FUNCTION_ARGS)
 
 		if (eqop)
 		{
-			result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel,
-														PointerGetDatum(root),
-														ObjectIdGetDatum(eqop),
-														PointerGetDatum(args),
-														Int16GetDatum(jointype),
-														PointerGetDatum(sjinfo)));
+			result =
+				DatumGetFloat8(DirectFunctionCall5Coll(eqjoinsel,
+													   collation,
+													   PointerGetDatum(root),
+													   ObjectIdGetDatum(eqop),
+													   PointerGetDatum(args),
+													   Int16GetDatum(jointype),
+													   PointerGetDatum(sjinfo)));
 		}
 		else
 		{

From e47c2602aa4d35a4e3eb6ada40454c6c0f1279bf Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 22 Jul 2020 10:16:21 +0900
Subject: [PATCH 188/334] Fix comment in sha2.h

An incorrect reference to SHA-1 was present.

Author: Daniel Gustafsson
Discussion: https://postgr.es/m/FE26C953-FA87-4BB9-9105-AA1F8705B0D0@yesql.se
---
 src/include/common/sha2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/common/sha2.h b/src/include/common/sha2.h
index 673e75b53436..9c4abf777d43 100644
--- a/src/include/common/sha2.h
+++ b/src/include/common/sha2.h
@@ -68,7 +68,7 @@
 #define PG_SHA512_DIGEST_LENGTH			64
 #define PG_SHA512_DIGEST_STRING_LENGTH	(PG_SHA512_DIGEST_LENGTH * 2 + 1)
 
-/* Context Structures for SHA-1/224/256/384/512 */
+/* Context Structures for SHA224/256/384/512 */
 #ifdef USE_OPENSSL
 typedef SHA256_CTX pg_sha256_ctx;
 typedef SHA512_CTX pg_sha512_ctx;

From a5073871ea655e37759f22f30c4c70359ad9759b Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Wed, 22 Jul 2020 16:38:20 +1200
Subject: [PATCH 189/334] Fix conversion table generator scripts.

convutils.pm used implicit conversion of undefined value to integer
zero.  Some of conversion scripts are susceptible to regexp greediness.
Fix, avoiding whitespace changes in the output.  Also update ICU URLs
that moved.

No need to back-patch, because the output of these scripts is also in
the source tree so we shouldn't need to rerun them on back-branches.

Author: Kyotaro Horiguchi <horikyoga.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJ7SEGLbj%3D%3DTQCcyKRA9aqj8%2B6L%3DexSq1y25TA%3DWxLziQ%40mail.gmail.com
---
 src/backend/utils/mb/Unicode/Makefile         |  2 +-
 .../utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl   |  7 ++-
 src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl |  3 +-
 .../utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl |  7 ++-
 src/backend/utils/mb/Unicode/convutils.pm     | 62 ++++++++++---------
 5 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/src/backend/utils/mb/Unicode/Makefile b/src/backend/utils/mb/Unicode/Makefile
index 9084f0300917..da307d8eb95c 100644
--- a/src/backend/utils/mb/Unicode/Makefile
+++ b/src/backend/utils/mb/Unicode/Makefile
@@ -122,7 +122,7 @@ euc-jis-2004-std.txt sjis-0213-2004-std.txt:
 	$(DOWNLOAD) http://x0213.org/codetable/$(@F)
 
 gb-18030-2000.xml windows-949-2000.xml:
-	$(DOWNLOAD) https://ssl.icu-project.org/repos/icu/data/trunk/charset/data/xml/$(@F)
+	$(DOWNLOAD) https://raw.githubusercontent.com/unicode-org/icu-data/master/charset/data/xml/$(@F)
 
 GB2312.TXT:
 	$(DOWNLOAD) 'http://trac.greenstone.org/browser/trunk/gsdl/unicode/MAPPINGS/EASTASIA/GB/GB2312.TXT?rev=1842&format=txt'
diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl
index 092a5b44f558..6d1681a18a35 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JIS_2004.pl
@@ -24,12 +24,13 @@
 
 while (my $line = <$in>)
 {
-	if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
+	if ($line =~ /^0x(\w+)\s*U\+(\w+)\+(\w+)\s*#\s*(\S.*)?\s*$/)
 	{
 
 		# combined characters
 		my ($c, $u1, $u2) = ($1, $2, $3);
-		my $rest = "U+" . $u1 . "+" . $u2 . $4;
+		# The "\t \t" below is just to avoid insubstantial diffs.
+		my $rest = "U+" . $u1 . "+" . $u2 . "\t \t" . $4;
 		my $code = hex($c);
 		my $ucs1 = hex($u1);
 		my $ucs2 = hex($u2);
@@ -45,7 +46,7 @@
 			l          => $.
 		  };
 	}
-	elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
+	elsif ($line =~ /^0x(\w+)\s*U\+(\w+)\s*#\s*(\S.*)?\s*$/)
 	{
 
 		# non-combined characters
diff --git a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
index 1d88c0296ee2..d8bed27e1b1e 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_EUC_JP.pl
@@ -80,7 +80,8 @@
 	}
 }
 
-foreach my $i (@mapping)
+# extract only SJIS characers
+foreach my $i (grep defined $_->{sjis}, @mapping)
 {
 	my $sjis = $i->{sjis};
 
diff --git a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl
index b516e91306f5..b86714dd46df 100755
--- a/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl
+++ b/src/backend/utils/mb/Unicode/UCS_to_SHIFT_JIS_2004.pl
@@ -24,12 +24,13 @@
 
 while (my $line = <$in>)
 {
-	if ($line =~ /^0x(.*)[ \t]*U\+(.*)\+(.*)[ \t]*#(.*)$/)
+	if ($line =~ /^0x(\w+)\s*U\+(\w+)\+(\w+)\s*#\s*(\S.*)?\s*$/)
 	{
 
 		# combined characters
 		my ($c, $u1, $u2) = ($1, $2, $3);
-		my $rest = "U+" . $u1 . "+" . $u2 . $4;
+		# The "\t \t" below is just to avoid insubstantial diffs.
+		my $rest = "U+" . $u1 . "+" . $u2 . "\t \t" . $4;
 		my $code = hex($c);
 		my $ucs1 = hex($u1);
 		my $ucs2 = hex($u2);
@@ -45,7 +46,7 @@
 			l          => $.
 		  };
 	}
-	elsif ($line =~ /^0x(.*)[ \t]*U\+(.*)[ \t]*#(.*)$/)
+	elsif ($line =~ /^0x(\w+)\s*U\+(\w+)\s*#\s*(\S.*)?\s*$/)
 	{
 
 		# non-combined characters
diff --git a/src/backend/utils/mb/Unicode/convutils.pm b/src/backend/utils/mb/Unicode/convutils.pm
index 2f64a12ea149..9d97061c6fe6 100644
--- a/src/backend/utils/mb/Unicode/convutils.pm
+++ b/src/backend/utils/mb/Unicode/convutils.pm
@@ -380,7 +380,8 @@ sub print_radix_table
 	  {
 		header  => "Dummy map, for invalid values",
 		min_idx => 0,
-		max_idx => $widest_range
+		max_idx => $widest_range,
+		label => "dummy map"
 	  };
 
 	###
@@ -471,35 +472,37 @@ sub print_radix_table
 	}
 
 	# Also look up the positions of the roots in the table.
-	my $b1root = $segmap{"1-byte"};
-	my $b2root = $segmap{"2-byte"};
-	my $b3root = $segmap{"3-byte"};
-	my $b4root = $segmap{"4-byte"};
+	# Missing map represents dummy mapping.
+	my $b1root = $segmap{"1-byte"} || 0;
+	my $b2root = $segmap{"2-byte"} || 0;
+	my $b3root = $segmap{"3-byte"} || 0;
+	my $b4root = $segmap{"4-byte"} || 0;
 
 	# And the lower-upper values of each level in each radix tree.
-	my $b1_lower = $min_idx{1}{1};
-	my $b1_upper = $max_idx{1}{1};
-
-	my $b2_1_lower = $min_idx{2}{1};
-	my $b2_1_upper = $max_idx{2}{1};
-	my $b2_2_lower = $min_idx{2}{2};
-	my $b2_2_upper = $max_idx{2}{2};
-
-	my $b3_1_lower = $min_idx{3}{1};
-	my $b3_1_upper = $max_idx{3}{1};
-	my $b3_2_lower = $min_idx{3}{2};
-	my $b3_2_upper = $max_idx{3}{2};
-	my $b3_3_lower = $min_idx{3}{3};
-	my $b3_3_upper = $max_idx{3}{3};
-
-	my $b4_1_lower = $min_idx{4}{1};
-	my $b4_1_upper = $max_idx{4}{1};
-	my $b4_2_lower = $min_idx{4}{2};
-	my $b4_2_upper = $max_idx{4}{2};
-	my $b4_3_lower = $min_idx{4}{3};
-	my $b4_3_upper = $max_idx{4}{3};
-	my $b4_4_lower = $min_idx{4}{4};
-	my $b4_4_upper = $max_idx{4}{4};
+	# Missing values represent zero.
+	my $b1_lower = $min_idx{1}{1} || 0;
+	my $b1_upper = $max_idx{1}{1} || 0;
+
+	my $b2_1_lower = $min_idx{2}{1} || 0;
+	my $b2_1_upper = $max_idx{2}{1} || 0;
+	my $b2_2_lower = $min_idx{2}{2} || 0;
+	my $b2_2_upper = $max_idx{2}{2} || 0;
+
+	my $b3_1_lower = $min_idx{3}{1} || 0;
+	my $b3_1_upper = $max_idx{3}{1} || 0;
+	my $b3_2_lower = $min_idx{3}{2} || 0;
+	my $b3_2_upper = $max_idx{3}{2} || 0;
+	my $b3_3_lower = $min_idx{3}{3} || 0;
+	my $b3_3_upper = $max_idx{3}{3} || 0;
+
+	my $b4_1_lower = $min_idx{4}{1} || 0;
+	my $b4_1_upper = $max_idx{4}{1} || 0;
+	my $b4_2_lower = $min_idx{4}{2} || 0;
+	my $b4_2_upper = $max_idx{4}{2} || 0;
+	my $b4_3_lower = $min_idx{4}{3} || 0;
+	my $b4_3_upper = $max_idx{4}{3} || 0;
+	my $b4_4_lower = $min_idx{4}{4} || 0;
+	my $b4_4_upper = $max_idx{4}{4} || 0;
 
 	###
 	### Find the maximum value in the whole table, to determine if we can
@@ -607,7 +610,8 @@ sub print_radix_table
 			for (my $j = 0;
 				$j < $vals_per_line && $i <= $seg->{max_idx}; $j++)
 			{
-				my $val = $seg->{values}->{$i};
+				# missing values represent zero.
+				my $val = $seg->{values}->{$i} || 0;
 
 				printf $out " 0x%0*x", $colwidth, $val;
 				$off++;

From 9e108984fb35d8f9e2c2bffa10c0034f9161e802 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 22 Jul 2020 14:52:23 +0900
Subject: [PATCH 190/334] Fix corner case with PGP decompression in pgcrypto

A compressed stream may end with an empty packet, and PGP decompression
finished before reading this empty packet in the remaining stream.  This
caused a failure in pgcrypto, handling this case as corrupted data.
This commit makes sure to consume such extra data, avoiding a failure
when decompression the entire stream.  This corner case was reproducible
with a data length of 16kB, and existed since its introduction in
e94dd6a.  A cheap regression test is added to cover this case.

Thanks to Jeff Janes for the extra investigation.

Reported-by: Frank Gagnepain
Author: Kyotaro Horiguchi, Michael Paquier
Discussion: https://postgr.es/m/16476-692ef7b84e5fb893@postgresql.org
Backpatch-through: 9.5
---
 contrib/pgcrypto/expected/pgp-compression.out | 30 +++++++++++++++++++
 contrib/pgcrypto/pgp-compress.c               | 22 +++++++-------
 contrib/pgcrypto/sql/pgp-compression.sql      | 21 +++++++++++++
 3 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/contrib/pgcrypto/expected/pgp-compression.out b/contrib/pgcrypto/expected/pgp-compression.out
index 32b350b8fe05..d4c57feba30b 100644
--- a/contrib/pgcrypto/expected/pgp-compression.out
+++ b/contrib/pgcrypto/expected/pgp-compression.out
@@ -48,3 +48,33 @@ select pgp_sym_decrypt(
  Secret message
 (1 row)
 
+-- check corner case involving an input string of 16kB, as per bug #16476.
+SELECT setseed(0);
+ setseed 
+---------
+ 
+(1 row)
+
+WITH random_string AS
+(
+  -- This generates a random string of 16366 bytes.  This is chosen
+  -- as random so that it does not get compressed, and the decompression
+  -- would work on a string with the same length as the origin, making the
+  -- test behavior more predictible.  lpad() ensures that the generated
+  -- hexadecimal value is completed by extra zero characters if random()
+  -- has generated a value strictly lower than 16.
+  SELECT string_agg(decode(lpad(to_hex((random()*256)::int), 2, '0'), 'hex'), '') as bytes
+    FROM generate_series(0, 16365)
+)
+SELECT bytes =
+    pgp_sym_decrypt_bytea(
+      pgp_sym_encrypt_bytea(bytes, 'key',
+                            'compress-algo=1,compress-level=1'),
+                            'key', 'expect-compress-algo=1')
+    AS is_same
+  FROM random_string;
+ is_same 
+---------
+ t
+(1 row)
+
diff --git a/contrib/pgcrypto/pgp-compress.c b/contrib/pgcrypto/pgp-compress.c
index 0505bdee9237..17f5b2ef93dc 100644
--- a/contrib/pgcrypto/pgp-compress.c
+++ b/contrib/pgcrypto/pgp-compress.c
@@ -243,6 +243,17 @@ decompress_read(void *priv, PullFilter *src, int len,
 	struct DecomprData *dec = priv;
 
 restart:
+	if (dec->stream.avail_in == 0)
+	{
+		uint8	   *tmp;
+
+		res = pullf_read(src, 8192, &tmp);
+		if (res < 0)
+			return res;
+		dec->stream.next_in = tmp;
+		dec->stream.avail_in = res;
+	}
+
 	if (dec->buf_data > 0)
 	{
 		if (len > dec->buf_data)
@@ -256,17 +267,6 @@ decompress_read(void *priv, PullFilter *src, int len,
 	if (dec->eof)
 		return 0;
 
-	if (dec->stream.avail_in == 0)
-	{
-		uint8	   *tmp;
-
-		res = pullf_read(src, 8192, &tmp);
-		if (res < 0)
-			return res;
-		dec->stream.next_in = tmp;
-		dec->stream.avail_in = res;
-	}
-
 	dec->stream.next_out = dec->buf;
 	dec->stream.avail_out = dec->buf_len;
 	dec->pos = dec->buf;
diff --git a/contrib/pgcrypto/sql/pgp-compression.sql b/contrib/pgcrypto/sql/pgp-compression.sql
index ca9ee1fc0088..87c59c6cabc4 100644
--- a/contrib/pgcrypto/sql/pgp-compression.sql
+++ b/contrib/pgcrypto/sql/pgp-compression.sql
@@ -28,3 +28,24 @@ select pgp_sym_decrypt(
 	pgp_sym_encrypt('Secret message', 'key',
 			'compress-algo=2, compress-level=0'),
 	'key', 'expect-compress-algo=0');
+
+-- check corner case involving an input string of 16kB, as per bug #16476.
+SELECT setseed(0);
+WITH random_string AS
+(
+  -- This generates a random string of 16366 bytes.  This is chosen
+  -- as random so that it does not get compressed, and the decompression
+  -- would work on a string with the same length as the origin, making the
+  -- test behavior more predictible.  lpad() ensures that the generated
+  -- hexadecimal value is completed by extra zero characters if random()
+  -- has generated a value strictly lower than 16.
+  SELECT string_agg(decode(lpad(to_hex((random()*256)::int), 2, '0'), 'hex'), '') as bytes
+    FROM generate_series(0, 16365)
+)
+SELECT bytes =
+    pgp_sym_decrypt_bytea(
+      pgp_sym_encrypt_bytea(bytes, 'key',
+                            'compress-algo=1,compress-level=1'),
+                            'key', 'expect-compress-algo=1')
+    AS is_same
+  FROM random_string;

From a57d312a7706321d850faa048a562a0c0c01b835 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 22 Jul 2020 19:19:44 -0400
Subject: [PATCH 191/334] Support infinity and -infinity in the numeric data
 type.

Add infinities that behave the same as they do in the floating-point
data types.  Aside from any intrinsic usefulness these may have,
this closes an important gap in our ability to convert floating
values to numeric and/or replace float-based APIs with numeric.

The new values are represented by bit patterns that were formerly
not used (although old code probably would take them for NaNs).
So there shouldn't be any pg_upgrade hazard.

Patch by me, reviewed by Dean Rasheed and Andrew Gierth

Discussion: https://postgr.es/m/606717.1591924582@sss.pgh.pa.us
---
 contrib/jsonb_plperl/jsonb_plperl.c      |    6 +-
 contrib/jsonb_plpython/jsonb_plpython.c  |    9 +-
 doc/src/sgml/datatype.sgml               |   75 +-
 src/backend/utils/adt/formatting.c       |    9 +-
 src/backend/utils/adt/numeric.c          | 1474 +++++++++++++++++-----
 src/include/utils/numeric.h              |    1 +
 src/test/regress/expected/aggregates.out |   86 +-
 src/test/regress/expected/numeric.out    |  680 +++++++++-
 src/test/regress/expected/window.out     |   66 +-
 src/test/regress/sql/aggregates.sql      |   22 +-
 src/test/regress/sql/numeric.sql         |  187 ++-
 src/test/regress/sql/window.sql          |   16 +-
 12 files changed, 2251 insertions(+), 380 deletions(-)

diff --git a/contrib/jsonb_plperl/jsonb_plperl.c b/contrib/jsonb_plperl/jsonb_plperl.c
index ed361efbe202..b81ba54b809d 100644
--- a/contrib/jsonb_plperl/jsonb_plperl.c
+++ b/contrib/jsonb_plperl/jsonb_plperl.c
@@ -227,10 +227,8 @@ SV_to_JsonbValue(SV *in, JsonbParseState **jsonb_state, bool is_elem)
 				/*
 				 * jsonb doesn't allow infinity or NaN (per JSON
 				 * specification), but the numeric type that is used for the
-				 * storage accepts NaN, so we have to prevent it here
-				 * explicitly.  We don't really have to check for isinf()
-				 * here, as numeric doesn't allow it and it would be caught
-				 * later, but it makes for a nicer error message.
+				 * storage accepts those, so we have to reject them here
+				 * explicitly.
 				 */
 				if (isinf(nval))
 					ereport(ERROR,
diff --git a/contrib/jsonb_plpython/jsonb_plpython.c b/contrib/jsonb_plpython/jsonb_plpython.c
index e09308daf07f..836c17877065 100644
--- a/contrib/jsonb_plpython/jsonb_plpython.c
+++ b/contrib/jsonb_plpython/jsonb_plpython.c
@@ -387,14 +387,17 @@ PLyNumber_ToJsonbValue(PyObject *obj, JsonbValue *jbvNum)
 	pfree(str);
 
 	/*
-	 * jsonb doesn't allow NaN (per JSON specification), so we have to prevent
-	 * it here explicitly.  (Infinity is also not allowed in jsonb, but
-	 * numeric_in above already catches that.)
+	 * jsonb doesn't allow NaN or infinity (per JSON specification), so we
+	 * have to reject those here explicitly.
 	 */
 	if (numeric_is_nan(num))
 		ereport(ERROR,
 				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
 				 errmsg("cannot convert NaN to jsonb")));
+	if (numeric_is_inf(num))
+		ereport(ERROR,
+				(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+				 errmsg("cannot convert infinity to jsonb")));
 
 	jbvNum->type = jbvNumeric;
 	jbvNum->val.numeric = num;
diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index 7027758d28dd..50e370cae440 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -554,9 +554,9 @@ NUMERIC(<replaceable>precision</replaceable>)
 <programlisting>
 NUMERIC
 </programlisting>
-     without any precision or scale creates a column in which numeric
-     values of any precision and scale can be stored, up to the
-     implementation limit on precision.  A column of this kind will
+     without any precision or scale creates an <quote>unconstrained
+     numeric</quote> column in which numeric values of any length can be
+     stored, up to the implementation limits.  A column of this kind will
      not coerce input values to any particular scale, whereas
      <type>numeric</type> columns with a declared scale will coerce
      input values to that scale.  (The <acronym>SQL</acronym> standard
@@ -568,10 +568,10 @@ NUMERIC
 
     <note>
      <para>
-      The maximum allowed precision when explicitly specified in the
-      type declaration is 1000; <type>NUMERIC</type> without a specified
-      precision is subject to the limits described in <xref
-      linkend="datatype-numeric-table"/>.
+      The maximum precision that can be explicitly specified in
+      a <type>NUMERIC</type> type declaration is 1000.  An
+      unconstrained <type>NUMERIC</type> column is subject to the limits
+      described in <xref linkend="datatype-numeric-table"/>.
      </para>
     </note>
 
@@ -593,6 +593,11 @@ NUMERIC
      plus three to eight bytes overhead.
     </para>
 
+    <indexterm>
+     <primary>infinity</primary>
+     <secondary>numeric (data type)</secondary>
+    </indexterm>
+
     <indexterm>
      <primary>NaN</primary>
      <see>not a number</see>
@@ -604,13 +609,44 @@ NUMERIC
     </indexterm>
 
     <para>
-     In addition to ordinary numeric values, the <type>numeric</type>
-     type allows the special value <literal>NaN</literal>, meaning
-     <quote>not-a-number</quote>.  Any operation on <literal>NaN</literal>
-     yields another <literal>NaN</literal>.  When writing this value
-     as a constant in an SQL command, you must put quotes around it,
-     for example <literal>UPDATE table SET x = 'NaN'</literal>.  On input,
-     the string <literal>NaN</literal> is recognized in a case-insensitive manner.
+     In addition to ordinary numeric values, the <type>numeric</type> type
+     has several special values:
+<literallayout>
+<literal>Infinity</literal>
+<literal>-Infinity</literal>
+<literal>NaN</literal>
+</literallayout>
+     These are adapted from the IEEE 754 standard, and represent
+     <quote>infinity</quote>, <quote>negative infinity</quote>, and
+     <quote>not-a-number</quote>, respectively. When writing these values
+     as constants in an SQL command, you must put quotes around them,
+     for example <literal>UPDATE table SET x = '-Infinity'</literal>.
+     On input, these strings are recognized in a case-insensitive manner.
+     The infinity values can alternatively be spelled <literal>inf</literal>
+     and <literal>-inf</literal>.
+    </para>
+
+    <para>
+     The infinity values behave as per mathematical expectations.  For
+     example, <literal>Infinity</literal> plus any finite value equals
+     <literal>Infinity</literal>, as does <literal>Infinity</literal>
+     plus <literal>Infinity</literal>; but <literal>Infinity</literal>
+     minus <literal>Infinity</literal> yields <literal>NaN</literal> (not a
+     number), because it has no well-defined interpretation.  Note that an
+     infinity can only be stored in an unconstrained <type>numeric</type>
+     column, because it notionally exceeds any finite precision limit.
+    </para>
+
+    <para>
+     The <literal>NaN</literal> (not a number) value is used to represent
+     undefined calculational results.  In general, any operation with
+     a <literal>NaN</literal> input yields another <literal>NaN</literal>.
+     The only exception is when the operation's other inputs are such that
+     the same output would be obtained if the <literal>NaN</literal> were to
+     be replaced by any finite or infinite numeric value; then, that output
+     value is used for <literal>NaN</literal> too.  (An example of this
+     principle is that <literal>NaN</literal> raised to the zero power
+     yields one.)
     </para>
 
     <note>
@@ -781,9 +817,14 @@ FROM generate_series(-3.5, 3.5, 1) as x;
      </para>
     </note>
 
+    <indexterm>
+     <primary>infinity</primary>
+     <secondary>floating point</secondary>
+    </indexterm>
+
     <indexterm>
      <primary>not a number</primary>
-     <secondary>double precision</secondary>
+     <secondary>floating point</secondary>
     </indexterm>
 
     <para>
@@ -800,11 +841,13 @@ FROM generate_series(-3.5, 3.5, 1) as x;
      as constants in an SQL command, you must put quotes around them,
      for example <literal>UPDATE table SET x = '-Infinity'</literal>.  On input,
      these strings are recognized in a case-insensitive manner.
+     The infinity values can alternatively be spelled <literal>inf</literal>
+     and <literal>-inf</literal>.
     </para>
 
     <note>
      <para>
-      IEEE754 specifies that <literal>NaN</literal> should not compare equal
+      IEEE 754 specifies that <literal>NaN</literal> should not compare equal
       to any other floating-point value (including <literal>NaN</literal>).
       In order to allow floating-point values to be sorted and used
       in tree-based indexes, <productname>PostgreSQL</productname> treats
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 16768b28c30e..662643813660 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -6129,9 +6129,12 @@ numeric_to_char(PG_FUNCTION_ARGS)
 		/*
 		 * numeric_out_sci() does not emit a sign for positive numbers.  We
 		 * need to add a space in this case so that positive and negative
-		 * numbers are aligned.  We also have to do the right thing for NaN.
+		 * numbers are aligned.  Also must check for NaN/infinity cases, which
+		 * we handle the same way as in float8_to_char.
 		 */
-		if (strcmp(orgnum, "NaN") == 0)
+		if (strcmp(orgnum, "NaN") == 0 ||
+			strcmp(orgnum, "Infinity") == 0 ||
+			strcmp(orgnum, "-Infinity") == 0)
 		{
 			/*
 			 * Allow 6 characters for the leading sign, the decimal point,
@@ -6346,7 +6349,7 @@ int8_to_char(PG_FUNCTION_ARGS)
 		/*
 		 * numeric_out_sci() does not emit a sign for positive numbers.  We
 		 * need to add a space in this case so that positive and negative
-		 * numbers are aligned.  We don't have to worry about NaN here.
+		 * numbers are aligned.  We don't have to worry about NaN/inf here.
 		 */
 		if (*orgnum != '-')
 		{
diff --git a/src/backend/utils/adt/numeric.c b/src/backend/utils/adt/numeric.c
index 1773fa292e49..ed825a1fddf9 100644
--- a/src/backend/utils/adt/numeric.c
+++ b/src/backend/utils/adt/numeric.c
@@ -109,14 +109,13 @@ typedef int16 NumericDigit;
  * If the high bits of the first word of a NumericChoice (n_header, or
  * n_short.n_header, or n_long.n_sign_dscale) are NUMERIC_SHORT, then the
  * numeric follows the NumericShort format; if they are NUMERIC_POS or
- * NUMERIC_NEG, it follows the NumericLong format.  If they are NUMERIC_NAN,
- * it is a NaN.  We currently always store a NaN using just two bytes (i.e.
- * only n_header), but previous releases used only the NumericLong format,
- * so we might find 4-byte NaNs on disk if a database has been migrated using
- * pg_upgrade.  In either case, when the high bits indicate a NaN, the
- * remaining bits are never examined.  Currently, we always initialize these
- * to zero, but it might be possible to use them for some other purpose in
- * the future.
+ * NUMERIC_NEG, it follows the NumericLong format. If they are NUMERIC_SPECIAL,
+ * the value is a NaN or Infinity.  We currently always store SPECIAL values
+ * using just two bytes (i.e. only n_header), but previous releases used only
+ * the NumericLong format, so we might find 4-byte NaNs (though not infinities)
+ * on disk if a database has been migrated using pg_upgrade.  In either case,
+ * the low-order bits of a special value's header are reserved and currently
+ * should always be set to zero.
  *
  * In the NumericShort format, the remaining 14 bits of the header word
  * (n_short.n_header) are allocated as follows: 1 for sign (positive or
@@ -168,25 +167,47 @@ struct NumericData
 #define NUMERIC_POS			0x0000
 #define NUMERIC_NEG			0x4000
 #define NUMERIC_SHORT		0x8000
-#define NUMERIC_NAN			0xC000
+#define NUMERIC_SPECIAL		0xC000
 
 #define NUMERIC_FLAGBITS(n) ((n)->choice.n_header & NUMERIC_SIGN_MASK)
-#define NUMERIC_IS_NAN(n)		(NUMERIC_FLAGBITS(n) == NUMERIC_NAN)
 #define NUMERIC_IS_SHORT(n)		(NUMERIC_FLAGBITS(n) == NUMERIC_SHORT)
+#define NUMERIC_IS_SPECIAL(n)	(NUMERIC_FLAGBITS(n) == NUMERIC_SPECIAL)
 
 #define NUMERIC_HDRSZ	(VARHDRSZ + sizeof(uint16) + sizeof(int16))
 #define NUMERIC_HDRSZ_SHORT (VARHDRSZ + sizeof(uint16))
 
 /*
- * If the flag bits are NUMERIC_SHORT or NUMERIC_NAN, we want the short header;
- * otherwise, we want the long one.  Instead of testing against each value, we
- * can just look at the high bit, for a slight efficiency gain.
+ * If the flag bits are NUMERIC_SHORT or NUMERIC_SPECIAL, we want the short
+ * header; otherwise, we want the long one.  Instead of testing against each
+ * value, we can just look at the high bit, for a slight efficiency gain.
  */
 #define NUMERIC_HEADER_IS_SHORT(n)	(((n)->choice.n_header & 0x8000) != 0)
 #define NUMERIC_HEADER_SIZE(n) \
 	(VARHDRSZ + sizeof(uint16) + \
 	 (NUMERIC_HEADER_IS_SHORT(n) ? 0 : sizeof(int16)))
 
+/*
+ * Definitions for special values (NaN, positive infinity, negative infinity).
+ *
+ * The two bits after the NUMERIC_SPECIAL bits are 00 for NaN, 01 for positive
+ * infinity, 11 for negative infinity.  (This makes the sign bit match where
+ * it is in a short-format value, though we make no use of that at present.)
+ * We could mask off the remaining bits before testing the active bits, but
+ * currently those bits must be zeroes, so masking would just add cycles.
+ */
+#define NUMERIC_EXT_SIGN_MASK	0xF000	/* high bits plus NaN/Inf flag bits */
+#define NUMERIC_NAN				0xC000
+#define NUMERIC_PINF			0xD000
+#define NUMERIC_NINF			0xF000
+#define NUMERIC_INF_SIGN_MASK	0x2000
+
+#define NUMERIC_EXT_FLAGBITS(n)	((n)->choice.n_header & NUMERIC_EXT_SIGN_MASK)
+#define NUMERIC_IS_NAN(n)		((n)->choice.n_header == NUMERIC_NAN)
+#define NUMERIC_IS_PINF(n)		((n)->choice.n_header == NUMERIC_PINF)
+#define NUMERIC_IS_NINF(n)		((n)->choice.n_header == NUMERIC_NINF)
+#define NUMERIC_IS_INF(n) \
+	(((n)->choice.n_header & ~NUMERIC_INF_SIGN_MASK) == NUMERIC_PINF)
+
 /*
  * Short format definitions.
  */
@@ -202,7 +223,13 @@ struct NumericData
 #define NUMERIC_SHORT_WEIGHT_MIN		(-(NUMERIC_SHORT_WEIGHT_MASK+1))
 
 /*
- * Extract sign, display scale, weight.
+ * Extract sign, display scale, weight.  These macros extract field values
+ * suitable for the NumericVar format from the Numeric (on-disk) format.
+ *
+ * Note that we don't trouble to ensure that dscale and weight read as zero
+ * for an infinity; however, that doesn't matter since we never convert
+ * "special" numerics to NumericVar form.  Only the constants defined below
+ * (const_nan, etc) ever represent a non-finite value as a NumericVar.
  */
 
 #define NUMERIC_DSCALE_MASK			0x3FFF
@@ -210,7 +237,9 @@ struct NumericData
 #define NUMERIC_SIGN(n) \
 	(NUMERIC_IS_SHORT(n) ? \
 		(((n)->choice.n_short.n_header & NUMERIC_SHORT_SIGN_MASK) ? \
-		NUMERIC_NEG : NUMERIC_POS) : NUMERIC_FLAGBITS(n))
+		 NUMERIC_NEG : NUMERIC_POS) : \
+		(NUMERIC_IS_SPECIAL(n) ? \
+		 NUMERIC_EXT_FLAGBITS(n) : NUMERIC_FLAGBITS(n)))
 #define NUMERIC_DSCALE(n)	(NUMERIC_HEADER_IS_SHORT((n)) ? \
 	((n)->choice.n_short.n_header & NUMERIC_SHORT_DSCALE_MASK) \
 		>> NUMERIC_SHORT_DSCALE_SHIFT \
@@ -227,7 +256,9 @@ struct NumericData
  * complex.
  *
  * The value represented by a NumericVar is determined by the sign, weight,
- * ndigits, and digits[] array.
+ * ndigits, and digits[] array.  If it is a "special" value (NaN or Inf)
+ * then only the sign field matters; ndigits should be zero, and the weight
+ * and dscale fields are ignored.
  *
  * Note: the first digit of a NumericVar's value is assumed to be multiplied
  * by NBASE ** weight.  Another way to say it is that there are weight+1
@@ -274,7 +305,7 @@ typedef struct NumericVar
 {
 	int			ndigits;		/* # of digits in digits[] - can be 0! */
 	int			weight;			/* weight of first digit */
-	int			sign;			/* NUMERIC_POS, NUMERIC_NEG, or NUMERIC_NAN */
+	int			sign;			/* NUMERIC_POS, _NEG, _NAN, _PINF, or _NINF */
 	int			dscale;			/* display scale */
 	NumericDigit *buf;			/* start of palloc'd space for digits[] */
 	NumericDigit *digits;		/* base-NBASE digits */
@@ -354,16 +385,26 @@ typedef struct NumericSumAccum
  * representations for numeric values in order to avoid depending on
  * USE_FLOAT8_BYVAL.  The type of abbreviation we use is based only on
  * the size of a datum, not the argument-passing convention for float8.
+ *
+ * The range of abbreviations for finite values is from +PG_INT64/32_MAX
+ * to -PG_INT64/32_MAX.  NaN has the abbreviation PG_INT64/32_MIN, and we
+ * define the sort ordering to make that work out properly (see further
+ * comments below).  PINF and NINF share the abbreviations of the largest
+ * and smallest finite abbreviation classes.
  */
 #define NUMERIC_ABBREV_BITS (SIZEOF_DATUM * BITS_PER_BYTE)
 #if SIZEOF_DATUM == 8
 #define NumericAbbrevGetDatum(X) ((Datum) (X))
 #define DatumGetNumericAbbrev(X) ((int64) (X))
 #define NUMERIC_ABBREV_NAN		 NumericAbbrevGetDatum(PG_INT64_MIN)
+#define NUMERIC_ABBREV_PINF		 NumericAbbrevGetDatum(-PG_INT64_MAX)
+#define NUMERIC_ABBREV_NINF		 NumericAbbrevGetDatum(PG_INT64_MAX)
 #else
 #define NumericAbbrevGetDatum(X) ((Datum) (X))
 #define DatumGetNumericAbbrev(X) ((int32) (X))
 #define NUMERIC_ABBREV_NAN		 NumericAbbrevGetDatum(PG_INT32_MIN)
+#define NUMERIC_ABBREV_PINF		 NumericAbbrevGetDatum(-PG_INT32_MAX)
+#define NUMERIC_ABBREV_NINF		 NumericAbbrevGetDatum(PG_INT32_MAX)
 #endif
 
 
@@ -379,6 +420,9 @@ static const NumericDigit const_one_data[1] = {1};
 static const NumericVar const_one =
 {1, 0, NUMERIC_POS, 0, NULL, (NumericDigit *) const_one_data};
 
+static const NumericVar const_minus_one =
+{1, 0, NUMERIC_NEG, 0, NULL, (NumericDigit *) const_one_data};
+
 static const NumericDigit const_two_data[1] = {2};
 static const NumericVar const_two =
 {1, 0, NUMERIC_POS, 0, NULL, (NumericDigit *) const_two_data};
@@ -416,6 +460,12 @@ static const NumericVar const_one_point_one =
 static const NumericVar const_nan =
 {0, 0, NUMERIC_NAN, 0, NULL, NULL};
 
+static const NumericVar const_pinf =
+{0, 0, NUMERIC_PINF, 0, NULL, NULL};
+
+static const NumericVar const_ninf =
+{0, 0, NUMERIC_NINF, 0, NULL, NULL};
+
 #if DEC_DIGITS == 4
 static const int round_powers[4] = {0, 1000, 100, 10};
 #endif
@@ -465,10 +515,12 @@ static void set_var_from_var(const NumericVar *value, NumericVar *dest);
 static char *get_str_from_var(const NumericVar *var);
 static char *get_str_from_var_sci(const NumericVar *var, int rscale);
 
+static Numeric duplicate_numeric(Numeric num);
 static Numeric make_result(const NumericVar *var);
 static Numeric make_result_opt_error(const NumericVar *var, bool *error);
 
 static void apply_typmod(NumericVar *var, int32 typmod);
+static void apply_typmod_special(Numeric num, int32 typmod);
 
 static bool numericvar_to_int32(const NumericVar *var, int32 *result);
 static bool numericvar_to_int64(const NumericVar *var, int64 *result);
@@ -478,7 +530,6 @@ static bool numericvar_to_uint64(const NumericVar *var, uint64 *result);
 static bool numericvar_to_int128(const NumericVar *var, int128 *result);
 static void int128_to_numericvar(int128 val, NumericVar *var);
 #endif
-static double numeric_to_double_no_overflow(Numeric num);
 static double numericvar_to_double_no_overflow(const NumericVar *var);
 
 static Datum numeric_abbrev_convert(Datum original_datum, SortSupport ssup);
@@ -587,23 +638,43 @@ numeric_in(PG_FUNCTION_ARGS)
 	}
 
 	/*
-	 * Check for NaN
+	 * Check for NaN and infinities.  We recognize the same strings allowed by
+	 * float8in().
 	 */
 	if (pg_strncasecmp(cp, "NaN", 3) == 0)
 	{
 		res = make_result(&const_nan);
-
-		/* Should be nothing left but spaces */
 		cp += 3;
-		while (*cp)
-		{
-			if (!isspace((unsigned char) *cp))
-				ereport(ERROR,
-						(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-						 errmsg("invalid input syntax for type %s: \"%s\"",
-								"numeric", str)));
-			cp++;
-		}
+	}
+	else if (pg_strncasecmp(cp, "Infinity", 8) == 0)
+	{
+		res = make_result(&const_pinf);
+		cp += 8;
+	}
+	else if (pg_strncasecmp(cp, "+Infinity", 9) == 0)
+	{
+		res = make_result(&const_pinf);
+		cp += 9;
+	}
+	else if (pg_strncasecmp(cp, "-Infinity", 9) == 0)
+	{
+		res = make_result(&const_ninf);
+		cp += 9;
+	}
+	else if (pg_strncasecmp(cp, "inf", 3) == 0)
+	{
+		res = make_result(&const_pinf);
+		cp += 3;
+	}
+	else if (pg_strncasecmp(cp, "+inf", 4) == 0)
+	{
+		res = make_result(&const_pinf);
+		cp += 4;
+	}
+	else if (pg_strncasecmp(cp, "-inf", 4) == 0)
+	{
+		res = make_result(&const_ninf);
+		cp += 4;
 	}
 	else
 	{
@@ -620,7 +691,7 @@ numeric_in(PG_FUNCTION_ARGS)
 		 * We duplicate a few lines of code here because we would like to
 		 * throw any trailing-junk syntax error before any semantic error
 		 * resulting from apply_typmod.  We can't easily fold the two cases
-		 * together because we mustn't apply apply_typmod to a NaN.
+		 * together because we mustn't apply apply_typmod to a NaN/Inf.
 		 */
 		while (*cp)
 		{
@@ -636,8 +707,24 @@ numeric_in(PG_FUNCTION_ARGS)
 
 		res = make_result(&value);
 		free_var(&value);
+
+		PG_RETURN_NUMERIC(res);
 	}
 
+	/* Should be nothing left but spaces */
+	while (*cp)
+	{
+		if (!isspace((unsigned char) *cp))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
+					 errmsg("invalid input syntax for type %s: \"%s\"",
+							"numeric", str)));
+		cp++;
+	}
+
+	/* As above, throw any typmod error after finishing syntax check */
+	apply_typmod_special(res, typmod);
+
 	PG_RETURN_NUMERIC(res);
 }
 
@@ -655,10 +742,17 @@ numeric_out(PG_FUNCTION_ARGS)
 	char	   *str;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_CSTRING(pstrdup("NaN"));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_PINF(num))
+			PG_RETURN_CSTRING(pstrdup("Infinity"));
+		else if (NUMERIC_IS_NINF(num))
+			PG_RETURN_CSTRING(pstrdup("-Infinity"));
+		else
+			PG_RETURN_CSTRING(pstrdup("NaN"));
+	}
 
 	/*
 	 * Get the number in the variable format.
@@ -681,6 +775,41 @@ numeric_is_nan(Numeric num)
 	return NUMERIC_IS_NAN(num);
 }
 
+/*
+ * numeric_is_inf() -
+ *
+ *	Is Numeric value an infinity?
+ */
+bool
+numeric_is_inf(Numeric num)
+{
+	return NUMERIC_IS_INF(num);
+}
+
+/*
+ * numeric_is_integral() -
+ *
+ *	Is Numeric value integral?
+ */
+static bool
+numeric_is_integral(Numeric num)
+{
+	NumericVar	arg;
+
+	/* Reject NaN, but infinities are considered integral */
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_NAN(num))
+			return false;
+		return true;
+	}
+
+	/* Integral if there are no digits to the right of the decimal point */
+	init_var_from_num(num, &arg);
+
+	return (arg.ndigits == 0 || arg.ndigits <= arg.weight + 1);
+}
+
 /*
  * numeric_maximum_size() -
  *
@@ -732,10 +861,17 @@ numeric_out_sci(Numeric num, int scale)
 	char	   *str;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		return pstrdup("NaN");
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_PINF(num))
+			return pstrdup("Infinity");
+		else if (NUMERIC_IS_NINF(num))
+			return pstrdup("-Infinity");
+		else
+			return pstrdup("NaN");
+	}
 
 	init_var_from_num(num, &x);
 
@@ -760,10 +896,17 @@ numeric_normalize(Numeric num)
 	int			last;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		return pstrdup("NaN");
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_PINF(num))
+			return pstrdup("Infinity");
+		else if (NUMERIC_IS_NINF(num))
+			return pstrdup("-Infinity");
+		else
+			return pstrdup("NaN");
+	}
 
 	init_var_from_num(num, &x);
 
@@ -823,7 +966,9 @@ numeric_recv(PG_FUNCTION_ARGS)
 	value.sign = (uint16) pq_getmsgint(buf, sizeof(uint16));
 	if (!(value.sign == NUMERIC_POS ||
 		  value.sign == NUMERIC_NEG ||
-		  value.sign == NUMERIC_NAN))
+		  value.sign == NUMERIC_NAN ||
+		  value.sign == NUMERIC_PINF ||
+		  value.sign == NUMERIC_NINF))
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_BINARY_REPRESENTATION),
 				 errmsg("invalid sign in external \"numeric\" value")));
@@ -849,13 +994,29 @@ numeric_recv(PG_FUNCTION_ARGS)
 	 * If the given dscale would hide any digits, truncate those digits away.
 	 * We could alternatively throw an error, but that would take a bunch of
 	 * extra code (about as much as trunc_var involves), and it might cause
-	 * client compatibility issues.
+	 * client compatibility issues.  Be careful not to apply trunc_var to
+	 * special values, as it could do the wrong thing; we don't need it
+	 * anyway, since make_result will ignore all but the sign field.
+	 *
+	 * After doing that, be sure to check the typmod restriction.
 	 */
-	trunc_var(&value, value.dscale);
+	if (value.sign == NUMERIC_POS ||
+		value.sign == NUMERIC_NEG)
+	{
+		trunc_var(&value, value.dscale);
 
-	apply_typmod(&value, typmod);
+		apply_typmod(&value, typmod);
+
+		res = make_result(&value);
+	}
+	else
+	{
+		/* apply_typmod_special wants us to make the Numeric first */
+		res = make_result(&value);
+
+		apply_typmod_special(res, typmod);
+	}
 
-	res = make_result(&value);
 	free_var(&value);
 
 	PG_RETURN_NUMERIC(res);
@@ -961,21 +1122,21 @@ numeric		(PG_FUNCTION_ARGS)
 	NumericVar	var;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities: if apply_typmod_special doesn't complain,
+	 * just return a copy of the input.
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		apply_typmod_special(num, typmod);
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
+	}
 
 	/*
 	 * If the value isn't a valid type modifier, simply return a copy of the
 	 * input value
 	 */
 	if (typmod < (int32) (VARHDRSZ))
-	{
-		new = (Numeric) palloc(VARSIZE(num));
-		memcpy(new, num, VARSIZE(num));
-		PG_RETURN_NUMERIC(new);
-	}
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	/*
 	 * Get the precision and scale out of the typmod value
@@ -997,8 +1158,7 @@ numeric		(PG_FUNCTION_ARGS)
 		&& (NUMERIC_CAN_BE_SHORT(scale, NUMERIC_WEIGHT(num))
 			|| !NUMERIC_IS_SHORT(num)))
 	{
-		new = (Numeric) palloc(VARSIZE(num));
-		memcpy(new, num, VARSIZE(num));
+		new = duplicate_numeric(num);
 		if (NUMERIC_IS_SHORT(num))
 			new->choice.n_short.n_header =
 				(num->choice.n_short.n_header & ~NUMERIC_SHORT_DSCALE_MASK)
@@ -1099,21 +1259,20 @@ numeric_abs(PG_FUNCTION_ARGS)
 	Numeric		num = PG_GETARG_NUMERIC(0);
 	Numeric		res;
 
-	/*
-	 * Handle NaN
-	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
-
 	/*
 	 * Do it the easy way directly on the packed format
 	 */
-	res = (Numeric) palloc(VARSIZE(num));
-	memcpy(res, num, VARSIZE(num));
+	res = duplicate_numeric(num);
 
 	if (NUMERIC_IS_SHORT(num))
 		res->choice.n_short.n_header =
 			num->choice.n_short.n_header & ~NUMERIC_SHORT_SIGN_MASK;
+	else if (NUMERIC_IS_SPECIAL(num))
+	{
+		/* This changes -Inf to Inf, and doesn't affect NaN */
+		res->choice.n_short.n_header =
+			num->choice.n_short.n_header & ~NUMERIC_INF_SIGN_MASK;
+	}
 	else
 		res->choice.n_long.n_sign_dscale = NUMERIC_POS | NUMERIC_DSCALE(num);
 
@@ -1127,24 +1286,25 @@ numeric_uminus(PG_FUNCTION_ARGS)
 	Numeric		num = PG_GETARG_NUMERIC(0);
 	Numeric		res;
 
-	/*
-	 * Handle NaN
-	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
-
 	/*
 	 * Do it the easy way directly on the packed format
 	 */
-	res = (Numeric) palloc(VARSIZE(num));
-	memcpy(res, num, VARSIZE(num));
+	res = duplicate_numeric(num);
+
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		/* Flip the sign, if it's Inf or -Inf */
+		if (!NUMERIC_IS_NAN(num))
+			res->choice.n_short.n_header =
+				num->choice.n_short.n_header ^ NUMERIC_INF_SIGN_MASK;
+	}
 
 	/*
 	 * The packed format is known to be totally zero digit trimmed always. So
-	 * we can identify a ZERO by the fact that there are no digits at all.  Do
-	 * nothing to a zero.
+	 * once we've eliminated specials, we can identify a zero by the fact that
+	 * there are no digits at all. Do nothing to a zero.
 	 */
-	if (NUMERIC_NDIGITS(num) != 0)
+	else if (NUMERIC_NDIGITS(num) != 0)
 	{
 		/* Else, flip the sign */
 		if (NUMERIC_IS_SHORT(num))
@@ -1166,12 +1326,42 @@ Datum
 numeric_uplus(PG_FUNCTION_ARGS)
 {
 	Numeric		num = PG_GETARG_NUMERIC(0);
-	Numeric		res;
 
-	res = (Numeric) palloc(VARSIZE(num));
-	memcpy(res, num, VARSIZE(num));
+	PG_RETURN_NUMERIC(duplicate_numeric(num));
+}
 
-	PG_RETURN_NUMERIC(res);
+
+/*
+ * numeric_sign_internal() -
+ *
+ * Returns -1 if the argument is less than 0, 0 if the argument is equal
+ * to 0, and 1 if the argument is greater than zero.  Caller must have
+ * taken care of the NaN case, but we can handle infinities here.
+ */
+static int
+numeric_sign_internal(Numeric num)
+{
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		Assert(!NUMERIC_IS_NAN(num));
+		/* Must be Inf or -Inf */
+		if (NUMERIC_IS_PINF(num))
+			return 1;
+		else
+			return -1;
+	}
+
+	/*
+	 * The packed format is known to be totally zero digit trimmed always. So
+	 * once we've eliminated specials, we can identify a zero by the fact that
+	 * there are no digits at all.
+	 */
+	else if (NUMERIC_NDIGITS(num) == 0)
+		return 0;
+	else if (NUMERIC_SIGN(num) == NUMERIC_NEG)
+		return -1;
+	else
+		return 1;
 }
 
 /*
@@ -1184,37 +1374,25 @@ Datum
 numeric_sign(PG_FUNCTION_ARGS)
 {
 	Numeric		num = PG_GETARG_NUMERIC(0);
-	Numeric		res;
-	NumericVar	result;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN (infinities can be handled normally)
 	 */
 	if (NUMERIC_IS_NAN(num))
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
-	init_var(&result);
-
-	/*
-	 * The packed format is known to be totally zero digit trimmed always. So
-	 * we can identify a ZERO by the fact that there are no digits at all.
-	 */
-	if (NUMERIC_NDIGITS(num) == 0)
-		set_var_from_var(&const_zero, &result);
-	else
+	switch (numeric_sign_internal(num))
 	{
-		/*
-		 * And if there are some, we return a copy of ONE with the sign of our
-		 * argument
-		 */
-		set_var_from_var(&const_one, &result);
-		result.sign = NUMERIC_SIGN(num);
+		case 0:
+			PG_RETURN_NUMERIC(make_result(&const_zero));
+		case 1:
+			PG_RETURN_NUMERIC(make_result(&const_one));
+		case -1:
+			PG_RETURN_NUMERIC(make_result(&const_minus_one));
 	}
 
-	res = make_result(&result);
-	free_var(&result);
-
-	PG_RETURN_NUMERIC(res);
+	Assert(false);
+	return (Datum) 0;
 }
 
 
@@ -1234,10 +1412,10 @@ numeric_round(PG_FUNCTION_ARGS)
 	NumericVar	arg;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	/*
 	 * Limit the scale value to avoid possible overflow in calculations
@@ -1283,10 +1461,10 @@ numeric_trunc(PG_FUNCTION_ARGS)
 	NumericVar	arg;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	/*
 	 * Limit the scale value to avoid possible overflow in calculations
@@ -1328,8 +1506,11 @@ numeric_ceil(PG_FUNCTION_ARGS)
 	Numeric		res;
 	NumericVar	result;
 
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	/*
+	 * Handle NaN and infinities
+	 */
+	if (NUMERIC_IS_SPECIAL(num))
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	init_var_from_num(num, &result);
 	ceil_var(&result, &result);
@@ -1353,8 +1534,11 @@ numeric_floor(PG_FUNCTION_ARGS)
 	Numeric		res;
 	NumericVar	result;
 
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	/*
+	 * Handle NaN and infinities
+	 */
+	if (NUMERIC_IS_SPECIAL(num))
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	init_var_from_num(num, &result);
 	floor_var(&result, &result);
@@ -1390,26 +1574,46 @@ generate_series_step_numeric(PG_FUNCTION_ARGS)
 		Numeric		stop_num = PG_GETARG_NUMERIC(1);
 		NumericVar	steploc = const_one;
 
-		/* handle NaN in start and stop values */
-		if (NUMERIC_IS_NAN(start_num))
-			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("start value cannot be NaN")));
-
-		if (NUMERIC_IS_NAN(stop_num))
-			ereport(ERROR,
-					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-					 errmsg("stop value cannot be NaN")));
+		/* Reject NaN and infinities in start and stop values */
+		if (NUMERIC_IS_SPECIAL(start_num))
+		{
+			if (NUMERIC_IS_NAN(start_num))
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("start value cannot be NaN")));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("start value cannot be infinity")));
+		}
+		if (NUMERIC_IS_SPECIAL(stop_num))
+		{
+			if (NUMERIC_IS_NAN(stop_num))
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("stop value cannot be NaN")));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("stop value cannot be infinity")));
+		}
 
 		/* see if we were given an explicit step size */
 		if (PG_NARGS() == 3)
 		{
 			Numeric		step_num = PG_GETARG_NUMERIC(2);
 
-			if (NUMERIC_IS_NAN(step_num))
-				ereport(ERROR,
-						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-						 errmsg("step size cannot be NaN")));
+			if (NUMERIC_IS_SPECIAL(step_num))
+			{
+				if (NUMERIC_IS_NAN(step_num))
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							 errmsg("step size cannot be NaN")));
+				else
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+							 errmsg("step size cannot be infinity")));
+			}
 
 			init_var_from_num(step_num, &steploc);
 
@@ -1510,12 +1714,21 @@ width_bucket_numeric(PG_FUNCTION_ARGS)
 				(errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION),
 				 errmsg("count must be greater than zero")));
 
-	if (NUMERIC_IS_NAN(operand) ||
-		NUMERIC_IS_NAN(bound1) ||
-		NUMERIC_IS_NAN(bound2))
-		ereport(ERROR,
-				(errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION),
-				 errmsg("operand, lower bound, and upper bound cannot be NaN")));
+	if (NUMERIC_IS_SPECIAL(operand) ||
+		NUMERIC_IS_SPECIAL(bound1) ||
+		NUMERIC_IS_SPECIAL(bound2))
+	{
+		if (NUMERIC_IS_NAN(operand) ||
+			NUMERIC_IS_NAN(bound1) ||
+			NUMERIC_IS_NAN(bound2))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION),
+					 errmsg("operand, lower bound, and upper bound cannot be NaN")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION),
+					 errmsg("operand, lower bound, and upper bound cannot be infinity")));
+	}
 
 	init_var(&result_var);
 	init_var(&count_var);
@@ -1719,9 +1932,14 @@ numeric_abbrev_convert(Datum original_datum, SortSupport ssup)
 	else
 		value = (Numeric) original_varatt;
 
-	if (NUMERIC_IS_NAN(value))
+	if (NUMERIC_IS_SPECIAL(value))
 	{
-		result = NUMERIC_ABBREV_NAN;
+		if (NUMERIC_IS_PINF(value))
+			result = NUMERIC_ABBREV_PINF;
+		else if (NUMERIC_IS_NINF(value))
+			result = NUMERIC_ABBREV_NINF;
+		else
+			result = NUMERIC_ABBREV_NAN;
 	}
 	else
 	{
@@ -1847,7 +2065,7 @@ numeric_cmp_abbrev(Datum x, Datum y, SortSupport ssup)
 {
 	/*
 	 * NOTE WELL: this is intentionally backwards, because the abbreviation is
-	 * negated relative to the original value, to handle NaN.
+	 * negated relative to the original value, to handle NaN/infinity cases.
 	 */
 	if (DatumGetNumericAbbrev(x) < DatumGetNumericAbbrev(y))
 		return 1;
@@ -2150,20 +2368,42 @@ cmp_numerics(Numeric num1, Numeric num2)
 	int			result;
 
 	/*
-	 * We consider all NANs to be equal and larger than any non-NAN. This is
-	 * somewhat arbitrary; the important thing is to have a consistent sort
-	 * order.
+	 * We consider all NANs to be equal and larger than any non-NAN (including
+	 * Infinity).  This is somewhat arbitrary; the important thing is to have
+	 * a consistent sort order.
 	 */
-	if (NUMERIC_IS_NAN(num1))
+	if (NUMERIC_IS_SPECIAL(num1))
 	{
-		if (NUMERIC_IS_NAN(num2))
-			result = 0;			/* NAN = NAN */
-		else
-			result = 1;			/* NAN > non-NAN */
+		if (NUMERIC_IS_NAN(num1))
+		{
+			if (NUMERIC_IS_NAN(num2))
+				result = 0;		/* NAN = NAN */
+			else
+				result = 1;		/* NAN > non-NAN */
+		}
+		else if (NUMERIC_IS_PINF(num1))
+		{
+			if (NUMERIC_IS_NAN(num2))
+				result = -1;	/* PINF < NAN */
+			else if (NUMERIC_IS_PINF(num2))
+				result = 0;		/* PINF = PINF */
+			else
+				result = 1;		/* PINF > anything else */
+		}
+		else					/* num1 must be NINF */
+		{
+			if (NUMERIC_IS_NINF(num2))
+				result = 0;		/* NINF = NINF */
+			else
+				result = -1;	/* NINF < anything else */
+		}
 	}
-	else if (NUMERIC_IS_NAN(num2))
+	else if (NUMERIC_IS_SPECIAL(num2))
 	{
-		result = -1;			/* non-NAN < NAN */
+		if (NUMERIC_IS_NINF(num2))
+			result = 1;			/* normal > NINF */
+		else
+			result = -1;		/* normal < NAN or PINF */
 	}
 	else
 	{
@@ -2190,10 +2430,12 @@ in_range_numeric_numeric(PG_FUNCTION_ARGS)
 	bool		result;
 
 	/*
-	 * Reject negative or NaN offset.  Negative is per spec, and NaN is
-	 * because appropriate semantics for that seem non-obvious.
+	 * Reject negative (including -Inf) or NaN offset.  Negative is per spec,
+	 * and NaN is because appropriate semantics for that seem non-obvious.
 	 */
-	if (NUMERIC_IS_NAN(offset) || NUMERIC_SIGN(offset) == NUMERIC_NEG)
+	if (NUMERIC_IS_NAN(offset) ||
+		NUMERIC_IS_NINF(offset) ||
+		NUMERIC_SIGN(offset) == NUMERIC_NEG)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PRECEDING_OR_FOLLOWING_SIZE),
 				 errmsg("invalid preceding or following size in window function")));
@@ -2214,6 +2456,67 @@ in_range_numeric_numeric(PG_FUNCTION_ARGS)
 	{
 		result = less;			/* non-NAN < NAN */
 	}
+
+	/*
+	 * Deal with infinite offset (necessarily +Inf, at this point).
+	 */
+	else if (NUMERIC_IS_SPECIAL(offset))
+	{
+		Assert(NUMERIC_IS_PINF(offset));
+		if (sub ? NUMERIC_IS_PINF(base) : NUMERIC_IS_NINF(base))
+		{
+			/*
+			 * base +/- offset would produce NaN, so return true for any val
+			 * (see in_range_float8_float8() for reasoning).
+			 */
+			result = true;
+		}
+		else if (sub)
+		{
+			/* base - offset must be -inf */
+			if (less)
+				result = NUMERIC_IS_NINF(val);	/* only -inf is <= sum */
+			else
+				result = true;	/* any val is >= sum */
+		}
+		else
+		{
+			/* base + offset must be +inf */
+			if (less)
+				result = true;	/* any val is <= sum */
+			else
+				result = NUMERIC_IS_PINF(val);	/* only +inf is >= sum */
+		}
+	}
+
+	/*
+	 * Deal with cases where val and/or base is infinite.  The offset, being
+	 * now known finite, cannot affect the conclusion.
+	 */
+	else if (NUMERIC_IS_SPECIAL(val))
+	{
+		if (NUMERIC_IS_PINF(val))
+		{
+			if (NUMERIC_IS_PINF(base))
+				result = true;	/* PINF = PINF */
+			else
+				result = !less; /* PINF > any other non-NAN */
+		}
+		else					/* val must be NINF */
+		{
+			if (NUMERIC_IS_NINF(base))
+				result = true;	/* NINF = NINF */
+			else
+				result = less;	/* NINF < anything else */
+		}
+	}
+	else if (NUMERIC_IS_SPECIAL(base))
+	{
+		if (NUMERIC_IS_NINF(base))
+			result = !less;		/* normal > NINF */
+		else
+			result = less;		/* normal < PINF */
+	}
 	else
 	{
 		/*
@@ -2264,8 +2567,8 @@ hash_numeric(PG_FUNCTION_ARGS)
 	int			hash_len;
 	NumericDigit *digits;
 
-	/* If it's NaN, don't try to hash the rest of the fields */
-	if (NUMERIC_IS_NAN(key))
+	/* If it's NaN or infinity, don't try to hash the rest of the fields */
+	if (NUMERIC_IS_SPECIAL(key))
 		PG_RETURN_UINT32(0);
 
 	weight = NUMERIC_WEIGHT(key);
@@ -2345,7 +2648,8 @@ hash_numeric_extended(PG_FUNCTION_ARGS)
 	int			hash_len;
 	NumericDigit *digits;
 
-	if (NUMERIC_IS_NAN(key))
+	/* If it's NaN or infinity, don't try to hash the rest of the fields */
+	if (NUMERIC_IS_SPECIAL(key))
 		PG_RETURN_UINT64(seed);
 
 	weight = NUMERIC_WEIGHT(key);
@@ -2429,10 +2733,32 @@ numeric_add_opt_error(Numeric num1, Numeric num2, bool *have_error)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		return make_result(&const_nan);
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			return make_result(&const_nan);
+		if (NUMERIC_IS_PINF(num1))
+		{
+			if (NUMERIC_IS_NINF(num2))
+				return make_result(&const_nan); /* Inf + -Inf */
+			else
+				return make_result(&const_pinf);
+		}
+		if (NUMERIC_IS_NINF(num1))
+		{
+			if (NUMERIC_IS_PINF(num2))
+				return make_result(&const_nan); /* -Inf + Inf */
+			else
+				return make_result(&const_ninf);
+		}
+		/* by here, num1 must be finite, so num2 is not */
+		if (NUMERIC_IS_PINF(num2))
+			return make_result(&const_pinf);
+		Assert(NUMERIC_IS_NINF(num2));
+		return make_result(&const_ninf);
+	}
 
 	/*
 	 * Unpack the values, let add_var() compute the result and return it.
@@ -2485,10 +2811,32 @@ numeric_sub_opt_error(Numeric num1, Numeric num2, bool *have_error)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		return make_result(&const_nan);
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			return make_result(&const_nan);
+		if (NUMERIC_IS_PINF(num1))
+		{
+			if (NUMERIC_IS_PINF(num2))
+				return make_result(&const_nan); /* Inf - Inf */
+			else
+				return make_result(&const_pinf);
+		}
+		if (NUMERIC_IS_NINF(num1))
+		{
+			if (NUMERIC_IS_NINF(num2))
+				return make_result(&const_nan); /* -Inf - -Inf */
+			else
+				return make_result(&const_ninf);
+		}
+		/* by here, num1 must be finite, so num2 is not */
+		if (NUMERIC_IS_PINF(num2))
+			return make_result(&const_ninf);
+		Assert(NUMERIC_IS_NINF(num2));
+		return make_result(&const_pinf);
+	}
 
 	/*
 	 * Unpack the values, let sub_var() compute the result and return it.
@@ -2541,10 +2889,64 @@ numeric_mul_opt_error(Numeric num1, Numeric num2, bool *have_error)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		return make_result(&const_nan);
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			return make_result(&const_nan);
+		if (NUMERIC_IS_PINF(num1))
+		{
+			switch (numeric_sign_internal(num2))
+			{
+				case 0:
+					return make_result(&const_nan); /* Inf * 0 */
+				case 1:
+					return make_result(&const_pinf);
+				case -1:
+					return make_result(&const_ninf);
+			}
+			Assert(false);
+		}
+		if (NUMERIC_IS_NINF(num1))
+		{
+			switch (numeric_sign_internal(num2))
+			{
+				case 0:
+					return make_result(&const_nan); /* -Inf * 0 */
+				case 1:
+					return make_result(&const_ninf);
+				case -1:
+					return make_result(&const_pinf);
+			}
+			Assert(false);
+		}
+		/* by here, num1 must be finite, so num2 is not */
+		if (NUMERIC_IS_PINF(num2))
+		{
+			switch (numeric_sign_internal(num1))
+			{
+				case 0:
+					return make_result(&const_nan); /* 0 * Inf */
+				case 1:
+					return make_result(&const_pinf);
+				case -1:
+					return make_result(&const_ninf);
+			}
+			Assert(false);
+		}
+		Assert(NUMERIC_IS_NINF(num2));
+		switch (numeric_sign_internal(num1))
+		{
+			case 0:
+				return make_result(&const_nan); /* 0 * -Inf */
+			case 1:
+				return make_result(&const_ninf);
+			case -1:
+				return make_result(&const_pinf);
+		}
+		Assert(false);
+	}
 
 	/*
 	 * Unpack the values, let mul_var() compute the result and return it.
@@ -2605,10 +3007,67 @@ numeric_div_opt_error(Numeric num1, Numeric num2, bool *have_error)
 		*have_error = false;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		return make_result(&const_nan);
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			return make_result(&const_nan);
+		if (NUMERIC_IS_PINF(num1))
+		{
+			if (NUMERIC_IS_SPECIAL(num2))
+				return make_result(&const_nan); /* Inf / [-]Inf */
+			switch (numeric_sign_internal(num2))
+			{
+				case 0:
+					if (have_error)
+					{
+						*have_error = true;
+						return NULL;
+					}
+					ereport(ERROR,
+							(errcode(ERRCODE_DIVISION_BY_ZERO),
+							 errmsg("division by zero")));
+					break;
+				case 1:
+					return make_result(&const_pinf);
+				case -1:
+					return make_result(&const_ninf);
+			}
+			Assert(false);
+		}
+		if (NUMERIC_IS_NINF(num1))
+		{
+			if (NUMERIC_IS_SPECIAL(num2))
+				return make_result(&const_nan); /* -Inf / [-]Inf */
+			switch (numeric_sign_internal(num2))
+			{
+				case 0:
+					if (have_error)
+					{
+						*have_error = true;
+						return NULL;
+					}
+					ereport(ERROR,
+							(errcode(ERRCODE_DIVISION_BY_ZERO),
+							 errmsg("division by zero")));
+					break;
+				case 1:
+					return make_result(&const_ninf);
+				case -1:
+					return make_result(&const_pinf);
+			}
+			Assert(false);
+		}
+		/* by here, num1 must be finite, so num2 is not */
+
+		/*
+		 * POSIX would have us return zero or minus zero if num1 is zero, and
+		 * otherwise throw an underflow error.  But the numeric type doesn't
+		 * really do underflow, so let's just return zero.
+		 */
+		return make_result(&const_zero);
+	}
 
 	/*
 	 * Unpack the arguments
@@ -2661,10 +3120,57 @@ numeric_div_trunc(PG_FUNCTION_ARGS)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			PG_RETURN_NUMERIC(make_result(&const_nan));
+		if (NUMERIC_IS_PINF(num1))
+		{
+			if (NUMERIC_IS_SPECIAL(num2))
+				PG_RETURN_NUMERIC(make_result(&const_nan)); /* Inf / [-]Inf */
+			switch (numeric_sign_internal(num2))
+			{
+				case 0:
+					ereport(ERROR,
+							(errcode(ERRCODE_DIVISION_BY_ZERO),
+							 errmsg("division by zero")));
+					break;
+				case 1:
+					PG_RETURN_NUMERIC(make_result(&const_pinf));
+				case -1:
+					PG_RETURN_NUMERIC(make_result(&const_ninf));
+			}
+			Assert(false);
+		}
+		if (NUMERIC_IS_NINF(num1))
+		{
+			if (NUMERIC_IS_SPECIAL(num2))
+				PG_RETURN_NUMERIC(make_result(&const_nan)); /* -Inf / [-]Inf */
+			switch (numeric_sign_internal(num2))
+			{
+				case 0:
+					ereport(ERROR,
+							(errcode(ERRCODE_DIVISION_BY_ZERO),
+							 errmsg("division by zero")));
+					break;
+				case 1:
+					PG_RETURN_NUMERIC(make_result(&const_ninf));
+				case -1:
+					PG_RETURN_NUMERIC(make_result(&const_pinf));
+			}
+			Assert(false);
+		}
+		/* by here, num1 must be finite, so num2 is not */
+
+		/*
+		 * POSIX would have us return zero or minus zero if num1 is zero, and
+		 * otherwise throw an underflow error.  But the numeric type doesn't
+		 * really do underflow, so let's just return zero.
+		 */
+		PG_RETURN_NUMERIC(make_result(&const_zero));
+	}
 
 	/*
 	 * Unpack the arguments
@@ -2723,8 +3229,34 @@ numeric_mod_opt_error(Numeric num1, Numeric num2, bool *have_error)
 	if (have_error)
 		*have_error = false;
 
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		return make_result(&const_nan);
+	/*
+	 * Handle NaN and infinities.  We follow POSIX fmod() on this, except that
+	 * POSIX treats x-is-infinite and y-is-zero identically, raising EDOM and
+	 * returning NaN.  We choose to throw error only for y-is-zero.
+	 */
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			return make_result(&const_nan);
+		if (NUMERIC_IS_INF(num1))
+		{
+			if (numeric_sign_internal(num2) == 0)
+			{
+				if (have_error)
+				{
+					*have_error = true;
+					return NULL;
+				}
+				ereport(ERROR,
+						(errcode(ERRCODE_DIVISION_BY_ZERO),
+						 errmsg("division by zero")));
+			}
+			/* Inf % any nonzero = NaN */
+			return make_result(&const_nan);
+		}
+		/* num2 must be [-]Inf; result is num1 regardless of sign of num2 */
+		return duplicate_numeric(num1);
+	}
 
 	init_var_from_num(num1, &arg1);
 	init_var_from_num(num2, &arg2);
@@ -2763,10 +3295,10 @@ numeric_inc(PG_FUNCTION_ARGS)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	/*
 	 * Compute the result and return it
@@ -2850,9 +3382,10 @@ numeric_gcd(PG_FUNCTION_ARGS)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities: we consider the result to be NaN in all such
+	 * cases.
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
 	/*
@@ -2892,9 +3425,10 @@ numeric_lcm(PG_FUNCTION_ARGS)
 	Numeric		res;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities: we consider the result to be NaN in all such
+	 * cases.
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
 	/*
@@ -3003,10 +3537,18 @@ numeric_sqrt(PG_FUNCTION_ARGS)
 	int			rscale;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		/* error should match that in sqrt_var() */
+		if (NUMERIC_IS_NINF(num))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_POWER_FUNCTION),
+					 errmsg("cannot take square root of a negative number")));
+		/* For NAN or PINF, just duplicate the input */
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
+	}
 
 	/*
 	 * Unpack the argument and determine the result scale.  We choose a scale
@@ -3054,10 +3596,16 @@ numeric_exp(PG_FUNCTION_ARGS)
 	double		val;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		/* Per POSIX, exp(-Inf) is zero */
+		if (NUMERIC_IS_NINF(num))
+			PG_RETURN_NUMERIC(make_result(&const_zero));
+		/* For NAN or PINF, just duplicate the input */
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
+	}
 
 	/*
 	 * Unpack the argument and determine the result scale.  We choose a scale
@@ -3115,10 +3663,17 @@ numeric_ln(PG_FUNCTION_ARGS)
 	int			rscale;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_NINF(num))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_LOG),
+					 errmsg("cannot take logarithm of a negative number")));
+		/* For NAN or PINF, just duplicate the input */
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
+	}
 
 	init_var_from_num(num, &arg);
 	init_var(&result);
@@ -3157,10 +3712,39 @@ numeric_log(PG_FUNCTION_ARGS)
 	NumericVar	result;
 
 	/*
-	 * Handle NaN
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
+	{
+		int			sign1,
+					sign2;
+
+		if (NUMERIC_IS_NAN(num1) || NUMERIC_IS_NAN(num2))
+			PG_RETURN_NUMERIC(make_result(&const_nan));
+		/* fail on negative inputs including -Inf, as log_var would */
+		sign1 = numeric_sign_internal(num1);
+		sign2 = numeric_sign_internal(num2);
+		if (sign1 < 0 || sign2 < 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_LOG),
+					 errmsg("cannot take logarithm of a negative number")));
+		/* fail on zero inputs, as log_var would */
+		if (sign1 == 0 || sign2 == 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_LOG),
+					 errmsg("cannot take logarithm of zero")));
+		if (NUMERIC_IS_PINF(num1))
+		{
+			/* log(Inf, Inf) reduces to Inf/Inf, so it's NaN */
+			if (NUMERIC_IS_PINF(num2))
+				PG_RETURN_NUMERIC(make_result(&const_nan));
+			/* log(Inf, finite-positive) is zero (we don't throw underflow) */
+			PG_RETURN_NUMERIC(make_result(&const_zero));
+		}
+		Assert(NUMERIC_IS_PINF(num2));
+		/* log(finite-positive, Inf) is Inf */
+		PG_RETURN_NUMERIC(make_result(&const_pinf));
+	}
 
 	/*
 	 * Initialize things
@@ -3186,7 +3770,7 @@ numeric_log(PG_FUNCTION_ARGS)
 /*
  * numeric_power() -
  *
- *	Raise b to the power of x
+ *	Raise x to the power of y
  */
 Datum
 numeric_power(PG_FUNCTION_ARGS)
@@ -3196,60 +3780,170 @@ numeric_power(PG_FUNCTION_ARGS)
 	Numeric		res;
 	NumericVar	arg1;
 	NumericVar	arg2;
-	NumericVar	arg2_trunc;
 	NumericVar	result;
+	int			sign1,
+				sign2;
 
 	/*
-	 * Handle NaN cases.  We follow the POSIX spec for pow(3), which says that
-	 * NaN ^ 0 = 1, and 1 ^ NaN = 1, while all other cases with NaN inputs
-	 * yield NaN (with no error).
+	 * Handle NaN and infinities
 	 */
-	if (NUMERIC_IS_NAN(num1))
+	if (NUMERIC_IS_SPECIAL(num1) || NUMERIC_IS_SPECIAL(num2))
 	{
-		if (!NUMERIC_IS_NAN(num2))
+		/*
+		 * We follow the POSIX spec for pow(3), which says that NaN ^ 0 = 1,
+		 * and 1 ^ NaN = 1, while all other cases with NaN inputs yield NaN
+		 * (with no error).
+		 */
+		if (NUMERIC_IS_NAN(num1))
+		{
+			if (!NUMERIC_IS_SPECIAL(num2))
+			{
+				init_var_from_num(num2, &arg2);
+				if (cmp_var(&arg2, &const_zero) == 0)
+					PG_RETURN_NUMERIC(make_result(&const_one));
+			}
+			PG_RETURN_NUMERIC(make_result(&const_nan));
+		}
+		if (NUMERIC_IS_NAN(num2))
+		{
+			if (!NUMERIC_IS_SPECIAL(num1))
+			{
+				init_var_from_num(num1, &arg1);
+				if (cmp_var(&arg1, &const_one) == 0)
+					PG_RETURN_NUMERIC(make_result(&const_one));
+			}
+			PG_RETURN_NUMERIC(make_result(&const_nan));
+		}
+		/* At least one input is infinite, but error rules still apply */
+		sign1 = numeric_sign_internal(num1);
+		sign2 = numeric_sign_internal(num2);
+		if (sign1 == 0 && sign2 < 0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_POWER_FUNCTION),
+					 errmsg("zero raised to a negative power is undefined")));
+		if (sign1 < 0 && !numeric_is_integral(num2))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_ARGUMENT_FOR_POWER_FUNCTION),
+					 errmsg("a negative number raised to a non-integer power yields a complex result")));
+
+		/*
+		 * POSIX gives this series of rules for pow(3) with infinite inputs:
+		 *
+		 * For any value of y, if x is +1, 1.0 shall be returned.
+		 */
+		if (!NUMERIC_IS_SPECIAL(num1))
 		{
-			init_var_from_num(num2, &arg2);
-			if (cmp_var(&arg2, &const_zero) == 0)
+			init_var_from_num(num1, &arg1);
+			if (cmp_var(&arg1, &const_one) == 0)
 				PG_RETURN_NUMERIC(make_result(&const_one));
 		}
-		PG_RETURN_NUMERIC(make_result(&const_nan));
-	}
-	if (NUMERIC_IS_NAN(num2))
-	{
-		init_var_from_num(num1, &arg1);
-		if (cmp_var(&arg1, &const_one) == 0)
+
+		/*
+		 * For any value of x, if y is [-]0, 1.0 shall be returned.
+		 */
+		if (sign2 == 0)
 			PG_RETURN_NUMERIC(make_result(&const_one));
-		PG_RETURN_NUMERIC(make_result(&const_nan));
-	}
 
-	/*
-	 * Initialize things
-	 */
-	init_var(&arg2_trunc);
-	init_var(&result);
-	init_var_from_num(num1, &arg1);
-	init_var_from_num(num2, &arg2);
+		/*
+		 * For any odd integer value of y > 0, if x is [-]0, [-]0 shall be
+		 * returned.  For y > 0 and not an odd integer, if x is [-]0, +0 shall
+		 * be returned.  (Since we don't deal in minus zero, we need not
+		 * distinguish these two cases.)
+		 */
+		if (sign1 == 0 && sign2 > 0)
+			PG_RETURN_NUMERIC(make_result(&const_zero));
+
+		/*
+		 * If x is -1, and y is [-]Inf, 1.0 shall be returned.
+		 *
+		 * For |x| < 1, if y is -Inf, +Inf shall be returned.
+		 *
+		 * For |x| > 1, if y is -Inf, +0 shall be returned.
+		 *
+		 * For |x| < 1, if y is +Inf, +0 shall be returned.
+		 *
+		 * For |x| > 1, if y is +Inf, +Inf shall be returned.
+		 */
+		if (NUMERIC_IS_INF(num2))
+		{
+			bool		abs_x_gt_one;
+
+			if (NUMERIC_IS_SPECIAL(num1))
+				abs_x_gt_one = true;	/* x is either Inf or -Inf */
+			else
+			{
+				init_var_from_num(num1, &arg1);
+				if (cmp_var(&arg1, &const_minus_one) == 0)
+					PG_RETURN_NUMERIC(make_result(&const_one));
+				arg1.sign = NUMERIC_POS;	/* now arg1 = abs(x) */
+				abs_x_gt_one = (cmp_var(&arg1, &const_one) > 0);
+			}
+			if (abs_x_gt_one == (sign2 > 0))
+				PG_RETURN_NUMERIC(make_result(&const_pinf));
+			else
+				PG_RETURN_NUMERIC(make_result(&const_zero));
+		}
+
+		/*
+		 * For y < 0, if x is +Inf, +0 shall be returned.
+		 *
+		 * For y > 0, if x is +Inf, +Inf shall be returned.
+		 */
+		if (NUMERIC_IS_PINF(num1))
+		{
+			if (sign2 > 0)
+				PG_RETURN_NUMERIC(make_result(&const_pinf));
+			else
+				PG_RETURN_NUMERIC(make_result(&const_zero));
+		}
+
+		Assert(NUMERIC_IS_NINF(num1));
+
+		/*
+		 * For y an odd integer < 0, if x is -Inf, -0 shall be returned.  For
+		 * y < 0 and not an odd integer, if x is -Inf, +0 shall be returned.
+		 * (Again, we need not distinguish these two cases.)
+		 */
+		if (sign2 < 0)
+			PG_RETURN_NUMERIC(make_result(&const_zero));
 
-	set_var_from_var(&arg2, &arg2_trunc);
-	trunc_var(&arg2_trunc, 0);
+		/*
+		 * For y an odd integer > 0, if x is -Inf, -Inf shall be returned. For
+		 * y > 0 and not an odd integer, if x is -Inf, +Inf shall be returned.
+		 */
+		init_var_from_num(num2, &arg2);
+		if (arg2.ndigits > 0 && arg2.ndigits == arg2.weight + 1 &&
+			(arg2.digits[arg2.ndigits - 1] & 1))
+			PG_RETURN_NUMERIC(make_result(&const_ninf));
+		else
+			PG_RETURN_NUMERIC(make_result(&const_pinf));
+	}
 
 	/*
 	 * The SQL spec requires that we emit a particular SQLSTATE error code for
 	 * certain error conditions.  Specifically, we don't return a
 	 * divide-by-zero error code for 0 ^ -1.
 	 */
-	if (cmp_var(&arg1, &const_zero) == 0 &&
-		cmp_var(&arg2, &const_zero) < 0)
+	sign1 = numeric_sign_internal(num1);
+	sign2 = numeric_sign_internal(num2);
+
+	if (sign1 == 0 && sign2 < 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_ARGUMENT_FOR_POWER_FUNCTION),
 				 errmsg("zero raised to a negative power is undefined")));
 
-	if (cmp_var(&arg1, &const_zero) < 0 &&
-		cmp_var(&arg2, &arg2_trunc) != 0)
+	if (sign1 < 0 && !numeric_is_integral(num2))
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_ARGUMENT_FOR_POWER_FUNCTION),
 				 errmsg("a negative number raised to a non-integer power yields a complex result")));
 
+	/*
+	 * Initialize things
+	 */
+	init_var(&result);
+	init_var_from_num(num1, &arg1);
+	init_var_from_num(num2, &arg2);
+
 	/*
 	 * Call power_var() to compute and return the result; note it handles
 	 * scale selection itself.
@@ -3259,7 +3953,6 @@ numeric_power(PG_FUNCTION_ARGS)
 	res = make_result(&result);
 
 	free_var(&result);
-	free_var(&arg2_trunc);
 
 	PG_RETURN_NUMERIC(res);
 }
@@ -3274,7 +3967,7 @@ numeric_scale(PG_FUNCTION_ARGS)
 {
 	Numeric		num = PG_GETARG_NUMERIC(0);
 
-	if (NUMERIC_IS_NAN(num))
+	if (NUMERIC_IS_SPECIAL(num))
 		PG_RETURN_NULL();
 
 	PG_RETURN_INT32(NUMERIC_DSCALE(num));
@@ -3341,7 +4034,7 @@ numeric_min_scale(PG_FUNCTION_ARGS)
 	NumericVar	arg;
 	int			min_scale;
 
-	if (NUMERIC_IS_NAN(num))
+	if (NUMERIC_IS_SPECIAL(num))
 		PG_RETURN_NULL();
 
 	init_var_from_num(num, &arg);
@@ -3361,8 +4054,8 @@ numeric_trim_scale(PG_FUNCTION_ARGS)
 	Numeric		res;
 	NumericVar	result;
 
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (NUMERIC_IS_SPECIAL(num))
+		PG_RETURN_NUMERIC(duplicate_numeric(num));
 
 	init_var_from_num(num, &result);
 	result.dscale = get_min_scale(&result);
@@ -3408,8 +4101,7 @@ numeric_int4_opt_error(Numeric num, bool *have_error)
 	if (have_error)
 		*have_error = false;
 
-	/* XXX would it be better to return NULL? */
-	if (NUMERIC_IS_NAN(num))
+	if (NUMERIC_IS_SPECIAL(num))
 	{
 		if (have_error)
 		{
@@ -3418,9 +4110,14 @@ numeric_int4_opt_error(Numeric num, bool *have_error)
 		}
 		else
 		{
-			ereport(ERROR,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("cannot convert NaN to integer")));
+			if (NUMERIC_IS_NAN(num))
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("cannot convert NaN to integer")));
+			else
+				ereport(ERROR,
+						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+						 errmsg("cannot convert infinity to integer")));
 		}
 	}
 
@@ -3499,11 +4196,17 @@ numeric_int8(PG_FUNCTION_ARGS)
 	NumericVar	x;
 	int64		result;
 
-	/* XXX would it be better to return NULL? */
-	if (NUMERIC_IS_NAN(num))
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot convert NaN to bigint")));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_NAN(num))
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot convert NaN to bigint")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot convert infinity to bigint")));
+	}
 
 	/* Convert to variable format and thence to int8 */
 	init_var_from_num(num, &x);
@@ -3544,11 +4247,17 @@ numeric_int2(PG_FUNCTION_ARGS)
 	int64		val;
 	int16		result;
 
-	/* XXX would it be better to return NULL? */
-	if (NUMERIC_IS_NAN(num))
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot convert NaN to smallint")));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_NAN(num))
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot convert NaN to smallint")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot convert infinity to smallint")));
+	}
 
 	/* Convert to variable format and thence to int8 */
 	init_var_from_num(num, &x);
@@ -3583,9 +4292,12 @@ float8_numeric(PG_FUNCTION_ARGS)
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
 	if (isinf(val))
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot convert infinity to numeric")));
+	{
+		if (val < 0)
+			PG_RETURN_NUMERIC(make_result(&const_ninf));
+		else
+			PG_RETURN_NUMERIC(make_result(&const_pinf));
+	}
 
 	snprintf(buf, sizeof(buf), "%.*g", DBL_DIG, val);
 
@@ -3609,8 +4321,15 @@ numeric_float8(PG_FUNCTION_ARGS)
 	char	   *tmp;
 	Datum		result;
 
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_FLOAT8(get_float8_nan());
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_PINF(num))
+			PG_RETURN_FLOAT8(get_float8_infinity());
+		else if (NUMERIC_IS_NINF(num))
+			PG_RETURN_FLOAT8(-get_float8_infinity());
+		else
+			PG_RETURN_FLOAT8(get_float8_nan());
+	}
 
 	tmp = DatumGetCString(DirectFunctionCall1(numeric_out,
 											  NumericGetDatum(num)));
@@ -3634,10 +4353,22 @@ numeric_float8_no_overflow(PG_FUNCTION_ARGS)
 	Numeric		num = PG_GETARG_NUMERIC(0);
 	double		val;
 
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_FLOAT8(get_float8_nan());
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_PINF(num))
+			val = HUGE_VAL;
+		else if (NUMERIC_IS_NINF(num))
+			val = -HUGE_VAL;
+		else
+			val = get_float8_nan();
+	}
+	else
+	{
+		NumericVar	x;
 
-	val = numeric_to_double_no_overflow(num);
+		init_var_from_num(num, &x);
+		val = numericvar_to_double_no_overflow(&x);
+	}
 
 	PG_RETURN_FLOAT8(val);
 }
@@ -3654,9 +4385,12 @@ float4_numeric(PG_FUNCTION_ARGS)
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
 	if (isinf(val))
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot convert infinity to numeric")));
+	{
+		if (val < 0)
+			PG_RETURN_NUMERIC(make_result(&const_ninf));
+		else
+			PG_RETURN_NUMERIC(make_result(&const_pinf));
+	}
 
 	snprintf(buf, sizeof(buf), "%.*g", FLT_DIG, val);
 
@@ -3680,8 +4414,15 @@ numeric_float4(PG_FUNCTION_ARGS)
 	char	   *tmp;
 	Datum		result;
 
-	if (NUMERIC_IS_NAN(num))
-		PG_RETURN_FLOAT4(get_float4_nan());
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_PINF(num))
+			PG_RETURN_FLOAT4(get_float4_infinity());
+		else if (NUMERIC_IS_NINF(num))
+			PG_RETURN_FLOAT4(-get_float4_infinity());
+		else
+			PG_RETURN_FLOAT4(get_float4_nan());
+	}
 
 	tmp = DatumGetCString(DirectFunctionCall1(numeric_out,
 											  NumericGetDatum(num)));
@@ -3701,10 +4442,17 @@ numeric_pg_lsn(PG_FUNCTION_ARGS)
 	NumericVar	x;
 	XLogRecPtr	result;
 
-	if (NUMERIC_IS_NAN(num))
-		ereport(ERROR,
-				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-				 errmsg("cannot convert NaN to pg_lsn")));
+	if (NUMERIC_IS_SPECIAL(num))
+	{
+		if (NUMERIC_IS_NAN(num))
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot convert NaN to pg_lsn")));
+		else
+			ereport(ERROR,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("cannot convert infinity to pg_lsn")));
+	}
 
 	/* Convert to variable format and thence to pg_lsn */
 	init_var_from_num(num, &x);
@@ -3741,9 +4489,15 @@ typedef struct NumericAggState
 	NumericSumAccum sumX2;		/* sum of squares of processed numbers */
 	int			maxScale;		/* maximum scale seen so far */
 	int64		maxScaleCount;	/* number of values seen with maximum scale */
-	int64		NaNcount;		/* count of NaN values (not included in N!) */
+	/* These counts are *not* included in N!  Use NA_TOTAL_COUNT() as needed */
+	int64		NaNcount;		/* count of NaN values */
+	int64		pInfcount;		/* count of +Inf values */
+	int64		nInfcount;		/* count of -Inf values */
 } NumericAggState;
 
+#define NA_TOTAL_COUNT(na) \
+	((na)->N + (na)->NaNcount + (na)->pInfcount + (na)->nInfcount)
+
 /*
  * Prepare state data for a numeric aggregate function that needs to compute
  * sum, count and optionally sum of squares of the input.
@@ -3795,10 +4549,15 @@ do_numeric_accum(NumericAggState *state, Numeric newval)
 	NumericVar	X2;
 	MemoryContext old_context;
 
-	/* Count NaN inputs separately from all else */
-	if (NUMERIC_IS_NAN(newval))
+	/* Count NaN/infinity inputs separately from all else */
+	if (NUMERIC_IS_SPECIAL(newval))
 	{
-		state->NaNcount++;
+		if (NUMERIC_IS_PINF(newval))
+			state->pInfcount++;
+		else if (NUMERIC_IS_NINF(newval))
+			state->nInfcount++;
+		else
+			state->NaNcount++;
 		return;
 	}
 
@@ -3860,10 +4619,15 @@ do_numeric_discard(NumericAggState *state, Numeric newval)
 	NumericVar	X2;
 	MemoryContext old_context;
 
-	/* Count NaN inputs separately from all else */
-	if (NUMERIC_IS_NAN(newval))
+	/* Count NaN/infinity inputs separately from all else */
+	if (NUMERIC_IS_SPECIAL(newval))
 	{
-		state->NaNcount--;
+		if (NUMERIC_IS_PINF(newval))
+			state->pInfcount--;
+		else if (NUMERIC_IS_NINF(newval))
+			state->nInfcount--;
+		else
+			state->NaNcount--;
 		return true;
 	}
 
@@ -3986,6 +4750,8 @@ numeric_combine(PG_FUNCTION_ARGS)
 		state1 = makeNumericAggStateCurrentContext(true);
 		state1->N = state2->N;
 		state1->NaNcount = state2->NaNcount;
+		state1->pInfcount = state2->pInfcount;
+		state1->nInfcount = state2->nInfcount;
 		state1->maxScale = state2->maxScale;
 		state1->maxScaleCount = state2->maxScaleCount;
 
@@ -3999,6 +4765,8 @@ numeric_combine(PG_FUNCTION_ARGS)
 
 	state1->N += state2->N;
 	state1->NaNcount += state2->NaNcount;
+	state1->pInfcount += state2->pInfcount;
+	state1->nInfcount += state2->nInfcount;
 
 	if (state2->N > 0)
 	{
@@ -4074,6 +4842,8 @@ numeric_avg_combine(PG_FUNCTION_ARGS)
 		state1 = makeNumericAggStateCurrentContext(false);
 		state1->N = state2->N;
 		state1->NaNcount = state2->NaNcount;
+		state1->pInfcount = state2->pInfcount;
+		state1->nInfcount = state2->nInfcount;
 		state1->maxScale = state2->maxScale;
 		state1->maxScaleCount = state2->maxScaleCount;
 
@@ -4086,6 +4856,8 @@ numeric_avg_combine(PG_FUNCTION_ARGS)
 
 	state1->N += state2->N;
 	state1->NaNcount += state2->NaNcount;
+	state1->pInfcount += state2->pInfcount;
+	state1->nInfcount += state2->nInfcount;
 
 	if (state2->N > 0)
 	{
@@ -4164,6 +4936,12 @@ numeric_avg_serialize(PG_FUNCTION_ARGS)
 	/* NaNcount */
 	pq_sendint64(&buf, state->NaNcount);
 
+	/* pInfcount */
+	pq_sendint64(&buf, state->pInfcount);
+
+	/* nInfcount */
+	pq_sendint64(&buf, state->nInfcount);
+
 	result = pq_endtypsend(&buf);
 
 	PG_RETURN_BYTEA_P(result);
@@ -4218,6 +4996,12 @@ numeric_avg_deserialize(PG_FUNCTION_ARGS)
 	/* NaNcount */
 	result->NaNcount = pq_getmsgint64(&buf);
 
+	/* pInfcount */
+	result->pInfcount = pq_getmsgint64(&buf);
+
+	/* nInfcount */
+	result->nInfcount = pq_getmsgint64(&buf);
+
 	pq_getmsgend(&buf);
 	pfree(buf.data);
 
@@ -4286,6 +5070,12 @@ numeric_serialize(PG_FUNCTION_ARGS)
 	/* NaNcount */
 	pq_sendint64(&buf, state->NaNcount);
 
+	/* pInfcount */
+	pq_sendint64(&buf, state->pInfcount);
+
+	/* nInfcount */
+	pq_sendint64(&buf, state->nInfcount);
+
 	result = pq_endtypsend(&buf);
 
 	PG_RETURN_BYTEA_P(result);
@@ -4349,6 +5139,12 @@ numeric_deserialize(PG_FUNCTION_ARGS)
 	/* NaNcount */
 	result->NaNcount = pq_getmsgint64(&buf);
 
+	/* pInfcount */
+	result->pInfcount = pq_getmsgint64(&buf);
+
+	/* nInfcount */
+	result->nInfcount = pq_getmsgint64(&buf);
+
 	pq_getmsgend(&buf);
 	pfree(buf.data);
 
@@ -5141,12 +5937,20 @@ numeric_avg(PG_FUNCTION_ARGS)
 	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);
 
 	/* If there were no non-null inputs, return NULL */
-	if (state == NULL || (state->N + state->NaNcount) == 0)
+	if (state == NULL || NA_TOTAL_COUNT(state) == 0)
 		PG_RETURN_NULL();
 
 	if (state->NaNcount > 0)	/* there was at least one NaN input */
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
+	/* adding plus and minus infinities gives NaN */
+	if (state->pInfcount > 0 && state->nInfcount > 0)
+		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (state->pInfcount > 0)
+		PG_RETURN_NUMERIC(make_result(&const_pinf));
+	if (state->nInfcount > 0)
+		PG_RETURN_NUMERIC(make_result(&const_ninf));
+
 	N_datum = DirectFunctionCall1(int8_numeric, Int64GetDatum(state->N));
 
 	init_var(&sumX_var);
@@ -5167,12 +5971,20 @@ numeric_sum(PG_FUNCTION_ARGS)
 	state = PG_ARGISNULL(0) ? NULL : (NumericAggState *) PG_GETARG_POINTER(0);
 
 	/* If there were no non-null inputs, return NULL */
-	if (state == NULL || (state->N + state->NaNcount) == 0)
+	if (state == NULL || NA_TOTAL_COUNT(state) == 0)
 		PG_RETURN_NULL();
 
 	if (state->NaNcount > 0)	/* there was at least one NaN input */
 		PG_RETURN_NUMERIC(make_result(&const_nan));
 
+	/* adding plus and minus infinities gives NaN */
+	if (state->pInfcount > 0 && state->nInfcount > 0)
+		PG_RETURN_NUMERIC(make_result(&const_nan));
+	if (state->pInfcount > 0)
+		PG_RETURN_NUMERIC(make_result(&const_pinf));
+	if (state->nInfcount > 0)
+		PG_RETURN_NUMERIC(make_result(&const_ninf));
+
 	init_var(&sumX_var);
 	accum_sum_final(&state->sumX, &sumX_var);
 	result = make_result(&sumX_var);
@@ -5208,9 +6020,9 @@ numeric_stddev_internal(NumericAggState *state,
 	/*
 	 * Sample stddev and variance are undefined when N <= 1; population stddev
 	 * is undefined when N == 0.  Return NULL in either case (note that NaNs
-	 * count as normal inputs for this purpose).
+	 * and infinities count as normal inputs for this purpose).
 	 */
-	if (state == NULL || (totCount = state->N + state->NaNcount) == 0)
+	if (state == NULL || (totCount = NA_TOTAL_COUNT(state)) == 0)
 	{
 		*is_null = true;
 		return NULL;
@@ -5225,9 +6037,10 @@ numeric_stddev_internal(NumericAggState *state,
 	*is_null = false;
 
 	/*
-	 * Deal with NaN inputs.
+	 * Deal with NaN and infinity cases.  By analogy to the behavior of the
+	 * float8 functions, any infinity input produces NaN output.
 	 */
-	if (state->NaNcount > 0)
+	if (state->NaNcount > 0 || state->pInfcount > 0 || state->nInfcount > 0)
 		return make_result(&const_nan);
 
 	/* OK, normal calculation applies */
@@ -5870,6 +6683,12 @@ dump_numeric(const char *str, Numeric num)
 		case NUMERIC_NAN:
 			printf("NaN");
 			break;
+		case NUMERIC_PINF:
+			printf("Infinity");
+			break;
+		case NUMERIC_NINF:
+			printf("-Infinity");
+			break;
 		default:
 			printf("SIGN=0x%x", NUMERIC_SIGN(num));
 			break;
@@ -5901,6 +6720,12 @@ dump_var(const char *str, NumericVar *var)
 		case NUMERIC_NAN:
 			printf("NaN");
 			break;
+		case NUMERIC_PINF:
+			printf("Infinity");
+			break;
+		case NUMERIC_NINF:
+			printf("-Infinity");
+			break;
 		default:
 			printf("SIGN=0x%x", var->sign);
 			break;
@@ -5918,8 +6743,9 @@ dump_var(const char *str, NumericVar *var)
  *
  * Local functions follow
  *
- * In general, these do not support NaNs --- callers must eliminate
- * the possibility of NaN first.  (make_result() is an exception.)
+ * In general, these do not support "special" (NaN or infinity) inputs;
+ * callers should handle those possibilities first.
+ * (There are one or two exceptions, noted in their header comments.)
  *
  * ----------------------------------------------------------------------
  */
@@ -5979,9 +6805,9 @@ zero_var(NumericVar *var)
  *
  *	Parse a string and put the number into a variable
  *
- * This function does not handle leading or trailing spaces, and it doesn't
- * accept "NaN" either.  It returns the end+1 position so that caller can
- * check for trailing spaces/garbage if deemed necessary.
+ * This function does not handle leading or trailing spaces.  It returns
+ * the end+1 position parsed, so that caller can check for trailing
+ * spaces/garbage if deemed necessary.
  *
  * cp is the place to actually start parsing; str is what to use in error
  * reports.  (Typically cp would be the same except advanced over spaces.)
@@ -6455,13 +7281,29 @@ get_str_from_var_sci(const NumericVar *var, int rscale)
 }
 
 
+/*
+ * duplicate_numeric() - copy a packed-format Numeric
+ *
+ * This will handle NaN and Infinity cases.
+ */
+static Numeric
+duplicate_numeric(Numeric num)
+{
+	Numeric		res;
+
+	res = (Numeric) palloc(VARSIZE(num));
+	memcpy(res, num, VARSIZE(num));
+	return res;
+}
+
 /*
  * make_result_opt_error() -
  *
  *	Create the packed db numeric format in palloc()'d memory from
- *	a variable.  If "*have_error" flag is provided, on error it's set to
- *	true, NULL returned.  This is helpful when caller need to handle errors
- *	by itself.
+ *	a variable.  This will handle NaN and Infinity cases.
+ *
+ *	If "have_error" isn't NULL, on overflow *have_error is set to true and
+ *	NULL is returned.  This is helpful when caller needs to handle errors.
  */
 static Numeric
 make_result_opt_error(const NumericVar *var, bool *have_error)
@@ -6476,12 +7318,22 @@ make_result_opt_error(const NumericVar *var, bool *have_error)
 	if (have_error)
 		*have_error = false;
 
-	if (sign == NUMERIC_NAN)
+	if ((sign & NUMERIC_SIGN_MASK) == NUMERIC_SPECIAL)
 	{
+		/*
+		 * Verify valid special value.  This could be just an Assert, perhaps,
+		 * but it seems worthwhile to expend a few cycles to ensure that we
+		 * never write any nonzero reserved bits to disk.
+		 */
+		if (!(sign == NUMERIC_NAN ||
+			  sign == NUMERIC_PINF ||
+			  sign == NUMERIC_NINF))
+			elog(ERROR, "invalid numeric sign value 0x%x", sign);
+
 		result = (Numeric) palloc(NUMERIC_HDRSZ_SHORT);
 
 		SET_VARSIZE(result, NUMERIC_HDRSZ_SHORT);
-		result->choice.n_header = NUMERIC_NAN;
+		result->choice.n_header = sign;
 		/* the header word is all we need */
 
 		dump_numeric("make_result()", result);
@@ -6572,8 +7424,8 @@ make_result(const NumericVar *var)
 /*
  * apply_typmod() -
  *
- *	Do bounds checking and rounding according to the attributes
- *	typmod field.
+ *	Do bounds checking and rounding according to the specified typmod.
+ *	Note that this is only applied to normal finite values.
  */
 static void
 apply_typmod(NumericVar *var, int32 typmod)
@@ -6646,6 +7498,45 @@ apply_typmod(NumericVar *var, int32 typmod)
 	}
 }
 
+/*
+ * apply_typmod_special() -
+ *
+ *	Do bounds checking according to the specified typmod, for an Inf or NaN.
+ *	For convenience of most callers, the value is presented in packed form.
+ */
+static void
+apply_typmod_special(Numeric num, int32 typmod)
+{
+	int			precision;
+	int			scale;
+
+	Assert(NUMERIC_IS_SPECIAL(num));	/* caller error if not */
+
+	/*
+	 * NaN is allowed regardless of the typmod; that's rather dubious perhaps,
+	 * but it's a longstanding behavior.  Inf is rejected if we have any
+	 * typmod restriction, since an infinity shouldn't be claimed to fit in
+	 * any finite number of digits.
+	 */
+	if (NUMERIC_IS_NAN(num))
+		return;
+
+	/* Do nothing if we have a default typmod (-1) */
+	if (typmod < (int32) (VARHDRSZ))
+		return;
+
+	typmod -= VARHDRSZ;
+	precision = (typmod >> 16) & 0xffff;
+	scale = typmod & 0xffff;
+
+	ereport(ERROR,
+			(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+			 errmsg("numeric field overflow"),
+			 errdetail("A field with precision %d, scale %d cannot hold an infinite value.",
+					   precision, scale)));
+}
+
+
 /*
  * Convert numeric to int8, rounding if needed.
  *
@@ -6961,36 +7852,9 @@ int128_to_numericvar(int128 val, NumericVar *var)
 #endif
 
 /*
- * Convert numeric to float8; if out of range, return +/- HUGE_VAL
+ * Convert a NumericVar to float8; if out of range, return +/- HUGE_VAL
  */
 static double
-numeric_to_double_no_overflow(Numeric num)
-{
-	char	   *tmp;
-	double		val;
-	char	   *endptr;
-
-	tmp = DatumGetCString(DirectFunctionCall1(numeric_out,
-											  NumericGetDatum(num)));
-
-	/* unlike float8in, we ignore ERANGE from strtod */
-	val = strtod(tmp, &endptr);
-	if (*endptr != '\0')
-	{
-		/* shouldn't happen ... */
-		ereport(ERROR,
-				(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
-				 errmsg("invalid input syntax for type %s: \"%s\"",
-						"double precision", tmp)));
-	}
-
-	pfree(tmp);
-
-	return val;
-}
-
-/* As above, but work from a NumericVar */
-static double
 numericvar_to_double_no_overflow(const NumericVar *var)
 {
 	char	   *tmp;
diff --git a/src/include/utils/numeric.h b/src/include/utils/numeric.h
index 0604cb65ed46..0b7d4ba3c4bc 100644
--- a/src/include/utils/numeric.h
+++ b/src/include/utils/numeric.h
@@ -57,6 +57,7 @@ typedef struct NumericData *Numeric;
  * Utility functions in numeric.c
  */
 extern bool numeric_is_nan(Numeric num);
+extern bool numeric_is_inf(Numeric num);
 int32		numeric_maximum_size(int32 typmod);
 extern char *numeric_out_sci(Numeric num, int scale);
 extern char *numeric_normalize(Numeric num);
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
index 3bd184ae294b..477fd1205c30 100644
--- a/src/test/regress/expected/aggregates.out
+++ b/src/test/regress/expected/aggregates.out
@@ -211,6 +211,18 @@ SELECT stddev_pop(3.0::numeric), stddev_samp(4.0::numeric);
           0 |            
 (1 row)
 
+SELECT var_pop('inf'::numeric), var_samp('inf'::numeric);
+ var_pop | var_samp 
+---------+----------
+     NaN |         
+(1 row)
+
+SELECT stddev_pop('inf'::numeric), stddev_samp('inf'::numeric);
+ stddev_pop | stddev_samp 
+------------+-------------
+        NaN |            
+(1 row)
+
 SELECT var_pop('nan'::numeric), var_samp('nan'::numeric);
  var_pop | var_samp 
 ---------+----------
@@ -285,32 +297,74 @@ select avg('NaN'::numeric) from generate_series(1,3);
 (1 row)
 
 -- verify correct results for infinite inputs
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
 FROM (VALUES ('1'), ('infinity')) v(x);
-   avg    | var_pop 
-----------+---------
- Infinity |     NaN
+   sum    |   avg    | var_pop 
+----------+----------+---------
+ Infinity | Infinity |     NaN
 (1 row)
 
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
 FROM (VALUES ('infinity'), ('1')) v(x);
-   avg    | var_pop 
-----------+---------
- Infinity |     NaN
+   sum    |   avg    | var_pop 
+----------+----------+---------
+ Infinity | Infinity |     NaN
 (1 row)
 
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
 FROM (VALUES ('infinity'), ('infinity')) v(x);
-   avg    | var_pop 
-----------+---------
- Infinity |     NaN
+   sum    |   avg    | var_pop 
+----------+----------+---------
+ Infinity | Infinity |     NaN
 (1 row)
 
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
+FROM (VALUES ('-infinity'), ('infinity')) v(x);
+ sum | avg | var_pop 
+-----+-----+---------
+ NaN | NaN |     NaN
+(1 row)
+
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
+FROM (VALUES ('-infinity'), ('-infinity')) v(x);
+    sum    |    avg    | var_pop 
+-----------+-----------+---------
+ -Infinity | -Infinity |     NaN
+(1 row)
+
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('1'), ('infinity')) v(x);
+   sum    |   avg    | var_pop 
+----------+----------+---------
+ Infinity | Infinity |     NaN
+(1 row)
+
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('infinity'), ('1')) v(x);
+   sum    |   avg    | var_pop 
+----------+----------+---------
+ Infinity | Infinity |     NaN
+(1 row)
+
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('infinity'), ('infinity')) v(x);
+   sum    |   avg    | var_pop 
+----------+----------+---------
+ Infinity | Infinity |     NaN
+(1 row)
+
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
 FROM (VALUES ('-infinity'), ('infinity')) v(x);
- avg | var_pop 
------+---------
- NaN |     NaN
+ sum | avg | var_pop 
+-----+-----+---------
+ NaN | NaN |     NaN
+(1 row)
+
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('-infinity'), ('-infinity')) v(x);
+    sum    |    avg    | var_pop 
+-----------+-----------+---------
+ -Infinity | -Infinity |     NaN
 (1 row)
 
 -- test accuracy with a large input offset
diff --git a/src/test/regress/expected/numeric.out b/src/test/regress/expected/numeric.out
index 81a0c5d40f71..8546ce901fa7 100644
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -660,6 +660,432 @@ SELECT t1.id1, t1.result, t2.expected
 -----+--------+----------
 (0 rows)
 
+-- ******************************
+-- * Check behavior with Inf and NaN inputs.  It's easiest to handle these
+-- * separately from the num_data framework used above, because some input
+-- * combinations will throw errors.
+-- ******************************
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('inf'),('-inf'),('nan'))
+SELECT x1, x2,
+  x1 + x2 AS sum,
+  x1 - x2 AS diff,
+  x1 * x2 AS prod
+FROM v AS v1(x1), v AS v2(x2);
+    x1     |    x2     |    sum    |   diff    |   prod    
+-----------+-----------+-----------+-----------+-----------
+         0 |         0 |         0 |         0 |         0
+         0 |         1 |         1 |        -1 |         0
+         0 |        -1 |        -1 |         1 |         0
+         0 |       4.2 |       4.2 |      -4.2 |       0.0
+         0 |  Infinity |  Infinity | -Infinity |       NaN
+         0 | -Infinity | -Infinity |  Infinity |       NaN
+         0 |       NaN |       NaN |       NaN |       NaN
+         1 |         0 |         1 |         1 |         0
+         1 |         1 |         2 |         0 |         1
+         1 |        -1 |         0 |         2 |        -1
+         1 |       4.2 |       5.2 |      -3.2 |       4.2
+         1 |  Infinity |  Infinity | -Infinity |  Infinity
+         1 | -Infinity | -Infinity |  Infinity | -Infinity
+         1 |       NaN |       NaN |       NaN |       NaN
+        -1 |         0 |        -1 |        -1 |         0
+        -1 |         1 |         0 |        -2 |        -1
+        -1 |        -1 |        -2 |         0 |         1
+        -1 |       4.2 |       3.2 |      -5.2 |      -4.2
+        -1 |  Infinity |  Infinity | -Infinity | -Infinity
+        -1 | -Infinity | -Infinity |  Infinity |  Infinity
+        -1 |       NaN |       NaN |       NaN |       NaN
+       4.2 |         0 |       4.2 |       4.2 |       0.0
+       4.2 |         1 |       5.2 |       3.2 |       4.2
+       4.2 |        -1 |       3.2 |       5.2 |      -4.2
+       4.2 |       4.2 |       8.4 |       0.0 |     17.64
+       4.2 |  Infinity |  Infinity | -Infinity |  Infinity
+       4.2 | -Infinity | -Infinity |  Infinity | -Infinity
+       4.2 |       NaN |       NaN |       NaN |       NaN
+  Infinity |         0 |  Infinity |  Infinity |       NaN
+  Infinity |         1 |  Infinity |  Infinity |  Infinity
+  Infinity |        -1 |  Infinity |  Infinity | -Infinity
+  Infinity |       4.2 |  Infinity |  Infinity |  Infinity
+  Infinity |  Infinity |  Infinity |       NaN |  Infinity
+  Infinity | -Infinity |       NaN |  Infinity | -Infinity
+  Infinity |       NaN |       NaN |       NaN |       NaN
+ -Infinity |         0 | -Infinity | -Infinity |       NaN
+ -Infinity |         1 | -Infinity | -Infinity | -Infinity
+ -Infinity |        -1 | -Infinity | -Infinity |  Infinity
+ -Infinity |       4.2 | -Infinity | -Infinity | -Infinity
+ -Infinity |  Infinity |       NaN | -Infinity | -Infinity
+ -Infinity | -Infinity | -Infinity |       NaN |  Infinity
+ -Infinity |       NaN |       NaN |       NaN |       NaN
+       NaN |         0 |       NaN |       NaN |       NaN
+       NaN |         1 |       NaN |       NaN |       NaN
+       NaN |        -1 |       NaN |       NaN |       NaN
+       NaN |       4.2 |       NaN |       NaN |       NaN
+       NaN |  Infinity |       NaN |       NaN |       NaN
+       NaN | -Infinity |       NaN |       NaN |       NaN
+       NaN |       NaN |       NaN |       NaN |       NaN
+(49 rows)
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('inf'),('-inf'),('nan'))
+SELECT x1, x2,
+  x1 / x2 AS quot,
+  x1 % x2 AS mod,
+  div(x1, x2) AS div
+FROM v AS v1(x1), v AS v2(x2) WHERE x2 != 0;
+    x1     |    x2     |          quot           | mod  |    div    
+-----------+-----------+-------------------------+------+-----------
+         0 |         1 |  0.00000000000000000000 |    0 |         0
+         1 |         1 |  1.00000000000000000000 |    0 |         1
+        -1 |         1 | -1.00000000000000000000 |    0 |        -1
+       4.2 |         1 |      4.2000000000000000 |  0.2 |         4
+  Infinity |         1 |                Infinity |  NaN |  Infinity
+ -Infinity |         1 |               -Infinity |  NaN | -Infinity
+       NaN |         1 |                     NaN |  NaN |       NaN
+         0 |        -1 |  0.00000000000000000000 |    0 |         0
+         1 |        -1 | -1.00000000000000000000 |    0 |        -1
+        -1 |        -1 |  1.00000000000000000000 |    0 |         1
+       4.2 |        -1 |     -4.2000000000000000 |  0.2 |        -4
+  Infinity |        -1 |               -Infinity |  NaN | -Infinity
+ -Infinity |        -1 |                Infinity |  NaN |  Infinity
+       NaN |        -1 |                     NaN |  NaN |       NaN
+         0 |       4.2 |  0.00000000000000000000 |  0.0 |         0
+         1 |       4.2 |  0.23809523809523809524 |  1.0 |         0
+        -1 |       4.2 | -0.23809523809523809524 | -1.0 |         0
+       4.2 |       4.2 |  1.00000000000000000000 |  0.0 |         1
+  Infinity |       4.2 |                Infinity |  NaN |  Infinity
+ -Infinity |       4.2 |               -Infinity |  NaN | -Infinity
+       NaN |       4.2 |                     NaN |  NaN |       NaN
+         0 |  Infinity |                       0 |    0 |         0
+         1 |  Infinity |                       0 |    1 |         0
+        -1 |  Infinity |                       0 |   -1 |         0
+       4.2 |  Infinity |                       0 |  4.2 |         0
+  Infinity |  Infinity |                     NaN |  NaN |       NaN
+ -Infinity |  Infinity |                     NaN |  NaN |       NaN
+       NaN |  Infinity |                     NaN |  NaN |       NaN
+         0 | -Infinity |                       0 |    0 |         0
+         1 | -Infinity |                       0 |    1 |         0
+        -1 | -Infinity |                       0 |   -1 |         0
+       4.2 | -Infinity |                       0 |  4.2 |         0
+  Infinity | -Infinity |                     NaN |  NaN |       NaN
+ -Infinity | -Infinity |                     NaN |  NaN |       NaN
+       NaN | -Infinity |                     NaN |  NaN |       NaN
+         0 |       NaN |                     NaN |  NaN |       NaN
+         1 |       NaN |                     NaN |  NaN |       NaN
+        -1 |       NaN |                     NaN |  NaN |       NaN
+       4.2 |       NaN |                     NaN |  NaN |       NaN
+  Infinity |       NaN |                     NaN |  NaN |       NaN
+ -Infinity |       NaN |                     NaN |  NaN |       NaN
+       NaN |       NaN |                     NaN |  NaN |       NaN
+(42 rows)
+
+SELECT 'inf'::numeric / '0';
+ERROR:  division by zero
+SELECT '-inf'::numeric / '0';
+ERROR:  division by zero
+SELECT 'nan'::numeric / '0';
+ ?column? 
+----------
+      NaN
+(1 row)
+
+SELECT '0'::numeric / '0';
+ERROR:  division by zero
+SELECT 'inf'::numeric % '0';
+ERROR:  division by zero
+SELECT '-inf'::numeric % '0';
+ERROR:  division by zero
+SELECT 'nan'::numeric % '0';
+ ?column? 
+----------
+      NaN
+(1 row)
+
+SELECT '0'::numeric % '0';
+ERROR:  division by zero
+SELECT div('inf'::numeric, '0');
+ERROR:  division by zero
+SELECT div('-inf'::numeric, '0');
+ERROR:  division by zero
+SELECT div('nan'::numeric, '0');
+ div 
+-----
+ NaN
+(1 row)
+
+SELECT div('0'::numeric, '0');
+ERROR:  division by zero
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('-7.777'),('inf'),('-inf'),('nan'))
+SELECT x, -x as minusx, abs(x), floor(x), ceil(x), sign(x), numeric_inc(x) as inc
+FROM v;
+     x     |  minusx   |   abs    |   floor   |   ceil    | sign |    inc    
+-----------+-----------+----------+-----------+-----------+------+-----------
+         0 |         0 |        0 |         0 |         0 |    0 |         1
+         1 |        -1 |        1 |         1 |         1 |    1 |         2
+        -1 |         1 |        1 |        -1 |        -1 |   -1 |         0
+       4.2 |      -4.2 |      4.2 |         4 |         5 |    1 |       5.2
+    -7.777 |     7.777 |    7.777 |        -8 |        -7 |   -1 |    -6.777
+  Infinity | -Infinity | Infinity |  Infinity |  Infinity |    1 |  Infinity
+ -Infinity |  Infinity | Infinity | -Infinity | -Infinity |   -1 | -Infinity
+       NaN |       NaN |      NaN |       NaN |       NaN |  NaN |       NaN
+(8 rows)
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('-7.777'),('inf'),('-inf'),('nan'))
+SELECT x, round(x), round(x,1) as round1, trunc(x), trunc(x,1) as trunc1
+FROM v;
+     x     |   round   |  round1   |   trunc   |  trunc1   
+-----------+-----------+-----------+-----------+-----------
+         0 |         0 |       0.0 |         0 |       0.0
+         1 |         1 |       1.0 |         1 |       1.0
+        -1 |        -1 |      -1.0 |        -1 |      -1.0
+       4.2 |         4 |       4.2 |         4 |       4.2
+    -7.777 |        -8 |      -7.8 |        -7 |      -7.7
+  Infinity |  Infinity |  Infinity |  Infinity |  Infinity
+ -Infinity | -Infinity | -Infinity | -Infinity | -Infinity
+       NaN |       NaN |       NaN |       NaN |       NaN
+(8 rows)
+
+-- the large values fall into the numeric abbreviation code's maximal classes
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('-7.777'),('1e340'),('-1e340'),
+         ('inf'),('-inf'),('nan'),
+         ('inf'),('-inf'),('nan'))
+SELECT substring(x::text, 1, 32)
+FROM v ORDER BY x;
+            substring             
+----------------------------------
+ -Infinity
+ -Infinity
+ -1000000000000000000000000000000
+ -7.777
+ -1
+ 0
+ 1
+ 4.2
+ 10000000000000000000000000000000
+ Infinity
+ Infinity
+ NaN
+ NaN
+(13 rows)
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('4.2'),('inf'),('nan'))
+SELECT x, sqrt(x)
+FROM v;
+    x     |       sqrt        
+----------+-------------------
+        0 | 0.000000000000000
+        1 | 1.000000000000000
+      4.2 | 2.049390153191920
+ Infinity |          Infinity
+      NaN |               NaN
+(5 rows)
+
+SELECT sqrt('-1'::numeric);
+ERROR:  cannot take square root of a negative number
+SELECT sqrt('-inf'::numeric);
+ERROR:  cannot take square root of a negative number
+WITH v(x) AS
+  (VALUES('1'::numeric),('4.2'),('inf'),('nan'))
+SELECT x,
+  log(x),
+  log10(x),
+  ln(x)
+FROM v;
+    x     |        log         |       log10        |         ln         
+----------+--------------------+--------------------+--------------------
+        1 | 0.0000000000000000 | 0.0000000000000000 | 0.0000000000000000
+      4.2 | 0.6232492903979005 | 0.6232492903979005 | 1.4350845252893226
+ Infinity |           Infinity |           Infinity |           Infinity
+      NaN |                NaN |                NaN |                NaN
+(4 rows)
+
+SELECT ln('0'::numeric);
+ERROR:  cannot take logarithm of zero
+SELECT ln('-1'::numeric);
+ERROR:  cannot take logarithm of a negative number
+SELECT ln('-inf'::numeric);
+ERROR:  cannot take logarithm of a negative number
+WITH v(x) AS
+  (VALUES('2'::numeric),('4.2'),('inf'),('nan'))
+SELECT x1, x2,
+  log(x1, x2)
+FROM v AS v1(x1), v AS v2(x2);
+    x1    |    x2    |        log         
+----------+----------+--------------------
+        2 |        2 | 1.0000000000000000
+        2 |      4.2 | 2.0703893278913979
+        2 | Infinity |           Infinity
+        2 |      NaN |                NaN
+      4.2 |        2 | 0.4830009440873890
+      4.2 |      4.2 | 1.0000000000000000
+      4.2 | Infinity |           Infinity
+      4.2 |      NaN |                NaN
+ Infinity |        2 |                  0
+ Infinity |      4.2 |                  0
+ Infinity | Infinity |                NaN
+ Infinity |      NaN |                NaN
+      NaN |        2 |                NaN
+      NaN |      4.2 |                NaN
+      NaN | Infinity |                NaN
+      NaN |      NaN |                NaN
+(16 rows)
+
+SELECT log('0'::numeric, '10');
+ERROR:  cannot take logarithm of zero
+SELECT log('10'::numeric, '0');
+ERROR:  cannot take logarithm of zero
+SELECT log('-inf'::numeric, '10');
+ERROR:  cannot take logarithm of a negative number
+SELECT log('10'::numeric, '-inf');
+ERROR:  cannot take logarithm of a negative number
+SELECT log('inf'::numeric, '0');
+ERROR:  cannot take logarithm of zero
+SELECT log('inf'::numeric, '-inf');
+ERROR:  cannot take logarithm of a negative number
+SELECT log('-inf'::numeric, 'inf');
+ERROR:  cannot take logarithm of a negative number
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('2'),('4.2'),('inf'),('nan'))
+SELECT x1, x2,
+  power(x1, x2)
+FROM v AS v1(x1), v AS v2(x2) WHERE x1 != 0 OR x2 >= 0;
+    x1    |    x2    |        power        
+----------+----------+---------------------
+        0 |        0 |  1.0000000000000000
+        0 |        1 |  0.0000000000000000
+        0 |        2 |  0.0000000000000000
+        0 |      4.2 |  0.0000000000000000
+        0 | Infinity |                   0
+        0 |      NaN |                 NaN
+        1 |        0 |  1.0000000000000000
+        1 |        1 |  1.0000000000000000
+        1 |        2 |  1.0000000000000000
+        1 |      4.2 |  1.0000000000000000
+        1 | Infinity |                   1
+        1 |      NaN |                   1
+        2 |        0 |  1.0000000000000000
+        2 |        1 |  2.0000000000000000
+        2 |        2 |  4.0000000000000000
+        2 |      4.2 |  18.379173679952560
+        2 | Infinity |            Infinity
+        2 |      NaN |                 NaN
+      4.2 |        0 |  1.0000000000000000
+      4.2 |        1 |  4.2000000000000000
+      4.2 |        2 | 17.6400000000000000
+      4.2 |      4.2 |  414.61691860129675
+      4.2 | Infinity |            Infinity
+      4.2 |      NaN |                 NaN
+ Infinity |        0 |                   1
+ Infinity |        1 |            Infinity
+ Infinity |        2 |            Infinity
+ Infinity |      4.2 |            Infinity
+ Infinity | Infinity |            Infinity
+ Infinity |      NaN |                 NaN
+      NaN |        0 |                   1
+      NaN |        1 |                 NaN
+      NaN |        2 |                 NaN
+      NaN |      4.2 |                 NaN
+      NaN | Infinity |                 NaN
+      NaN |      NaN |                 NaN
+(36 rows)
+
+SELECT power('0'::numeric, '-1');
+ERROR:  zero raised to a negative power is undefined
+SELECT power('0'::numeric, '-inf');
+ERROR:  zero raised to a negative power is undefined
+SELECT power('-1'::numeric, 'inf');
+ power 
+-------
+     1
+(1 row)
+
+SELECT power('-2'::numeric, '3');
+        power        
+---------------------
+ -8.0000000000000000
+(1 row)
+
+SELECT power('-2'::numeric, '3.3');
+ERROR:  a negative number raised to a non-integer power yields a complex result
+SELECT power('-2'::numeric, '-1');
+        power        
+---------------------
+ -0.5000000000000000
+(1 row)
+
+SELECT power('-2'::numeric, '-1.5');
+ERROR:  a negative number raised to a non-integer power yields a complex result
+SELECT power('-2'::numeric, 'inf');
+  power   
+----------
+ Infinity
+(1 row)
+
+SELECT power('-2'::numeric, '-inf');
+ power 
+-------
+     0
+(1 row)
+
+SELECT power('inf'::numeric, '-2');
+ power 
+-------
+     0
+(1 row)
+
+SELECT power('inf'::numeric, '-inf');
+ power 
+-------
+     0
+(1 row)
+
+SELECT power('-inf'::numeric, '2');
+  power   
+----------
+ Infinity
+(1 row)
+
+SELECT power('-inf'::numeric, '3');
+   power   
+-----------
+ -Infinity
+(1 row)
+
+SELECT power('-inf'::numeric, '4.5');
+ERROR:  a negative number raised to a non-integer power yields a complex result
+SELECT power('-inf'::numeric, '-2');
+ power 
+-------
+     0
+(1 row)
+
+SELECT power('-inf'::numeric, '-3');
+ power 
+-------
+     0
+(1 row)
+
+SELECT power('-inf'::numeric, '0');
+ power 
+-------
+     1
+(1 row)
+
+SELECT power('-inf'::numeric, 'inf');
+  power   
+----------
+ Infinity
+(1 row)
+
+SELECT power('-inf'::numeric, '-inf');
+ power 
+-------
+     0
+(1 row)
+
 -- ******************************
 -- * miscellaneous checks for things that have been broken in the past...
 -- ******************************
@@ -696,6 +1122,13 @@ ERROR:  numeric field overflow
 DETAIL:  A field with precision 4, scale 4 must round to an absolute value less than 1.
 INSERT INTO fract_only VALUES (7, '0.00001');
 INSERT INTO fract_only VALUES (8, '0.00017');
+INSERT INTO fract_only VALUES (9, 'NaN');
+INSERT INTO fract_only VALUES (10, 'Inf');	-- should fail
+ERROR:  numeric field overflow
+DETAIL:  A field with precision 4, scale 4 cannot hold an infinite value.
+INSERT INTO fract_only VALUES (11, '-Inf');	-- should fail
+ERROR:  numeric field overflow
+DETAIL:  A field with precision 4, scale 4 cannot hold an infinite value.
 SELECT * FROM fract_only;
  id |   val   
 ----+---------
@@ -705,7 +1138,8 @@ SELECT * FROM fract_only;
   5 |  0.9999
   7 |  0.0000
   8 |  0.0002
-(6 rows)
+  9 |     NaN
+(7 rows)
 
 DROP TABLE fract_only;
 -- Check inf/nan conversion behavior
@@ -716,9 +1150,35 @@ SELECT 'NaN'::float8::numeric;
 (1 row)
 
 SELECT 'Infinity'::float8::numeric;
-ERROR:  cannot convert infinity to numeric
+ numeric  
+----------
+ Infinity
+(1 row)
+
 SELECT '-Infinity'::float8::numeric;
-ERROR:  cannot convert infinity to numeric
+  numeric  
+-----------
+ -Infinity
+(1 row)
+
+SELECT 'NaN'::numeric::float8;
+ float8 
+--------
+    NaN
+(1 row)
+
+SELECT 'Infinity'::numeric::float8;
+  float8  
+----------
+ Infinity
+(1 row)
+
+SELECT '-Infinity'::numeric::float8;
+  float8   
+-----------
+ -Infinity
+(1 row)
+
 SELECT 'NaN'::float4::numeric;
  numeric 
 ---------
@@ -726,9 +1186,59 @@ SELECT 'NaN'::float4::numeric;
 (1 row)
 
 SELECT 'Infinity'::float4::numeric;
-ERROR:  cannot convert infinity to numeric
+ numeric  
+----------
+ Infinity
+(1 row)
+
 SELECT '-Infinity'::float4::numeric;
-ERROR:  cannot convert infinity to numeric
+  numeric  
+-----------
+ -Infinity
+(1 row)
+
+SELECT 'NaN'::numeric::float4;
+ float4 
+--------
+    NaN
+(1 row)
+
+SELECT 'Infinity'::numeric::float4;
+  float4  
+----------
+ Infinity
+(1 row)
+
+SELECT '-Infinity'::numeric::float4;
+  float4   
+-----------
+ -Infinity
+(1 row)
+
+SELECT '42'::int2::numeric;
+ numeric 
+---------
+      42
+(1 row)
+
+SELECT 'NaN'::numeric::int2;
+ERROR:  cannot convert NaN to smallint
+SELECT 'Infinity'::numeric::int2;
+ERROR:  cannot convert infinity to smallint
+SELECT '-Infinity'::numeric::int2;
+ERROR:  cannot convert infinity to smallint
+SELECT 'NaN'::numeric::int4;
+ERROR:  cannot convert NaN to integer
+SELECT 'Infinity'::numeric::int4;
+ERROR:  cannot convert infinity to integer
+SELECT '-Infinity'::numeric::int4;
+ERROR:  cannot convert infinity to integer
+SELECT 'NaN'::numeric::int8;
+ERROR:  cannot convert NaN to bigint
+SELECT 'Infinity'::numeric::int8;
+ERROR:  cannot convert infinity to bigint
+SELECT '-Infinity'::numeric::int8;
+ERROR:  cannot convert infinity to bigint
 -- Simple check that ceil(), floor(), and round() work correctly
 CREATE TABLE ceil_floor_round (a numeric);
 INSERT INTO ceil_floor_round VALUES ('-5.5');
@@ -794,6 +1304,12 @@ SELECT width_bucket('NaN', 3.0, 4.0, 888);
 ERROR:  operand, lower bound, and upper bound cannot be NaN
 SELECT width_bucket(0::float8, 'NaN', 4.0::float8, 888);
 ERROR:  operand, lower bound, and upper bound cannot be NaN
+SELECT width_bucket('inf', 3.0, 4.0, 888);
+ERROR:  operand, lower bound, and upper bound cannot be infinity
+SELECT width_bucket(2.0, 3.0, '-inf', 888);
+ERROR:  operand, lower bound, and upper bound cannot be infinity
+SELECT width_bucket(0::float8, '-inf', 4.0::float8, 888);
+ERROR:  lower and upper bounds must be finite
 -- normal operation
 CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8);
 COPY width_bucket_test (operand_num) FROM stdin;
@@ -1199,6 +1715,60 @@ SELECT '' AS to_char_23, to_char(val, '9.999EEEE')				FROM num_data;
             | -2.493e+07
 (10 rows)
 
+WITH v(val) AS
+  (VALUES('0'::numeric),('-4.2'),('4.2e9'),('1.2e-5'),('inf'),('-inf'),('nan'))
+SELECT val,
+  to_char(val, '9.999EEEE') as numeric,
+  to_char(val::float8, '9.999EEEE') as float8,
+  to_char(val::float4, '9.999EEEE') as float4
+FROM v;
+    val     |  numeric   |   float8   |   float4   
+------------+------------+------------+------------
+          0 |  0.000e+00 |  0.000e+00 |  0.000e+00
+       -4.2 | -4.200e+00 | -4.200e+00 | -4.200e+00
+ 4200000000 |  4.200e+09 |  4.200e+09 |  4.200e+09
+   0.000012 |  1.200e-05 |  1.200e-05 |  1.200e-05
+   Infinity |  #.####### |  #.####### |  #.#######
+  -Infinity |  #.####### |  #.####### |  #.#######
+        NaN |  #.####### |  #.####### |  #.#######
+(7 rows)
+
+WITH v(val) AS
+  (VALUES('0'::numeric),('-4.2'),('4.2e9'),('1.2e-5'),('inf'),('-inf'),('nan'))
+SELECT val,
+  to_char(val, 'MI9999999999.99') as numeric,
+  to_char(val::float8, 'MI9999999999.99') as float8,
+  to_char(val::float4, 'MI9999999999.99') as float4
+FROM v;
+    val     |    numeric     |     float8     |     float4     
+------------+----------------+----------------+----------------
+          0 |            .00 |            .00 |            .00
+       -4.2 | -         4.20 | -         4.20 | -         4.20
+ 4200000000 |  4200000000.00 |  4200000000.00 |  4200000000
+   0.000012 |            .00 |            .00 |            .00
+   Infinity |    Infinity    |    Infinity    |    Infinity
+  -Infinity | -  Infinity    | -  Infinity    | -  Infinity
+        NaN |         NaN    |         NaN    |         NaN
+(7 rows)
+
+WITH v(val) AS
+  (VALUES('0'::numeric),('-4.2'),('4.2e9'),('1.2e-5'),('inf'),('-inf'),('nan'))
+SELECT val,
+  to_char(val, 'MI99.99') as numeric,
+  to_char(val::float8, 'MI99.99') as float8,
+  to_char(val::float4, 'MI99.99') as float4
+FROM v;
+    val     | numeric | float8 | float4 
+------------+---------+--------+--------
+          0 |    .00  |    .00 |    .00
+       -4.2 | - 4.20  | - 4.20 | - 4.20
+ 4200000000 |  ##.##  |  ##.## |  ##.
+   0.000012 |    .00  |    .00 |    .00
+   Infinity |  ##.##  |  ##.## |  ##.
+  -Infinity | -##.##  | -##.## | -##.
+        NaN |  ##.##  |  ##.## |  ##.##
+(7 rows)
+
 SELECT '' AS to_char_24, to_char('100'::numeric, 'FM999.9');
  to_char_24 | to_char 
 ------------+---------
@@ -1426,6 +1996,12 @@ INSERT INTO num_input_test(n1) VALUES ('555.50');
 INSERT INTO num_input_test(n1) VALUES ('-555.50');
 INSERT INTO num_input_test(n1) VALUES ('NaN ');
 INSERT INTO num_input_test(n1) VALUES ('        nan');
+INSERT INTO num_input_test(n1) VALUES (' inf ');
+INSERT INTO num_input_test(n1) VALUES (' +inf ');
+INSERT INTO num_input_test(n1) VALUES (' -inf ');
+INSERT INTO num_input_test(n1) VALUES (' Infinity ');
+INSERT INTO num_input_test(n1) VALUES (' +inFinity ');
+INSERT INTO num_input_test(n1) VALUES (' -INFINITY ');
 -- bad inputs
 INSERT INTO num_input_test(n1) VALUES ('     ');
 ERROR:  invalid input syntax for type numeric: "     "
@@ -1459,17 +2035,27 @@ INSERT INTO num_input_test(n1) VALUES (' N aN ');
 ERROR:  invalid input syntax for type numeric: " N aN "
 LINE 1: INSERT INTO num_input_test(n1) VALUES (' N aN ');
                                                ^
+INSERT INTO num_input_test(n1) VALUES ('+ infinity');
+ERROR:  invalid input syntax for type numeric: "+ infinity"
+LINE 1: INSERT INTO num_input_test(n1) VALUES ('+ infinity');
+                                               ^
 SELECT * FROM num_input_test;
-   n1    
----------
-     123
- 3245874
-  -93853
-  555.50
- -555.50
-     NaN
-     NaN
-(7 rows)
+    n1     
+-----------
+       123
+   3245874
+    -93853
+    555.50
+   -555.50
+       NaN
+       NaN
+  Infinity
+  Infinity
+ -Infinity
+  Infinity
+  Infinity
+ -Infinity
+(13 rows)
 
 --
 -- Test some corner cases for multiplication
@@ -1805,6 +2391,24 @@ select exp(1.0::numeric(71,70));
  2.7182818284590452353602874713526624977572470936999595749669676277240766
 (1 row)
 
+select exp('nan'::numeric);
+ exp 
+-----
+ NaN
+(1 row)
+
+select exp('inf'::numeric);
+   exp    
+----------
+ Infinity
+(1 row)
+
+select exp('-inf'::numeric);
+ exp 
+-----
+   0
+(1 row)
+
 -- cases that used to generate inaccurate results
 select exp(32.999);
          exp         
@@ -1876,6 +2480,12 @@ select * from generate_series('nan'::numeric, 100::numeric, 10::numeric);
 ERROR:  start value cannot be NaN
 select * from generate_series(0::numeric, 'nan'::numeric, 10::numeric);
 ERROR:  stop value cannot be NaN
+select * from generate_series('inf'::numeric, 'inf'::numeric, 10::numeric);
+ERROR:  start value cannot be infinity
+select * from generate_series(0::numeric, 'inf'::numeric, 10::numeric);
+ERROR:  stop value cannot be infinity
+select * from generate_series(0::numeric, '42'::numeric, '-inf'::numeric);
+ERROR:  step size cannot be infinity
 -- Checks maximum, output is truncated
 select (i / (10::numeric ^ 131071))::numeric(1,0)
 	from generate_series(6 * (10::numeric ^ 131071),
@@ -2081,6 +2691,12 @@ select scale(numeric 'NaN');
       
 (1 row)
 
+select scale(numeric 'inf');
+ scale 
+-------
+      
+(1 row)
+
 select scale(NULL::numeric);
  scale 
 -------
@@ -2138,6 +2754,12 @@ select min_scale(numeric 'NaN') is NULL; -- should be true
  t
 (1 row)
 
+select min_scale(numeric 'inf') is NULL; -- should be true
+ ?column? 
+----------
+ t
+(1 row)
+
 select min_scale(0);                     -- no digits
  min_scale 
 -----------
@@ -2207,6 +2829,12 @@ select trim_scale(numeric 'NaN');
         NaN
 (1 row)
 
+select trim_scale(numeric 'inf');
+ trim_scale 
+------------
+   Infinity
+(1 row)
+
 select trim_scale(1.120);
  trim_scale 
 ------------
@@ -2280,7 +2908,11 @@ FROM (VALUES (0::numeric, 0::numeric),
              (0::numeric, 46375::numeric),
              (433125::numeric, 46375::numeric),
              (43312.5::numeric, 4637.5::numeric),
-             (4331.250::numeric, 463.75000::numeric)) AS v(a, b);
+             (4331.250::numeric, 463.75000::numeric),
+             ('inf', '0'),
+             ('inf', '42'),
+             ('inf', 'inf')
+     ) AS v(a, b);
     a     |     b     |   gcd   |   gcd   |   gcd   |   gcd   
 ----------+-----------+---------+---------+---------+---------
         0 |         0 |       0 |       0 |       0 |       0
@@ -2289,7 +2921,10 @@ FROM (VALUES (0::numeric, 0::numeric),
    433125 |     46375 |     875 |     875 |     875 |     875
   43312.5 |    4637.5 |    87.5 |    87.5 |    87.5 |    87.5
  4331.250 | 463.75000 | 8.75000 | 8.75000 | 8.75000 | 8.75000
-(6 rows)
+ Infinity |         0 |     NaN |     NaN |     NaN |     NaN
+ Infinity |        42 |     NaN |     NaN |     NaN |     NaN
+ Infinity |  Infinity |     NaN |     NaN |     NaN |     NaN
+(9 rows)
 
 --
 -- Tests for LCM()
@@ -2301,7 +2936,11 @@ FROM (VALUES (0::numeric, 0::numeric),
              (13272::numeric, 13272::numeric),
              (423282::numeric, 13272::numeric),
              (42328.2::numeric, 1327.2::numeric),
-             (4232.820::numeric, 132.72000::numeric)) AS v(a, b);
+             (4232.820::numeric, 132.72000::numeric),
+             ('inf', '0'),
+             ('inf', '42'),
+             ('inf', 'inf')
+     ) AS v(a, b);
     a     |     b     |     lcm      |     lcm      |     lcm      |     lcm      
 ----------+-----------+--------------+--------------+--------------+--------------
         0 |         0 |            0 |            0 |            0 |            0
@@ -2311,7 +2950,10 @@ FROM (VALUES (0::numeric, 0::numeric),
    423282 |     13272 |     11851896 |     11851896 |     11851896 |     11851896
   42328.2 |    1327.2 |    1185189.6 |    1185189.6 |    1185189.6 |    1185189.6
  4232.820 | 132.72000 | 118518.96000 | 118518.96000 | 118518.96000 | 118518.96000
-(7 rows)
+ Infinity |         0 |          NaN |          NaN |          NaN |          NaN
+ Infinity |        42 |          NaN |          NaN |          NaN |          NaN
+ Infinity |  Infinity |          NaN |          NaN |          NaN |          NaN
+(10 rows)
 
 SELECT lcm(9999 * (10::numeric)^131068 + (10::numeric^131068 - 1), 2); -- overflow
 ERROR:  value overflows numeric format
diff --git a/src/test/regress/expected/window.out b/src/test/regress/expected/window.out
index 432edfa0630e..13c91c9916fa 100644
--- a/src/test/regress/expected/window.out
+++ b/src/test/regress/expected/window.out
@@ -1872,7 +1872,7 @@ create temp table numerics(
     f_numeric numeric
 );
 insert into numerics values
-(0, '-infinity', '-infinity', '-1000'),  -- numeric type lacks infinities
+(0, '-infinity', '-infinity', '-infinity'),
 (1, -3, -3, -3),
 (2, -1, -1, -1),
 (3, 0, 0, 0),
@@ -1880,7 +1880,7 @@ insert into numerics values
 (5, 1.12, 1.12, 1.12),
 (6, 2, 2, 2),
 (7, 100, 100, 100),
-(8, 'infinity', 'infinity', '1000'),
+(8, 'infinity', 'infinity', 'infinity'),
 (9, 'NaN', 'NaN', 'NaN');
 select id, f_float4, first_value(id) over w, last_value(id) over w
 from numerics
@@ -2078,7 +2078,7 @@ window w as (order by f_numeric range between
              1 preceding and 1 following);
  id | f_numeric | first_value | last_value 
 ----+-----------+-------------+------------
-  0 |     -1000 |           0 |          0
+  0 | -Infinity |           0 |          0
   1 |        -3 |           1 |          1
   2 |        -1 |           2 |          3
   3 |         0 |           2 |          3
@@ -2086,7 +2086,7 @@ window w as (order by f_numeric range between
   5 |      1.12 |           4 |          6
   6 |         2 |           4 |          6
   7 |       100 |           7 |          7
-  8 |      1000 |           8 |          8
+  8 |  Infinity |           8 |          8
   9 |       NaN |           9 |          9
 (10 rows)
 
@@ -2096,7 +2096,7 @@ window w as (order by f_numeric range between
              1 preceding and 1.1::numeric following);
  id | f_numeric | first_value | last_value 
 ----+-----------+-------------+------------
-  0 |     -1000 |           0 |          0
+  0 | -Infinity |           0 |          0
   1 |        -3 |           1 |          1
   2 |        -1 |           2 |          3
   3 |         0 |           2 |          4
@@ -2104,7 +2104,7 @@ window w as (order by f_numeric range between
   5 |      1.12 |           4 |          6
   6 |         2 |           4 |          6
   7 |       100 |           7 |          7
-  8 |      1000 |           8 |          8
+  8 |  Infinity |           8 |          8
   9 |       NaN |           9 |          9
 (10 rows)
 
@@ -2116,6 +2116,60 @@ ERROR:  RANGE with offset PRECEDING/FOLLOWING is not supported for column type n
 LINE 4:              1 preceding and 1.1::float8 following);
                                      ^
 HINT:  Cast the offset value to an appropriate type.
+select id, f_numeric, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_numeric range between
+             'inf' preceding and 'inf' following);
+ id | f_numeric | first_value | last_value 
+----+-----------+-------------+------------
+  0 | -Infinity |           0 |          8
+  1 |        -3 |           0 |          8
+  2 |        -1 |           0 |          8
+  3 |         0 |           0 |          8
+  4 |       1.1 |           0 |          8
+  5 |      1.12 |           0 |          8
+  6 |         2 |           0 |          8
+  7 |       100 |           0 |          8
+  8 |  Infinity |           0 |          8
+  9 |       NaN |           9 |          9
+(10 rows)
+
+select id, f_numeric, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_numeric range between
+             'inf' preceding and 'inf' preceding);
+ id | f_numeric | first_value | last_value 
+----+-----------+-------------+------------
+  0 | -Infinity |           0 |          0
+  1 |        -3 |           0 |          0
+  2 |        -1 |           0 |          0
+  3 |         0 |           0 |          0
+  4 |       1.1 |           0 |          0
+  5 |      1.12 |           0 |          0
+  6 |         2 |           0 |          0
+  7 |       100 |           0 |          0
+  8 |  Infinity |           0 |          8
+  9 |       NaN |           9 |          9
+(10 rows)
+
+select id, f_numeric, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_numeric range between
+             'inf' following and 'inf' following);
+ id | f_numeric | first_value | last_value 
+----+-----------+-------------+------------
+  0 | -Infinity |           0 |          8
+  1 |        -3 |           8 |          8
+  2 |        -1 |           8 |          8
+  3 |         0 |           8 |          8
+  4 |       1.1 |           8 |          8
+  5 |      1.12 |           8 |          8
+  6 |         2 |           8 |          8
+  7 |       100 |           8 |          8
+  8 |  Infinity |           8 |          8
+  9 |       NaN |           9 |          9
+(10 rows)
+
 select id, f_numeric, first_value(id) over w, last_value(id) over w
 from numerics
 window w as (order by f_numeric range between
diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql
index 044d5155073c..54f5cf7ecc43 100644
--- a/src/test/regress/sql/aggregates.sql
+++ b/src/test/regress/sql/aggregates.sql
@@ -53,6 +53,8 @@ SELECT var_pop('nan'::float4), var_samp('nan'::float4);
 SELECT stddev_pop('nan'::float4), stddev_samp('nan'::float4);
 SELECT var_pop(1.0::numeric), var_samp(2.0::numeric);
 SELECT stddev_pop(3.0::numeric), stddev_samp(4.0::numeric);
+SELECT var_pop('inf'::numeric), var_samp('inf'::numeric);
+SELECT stddev_pop('inf'::numeric), stddev_samp('inf'::numeric);
 SELECT var_pop('nan'::numeric), var_samp('nan'::numeric);
 SELECT stddev_pop('nan'::numeric), stddev_samp('nan'::numeric);
 
@@ -69,14 +71,26 @@ select sum('NaN'::numeric) from generate_series(1,3);
 select avg('NaN'::numeric) from generate_series(1,3);
 
 -- verify correct results for infinite inputs
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
 FROM (VALUES ('1'), ('infinity')) v(x);
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
 FROM (VALUES ('infinity'), ('1')) v(x);
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
 FROM (VALUES ('infinity'), ('infinity')) v(x);
-SELECT avg(x::float8), var_pop(x::float8)
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
+FROM (VALUES ('-infinity'), ('infinity')) v(x);
+SELECT sum(x::float8), avg(x::float8), var_pop(x::float8)
+FROM (VALUES ('-infinity'), ('-infinity')) v(x);
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('1'), ('infinity')) v(x);
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('infinity'), ('1')) v(x);
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('infinity'), ('infinity')) v(x);
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
 FROM (VALUES ('-infinity'), ('infinity')) v(x);
+SELECT sum(x::numeric), avg(x::numeric), var_pop(x::numeric)
+FROM (VALUES ('-infinity'), ('-infinity')) v(x);
 
 -- test accuracy with a large input offset
 SELECT avg(x::float8), var_pop(x::float8)
diff --git a/src/test/regress/sql/numeric.sql b/src/test/regress/sql/numeric.sql
index 5dc80f686f48..416c16722a9c 100644
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -634,6 +634,119 @@ SELECT t1.id1, t1.result, t2.expected
     WHERE t1.id1 = t2.id
     AND t1.result != t2.expected;
 
+-- ******************************
+-- * Check behavior with Inf and NaN inputs.  It's easiest to handle these
+-- * separately from the num_data framework used above, because some input
+-- * combinations will throw errors.
+-- ******************************
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('inf'),('-inf'),('nan'))
+SELECT x1, x2,
+  x1 + x2 AS sum,
+  x1 - x2 AS diff,
+  x1 * x2 AS prod
+FROM v AS v1(x1), v AS v2(x2);
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('inf'),('-inf'),('nan'))
+SELECT x1, x2,
+  x1 / x2 AS quot,
+  x1 % x2 AS mod,
+  div(x1, x2) AS div
+FROM v AS v1(x1), v AS v2(x2) WHERE x2 != 0;
+
+SELECT 'inf'::numeric / '0';
+SELECT '-inf'::numeric / '0';
+SELECT 'nan'::numeric / '0';
+SELECT '0'::numeric / '0';
+SELECT 'inf'::numeric % '0';
+SELECT '-inf'::numeric % '0';
+SELECT 'nan'::numeric % '0';
+SELECT '0'::numeric % '0';
+SELECT div('inf'::numeric, '0');
+SELECT div('-inf'::numeric, '0');
+SELECT div('nan'::numeric, '0');
+SELECT div('0'::numeric, '0');
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('-7.777'),('inf'),('-inf'),('nan'))
+SELECT x, -x as minusx, abs(x), floor(x), ceil(x), sign(x), numeric_inc(x) as inc
+FROM v;
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('-7.777'),('inf'),('-inf'),('nan'))
+SELECT x, round(x), round(x,1) as round1, trunc(x), trunc(x,1) as trunc1
+FROM v;
+
+-- the large values fall into the numeric abbreviation code's maximal classes
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('-1'),('4.2'),('-7.777'),('1e340'),('-1e340'),
+         ('inf'),('-inf'),('nan'),
+         ('inf'),('-inf'),('nan'))
+SELECT substring(x::text, 1, 32)
+FROM v ORDER BY x;
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('4.2'),('inf'),('nan'))
+SELECT x, sqrt(x)
+FROM v;
+
+SELECT sqrt('-1'::numeric);
+SELECT sqrt('-inf'::numeric);
+
+WITH v(x) AS
+  (VALUES('1'::numeric),('4.2'),('inf'),('nan'))
+SELECT x,
+  log(x),
+  log10(x),
+  ln(x)
+FROM v;
+
+SELECT ln('0'::numeric);
+SELECT ln('-1'::numeric);
+SELECT ln('-inf'::numeric);
+
+WITH v(x) AS
+  (VALUES('2'::numeric),('4.2'),('inf'),('nan'))
+SELECT x1, x2,
+  log(x1, x2)
+FROM v AS v1(x1), v AS v2(x2);
+
+SELECT log('0'::numeric, '10');
+SELECT log('10'::numeric, '0');
+SELECT log('-inf'::numeric, '10');
+SELECT log('10'::numeric, '-inf');
+SELECT log('inf'::numeric, '0');
+SELECT log('inf'::numeric, '-inf');
+SELECT log('-inf'::numeric, 'inf');
+
+WITH v(x) AS
+  (VALUES('0'::numeric),('1'),('2'),('4.2'),('inf'),('nan'))
+SELECT x1, x2,
+  power(x1, x2)
+FROM v AS v1(x1), v AS v2(x2) WHERE x1 != 0 OR x2 >= 0;
+
+SELECT power('0'::numeric, '-1');
+SELECT power('0'::numeric, '-inf');
+SELECT power('-1'::numeric, 'inf');
+SELECT power('-2'::numeric, '3');
+SELECT power('-2'::numeric, '3.3');
+SELECT power('-2'::numeric, '-1');
+SELECT power('-2'::numeric, '-1.5');
+SELECT power('-2'::numeric, 'inf');
+SELECT power('-2'::numeric, '-inf');
+SELECT power('inf'::numeric, '-2');
+SELECT power('inf'::numeric, '-inf');
+SELECT power('-inf'::numeric, '2');
+SELECT power('-inf'::numeric, '3');
+SELECT power('-inf'::numeric, '4.5');
+SELECT power('-inf'::numeric, '-2');
+SELECT power('-inf'::numeric, '-3');
+SELECT power('-inf'::numeric, '0');
+SELECT power('-inf'::numeric, 'inf');
+SELECT power('-inf'::numeric, '-inf');
+
 -- ******************************
 -- * miscellaneous checks for things that have been broken in the past...
 -- ******************************
@@ -652,6 +765,9 @@ INSERT INTO fract_only VALUES (5, '0.99994');
 INSERT INTO fract_only VALUES (6, '0.99995');  -- should fail
 INSERT INTO fract_only VALUES (7, '0.00001');
 INSERT INTO fract_only VALUES (8, '0.00017');
+INSERT INTO fract_only VALUES (9, 'NaN');
+INSERT INTO fract_only VALUES (10, 'Inf');	-- should fail
+INSERT INTO fract_only VALUES (11, '-Inf');	-- should fail
 SELECT * FROM fract_only;
 DROP TABLE fract_only;
 
@@ -659,9 +775,25 @@ DROP TABLE fract_only;
 SELECT 'NaN'::float8::numeric;
 SELECT 'Infinity'::float8::numeric;
 SELECT '-Infinity'::float8::numeric;
+SELECT 'NaN'::numeric::float8;
+SELECT 'Infinity'::numeric::float8;
+SELECT '-Infinity'::numeric::float8;
 SELECT 'NaN'::float4::numeric;
 SELECT 'Infinity'::float4::numeric;
 SELECT '-Infinity'::float4::numeric;
+SELECT 'NaN'::numeric::float4;
+SELECT 'Infinity'::numeric::float4;
+SELECT '-Infinity'::numeric::float4;
+SELECT '42'::int2::numeric;
+SELECT 'NaN'::numeric::int2;
+SELECT 'Infinity'::numeric::int2;
+SELECT '-Infinity'::numeric::int2;
+SELECT 'NaN'::numeric::int4;
+SELECT 'Infinity'::numeric::int4;
+SELECT '-Infinity'::numeric::int4;
+SELECT 'NaN'::numeric::int8;
+SELECT 'Infinity'::numeric::int8;
+SELECT '-Infinity'::numeric::int8;
 
 -- Simple check that ceil(), floor(), and round() work correctly
 CREATE TABLE ceil_floor_round (a numeric);
@@ -697,6 +829,9 @@ SELECT width_bucket(5.0::float8, 3.0::float8, 4.0::float8, -5);
 SELECT width_bucket(3.5::float8, 3.0::float8, 3.0::float8, 888);
 SELECT width_bucket('NaN', 3.0, 4.0, 888);
 SELECT width_bucket(0::float8, 'NaN', 4.0::float8, 888);
+SELECT width_bucket('inf', 3.0, 4.0, 888);
+SELECT width_bucket(2.0, 3.0, '-inf', 888);
+SELECT width_bucket(0::float8, '-inf', 4.0::float8, 888);
 
 -- normal operation
 CREATE TABLE width_bucket_test (operand_num numeric, operand_f8 float8);
@@ -782,6 +917,30 @@ SELECT '' AS to_char_21, to_char(val, '999999SG9999999999')			FROM num_data;
 SELECT '' AS to_char_22, to_char(val, 'FM9999999999999999.999999999999999')	FROM num_data;
 SELECT '' AS to_char_23, to_char(val, '9.999EEEE')				FROM num_data;
 
+WITH v(val) AS
+  (VALUES('0'::numeric),('-4.2'),('4.2e9'),('1.2e-5'),('inf'),('-inf'),('nan'))
+SELECT val,
+  to_char(val, '9.999EEEE') as numeric,
+  to_char(val::float8, '9.999EEEE') as float8,
+  to_char(val::float4, '9.999EEEE') as float4
+FROM v;
+
+WITH v(val) AS
+  (VALUES('0'::numeric),('-4.2'),('4.2e9'),('1.2e-5'),('inf'),('-inf'),('nan'))
+SELECT val,
+  to_char(val, 'MI9999999999.99') as numeric,
+  to_char(val::float8, 'MI9999999999.99') as float8,
+  to_char(val::float4, 'MI9999999999.99') as float4
+FROM v;
+
+WITH v(val) AS
+  (VALUES('0'::numeric),('-4.2'),('4.2e9'),('1.2e-5'),('inf'),('-inf'),('nan'))
+SELECT val,
+  to_char(val, 'MI99.99') as numeric,
+  to_char(val::float8, 'MI99.99') as float8,
+  to_char(val::float4, 'MI99.99') as float4
+FROM v;
+
 SELECT '' AS to_char_24, to_char('100'::numeric, 'FM999.9');
 SELECT '' AS to_char_25, to_char('100'::numeric, 'FM999.');
 SELECT '' AS to_char_26, to_char('100'::numeric, 'FM999');
@@ -839,6 +998,12 @@ INSERT INTO num_input_test(n1) VALUES ('555.50');
 INSERT INTO num_input_test(n1) VALUES ('-555.50');
 INSERT INTO num_input_test(n1) VALUES ('NaN ');
 INSERT INTO num_input_test(n1) VALUES ('        nan');
+INSERT INTO num_input_test(n1) VALUES (' inf ');
+INSERT INTO num_input_test(n1) VALUES (' +inf ');
+INSERT INTO num_input_test(n1) VALUES (' -inf ');
+INSERT INTO num_input_test(n1) VALUES (' Infinity ');
+INSERT INTO num_input_test(n1) VALUES (' +inFinity ');
+INSERT INTO num_input_test(n1) VALUES (' -INFINITY ');
 
 -- bad inputs
 INSERT INTO num_input_test(n1) VALUES ('     ');
@@ -849,6 +1014,7 @@ INSERT INTO num_input_test(n1) VALUES ('5 . 0');
 INSERT INTO num_input_test(n1) VALUES ('5. 0   ');
 INSERT INTO num_input_test(n1) VALUES ('');
 INSERT INTO num_input_test(n1) VALUES (' N aN ');
+INSERT INTO num_input_test(n1) VALUES ('+ infinity');
 
 SELECT * FROM num_input_test;
 
@@ -952,6 +1118,9 @@ select 1.234 ^ 5678;
 select exp(0.0);
 select exp(1.0);
 select exp(1.0::numeric(71,70));
+select exp('nan'::numeric);
+select exp('inf'::numeric);
+select exp('-inf'::numeric);
 
 -- cases that used to generate inaccurate results
 select exp(32.999);
@@ -973,6 +1142,9 @@ select * from generate_series(-100::numeric, 100::numeric, 0::numeric);
 select * from generate_series(-100::numeric, 100::numeric, 'nan'::numeric);
 select * from generate_series('nan'::numeric, 100::numeric, 10::numeric);
 select * from generate_series(0::numeric, 'nan'::numeric, 10::numeric);
+select * from generate_series('inf'::numeric, 'inf'::numeric, 10::numeric);
+select * from generate_series(0::numeric, 'inf'::numeric, 10::numeric);
+select * from generate_series(0::numeric, '42'::numeric, '-inf'::numeric);
 -- Checks maximum, output is truncated
 select (i / (10::numeric ^ 131071))::numeric(1,0)
 	from generate_series(6 * (10::numeric ^ 131071),
@@ -1040,6 +1212,7 @@ select log(3.1954752e47, 9.4792021e-73);
 --
 
 select scale(numeric 'NaN');
+select scale(numeric 'inf');
 select scale(NULL::numeric);
 select scale(1.12);
 select scale(0);
@@ -1054,6 +1227,7 @@ select scale(-13.000000000000000);
 --
 
 select min_scale(numeric 'NaN') is NULL; -- should be true
+select min_scale(numeric 'inf') is NULL; -- should be true
 select min_scale(0);                     -- no digits
 select min_scale(0.00);                  -- no digits again
 select min_scale(1.0);                   -- no scale
@@ -1070,6 +1244,7 @@ select min_scale(1e100);                 -- very big number
 --
 
 select trim_scale(numeric 'NaN');
+select trim_scale(numeric 'inf');
 select trim_scale(1.120);
 select trim_scale(0);
 select trim_scale(0.00);
@@ -1096,7 +1271,11 @@ FROM (VALUES (0::numeric, 0::numeric),
              (0::numeric, 46375::numeric),
              (433125::numeric, 46375::numeric),
              (43312.5::numeric, 4637.5::numeric),
-             (4331.250::numeric, 463.75000::numeric)) AS v(a, b);
+             (4331.250::numeric, 463.75000::numeric),
+             ('inf', '0'),
+             ('inf', '42'),
+             ('inf', 'inf')
+     ) AS v(a, b);
 
 --
 -- Tests for LCM()
@@ -1108,7 +1287,11 @@ FROM (VALUES (0::numeric, 0::numeric),
              (13272::numeric, 13272::numeric),
              (423282::numeric, 13272::numeric),
              (42328.2::numeric, 1327.2::numeric),
-             (4232.820::numeric, 132.72000::numeric)) AS v(a, b);
+             (4232.820::numeric, 132.72000::numeric),
+             ('inf', '0'),
+             ('inf', '42'),
+             ('inf', 'inf')
+     ) AS v(a, b);
 
 SELECT lcm(9999 * (10::numeric)^131068 + (10::numeric^131068 - 1), 2); -- overflow
 
diff --git a/src/test/regress/sql/window.sql b/src/test/regress/sql/window.sql
index 51ec0bac9ad1..af206ca4664e 100644
--- a/src/test/regress/sql/window.sql
+++ b/src/test/regress/sql/window.sql
@@ -499,7 +499,7 @@ create temp table numerics(
 );
 
 insert into numerics values
-(0, '-infinity', '-infinity', '-1000'),  -- numeric type lacks infinities
+(0, '-infinity', '-infinity', '-infinity'),
 (1, -3, -3, -3),
 (2, -1, -1, -1),
 (3, 0, 0, 0),
@@ -507,7 +507,7 @@ insert into numerics values
 (5, 1.12, 1.12, 1.12),
 (6, 2, 2, 2),
 (7, 100, 100, 100),
-(8, 'infinity', 'infinity', '1000'),
+(8, 'infinity', 'infinity', 'infinity'),
 (9, 'NaN', 'NaN', 'NaN');
 
 select id, f_float4, first_value(id) over w, last_value(id) over w
@@ -574,6 +574,18 @@ window w as (order by f_numeric range between
              1 preceding and 1.1::float8 following);  -- currently unsupported
 select id, f_numeric, first_value(id) over w, last_value(id) over w
 from numerics
+window w as (order by f_numeric range between
+             'inf' preceding and 'inf' following);
+select id, f_numeric, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_numeric range between
+             'inf' preceding and 'inf' preceding);
+select id, f_numeric, first_value(id) over w, last_value(id) over w
+from numerics
+window w as (order by f_numeric range between
+             'inf' following and 'inf' following);
+select id, f_numeric, first_value(id) over w, last_value(id) over w
+from numerics
 window w as (order by f_numeric range between
              1.1 preceding and 'NaN' following);  -- error, NaN disallowed
 

From 38f60f174e3279069b5547b5f4015eb4a8492037 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Thu, 23 Jul 2020 08:29:08 +0900
Subject: [PATCH 192/334] Revert "Fix corner case with PGP decompression in
 pgcrypto"

This reverts commit 9e10898, after finding out that buildfarm members
running SLES 15 on z390 complain on the compression and decompression
logic of the new test: pipistrelles, barbthroat and steamerduck.

Those hosts are visibly using hardware-specific changes to improve zlib
performance, requiring more investigation.

Thanks to Tom Lane for the discussion.

Discussion: https://postgr.es/m/20200722093749.GA2564@paquier.xyz
Backpatch-through: 9.5
---
 contrib/pgcrypto/expected/pgp-compression.out | 30 -------------------
 contrib/pgcrypto/pgp-compress.c               | 22 +++++++-------
 contrib/pgcrypto/sql/pgp-compression.sql      | 21 -------------
 3 files changed, 11 insertions(+), 62 deletions(-)

diff --git a/contrib/pgcrypto/expected/pgp-compression.out b/contrib/pgcrypto/expected/pgp-compression.out
index d4c57feba30b..32b350b8fe05 100644
--- a/contrib/pgcrypto/expected/pgp-compression.out
+++ b/contrib/pgcrypto/expected/pgp-compression.out
@@ -48,33 +48,3 @@ select pgp_sym_decrypt(
  Secret message
 (1 row)
 
--- check corner case involving an input string of 16kB, as per bug #16476.
-SELECT setseed(0);
- setseed 
----------
- 
-(1 row)
-
-WITH random_string AS
-(
-  -- This generates a random string of 16366 bytes.  This is chosen
-  -- as random so that it does not get compressed, and the decompression
-  -- would work on a string with the same length as the origin, making the
-  -- test behavior more predictible.  lpad() ensures that the generated
-  -- hexadecimal value is completed by extra zero characters if random()
-  -- has generated a value strictly lower than 16.
-  SELECT string_agg(decode(lpad(to_hex((random()*256)::int), 2, '0'), 'hex'), '') as bytes
-    FROM generate_series(0, 16365)
-)
-SELECT bytes =
-    pgp_sym_decrypt_bytea(
-      pgp_sym_encrypt_bytea(bytes, 'key',
-                            'compress-algo=1,compress-level=1'),
-                            'key', 'expect-compress-algo=1')
-    AS is_same
-  FROM random_string;
- is_same 
----------
- t
-(1 row)
-
diff --git a/contrib/pgcrypto/pgp-compress.c b/contrib/pgcrypto/pgp-compress.c
index 17f5b2ef93dc..0505bdee9237 100644
--- a/contrib/pgcrypto/pgp-compress.c
+++ b/contrib/pgcrypto/pgp-compress.c
@@ -243,17 +243,6 @@ decompress_read(void *priv, PullFilter *src, int len,
 	struct DecomprData *dec = priv;
 
 restart:
-	if (dec->stream.avail_in == 0)
-	{
-		uint8	   *tmp;
-
-		res = pullf_read(src, 8192, &tmp);
-		if (res < 0)
-			return res;
-		dec->stream.next_in = tmp;
-		dec->stream.avail_in = res;
-	}
-
 	if (dec->buf_data > 0)
 	{
 		if (len > dec->buf_data)
@@ -267,6 +256,17 @@ decompress_read(void *priv, PullFilter *src, int len,
 	if (dec->eof)
 		return 0;
 
+	if (dec->stream.avail_in == 0)
+	{
+		uint8	   *tmp;
+
+		res = pullf_read(src, 8192, &tmp);
+		if (res < 0)
+			return res;
+		dec->stream.next_in = tmp;
+		dec->stream.avail_in = res;
+	}
+
 	dec->stream.next_out = dec->buf;
 	dec->stream.avail_out = dec->buf_len;
 	dec->pos = dec->buf;
diff --git a/contrib/pgcrypto/sql/pgp-compression.sql b/contrib/pgcrypto/sql/pgp-compression.sql
index 87c59c6cabc4..ca9ee1fc0088 100644
--- a/contrib/pgcrypto/sql/pgp-compression.sql
+++ b/contrib/pgcrypto/sql/pgp-compression.sql
@@ -28,24 +28,3 @@ select pgp_sym_decrypt(
 	pgp_sym_encrypt('Secret message', 'key',
 			'compress-algo=2, compress-level=0'),
 	'key', 'expect-compress-algo=0');
-
--- check corner case involving an input string of 16kB, as per bug #16476.
-SELECT setseed(0);
-WITH random_string AS
-(
-  -- This generates a random string of 16366 bytes.  This is chosen
-  -- as random so that it does not get compressed, and the decompression
-  -- would work on a string with the same length as the origin, making the
-  -- test behavior more predictible.  lpad() ensures that the generated
-  -- hexadecimal value is completed by extra zero characters if random()
-  -- has generated a value strictly lower than 16.
-  SELECT string_agg(decode(lpad(to_hex((random()*256)::int), 2, '0'), 'hex'), '') as bytes
-    FROM generate_series(0, 16365)
-)
-SELECT bytes =
-    pgp_sym_decrypt_bytea(
-      pgp_sym_encrypt_bytea(bytes, 'key',
-                            'compress-algo=1,compress-level=1'),
-                            'key', 'expect-compress-algo=1')
-    AS is_same
-  FROM random_string;

From c55040ccd017962b7b8d3fbcdc184aa90c722a21 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Thu, 23 Jul 2020 08:19:07 +0530
Subject: [PATCH 193/334] WAL Log invalidations at command end with
 wal_level=logical.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When wal_level=logical, write invalidations at command end into WAL so
that decoding can use this information.

This patch is required to allow the streaming of in-progress transactions
in logical decoding.  The actual work to allow streaming will be committed
as a separate patch.

We still add the invalidations to the cache and write them to WAL at
commit time in RecordTransactionCommit(). This uses the existing
XLOG_INVALIDATIONS xlog record type, from the RM_STANDBY_ID resource
manager (see LogStandbyInvalidations for details).

So existing code relying on those invalidations (e.g. redo) does not need
to be changed.

The invalidations written at command end uses a new xlog record type
XLOG_XACT_INVALIDATIONS, from RM_XACT_ID resource manager. See
LogLogicalInvalidations for details.

These new xlog records are ignored by existing redo procedures, which
still rely on the invalidations written to commit records.

The invalidations are decoded and accumulated in top-transaction, and then
executed during replay.  This obviates the need to decode the
invalidations as part of a commit record.

Bump XLOG_PAGE_MAGIC, since this introduces XLOG_XACT_INVALIDATIONS.

Author: Dilip Kumar, Tomas Vondra, Amit Kapila
Reviewed-by: Amit Kapila
Tested-by: Neha Sharma and Mahendra Singh Thalor
Discussion: https://postgr.es/m/688b0b7f-2f6c-d827-c27b-216a8e3ea700@2ndquadrant.com
---
 src/backend/access/rmgrdesc/xactdesc.c        | 10 ++++
 src/backend/access/transam/xact.c             | 17 ++++++
 src/backend/replication/logical/decode.c      | 58 +++++++++++--------
 .../replication/logical/reorderbuffer.c       | 52 ++++++++++++++---
 src/backend/utils/cache/inval.c               | 54 +++++++++++++++++
 src/include/access/xact.h                     |  2 +-
 src/include/access/xlog_internal.h            |  2 +-
 src/include/replication/reorderbuffer.h       |  3 +
 src/include/utils/inval.h                     |  2 +
 9 files changed, 166 insertions(+), 34 deletions(-)

diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c
index 9fce75565f4b..addd95faec14 100644
--- a/src/backend/access/rmgrdesc/xactdesc.c
+++ b/src/backend/access/rmgrdesc/xactdesc.c
@@ -396,6 +396,13 @@ xact_desc(StringInfo buf, XLogReaderState *record)
 		appendStringInfo(buf, "xtop %u: ", xlrec->xtop);
 		xact_desc_assignment(buf, xlrec);
 	}
+	else if (info == XLOG_XACT_INVALIDATIONS)
+	{
+		xl_xact_invals *xlrec = (xl_xact_invals *) rec;
+
+		standby_desc_invalidations(buf, xlrec->nmsgs, xlrec->msgs, InvalidOid,
+								   InvalidOid, false);
+	}
 }
 
 const char *
@@ -423,6 +430,9 @@ xact_identify(uint8 info)
 		case XLOG_XACT_ASSIGNMENT:
 			id = "ASSIGNMENT";
 			break;
+		case XLOG_XACT_INVALIDATIONS:
+			id = "INVALIDATION";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index bd4c3cf32585..d4f7c29847f4 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -1224,6 +1224,16 @@ RecordTransactionCommit(void)
 	bool		RelcacheInitFileInval = false;
 	bool		wrote_xlog;
 
+	/*
+	 * Log pending invalidations for logical decoding of in-progress
+	 * transactions.  Normally for DDLs, we log this at each command end,
+	 * however, for certain cases where we directly update the system table
+	 * without a transaction block, the invalidations are not logged till this
+	 * time.
+	 */
+	if (XLogLogicalInfoActive())
+		LogLogicalInvalidations();
+
 	/* Get data needed for commit record */
 	nrels = smgrGetPendingDeletes(true, &rels);
 	nchildren = xactGetCommittedChildren(&children);
@@ -6022,6 +6032,13 @@ xact_redo(XLogReaderState *record)
 			ProcArrayApplyXidAssignment(xlrec->xtop,
 										xlrec->nsubxacts, xlrec->xsub);
 	}
+	else if (info == XLOG_XACT_INVALIDATIONS)
+	{
+		/*
+		 * XXX we do ignore this for now, what matters are invalidations
+		 * written into the commit record.
+		 */
+	}
 	else
 		elog(PANIC, "xact_redo: unknown op code %u", info);
 }
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 0c0c37173919..f3a1c31a2921 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -278,10 +278,39 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 
 			/*
 			 * We assign subxact to the toplevel xact while processing each
-			 * record if required.  So, we don't need to do anything here.
-			 * See LogicalDecodingProcessRecord.
+			 * record if required.  So, we don't need to do anything here. See
+			 * LogicalDecodingProcessRecord.
 			 */
 			break;
+		case XLOG_XACT_INVALIDATIONS:
+			{
+				TransactionId xid;
+				xl_xact_invals *invals;
+
+				xid = XLogRecGetXid(r);
+				invals = (xl_xact_invals *) XLogRecGetData(r);
+
+				/*
+				 * Execute the invalidations for xid-less transactions,
+				 * otherwise, accumulate them so that they can be processed at
+				 * the commit time.
+				 */
+				if (TransactionIdIsValid(xid))
+				{
+					if (!ctx->fast_forward)
+						ReorderBufferAddInvalidations(reorder, xid,
+													  buf->origptr,
+													  invals->nmsgs,
+													  invals->msgs);
+					ReorderBufferXidSetCatalogChanges(ctx->reorder, xid,
+													  buf->origptr);
+				}
+				else if ((!ctx->fast_forward))
+					ReorderBufferImmediateInvalidation(ctx->reorder,
+													   invals->nmsgs,
+													   invals->msgs);
+			}
+			break;
 		case XLOG_XACT_PREPARE:
 
 			/*
@@ -334,15 +363,11 @@ DecodeStandbyOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 		case XLOG_STANDBY_LOCK:
 			break;
 		case XLOG_INVALIDATIONS:
-			{
-				xl_invalidations *invalidations =
-				(xl_invalidations *) XLogRecGetData(r);
 
-				if (!ctx->fast_forward)
-					ReorderBufferImmediateInvalidation(ctx->reorder,
-													   invalidations->nmsgs,
-													   invalidations->msgs);
-			}
+			/*
+			 * We are processing the invalidations at the command level via
+			 * XLOG_XACT_INVALIDATIONS.  So we don't need to do anything here.
+			 */
 			break;
 		default:
 			elog(ERROR, "unexpected RM_STANDBY_ID record type: %u", info);
@@ -573,19 +598,6 @@ DecodeCommit(LogicalDecodingContext *ctx, XLogRecordBuffer *buf,
 		commit_time = parsed->origin_timestamp;
 	}
 
-	/*
-	 * Process invalidation messages, even if we're not interested in the
-	 * transaction's contents, since the various caches need to always be
-	 * consistent.
-	 */
-	if (parsed->nmsgs > 0)
-	{
-		if (!ctx->fast_forward)
-			ReorderBufferAddInvalidations(ctx->reorder, xid, buf->origptr,
-										  parsed->nmsgs, parsed->msgs);
-		ReorderBufferXidSetCatalogChanges(ctx->reorder, xid, buf->origptr);
-	}
-
 	SnapBuildCommitTxn(ctx->snapshot_builder, buf->origptr, xid,
 					   parsed->nsubxacts, parsed->subxacts);
 
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 449327a147f9..ce6e62152f03 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -856,6 +856,9 @@ ReorderBufferAssignChild(ReorderBuffer *rb, TransactionId xid,
 	subtxn->toplevel_xid = xid;
 	Assert(subtxn->nsubtxns == 0);
 
+	/* set the reference to top-level transaction */
+	subtxn->toptxn = txn;
+
 	/* add to subtransaction list */
 	dlist_push_tail(&txn->subtxns, &subtxn->node);
 	txn->nsubtxns++;
@@ -2201,7 +2204,11 @@ ReorderBufferAddNewTupleCids(ReorderBuffer *rb, TransactionId xid,
 /*
  * Setup the invalidation of the toplevel transaction.
  *
- * This needs to be done before ReorderBufferCommit is called!
+ * This needs to be called for each XLOG_XACT_INVALIDATIONS message and
+ * accumulates all the invalidation messages in the toplevel transaction.
+ * This is required because in some cases where we skip processing the
+ * transaction (see ReorderBufferForget), we need to execute all the
+ * invalidations together.
  */
 void
 ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
@@ -2212,17 +2219,35 @@ ReorderBufferAddInvalidations(ReorderBuffer *rb, TransactionId xid,
 
 	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
 
-	if (txn->ninvalidations != 0)
-		elog(ERROR, "only ever add one set of invalidations");
+	/*
+	 * We collect all the invalidations under the top transaction so that we
+	 * can execute them all together.
+	 */
+	if (txn->toptxn)
+		txn = txn->toptxn;
 
 	Assert(nmsgs > 0);
 
-	txn->ninvalidations = nmsgs;
-	txn->invalidations = (SharedInvalidationMessage *)
-		MemoryContextAlloc(rb->context,
-						   sizeof(SharedInvalidationMessage) * nmsgs);
-	memcpy(txn->invalidations, msgs,
-		   sizeof(SharedInvalidationMessage) * nmsgs);
+	/* Accumulate invalidations. */
+	if (txn->ninvalidations == 0)
+	{
+		txn->ninvalidations = nmsgs;
+		txn->invalidations = (SharedInvalidationMessage *)
+			MemoryContextAlloc(rb->context,
+							   sizeof(SharedInvalidationMessage) * nmsgs);
+		memcpy(txn->invalidations, msgs,
+			   sizeof(SharedInvalidationMessage) * nmsgs);
+	}
+	else
+	{
+		txn->invalidations = (SharedInvalidationMessage *)
+			repalloc(txn->invalidations, sizeof(SharedInvalidationMessage) *
+					 (txn->ninvalidations + nmsgs));
+
+		memcpy(txn->invalidations + txn->ninvalidations, msgs,
+			   nmsgs * sizeof(SharedInvalidationMessage));
+		txn->ninvalidations += nmsgs;
+	}
 }
 
 /*
@@ -2250,6 +2275,15 @@ ReorderBufferXidSetCatalogChanges(ReorderBuffer *rb, TransactionId xid,
 	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
 
 	txn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
+
+	/*
+	 * Mark top-level transaction as having catalog changes too if one of its
+	 * children has so that the ReorderBufferBuildTupleCidHash can
+	 * conveniently check just top-level transaction and decide whether to
+	 * build the hash table or not.
+	 */
+	if (txn->toptxn != NULL)
+		txn->toptxn->txn_flags |= RBTXN_HAS_CATALOG_CHANGES;
 }
 
 /*
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index 591dd33be678..628d6f5d0cce 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -85,6 +85,9 @@
  *	worth trying to avoid sending such inval traffic in the future, if those
  *	problems can be overcome cheaply.
  *
+ *	When wal_level=logical, write invalidations into WAL at each command end to
+ *	support the decoding of the in-progress transactions.  See
+ *	CommandEndInvalidationMessages.
  *
  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -1094,6 +1097,11 @@ CommandEndInvalidationMessages(void)
 
 	ProcessInvalidationMessages(&transInvalInfo->CurrentCmdInvalidMsgs,
 								LocalExecuteInvalidationMessage);
+
+	/* WAL Log per-command invalidation messages for wal_level=logical */
+	if (XLogLogicalInfoActive())
+		LogLogicalInvalidations();
+
 	AppendInvalidationMessages(&transInvalInfo->PriorCmdInvalidMsgs,
 							   &transInvalInfo->CurrentCmdInvalidMsgs);
 }
@@ -1501,3 +1509,49 @@ CallSyscacheCallbacks(int cacheid, uint32 hashvalue)
 		i = ccitem->link - 1;
 	}
 }
+
+/*
+ * LogLogicalInvalidations
+ *
+ * Emit WAL for invalidations.  This is currently only used for logging
+ * invalidations at the command end or at commit time if any invalidations
+ * are pending.
+ */
+void
+LogLogicalInvalidations()
+{
+	xl_xact_invals xlrec;
+	SharedInvalidationMessage *invalMessages;
+	int			nmsgs = 0;
+
+	/* Quick exit if we haven't done anything with invalidation messages. */
+	if (transInvalInfo == NULL)
+		return;
+
+	ProcessInvalidationMessagesMulti(&transInvalInfo->CurrentCmdInvalidMsgs,
+									 MakeSharedInvalidMessagesArray);
+
+	Assert(!(numSharedInvalidMessagesArray > 0 &&
+			 SharedInvalidMessagesArray == NULL));
+
+	invalMessages = SharedInvalidMessagesArray;
+	nmsgs = numSharedInvalidMessagesArray;
+	SharedInvalidMessagesArray = NULL;
+	numSharedInvalidMessagesArray = 0;
+
+	if (nmsgs > 0)
+	{
+		/* prepare record */
+		memset(&xlrec, 0, MinSizeOfXactInvals);
+		xlrec.nmsgs = nmsgs;
+
+		/* perform insertion */
+		XLogBeginInsert();
+		XLogRegisterData((char *) (&xlrec), MinSizeOfXactInvals);
+		XLogRegisterData((char *) invalMessages,
+						 nmsgs * sizeof(SharedInvalidationMessage));
+		XLogInsert(RM_XACT_ID, XLOG_XACT_INVALIDATIONS);
+
+		pfree(invalMessages);
+	}
+}
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index aef855536744..53480116a462 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -146,7 +146,7 @@ typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid,
 #define XLOG_XACT_COMMIT_PREPARED	0x30
 #define XLOG_XACT_ABORT_PREPARED	0x40
 #define XLOG_XACT_ASSIGNMENT		0x50
-/* free opcode 0x60 */
+#define XLOG_XACT_INVALIDATIONS		0x60
 /* free opcode 0x70 */
 
 /* mask for filtering opcodes out of xl_info */
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index b9490a3afeff..9b2da56379e1 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -31,7 +31,7 @@
 /*
  * Each page of XLOG file has a header like this:
  */
-#define XLOG_PAGE_MAGIC 0xD107	/* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD108	/* can be used as WAL version indicator */
 
 typedef struct XLogPageHeaderData
 {
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 019bd382de9b..1055e99e2e14 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -220,6 +220,9 @@ typedef struct ReorderBufferTXN
 	 */
 	XLogRecPtr	end_lsn;
 
+	/* Toplevel transaction for this subxact (NULL for top-level). */
+	struct ReorderBufferTXN *toptxn;
+
 	/*
 	 * LSN of the last lsn at which snapshot information reside, so we can
 	 * restart decoding from there and fully recover this transaction from
diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h
index bc5081cf7210..463888c3894f 100644
--- a/src/include/utils/inval.h
+++ b/src/include/utils/inval.h
@@ -61,4 +61,6 @@ extern void CacheRegisterRelcacheCallback(RelcacheCallbackFunction func,
 extern void CallSyscacheCallbacks(int cacheid, uint32 hashvalue);
 
 extern void InvalidateSystemCaches(void);
+
+extern void LogLogicalInvalidations(void);
 #endif							/* INVAL_H */

From 42dee8b8e362bae05de2234a4fc7a3aa9dacdf6f Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Thu, 23 Jul 2020 21:10:49 +1200
Subject: [PATCH 194/334] Fix error message.

Remove extra space.  Back-patch to all releases, like commit 7897e3bb.

Author: Lu, Chenyang <lucy.fnst@cn.fujitsu.com>
Discussion: https://postgr.es/m/795d03c6129844d3803e7eea48f5af0d%40G08CNEXMBPEKD04.g08.fujitsu.local
---
 src/backend/storage/file/buffile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index 3907349b691e..2d7a08232089 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -499,7 +499,7 @@ BufFileDumpBuffer(BufFile *file)
 		if (bytestowrite <= 0)
 			ereport(ERROR,
 					(errcode_for_file_access(),
-					 errmsg("could not write to file \"%s\" : %m",
+					 errmsg("could not write to file \"%s\": %m",
 							FilePathName(thisfile))));
 		file->curOffset += bytestowrite;
 		wpos += bytestowrite;

From 5733fa0fe4a73efa46801aa4189f7da17dd2b4bf Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Thu, 23 Jul 2020 17:13:00 +0200
Subject: [PATCH 195/334] doc: Document that ssl_ciphers does not affect TLS
 1.3

TLS 1.3 uses a different way of specifying ciphers and a different
OpenSSL API.  PostgreSQL currently does not support setting those
ciphers.  For now, just document this.  In the future, support for
this might be added somehow.

Reviewed-by: Jonathan S. Katz <jkatz@postgresql.org>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
---
 doc/src/sgml/config.sgml | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index ca6a3a523ff6..6ce59078967c 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1216,16 +1216,22 @@ include_dir 'conf.d'
       </term>
       <listitem>
        <para>
-        Specifies a list of <acronym>SSL</acronym> cipher suites that are allowed to be
-        used on secure connections.  See
-        the <citerefentry><refentrytitle>ciphers</refentrytitle></citerefentry> manual page
-        in the <application>OpenSSL</application> package for the syntax of this setting
-        and a list of supported values.
-        This parameter can only be set in the <filename>postgresql.conf</filename>
-        file or on the server command line.
-        The default value is <literal>HIGH:MEDIUM:+3DES:!aNULL</literal>.  The
-        default is usually a reasonable choice unless you have specific
-        security requirements.
+        Specifies a list of <acronym>SSL</acronym> cipher suites that are
+        allowed to be used by SSL connections.  See the
+        <citerefentry><refentrytitle>ciphers</refentrytitle></citerefentry>
+        manual page in the <application>OpenSSL</application> package for the
+        syntax of this setting and a list of supported values.  Only
+        connections using TLS version 1.2 and lower are affected.  There is
+        currently no setting that controls the cipher choices used by TLS
+        version 1.3 connections.  The default value is
+        <literal>HIGH:MEDIUM:+3DES:!aNULL</literal>.  The default is usually a
+        reasonable choice unless you have specific security requirements.
+       </para>
+
+       <para>
+        This parameter can only be set in the
+        <filename>postgresql.conf</filename> file or on the server command
+        line.
        </para>
 
        <para>

From b9b610577d7f70d959968c3697557011699b3a54 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 23 Jul 2020 17:19:37 -0400
Subject: [PATCH 196/334] Fix ancient violation of zlib's API spec.

contrib/pgcrypto mishandled the case where deflate() does not consume
all of the offered input on the first try.  It reset the next_in pointer
to the start of the input instead of leaving it alone, causing the wrong
data to be fed to the next deflate() call.

This has been broken since pgcrypto was committed.  The reason for the
lack of complaints seems to be that it's fairly hard to get stock zlib
to not consume all the input, so long as the output buffer is big enough
(which it normally would be in pgcrypto's usage; AFAICT the input is
always going to be packetized into packets no larger than ZIP_OUT_BUF).
However, IBM's zlibNX implementation for AIX evidently will do it
in some cases.

I did not add a test case for this, because I couldn't find one that
would fail with stock zlib.  When we put back the test case for
bug #16476, that will cover the zlibNX situation well enough.

While here, write deflate()'s second argument as Z_NO_FLUSH per its
API spec, instead of hard-wiring the value zero.

Per buildfarm results and subsequent investigation.

Discussion: https://postgr.es/m/16476-692ef7b84e5fb893@postgresql.org
---
 contrib/pgcrypto/pgp-compress.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/contrib/pgcrypto/pgp-compress.c b/contrib/pgcrypto/pgp-compress.c
index 0505bdee9237..4b1d2a1ff5f9 100644
--- a/contrib/pgcrypto/pgp-compress.c
+++ b/contrib/pgcrypto/pgp-compress.c
@@ -114,13 +114,13 @@ compress_process(PushFilter *next, void *priv, const uint8 *data, int len)
 	/*
 	 * process data
 	 */
-	while (len > 0)
+	st->stream.next_in = unconstify(uint8 *, data);
+	st->stream.avail_in = len;
+	while (st->stream.avail_in > 0)
 	{
-		st->stream.next_in = unconstify(uint8 *, data);
-		st->stream.avail_in = len;
 		st->stream.next_out = st->buf;
 		st->stream.avail_out = st->buf_len;
-		res = deflate(&st->stream, 0);
+		res = deflate(&st->stream, Z_NO_FLUSH);
 		if (res != Z_OK)
 			return PXE_PGP_COMPRESSION_ERROR;
 
@@ -131,7 +131,6 @@ compress_process(PushFilter *next, void *priv, const uint8 *data, int len)
 			if (res < 0)
 				return res;
 		}
-		len = st->stream.avail_in;
 	}
 
 	return 0;
@@ -154,6 +153,7 @@ compress_flush(PushFilter *next, void *priv)
 		zres = deflate(&st->stream, Z_FINISH);
 		if (zres != Z_STREAM_END && zres != Z_OK)
 			return PXE_PGP_COMPRESSION_ERROR;
+
 		n_out = st->buf_len - st->stream.avail_out;
 		if (n_out > 0)
 		{

From 25244b8972a34b838c4033fe9efc1d31cba9d0e3 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Fri, 24 Jul 2020 10:34:16 +0200
Subject: [PATCH 197/334] Rename configure.in to configure.ac

The new name has been preferred by Autoconf for a long time.  Future
versions of Autoconf will warn about the old name.

Discussion: https://www.postgresql.org/message-id/flat/e796c185-5ece-8569-248f-dd3799701be1%402ndquadrant.com
---
 config/general.m4            |  2 +-
 configure.in => configure.ac |  4 ++--
 src/backend/catalog/Makefile |  4 ++--
 src/include/pg_config.h.in   |  2 +-
 src/tools/msvc/Solution.pm   | 10 +++++-----
 src/tools/version_stamp.pl   | 12 ++++++------
 6 files changed, 17 insertions(+), 17 deletions(-)
 rename configure.in => configure.ac (99%)

diff --git a/config/general.m4 b/config/general.m4
index 95d65ceb093d..140b9737bfbc 100644
--- a/config/general.m4
+++ b/config/general.m4
@@ -8,7 +8,7 @@
 # argument (other than "yes/no"), etc.
 #
 # The point of this implementation is to reduce code size and
-# redundancy in configure.in and to improve robustness and consistency
+# redundancy in configure.ac and to improve robustness and consistency
 # in the option evaluation code.
 
 
diff --git a/configure.in b/configure.ac
similarity index 99%
rename from configure.in
rename to configure.ac
index e91e49a579ee..eb2c731b58fb 100644
--- a/configure.in
+++ b/configure.ac
@@ -1,5 +1,5 @@
 dnl Process this file with autoconf to produce a configure script.
-dnl configure.in
+dnl configure.ac
 dnl
 dnl Developers, please strive to achieve this order:
 dnl
@@ -21,7 +21,7 @@ AC_INIT([PostgreSQL], [14devel], [pgsql-bugs@lists.postgresql.org], [], [https:/
 
 m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.69], [], [m4_fatal([Autoconf version 2.69 is required.
 Untested combinations of 'autoconf' and PostgreSQL versions are not
-recommended.  You can remove the check from 'configure.in' but it is then
+recommended.  You can remove the check from 'configure.ac' but it is then
 your responsibility whether the result works or not.])])
 AC_COPYRIGHT([Copyright (c) 1996-2020, PostgreSQL Global Development Group])
 AC_CONFIG_SRCDIR([src/backend/access/common/heaptuple.c])
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index 9499bb33e566..93cf6d436857 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -103,10 +103,10 @@ generated-header-symlinks: $(top_builddir)/src/include/catalog/header-stamp
 # won't update them if they didn't change (to avoid unnecessary recompiles).
 # Technically, this should depend on Makefile.global which supplies
 # $(MAJORVERSION); but then genbki.pl would need to be re-run after every
-# configure run, even in distribution tarballs.  So depending on configure.in
+# configure run, even in distribution tarballs.  So depending on configure.ac
 # instead is cheating a bit, but it will achieve the goal of updating the
 # version number when it changes.
-bki-stamp: genbki.pl Catalog.pm $(POSTGRES_BKI_SRCS) $(POSTGRES_BKI_DATA) $(top_srcdir)/configure.in
+bki-stamp: genbki.pl Catalog.pm $(POSTGRES_BKI_SRCS) $(POSTGRES_BKI_DATA) $(top_srcdir)/configure.ac
 	$(PERL) $< --include-path=$(top_srcdir)/src/include/ \
 		--set-version=$(MAJORVERSION) $(POSTGRES_BKI_SRCS)
 	touch $@
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 73aa61816694..fb270df678a5 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -1,4 +1,4 @@
-/* src/include/pg_config.h.in.  Generated from configure.in by autoheader.  */
+/* src/include/pg_config.h.in.  Generated from configure.ac by autoheader.  */
 
 /* Define to the type of arg 1 of 'accept' */
 #undef ACCEPT_TYPE_ARG1
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 023da623826f..bc8904732f01 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -155,9 +155,9 @@ sub GenerateFiles
 	my $ac_define_openssl_api_compat_found = 0;
 	my $openssl_api_compat;
 
-	# Parse configure.in to get version numbers
-	open(my $c, '<', "configure.in")
-	  || confess("Could not open configure.in for reading\n");
+	# Parse configure.ac to get version numbers
+	open(my $c, '<', "configure.ac")
+	  || confess("Could not open configure.ac for reading\n");
 	while (<$c>)
 	{
 		if (/^AC_INIT\(\[([^\]]+)\], \[([^\]]+)\], \[([^\]]+)\], \[([^\]]*)\], \[([^\]]+)\]/
@@ -185,7 +185,7 @@ sub GenerateFiles
 		}
 	}
 	close($c);
-	confess "Unable to parse configure.in for all variables!"
+	confess "Unable to parse configure.ac for all variables!"
 	  unless $ac_init_found && $ac_define_openssl_api_compat_found;
 
 	if (IsNewer("src/include/pg_config_os.h", "src/include/port/win32.h"))
@@ -834,7 +834,7 @@ EOF
 
 # Read lines from input file and substitute symbols using the same
 # logic that config.status uses.  There should be one call of this for
-# each AC_CONFIG_HEADERS call in configure.in.
+# each AC_CONFIG_HEADERS call in configure.ac.
 #
 # If the "required" argument is true, we also keep track which of our
 # defines have been found and error out if any are left unused at the
diff --git a/src/tools/version_stamp.pl b/src/tools/version_stamp.pl
index 359558762225..36b18d514cf8 100755
--- a/src/tools/version_stamp.pl
+++ b/src/tools/version_stamp.pl
@@ -9,10 +9,10 @@
 #################################################################
 
 #
-# This script updates the version stamp in configure.in, and also in assorted
+# This script updates the version stamp in configure.ac, and also in assorted
 # other files wherein it's not convenient to obtain the version number from
 # configure's output.  Note that you still have to run autoconf afterward
-# to regenerate configure from the updated configure.in.
+# to regenerate configure from the updated configure.ac.
 #
 # Usage: cd to top of source tree and issue
 #	src/tools/version_stamp.pl MINORVERSION
@@ -74,7 +74,7 @@
 # (this also ensures we're in the right directory)
 
 my $aconfver = "";
-open(my $fh, '<', "configure.in") || die "could not read configure.in: $!\n";
+open(my $fh, '<', "configure.ac") || die "could not read configure.ac: $!\n";
 while (<$fh>)
 {
 	if (m/^m4_if\(m4_defn\(\[m4_PACKAGE_VERSION\]\), \[(.*)\], \[\], \[m4_fatal/
@@ -86,13 +86,13 @@
 }
 close($fh);
 $aconfver ne ""
-  || die "could not find autoconf version number in configure.in\n";
+  || die "could not find autoconf version number in configure.ac\n";
 
-# Update configure.in and other files that contain version numbers
+# Update configure.ac and other files that contain version numbers
 
 my $fixedfiles = "";
 
-sed_file("configure.in",
+sed_file("configure.ac",
 	"-e 's/AC_INIT(\\[PostgreSQL\\], \\[[0-9a-z.]*\\]/AC_INIT([PostgreSQL], [$fullversion]/'"
 );
 

From 2f2007fbb255be178aca586780967f43885203a7 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 24 Jul 2020 15:26:51 -0400
Subject: [PATCH 198/334] Fix assorted bugs by changing TS_execute's callback
 API to ternary logic.

Text search sometimes failed to find valid matches, for instance
'!crew:A'::tsquery might fail to locate 'crew:1B'::tsvector during
an index search.  The root of the issue is that TS_execute's callback
functions were not changed to use ternary (yes/no/maybe) reporting
when we made the search logic itself do so.  It's somewhat annoying
to break that API, but on the other hand we now see that any code
using plain boolean logic is almost certainly broken since the
addition of phrase search.  There seem to be very few outside callers
of this code anyway, so we'll just break them intentionally to get
them to adapt.

This allows removal of tsginidx.c's private re-implementation of
TS_execute, since that's now entirely duplicative.  It's also no
longer necessary to avoid use of CALC_NOT in tsgistidx.c, since
the underlying callbacks can now do something reasonable.

Back-patch into v13.  We can't change this in stable branches,
but it seems not quite too late to fix it in v13.

Tom Lane and Pavel Borisov

Discussion: https://postgr.es/m/CALT9ZEE-aLotzBg-pOp2GFTesGWVYzXA3=mZKzRDa_OKnLF7Mg@mail.gmail.com
---
 src/backend/tsearch/wparser_def.c     |   8 +-
 src/backend/utils/adt/tsginidx.c      | 133 +++++------------------
 src/backend/utils/adt/tsgistidx.c     |  26 +++--
 src/backend/utils/adt/tsrank.c        |  12 ++-
 src/backend/utils/adt/tsvector_op.c   | 147 +++++++++++++++++---------
 src/include/tsearch/ts_utils.h        |  32 +++---
 src/test/regress/expected/tsearch.out | 144 +++++++++++++++++++++++++
 src/test/regress/expected/tstypes.out |  49 +++++++++
 src/test/regress/sql/tsearch.sql      |  24 +++++
 src/test/regress/sql/tstypes.sql      |   9 ++
 10 files changed, 395 insertions(+), 189 deletions(-)

diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index fda35abc7417..76b6f9aef03c 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -1962,7 +1962,7 @@ typedef struct
 /*
  * TS_execute callback for matching a tsquery operand to headline words
  */
-static bool
+static TSTernaryValue
 checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
 {
 	hlCheck    *checkval = (hlCheck *) opaque;
@@ -1975,7 +1975,7 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
 		{
 			/* if data == NULL, don't need to report positions */
 			if (!data)
-				return true;
+				return TS_YES;
 
 			if (!data->pos)
 			{
@@ -1992,9 +1992,9 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
 	}
 
 	if (data && data->npos > 0)
-		return true;
+		return TS_YES;
 
-	return false;
+	return TS_NO;
 }
 
 /*
diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c
index 2d656168fca5..3128f0a7da07 100644
--- a/src/backend/utils/adt/tsginidx.c
+++ b/src/backend/utils/adt/tsginidx.c
@@ -178,9 +178,13 @@ typedef struct
 	bool	   *need_recheck;
 } GinChkVal;
 
-static GinTernaryValue
-checkcondition_gin_internal(GinChkVal *gcv, QueryOperand *val, ExecPhraseData *data)
+/*
+ * TS_execute callback for matching a tsquery operand to GIN index data
+ */
+static TSTernaryValue
+checkcondition_gin(void *checkval, QueryOperand *val, ExecPhraseData *data)
 {
+	GinChkVal  *gcv = (GinChkVal *) checkval;
 	int			j;
 
 	/*
@@ -193,112 +197,22 @@ checkcondition_gin_internal(GinChkVal *gcv, QueryOperand *val, ExecPhraseData *d
 	/* convert item's number to corresponding entry's (operand's) number */
 	j = gcv->map_item_operand[((QueryItem *) val) - gcv->first_item];
 
-	/* return presence of current entry in indexed value */
-	return gcv->check[j];
-}
-
-/*
- * Wrapper of check condition function for TS_execute.
- */
-static bool
-checkcondition_gin(void *checkval, QueryOperand *val, ExecPhraseData *data)
-{
-	return checkcondition_gin_internal((GinChkVal *) checkval,
-									   val,
-									   data) != GIN_FALSE;
-}
-
-/*
- * Evaluate tsquery boolean expression using ternary logic.
- *
- * Note: the reason we can't use TS_execute() for this is that its API
- * for the checkcondition callback doesn't allow a MAYBE result to be
- * returned, but we might have MAYBEs in the gcv->check array.
- * Perhaps we should change that API.
- */
-static GinTernaryValue
-TS_execute_ternary(GinChkVal *gcv, QueryItem *curitem, bool in_phrase)
-{
-	GinTernaryValue val1,
-				val2,
-				result;
-
-	/* since this function recurses, it could be driven to stack overflow */
-	check_stack_depth();
-
-	if (curitem->type == QI_VAL)
-		return
-			checkcondition_gin_internal(gcv,
-										(QueryOperand *) curitem,
-										NULL /* don't have position info */ );
-
-	switch (curitem->qoperator.oper)
+	/*
+	 * return presence of current entry in indexed value; but TRUE becomes
+	 * MAYBE in the presence of a query requiring recheck
+	 */
+	if (gcv->check[j] == GIN_TRUE)
 	{
-		case OP_NOT:
-
-			/*
-			 * Below a phrase search, force NOT's result to MAYBE.  We cannot
-			 * invert a TRUE result from the subexpression to FALSE, since
-			 * TRUE only says that the subexpression matches somewhere, not
-			 * that it matches everywhere, so there might be positions where
-			 * the NOT will match.  We could invert FALSE to TRUE, but there's
-			 * little point in distinguishing TRUE from MAYBE, since a recheck
-			 * will have been forced already.
-			 */
-			if (in_phrase)
-				return GIN_MAYBE;
-
-			result = TS_execute_ternary(gcv, curitem + 1, in_phrase);
-			if (result == GIN_MAYBE)
-				return result;
-			return !result;
-
-		case OP_PHRASE:
-
-			/*
-			 * GIN doesn't contain any information about positions, so treat
-			 * OP_PHRASE as OP_AND with recheck requirement, and always
-			 * reporting MAYBE not TRUE.
-			 */
-			*(gcv->need_recheck) = true;
-			/* Pass down in_phrase == true in case there's a NOT below */
-			in_phrase = true;
-
-			/* FALL THRU */
-
-		case OP_AND:
-			val1 = TS_execute_ternary(gcv, curitem + curitem->qoperator.left,
-									  in_phrase);
-			if (val1 == GIN_FALSE)
-				return GIN_FALSE;
-			val2 = TS_execute_ternary(gcv, curitem + 1, in_phrase);
-			if (val2 == GIN_FALSE)
-				return GIN_FALSE;
-			if (val1 == GIN_TRUE && val2 == GIN_TRUE &&
-				curitem->qoperator.oper != OP_PHRASE)
-				return GIN_TRUE;
-			else
-				return GIN_MAYBE;
-
-		case OP_OR:
-			val1 = TS_execute_ternary(gcv, curitem + curitem->qoperator.left,
-									  in_phrase);
-			if (val1 == GIN_TRUE)
-				return GIN_TRUE;
-			val2 = TS_execute_ternary(gcv, curitem + 1, in_phrase);
-			if (val2 == GIN_TRUE)
-				return GIN_TRUE;
-			if (val1 == GIN_FALSE && val2 == GIN_FALSE)
-				return GIN_FALSE;
-			else
-				return GIN_MAYBE;
-
-		default:
-			elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper);
+		if (val->weight != 0 || data != NULL)
+			return TS_MAYBE;
 	}
 
-	/* not reachable, but keep compiler quiet */
-	return false;
+	/*
+	 * We rely on GinTernaryValue and TSTernaryValue using equivalent value
+	 * assignments.  We could use a switch statement to map the values if that
+	 * ever stops being true, but it seems unlikely to happen.
+	 */
+	return (TSTernaryValue) gcv->check[j];
 }
 
 Datum
@@ -370,10 +284,11 @@ gin_tsquery_triconsistent(PG_FUNCTION_ARGS)
 		gcv.map_item_operand = (int *) (extra_data[0]);
 		gcv.need_recheck = &recheck;
 
-		res = TS_execute_ternary(&gcv, GETQUERY(query), false);
-
-		if (res == GIN_TRUE && recheck)
-			res = GIN_MAYBE;
+		if (TS_execute(GETQUERY(query),
+					   &gcv,
+					   TS_EXEC_CALC_NOT | TS_EXEC_PHRASE_NO_POS,
+					   checkcondition_gin))
+			res = recheck ? GIN_MAYBE : GIN_TRUE;
 	}
 
 	PG_RETURN_GIN_TERNARY_VALUE(res);
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index c3f25800e7b2..927aed915649 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -273,9 +273,9 @@ typedef struct
 } CHKVAL;
 
 /*
- * is there value 'val' in array or not ?
+ * TS_execute callback for matching a tsquery operand to GIST leaf-page data
  */
-static bool
+static TSTernaryValue
 checkcondition_arr(void *checkval, QueryOperand *val, ExecPhraseData *data)
 {
 	int32	   *StopLow = ((CHKVAL *) checkval)->arrb;
@@ -288,23 +288,26 @@ checkcondition_arr(void *checkval, QueryOperand *val, ExecPhraseData *data)
 	 * we are not able to find a prefix by hash value
 	 */
 	if (val->prefix)
-		return true;
+		return TS_MAYBE;
 
 	while (StopLow < StopHigh)
 	{
 		StopMiddle = StopLow + (StopHigh - StopLow) / 2;
 		if (*StopMiddle == val->valcrc)
-			return true;
+			return TS_MAYBE;
 		else if (*StopMiddle < val->valcrc)
 			StopLow = StopMiddle + 1;
 		else
 			StopHigh = StopMiddle;
 	}
 
-	return false;
+	return TS_NO;
 }
 
-static bool
+/*
+ * TS_execute callback for matching a tsquery operand to GIST non-leaf data
+ */
+static TSTernaryValue
 checkcondition_bit(void *checkval, QueryOperand *val, ExecPhraseData *data)
 {
 	void	   *key = (SignTSVector *) checkval;
@@ -313,8 +316,12 @@ checkcondition_bit(void *checkval, QueryOperand *val, ExecPhraseData *data)
 	 * we are not able to find a prefix in signature tree
 	 */
 	if (val->prefix)
-		return true;
-	return GETBIT(GETSIGN(key), HASHVAL(val->valcrc, GETSIGLEN(key)));
+		return TS_MAYBE;
+
+	if (GETBIT(GETSIGN(key), HASHVAL(val->valcrc, GETSIGLEN(key))))
+		return TS_MAYBE;
+	else
+		return TS_NO;
 }
 
 Datum
@@ -339,10 +346,9 @@ gtsvector_consistent(PG_FUNCTION_ARGS)
 		if (ISALLTRUE(key))
 			PG_RETURN_BOOL(true);
 
-		/* since signature is lossy, cannot specify CALC_NOT here */
 		PG_RETURN_BOOL(TS_execute(GETQUERY(query),
 								  key,
-								  TS_EXEC_PHRASE_NO_POS,
+								  TS_EXEC_PHRASE_NO_POS | TS_EXEC_CALC_NOT,
 								  checkcondition_bit));
 	}
 	else
diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c
index 07251dd577c8..cbd97abccde4 100644
--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@@ -556,14 +556,18 @@ typedef struct
 #define QR_GET_OPERAND_DATA(q, v) \
 	( (q)->operandData + (((QueryItem*)(v)) - GETQUERY((q)->query)) )
 
-static bool
-checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *data)
+/*
+ * TS_execute callback for matching a tsquery operand to QueryRepresentation
+ */
+static TSTernaryValue
+checkcondition_QueryOperand(void *checkval, QueryOperand *val,
+							ExecPhraseData *data)
 {
 	QueryRepresentation *qr = (QueryRepresentation *) checkval;
 	QueryRepresentationOperand *opData = QR_GET_OPERAND_DATA(qr, val);
 
 	if (!opData->operandexists)
-		return false;
+		return TS_NO;
 
 	if (data)
 	{
@@ -573,7 +577,7 @@ checkcondition_QueryOperand(void *checkval, QueryOperand *val, ExecPhraseData *d
 			data->pos += MAXQROPOS - opData->npos;
 	}
 
-	return true;
+	return TS_YES;
 }
 
 typedef struct
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 51619c396c74..6df943abd4e1 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -67,14 +67,6 @@ typedef struct
 	StatEntry  *root;
 } TSVectorStat;
 
-/* TS_execute requires ternary logic to handle NOT with phrase matches */
-typedef enum
-{
-	TS_NO,						/* definitely no match */
-	TS_YES,						/* definitely does match */
-	TS_MAYBE					/* can't verify match for lack of pos data */
-} TSTernaryValue;
-
 
 static TSTernaryValue TS_execute_recurse(QueryItem *curitem, void *arg,
 										 uint32 flags,
@@ -1188,13 +1180,15 @@ tsCompareString(char *a, int lena, char *b, int lenb, bool prefix)
 /*
  * Check weight info or/and fill 'data' with the required positions
  */
-static bool
+static TSTernaryValue
 checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
 			   ExecPhraseData *data)
 {
-	bool		result = false;
+	TSTernaryValue result = TS_NO;
 
-	if (entry->haspos && (val->weight || data))
+	Assert(data == NULL || data->npos == 0);
+
+	if (entry->haspos)
 	{
 		WordEntryPosVector *posvec;
 
@@ -1232,7 +1226,13 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
 			data->npos = dptr - data->pos;
 
 			if (data->npos > 0)
-				result = true;
+				result = TS_YES;
+			else
+			{
+				pfree(data->pos);
+				data->pos = NULL;
+				data->allocated = false;
+			}
 		}
 		else if (val->weight)
 		{
@@ -1243,40 +1243,57 @@ checkclass_str(CHKVAL *chkval, WordEntry *entry, QueryOperand *val,
 			{
 				if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter)))
 				{
-					result = true;
+					result = TS_YES;
 					break;		/* no need to go further */
 				}
 
 				posvec_iter++;
 			}
 		}
-		else					/* data != NULL */
+		else if (data)
 		{
 			data->npos = posvec->npos;
 			data->pos = posvec->pos;
 			data->allocated = false;
-			result = true;
+			result = TS_YES;
+		}
+		else
+		{
+			/* simplest case: no weight check, positions not needed */
+			result = TS_YES;
 		}
 	}
 	else
 	{
-		result = true;
+		/*
+		 * Position info is lacking, so if the caller requires it, we can only
+		 * say that maybe there is a match.
+		 *
+		 * Notice, however, that we *don't* check val->weight here.
+		 * Historically, stripped tsvectors are considered to match queries
+		 * whether or not the query has a weight restriction; that's a little
+		 * dubious but we'll preserve the behavior.
+		 */
+		if (data)
+			result = TS_MAYBE;
+		else
+			result = TS_YES;
 	}
 
 	return result;
 }
 
 /*
- * is there value 'val' in array or not ?
+ * TS_execute callback for matching a tsquery operand to plain tsvector data
  */
-static bool
+static TSTernaryValue
 checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 {
 	CHKVAL	   *chkval = (CHKVAL *) checkval;
 	WordEntry  *StopLow = chkval->arrb;
 	WordEntry  *StopHigh = chkval->arre;
 	WordEntry  *StopMiddle = StopHigh;
-	bool		res = false;
+	TSTernaryValue res = TS_NO;
 
 	/* Loop invariant: StopLow <= val < StopHigh */
 	while (StopLow < StopHigh)
@@ -1302,36 +1319,69 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 			StopHigh = StopMiddle;
 	}
 
-	if ((!res || data) && val->prefix)
+	/*
+	 * If it's a prefix search, we should also consider lexemes that the
+	 * search term is a prefix of (which will necessarily immediately follow
+	 * the place we found in the above loop).  But we can skip them if there
+	 * was a definite match on the exact term AND the caller doesn't need
+	 * position info.
+	 */
+	if (val->prefix && (res != TS_YES || data))
 	{
 		WordEntryPos *allpos = NULL;
 		int			npos = 0,
 					totalpos = 0;
 
-		/*
-		 * there was a failed exact search, so we should scan further to find
-		 * a prefix match. We also need to do so if caller needs position info
-		 */
+		/* adjust start position for corner case */
 		if (StopLow >= StopHigh)
 			StopMiddle = StopHigh;
 
-		while ((!res || data) && StopMiddle < chkval->arre &&
+		/* we don't try to re-use any data from the initial match */
+		if (data)
+		{
+			if (data->allocated)
+				pfree(data->pos);
+			data->pos = NULL;
+			data->allocated = false;
+			data->npos = 0;
+		}
+		res = TS_NO;
+
+		while ((res != TS_YES || data) &&
+			   StopMiddle < chkval->arre &&
 			   tsCompareString(chkval->operand + val->distance,
 							   val->length,
 							   chkval->values + StopMiddle->pos,
 							   StopMiddle->len,
 							   true) == 0)
 		{
-			if (data)
-			{
-				/*
-				 * We need to join position information
-				 */
-				res = checkclass_str(chkval, StopMiddle, val, data);
+			TSTernaryValue subres;
+
+			subres = checkclass_str(chkval, StopMiddle, val, data);
 
-				if (res)
+			if (subres != TS_NO)
+			{
+				if (data)
 				{
-					while (npos + data->npos >= totalpos)
+					/*
+					 * We need to join position information
+					 */
+					if (subres == TS_MAYBE)
+					{
+						/*
+						 * No position info for this match, so we must report
+						 * MAYBE overall.
+						 */
+						res = TS_MAYBE;
+						/* forget any previous positions */
+						npos = 0;
+						/* don't leak storage */
+						if (allpos)
+							pfree(allpos);
+						break;
+					}
+
+					while (npos + data->npos > totalpos)
 					{
 						if (totalpos == 0)
 						{
@@ -1347,22 +1397,27 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 
 					memcpy(allpos + npos, data->pos, sizeof(WordEntryPos) * data->npos);
 					npos += data->npos;
+
+					/* don't leak storage from individual matches */
+					if (data->allocated)
+						pfree(data->pos);
+					data->pos = NULL;
+					data->allocated = false;
+					/* it's important to reset data->npos before next loop */
+					data->npos = 0;
 				}
 				else
 				{
-					/* at loop exit, res must be true if we found matches */
-					res = (npos > 0);
+					/* Don't need positions, just handle YES/MAYBE */
+					if (subres == TS_YES || res == TS_NO)
+						res = subres;
 				}
 			}
-			else
-			{
-				res = checkclass_str(chkval, StopMiddle, val, NULL);
-			}
 
 			StopMiddle++;
 		}
 
-		if (res && data)
+		if (data && npos > 0)
 		{
 			/* Sort and make unique array of found positions */
 			data->pos = allpos;
@@ -1370,6 +1425,7 @@ checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data)
 			data->npos = qunique(data->pos, npos, sizeof(WordEntryPos),
 								 compareWordEntryPos);
 			data->allocated = true;
+			res = TS_YES;
 		}
 	}
 
@@ -1561,14 +1617,7 @@ TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags,
 	check_stack_depth();
 
 	if (curitem->type == QI_VAL)
-	{
-		if (!chkcond(arg, (QueryOperand *) curitem, data))
-			return TS_NO;
-		if (data->npos > 0 || data->negate)
-			return TS_YES;
-		/* If we have no position data, we must return TS_MAYBE */
-		return TS_MAYBE;
-	}
+		return chkcond(arg, (QueryOperand *) curitem, data);
 
 	switch (curitem->qoperator.oper)
 	{
@@ -1821,7 +1870,7 @@ TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags,
 
 	if (curitem->type == QI_VAL)
 		return chkcond(arg, (QueryOperand *) curitem,
-					   NULL /* don't need position info */ ) ? TS_YES : TS_NO;
+					   NULL /* don't need position info */ );
 
 	switch (curitem->qoperator.oper)
 	{
diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h
index f78fbd9d1a4c..609b0c7e9bdf 100644
--- a/src/include/tsearch/ts_utils.h
+++ b/src/include/tsearch/ts_utils.h
@@ -124,13 +124,21 @@ extern text *generateHeadline(HeadlineParsedText *prs);
  * whether a given primitive tsquery value is matched in the data.
  */
 
+/* TS_execute requires ternary logic to handle NOT with phrase matches */
+typedef enum
+{
+	TS_NO,						/* definitely no match */
+	TS_YES,						/* definitely does match */
+	TS_MAYBE					/* can't verify match for lack of pos data */
+} TSTernaryValue;
+
 /*
  * struct ExecPhraseData is passed to a TSExecuteCallback function if we need
  * lexeme position data (because of a phrase-match operator in the tsquery).
- * The callback should fill in position data when it returns true (success).
- * If it cannot return position data, it may leave "data" unchanged, but
- * then the caller of TS_execute() must pass the TS_EXEC_PHRASE_NO_POS flag
- * and must arrange for a later recheck with position data available.
+ * The callback should fill in position data when it returns TS_YES (success).
+ * If it cannot return position data, it should leave "data" unchanged and
+ * return TS_MAYBE.  The caller of TS_execute() must then arrange for a later
+ * recheck with position data available.
  *
  * The reported lexeme positions must be sorted and unique.  Callers must only
  * consult the position bits of the pos array, ie, WEP_GETPOS(data->pos[i]).
@@ -162,12 +170,13 @@ typedef struct ExecPhraseData
  * val: lexeme to test for presence of
  * data: to be filled with lexeme positions; NULL if position data not needed
  *
- * Return true if lexeme is present in data, else false.  If data is not
- * NULL, it should be filled with lexeme positions, but function can leave
- * it as zeroes if position data is not available.
+ * Return TS_YES if lexeme is present in data, TS_MAYBE if it might be
+ * present, TS_NO if it definitely is not present.  If data is not NULL,
+ * it must be filled with lexeme positions if available.  If position data
+ * is not available, leave *data as zeroes and return TS_MAYBE, never TS_YES.
  */
-typedef bool (*TSExecuteCallback) (void *arg, QueryOperand *val,
-								   ExecPhraseData *data);
+typedef TSTernaryValue (*TSExecuteCallback) (void *arg, QueryOperand *val,
+											 ExecPhraseData *data);
 
 /*
  * Flag bits for TS_execute
@@ -175,10 +184,7 @@ typedef bool (*TSExecuteCallback) (void *arg, QueryOperand *val,
 #define TS_EXEC_EMPTY			(0x00)
 /*
  * If TS_EXEC_CALC_NOT is not set, then NOT expressions are automatically
- * evaluated to be true.  Useful in cases where NOT cannot be accurately
- * computed (GiST) or it isn't important (ranking).  From TS_execute's
- * perspective, !CALC_NOT means that the TSExecuteCallback function might
- * return false-positive indications of a lexeme's presence.
+ * evaluated to be true.  Useful in cases where NOT isn't important (ranking).
  */
 #define TS_EXEC_CALC_NOT		(0x01)
 /*
diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out
index 7105c67cf8f8..0110b4d2e0d8 100644
--- a/src/test/regress/expected/tsearch.out
+++ b/src/test/regress/expected/tsearch.out
@@ -176,6 +176,30 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
    507
 (1 row)
 
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+ count 
+-------
+    56
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+ count 
+-------
+    58
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+ count 
+-------
+   452
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
+ count 
+-------
+   450
+(1 row)
+
 create index wowidx on test_tsvector using gist (a);
 SET enable_seqscan=OFF;
 SET enable_indexscan=ON;
@@ -308,6 +332,30 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
    507
 (1 row)
 
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+ count 
+-------
+    56
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+ count 
+-------
+    58
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+ count 
+-------
+   452
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
+ count 
+-------
+   450
+(1 row)
+
 SET enable_indexscan=OFF;
 SET enable_bitmapscan=ON;
 explain (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh';
@@ -440,6 +488,30 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
    507
 (1 row)
 
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+ count 
+-------
+    56
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+ count 
+-------
+    58
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+ count 
+-------
+   452
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
+ count 
+-------
+   450
+(1 row)
+
 -- Test siglen parameter of GiST tsvector_ops
 CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(foo=1));
 ERROR:  unrecognized parameter "foo"
@@ -595,6 +667,30 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
    507
 (1 row)
 
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+ count 
+-------
+    56
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+ count 
+-------
+    58
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+ count 
+-------
+   452
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
+ count 
+-------
+   450
+(1 row)
+
 DROP INDEX wowidx2;
 CREATE INDEX wowidx ON test_tsvector USING gist (a tsvector_ops(siglen=484));
 \d test_tsvector
@@ -736,6 +832,30 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
    507
 (1 row)
 
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+ count 
+-------
+    56
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+ count 
+-------
+    58
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+ count 
+-------
+   452
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
+ count 
+-------
+   450
+(1 row)
+
 RESET enable_seqscan;
 RESET enable_indexscan;
 RESET enable_bitmapscan;
@@ -873,6 +993,30 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
    507
 (1 row)
 
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+ count 
+-------
+    56
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+ count 
+-------
+    58
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+ count 
+-------
+   452
+(1 row)
+
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
+ count 
+-------
+   450
+(1 row)
+
 -- Test optimization of non-empty GIN_SEARCH_MODE_ALL queries
 EXPLAIN (COSTS OFF)
 SELECT count(*) FROM test_tsvector WHERE a @@ '!qh';
diff --git a/src/test/regress/expected/tstypes.out b/src/test/regress/expected/tstypes.out
index ee4a2490bbb3..2601e312df4e 100644
--- a/src/test/regress/expected/tstypes.out
+++ b/src/test/regress/expected/tstypes.out
@@ -551,6 +551,55 @@ SELECT 'wa:1A wb:2D'::tsvector @@ 'w:*D <-> w:*A'::tsquery as "false";
  f
 (1 row)
 
+SELECT 'wa:1A'::tsvector @@ 'w:*A'::tsquery as "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT 'wa:1A'::tsvector @@ 'w:*D'::tsquery as "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'wa:1A'::tsvector @@ '!w:*A'::tsquery as "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT 'wa:1A'::tsvector @@ '!w:*D'::tsquery as "true";
+ true 
+------
+ t
+(1 row)
+
+-- historically, a stripped tsvector matches queries ignoring weights:
+SELECT strip('wa:1A'::tsvector) @@ 'w:*A'::tsquery as "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT strip('wa:1A'::tsvector) @@ 'w:*D'::tsquery as "true";
+ true 
+------
+ t
+(1 row)
+
+SELECT strip('wa:1A'::tsvector) @@ '!w:*A'::tsquery as "false";
+ false 
+-------
+ f
+(1 row)
+
+SELECT strip('wa:1A'::tsvector) @@ '!w:*D'::tsquery as "false";
+ false 
+-------
+ f
+(1 row)
+
 SELECT 'supernova'::tsvector @@ 'super'::tsquery AS "false";
  false 
 -------
diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql
index e53e44f7ed1c..8a27fcd8b0b8 100644
--- a/src/test/regress/sql/tsearch.sql
+++ b/src/test/regress/sql/tsearch.sql
@@ -61,6 +61,10 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
 
 create index wowidx on test_tsvector using gist (a);
 
@@ -90,6 +94,10 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
 
 SET enable_indexscan=OFF;
 SET enable_bitmapscan=ON;
@@ -116,6 +124,10 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
 
 -- Test siglen parameter of GiST tsvector_ops
 CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(foo=1));
@@ -152,6 +164,10 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
 
 DROP INDEX wowidx2;
 
@@ -181,6 +197,10 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
 
 RESET enable_seqscan;
 RESET enable_indexscan;
@@ -215,6 +235,10 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!qe <2> qt';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(pl <-> yh)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(yh <-> pl)';
 SELECT count(*) FROM test_tsvector WHERE a @@ '!(qe <2> qt)';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ 'wd:D';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:A';
+SELECT count(*) FROM test_tsvector WHERE a @@ '!wd:D';
 
 -- Test optimization of non-empty GIN_SEARCH_MODE_ALL queries
 EXPLAIN (COSTS OFF)
diff --git a/src/test/regress/sql/tstypes.sql b/src/test/regress/sql/tstypes.sql
index 50b4359c9a40..30c8c702f091 100644
--- a/src/test/regress/sql/tstypes.sql
+++ b/src/test/regress/sql/tstypes.sql
@@ -104,6 +104,15 @@ SELECT 'a b:89  ca:23A,64c cb:80b d:34c'::tsvector @@ 'd:AC & c:*B' as "true";
 SELECT 'wa:1D wb:2A'::tsvector @@ 'w:*D & w:*A'::tsquery as "true";
 SELECT 'wa:1D wb:2A'::tsvector @@ 'w:*D <-> w:*A'::tsquery as "true";
 SELECT 'wa:1A wb:2D'::tsvector @@ 'w:*D <-> w:*A'::tsquery as "false";
+SELECT 'wa:1A'::tsvector @@ 'w:*A'::tsquery as "true";
+SELECT 'wa:1A'::tsvector @@ 'w:*D'::tsquery as "false";
+SELECT 'wa:1A'::tsvector @@ '!w:*A'::tsquery as "false";
+SELECT 'wa:1A'::tsvector @@ '!w:*D'::tsquery as "true";
+-- historically, a stripped tsvector matches queries ignoring weights:
+SELECT strip('wa:1A'::tsvector) @@ 'w:*A'::tsquery as "true";
+SELECT strip('wa:1A'::tsvector) @@ 'w:*D'::tsquery as "true";
+SELECT strip('wa:1A'::tsvector) @@ '!w:*A'::tsquery as "false";
+SELECT strip('wa:1A'::tsvector) @@ '!w:*D'::tsquery as "false";
 
 SELECT 'supernova'::tsvector @@ 'super'::tsquery AS "false";
 SELECT 'supeanova supernova'::tsvector @@ 'super'::tsquery AS "false";

From 79d6d1a277ee1cdda90f9a66d7970ac3885822de Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 24 Jul 2020 15:43:56 -0400
Subject: [PATCH 199/334] Replace TS_execute's TS_EXEC_CALC_NOT flag with
 TS_EXEC_SKIP_NOT.

It's fairly silly that ignoring NOT subexpressions is TS_execute's
default behavior.  It's wrong on its face and it encourages errors
of omission.  Moreover, the only two remaining callers that aren't
specifying CALC_NOT are in ts_headline calculations, and it's very
arguable that those are bugs: if you've specified "!foo" in your
query, why would you want to get a headline that includes "foo"?

Hence, rip that out and change the default behavior to be to calculate
NOT accurately.  As a concession to the slim chance that there is still
somebody somewhere who needs the incorrect behavior, provide a new
SKIP_NOT flag to explicitly request that.

Back-patch into v13, mainly because it seems better to change this
at the same time as the previous commit's rejiggering of TS_execute
related APIs.  Any outside callers affected by this change are
probably also affected by that one.

Discussion: https://postgr.es/m/CALT9ZEE-aLotzBg-pOp2GFTesGWVYzXA3=mZKzRDa_OKnLF7Mg@mail.gmail.com
---
 src/backend/utils/adt/tsginidx.c    | 4 ++--
 src/backend/utils/adt/tsgistidx.c   | 4 ++--
 src/backend/utils/adt/tsrank.c      | 2 +-
 src/backend/utils/adt/tsvector_op.c | 8 ++++----
 src/include/tsearch/ts_utils.h      | 8 +++++---
 5 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/backend/utils/adt/tsginidx.c b/src/backend/utils/adt/tsginidx.c
index 3128f0a7da07..b3e3ffc57763 100644
--- a/src/backend/utils/adt/tsginidx.c
+++ b/src/backend/utils/adt/tsginidx.c
@@ -248,7 +248,7 @@ gin_tsquery_consistent(PG_FUNCTION_ARGS)
 
 		res = TS_execute(GETQUERY(query),
 						 &gcv,
-						 TS_EXEC_CALC_NOT | TS_EXEC_PHRASE_NO_POS,
+						 TS_EXEC_PHRASE_NO_POS,
 						 checkcondition_gin);
 	}
 
@@ -286,7 +286,7 @@ gin_tsquery_triconsistent(PG_FUNCTION_ARGS)
 
 		if (TS_execute(GETQUERY(query),
 					   &gcv,
-					   TS_EXEC_CALC_NOT | TS_EXEC_PHRASE_NO_POS,
+					   TS_EXEC_PHRASE_NO_POS,
 					   checkcondition_gin))
 			res = recheck ? GIN_MAYBE : GIN_TRUE;
 	}
diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index 927aed915649..a601965bd83e 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -348,7 +348,7 @@ gtsvector_consistent(PG_FUNCTION_ARGS)
 
 		PG_RETURN_BOOL(TS_execute(GETQUERY(query),
 								  key,
-								  TS_EXEC_PHRASE_NO_POS | TS_EXEC_CALC_NOT,
+								  TS_EXEC_PHRASE_NO_POS,
 								  checkcondition_bit));
 	}
 	else
@@ -359,7 +359,7 @@ gtsvector_consistent(PG_FUNCTION_ARGS)
 		chkval.arre = chkval.arrb + ARRNELEM(key);
 		PG_RETURN_BOOL(TS_execute(GETQUERY(query),
 								  (void *) &chkval,
-								  TS_EXEC_PHRASE_NO_POS | TS_EXEC_CALC_NOT,
+								  TS_EXEC_PHRASE_NO_POS,
 								  checkcondition_arr));
 	}
 }
diff --git a/src/backend/utils/adt/tsrank.c b/src/backend/utils/adt/tsrank.c
index cbd97abccde4..c88ebfc7d411 100644
--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@@ -697,7 +697,7 @@ Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, CoverExt *ext)
 		fillQueryRepresentationData(qr, ptr);
 
 		if (TS_execute(GETQUERY(qr->query), (void *) qr,
-					   TS_EXEC_CALC_NOT, checkcondition_QueryOperand))
+					   TS_EXEC_EMPTY, checkcondition_QueryOperand))
 		{
 			if (WEP_GETPOS(ptr->pos) < ext->p)
 			{
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index 6df943abd4e1..f01b1ee25377 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -1627,9 +1627,9 @@ TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags,
 			 * We need not touch data->width, since a NOT operation does not
 			 * change the match width.
 			 */
-			if (!(flags & TS_EXEC_CALC_NOT))
+			if (flags & TS_EXEC_SKIP_NOT)
 			{
-				/* without CALC_NOT, report NOT as "match everywhere" */
+				/* with SKIP_NOT, report NOT as "match everywhere" */
 				Assert(data->npos == 0 && !data->negate);
 				data->negate = true;
 				return TS_YES;
@@ -1875,7 +1875,7 @@ TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags,
 	switch (curitem->qoperator.oper)
 	{
 		case OP_NOT:
-			if (!(flags & TS_EXEC_CALC_NOT))
+			if (flags & TS_EXEC_SKIP_NOT)
 				return TS_YES;
 			switch (TS_execute_recurse(curitem + 1, arg, flags, chkcond))
 			{
@@ -2038,7 +2038,7 @@ ts_match_vq(PG_FUNCTION_ARGS)
 	chkval.operand = GETOPERAND(query);
 	result = TS_execute(GETQUERY(query),
 						&chkval,
-						TS_EXEC_CALC_NOT,
+						TS_EXEC_EMPTY,
 						checkcondition_str);
 
 	PG_FREE_IF_COPY(val, 0);
diff --git a/src/include/tsearch/ts_utils.h b/src/include/tsearch/ts_utils.h
index 609b0c7e9bdf..400ba3300148 100644
--- a/src/include/tsearch/ts_utils.h
+++ b/src/include/tsearch/ts_utils.h
@@ -183,10 +183,12 @@ typedef TSTernaryValue (*TSExecuteCallback) (void *arg, QueryOperand *val,
  */
 #define TS_EXEC_EMPTY			(0x00)
 /*
- * If TS_EXEC_CALC_NOT is not set, then NOT expressions are automatically
- * evaluated to be true.  Useful in cases where NOT isn't important (ranking).
+ * If TS_EXEC_SKIP_NOT is set, then NOT sub-expressions are automatically
+ * evaluated to be true.  This was formerly the default behavior.  It's now
+ * deprecated because it tends to give silly answers, but some applications
+ * might still have a use for it.
  */
-#define TS_EXEC_CALC_NOT		(0x01)
+#define TS_EXEC_SKIP_NOT		(0x01)
 /*
  * If TS_EXEC_PHRASE_NO_POS is set, allow OP_PHRASE to be executed lossily
  * in the absence of position information: a true result indicates that the

From 2a2494229a709b880a6db82d8b267017fccf671f Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Sat, 25 Jul 2020 10:20:39 +0530
Subject: [PATCH 200/334] Fix buffer usage stats for nodes above Gather Merge.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 85c9d347 addressed a similar problem for Gather and Gather
Merge nodes but forgot to account for nodes above parallel nodes.  This
still works for nodes above Gather node because we shut down the workers
for Gather node as soon as there are no more tuples.  We can do a similar
thing for Gather Merge as well but it seems better to account for stats
during nodes shutdown after completing the execution.

Reported-by: Stéphane Lorek, Jehan-Guillaume de Rorthais
Author: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Reviewed-by: Amit Kapila
Backpatch-through: 10, where it was introduced
Discussion: https://postgr.es/m/20200718160206.584532a2@firost
---
 src/backend/executor/execProcnode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 5662e7d74215..01b7b926bf70 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -755,8 +755,6 @@ ExecShutdownNode(PlanState *node)
 
 	check_stack_depth();
 
-	planstate_tree_walker(node, ExecShutdownNode, NULL);
-
 	/*
 	 * Treat the node as running while we shut it down, but only if it's run
 	 * at least once already.  We don't expect much CPU consumption during
@@ -770,6 +768,8 @@ ExecShutdownNode(PlanState *node)
 	if (node->instrument && node->instrument->running)
 		InstrStartNode(node->instrument);
 
+	planstate_tree_walker(node, ExecShutdownNode, NULL);
+
 	switch (nodeTag(node))
 	{
 		case T_GatherState:

From 8a37951eebffd9bf528cb06d46127fb721d0e452 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 25 Jul 2020 12:54:58 -0400
Subject: [PATCH 201/334] Mark built-in coercion functions as leakproof where
 possible.

Making these leakproof seems helpful since (for example) if you have a
function f(int8) that is leakproof, you don't want it to effectively
become non-leakproof when you apply it to an int4 or int2 column.
But that's what happens today, since the implicit up-coercion will
not be leakproof.

Most of the coercion functions that visibly can't throw errors are
functions that convert numeric datatypes to other, wider ones.
Notable is that float4_numeric and float8_numeric can be marked
leakproof; before commit a57d312a7 they could not have been.
I also marked the functions that coerce strings to "name" as leakproof;
that's okay today because they truncate silently, but if we ever
reconsidered that behavior then they could no longer be leakproof.

I desisted from marking rtrim1() as leakproof; it appears so right now,
but the code seems a little too complex and perhaps subject to change,
since it's shared with other SQL functions.

Discussion: https://postgr.es/m/459322.1595607431@sss.pgh.pa.us
---
 src/include/catalog/catversion.h         |   2 +-
 src/include/catalog/pg_proc.dat          | 103 ++++++++++++-----------
 src/test/regress/expected/opr_sanity.out |  25 ++++++
 3 files changed, 78 insertions(+), 52 deletions(-)

diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index ed3aef93d04d..928495112196 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202007202
+#define CATALOG_VERSION_NO	202007251
 
 #endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 4b5af32440fb..082a11f2708c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -698,11 +698,11 @@
   proname => 'dlog1', prorettype => 'float8', proargtypes => 'float8',
   prosrc => 'dlog1' },
 { oid => '235', descr => 'convert int2 to float8',
-  proname => 'float8', prorettype => 'float8', proargtypes => 'int2',
-  prosrc => 'i2tod' },
+  proname => 'float8', proleakproof => 't', prorettype => 'float8',
+  proargtypes => 'int2', prosrc => 'i2tod' },
 { oid => '236', descr => 'convert int2 to float4',
-  proname => 'float4', prorettype => 'float4', proargtypes => 'int2',
-  prosrc => 'i2tof' },
+  proname => 'float4', proleakproof => 't', prorettype => 'float4',
+  proargtypes => 'int2', prosrc => 'i2tof' },
 { oid => '237', descr => 'convert float8 to int2',
   proname => 'int2', prorettype => 'int2', proargtypes => 'float8',
   prosrc => 'dtoi2' },
@@ -879,26 +879,26 @@
   proargtypes => 'float8 float8 float8 int4', prosrc => 'width_bucket_float8' },
 
 { oid => '311', descr => 'convert float4 to float8',
-  proname => 'float8', prorettype => 'float8', proargtypes => 'float4',
-  prosrc => 'ftod' },
+  proname => 'float8', proleakproof => 't', prorettype => 'float8',
+  proargtypes => 'float4', prosrc => 'ftod' },
 { oid => '312', descr => 'convert float8 to float4',
   proname => 'float4', prorettype => 'float4', proargtypes => 'float8',
   prosrc => 'dtof' },
 { oid => '313', descr => 'convert int2 to int4',
-  proname => 'int4', prorettype => 'int4', proargtypes => 'int2',
-  prosrc => 'i2toi4' },
+  proname => 'int4', proleakproof => 't', prorettype => 'int4',
+  proargtypes => 'int2', prosrc => 'i2toi4' },
 { oid => '314', descr => 'convert int4 to int2',
   proname => 'int2', prorettype => 'int2', proargtypes => 'int4',
   prosrc => 'i4toi2' },
 { oid => '316', descr => 'convert int4 to float8',
-  proname => 'float8', prorettype => 'float8', proargtypes => 'int4',
-  prosrc => 'i4tod' },
+  proname => 'float8', proleakproof => 't', prorettype => 'float8',
+  proargtypes => 'int4', prosrc => 'i4tod' },
 { oid => '317', descr => 'convert float8 to int4',
   proname => 'int4', prorettype => 'int4', proargtypes => 'float8',
   prosrc => 'dtoi4' },
 { oid => '318', descr => 'convert int4 to float4',
-  proname => 'float4', prorettype => 'float4', proargtypes => 'int4',
-  prosrc => 'i4tof' },
+  proname => 'float4', proleakproof => 't', prorettype => 'float4',
+  proargtypes => 'int4', prosrc => 'i4tof' },
 { oid => '319', descr => 'convert float4 to int4',
   proname => 'int4', prorettype => 'int4', proargtypes => 'float4',
   prosrc => 'ftoi4' },
@@ -1150,17 +1150,17 @@
   proname => 'text', prorettype => 'text', proargtypes => 'bpchar',
   prosrc => 'rtrim1' },
 { oid => '406', descr => 'convert name to text',
-  proname => 'text', prorettype => 'text', proargtypes => 'name',
-  prosrc => 'name_text' },
+  proname => 'text', proleakproof => 't', prorettype => 'text',
+  proargtypes => 'name', prosrc => 'name_text' },
 { oid => '407', descr => 'convert text to name',
-  proname => 'name', prorettype => 'name', proargtypes => 'text',
-  prosrc => 'text_name' },
+  proname => 'name', proleakproof => 't', prorettype => 'name',
+  proargtypes => 'text', prosrc => 'text_name' },
 { oid => '408', descr => 'convert name to char(n)',
   proname => 'bpchar', prorettype => 'bpchar', proargtypes => 'name',
   prosrc => 'name_bpchar' },
 { oid => '409', descr => 'convert char(n) to name',
-  proname => 'name', prorettype => 'name', proargtypes => 'bpchar',
-  prosrc => 'bpchar_name' },
+  proname => 'name', proleakproof => 't', prorettype => 'name',
+  proargtypes => 'bpchar', prosrc => 'bpchar_name' },
 
 { oid => '449', descr => 'hash',
   proname => 'hashint2', prorettype => 'int4', proargtypes => 'int2',
@@ -1338,11 +1338,11 @@
   proname => 'int4', prorettype => 'int4', proargtypes => 'int8',
   prosrc => 'int84' },
 { oid => '481', descr => 'convert int4 to int8',
-  proname => 'int8', prorettype => 'int8', proargtypes => 'int4',
-  prosrc => 'int48' },
+  proname => 'int8', proleakproof => 't', prorettype => 'int8',
+  proargtypes => 'int4', prosrc => 'int48' },
 { oid => '482', descr => 'convert int8 to float8',
-  proname => 'float8', prorettype => 'float8', proargtypes => 'int8',
-  prosrc => 'i8tod' },
+  proname => 'float8', proleakproof => 't', prorettype => 'float8',
+  proargtypes => 'int8', prosrc => 'i8tod' },
 { oid => '483', descr => 'convert float8 to int8',
   proname => 'int8', prorettype => 'int8', proargtypes => 'float8',
   prosrc => 'dtoi8' },
@@ -1359,8 +1359,8 @@
   proargtypes => 'anyarray int8', prosrc => 'hash_array_extended' },
 
 { oid => '652', descr => 'convert int8 to float4',
-  proname => 'float4', prorettype => 'float4', proargtypes => 'int8',
-  prosrc => 'i8tof' },
+  proname => 'float4', proleakproof => 't', prorettype => 'float4',
+  proargtypes => 'int8', prosrc => 'i8tof' },
 { oid => '653', descr => 'convert float4 to int8',
   proname => 'int8', prorettype => 'int8', proargtypes => 'float4',
   prosrc => 'ftoi8' },
@@ -1369,8 +1369,8 @@
   proname => 'int2', prorettype => 'int2', proargtypes => 'int8',
   prosrc => 'int82' },
 { oid => '754', descr => 'convert int2 to int8',
-  proname => 'int8', prorettype => 'int8', proargtypes => 'int2',
-  prosrc => 'int28' },
+  proname => 'int8', proleakproof => 't', prorettype => 'int8',
+  proargtypes => 'int2', prosrc => 'int28' },
 
 { oid => '655',
   proname => 'namelt', proleakproof => 't', prorettype => 'bool',
@@ -2521,8 +2521,8 @@
   proname => 'oid', prorettype => 'oid', proargtypes => 'int8',
   prosrc => 'i8tooid' },
 { oid => '1288', descr => 'convert oid to int8',
-  proname => 'int8', prorettype => 'int8', proargtypes => 'oid',
-  prosrc => 'oidtoi8' },
+  proname => 'int8', proleakproof => 't', prorettype => 'int8',
+  proargtypes => 'oid', prosrc => 'oidtoi8' },
 
 { oid => '1291',
   descr => 'trigger to suppress updates when new and old records match',
@@ -2782,8 +2782,8 @@
   prosrc => 'textlen' },
 
 { oid => '1370', descr => 'convert time to interval',
-  proname => 'interval', prorettype => 'interval', proargtypes => 'time',
-  prosrc => 'time_interval' },
+  proname => 'interval', proleakproof => 't', prorettype => 'interval',
+  proargtypes => 'time', prosrc => 'time_interval' },
 { oid => '1372', descr => 'character length',
   proname => 'char_length', prorettype => 'int4', proargtypes => 'bpchar',
   prosrc => 'bpcharlen' },
@@ -2861,11 +2861,11 @@
 # OIDS 1400 - 1499
 
 { oid => '1400', descr => 'convert varchar to name',
-  proname => 'name', prorettype => 'name', proargtypes => 'varchar',
-  prosrc => 'text_name' },
+  proname => 'name', proleakproof => 't', prorettype => 'name',
+  proargtypes => 'varchar', prosrc => 'text_name' },
 { oid => '1401', descr => 'convert name to varchar',
-  proname => 'varchar', prorettype => 'varchar', proargtypes => 'name',
-  prosrc => 'name_text' },
+  proname => 'varchar', proleakproof => 't', prorettype => 'varchar',
+  proargtypes => 'name', prosrc => 'name_text' },
 
 { oid => '1402', descr => 'current schema name',
   proname => 'current_schema', provolatile => 's', proparallel => 'u',
@@ -3941,8 +3941,8 @@
   proname => 'macaddr8_or', prorettype => 'macaddr8',
   proargtypes => 'macaddr8 macaddr8', prosrc => 'macaddr8_or' },
 { oid => '4123', descr => 'convert macaddr to macaddr8',
-  proname => 'macaddr8', prorettype => 'macaddr8', proargtypes => 'macaddr',
-  prosrc => 'macaddrtomacaddr8' },
+  proname => 'macaddr8', proleakproof => 't', prorettype => 'macaddr8',
+  proargtypes => 'macaddr', prosrc => 'macaddrtomacaddr8' },
 { oid => '4124', descr => 'convert macaddr8 to macaddr',
   proname => 'macaddr', prorettype => 'macaddr', proargtypes => 'macaddr8',
   prosrc => 'macaddr8tomacaddr' },
@@ -4321,8 +4321,8 @@
   proname => 'trim_scale', prorettype => 'numeric', proargtypes => 'numeric',
   prosrc => 'numeric_trim_scale' },
 { oid => '1740', descr => 'convert int4 to numeric',
-  proname => 'numeric', prorettype => 'numeric', proargtypes => 'int4',
-  prosrc => 'int4_numeric' },
+  proname => 'numeric', proleakproof => 't', prorettype => 'numeric',
+  proargtypes => 'int4', prosrc => 'int4_numeric' },
 { oid => '1741', descr => 'base 10 logarithm',
   proname => 'log', prolang => 'sql', prorettype => 'numeric',
   proargtypes => 'numeric', prosrc => 'select pg_catalog.log(10, $1)' },
@@ -4330,11 +4330,11 @@
   proname => 'log10', prolang => 'sql', prorettype => 'numeric',
   proargtypes => 'numeric', prosrc => 'select pg_catalog.log(10, $1)' },
 { oid => '1742', descr => 'convert float4 to numeric',
-  proname => 'numeric', prorettype => 'numeric', proargtypes => 'float4',
-  prosrc => 'float4_numeric' },
+  proname => 'numeric', proleakproof => 't', prorettype => 'numeric',
+  proargtypes => 'float4', prosrc => 'float4_numeric' },
 { oid => '1743', descr => 'convert float8 to numeric',
-  proname => 'numeric', prorettype => 'numeric', proargtypes => 'float8',
-  prosrc => 'float8_numeric' },
+  proname => 'numeric', proleakproof => 't', prorettype => 'numeric',
+  proargtypes => 'float8', prosrc => 'float8_numeric' },
 { oid => '1744', descr => 'convert numeric to int4',
   proname => 'int4', prorettype => 'int4', proargtypes => 'numeric',
   prosrc => 'numeric_int4' },
@@ -4390,11 +4390,11 @@
   proname => 'int8', prorettype => 'int8', proargtypes => 'numeric',
   prosrc => 'numeric_int8' },
 { oid => '1781', descr => 'convert int8 to numeric',
-  proname => 'numeric', prorettype => 'numeric', proargtypes => 'int8',
-  prosrc => 'int8_numeric' },
+  proname => 'numeric', proleakproof => 't', prorettype => 'numeric',
+  proargtypes => 'int8', prosrc => 'int8_numeric' },
 { oid => '1782', descr => 'convert int2 to numeric',
-  proname => 'numeric', prorettype => 'numeric', proargtypes => 'int2',
-  prosrc => 'int2_numeric' },
+  proname => 'numeric', proleakproof => 't', prorettype => 'numeric',
+  proargtypes => 'int2', prosrc => 'int2_numeric' },
 { oid => '1783', descr => 'convert numeric to int2',
   proname => 'int2', prorettype => 'int2', proargtypes => 'numeric',
   prosrc => 'numeric_int2' },
@@ -7755,7 +7755,8 @@
 { oid => '2510', descr => 'get the prepared statements for this session',
   proname => 'pg_prepared_statement', prorows => '1000', proretset => 't',
   provolatile => 's', proparallel => 'r', prorettype => 'record',
-  proargtypes => '', proallargtypes => '{text,text,timestamptz,_regtype,bool,int8,int8}',
+  proargtypes => '',
+  proallargtypes => '{text,text,timestamptz,_regtype,bool,int8,int8}',
   proargmodes => '{o,o,o,o,o,o,o}',
   proargnames => '{name,statement,prepare_time,parameter_types,from_sql,generic_plans,custom_plans}',
   prosrc => 'pg_prepared_statement' },
@@ -7933,11 +7934,11 @@
   prosrc => 'pg_tablespace_databases' },
 
 { oid => '2557', descr => 'convert int4 to boolean',
-  proname => 'bool', prorettype => 'bool', proargtypes => 'int4',
-  prosrc => 'int4_bool' },
+  proname => 'bool', proleakproof => 't', prorettype => 'bool',
+  proargtypes => 'int4', prosrc => 'int4_bool' },
 { oid => '2558', descr => 'convert boolean to int4',
-  proname => 'int4', prorettype => 'int4', proargtypes => 'bool',
-  prosrc => 'bool_int4' },
+  proname => 'int4', proleakproof => 't', prorettype => 'int4',
+  proargtypes => 'bool', prosrc => 'bool_int4' },
 { oid => '2559', descr => 'current value from last used sequence',
   proname => 'lastval', provolatile => 'v', proparallel => 'u',
   prorettype => 'int8', proargtypes => '', prosrc => 'lastval' },
diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out
index 27056d70d36d..1b3c146e4cc9 100644
--- a/src/test/regress/expected/opr_sanity.out
+++ b/src/test/regress/expected/opr_sanity.out
@@ -572,6 +572,8 @@ int24ge(smallint,integer)
 int42ge(integer,smallint)
 oideq(oid,oid)
 oidne(oid,oid)
+float8(smallint)
+float4(smallint)
 nameeqtext(name,text)
 namelttext(name,text)
 nameletext(name,text)
@@ -610,6 +612,10 @@ float84lt(double precision,real)
 float84le(double precision,real)
 float84gt(double precision,real)
 float84ge(double precision,real)
+float8(real)
+int4(smallint)
+float8(integer)
+float4(integer)
 btint2cmp(smallint,smallint)
 btint4cmp(integer,integer)
 btfloat4cmp(real,real)
@@ -620,6 +626,9 @@ btnamecmp(name,name)
 bttextcmp(text,text)
 cash_cmp(money,money)
 btoidvectorcmp(oidvector,oidvector)
+text(name)
+name(text)
+name(character)
 text_larger(text,text)
 text_smaller(text,text)
 int8eq(bigint,bigint)
@@ -634,7 +643,10 @@ int84lt(bigint,integer)
 int84gt(bigint,integer)
 int84le(bigint,integer)
 int84ge(bigint,integer)
+int8(integer)
+float8(bigint)
 oidvectorne(oidvector,oidvector)
+float4(bigint)
 namelt(name,name)
 namele(name,name)
 namegt(name,name)
@@ -651,6 +663,7 @@ text_lt(text,text)
 text_le(text,text)
 text_gt(text,text)
 text_ge(text,text)
+int8(smallint)
 macaddr_eq(macaddr,macaddr)
 macaddr_lt(macaddr,macaddr)
 macaddr_le(macaddr,macaddr)
@@ -716,6 +729,7 @@ interval_ge(interval,interval)
 interval_gt(interval,interval)
 charlt("char","char")
 tidne(tid,tid)
+int8(oid)
 tideq(tid,tid)
 timestamptz_cmp(timestamp with time zone,timestamp with time zone)
 interval_cmp(interval,interval)
@@ -727,6 +741,9 @@ timetz_le(time with time zone,time with time zone)
 timetz_ge(time with time zone,time with time zone)
 timetz_gt(time with time zone,time with time zone)
 timetz_cmp(time with time zone,time with time zone)
+"interval"(time without time zone)
+name(character varying)
+"varchar"(name)
 circle_eq(circle,circle)
 circle_ne(circle,circle)
 circle_lt(circle,circle)
@@ -757,6 +774,11 @@ varbitcmp(bit varying,bit varying)
 boolle(boolean,boolean)
 boolge(boolean,boolean)
 btboolcmp(boolean,boolean)
+"numeric"(integer)
+"numeric"(real)
+"numeric"(double precision)
+"numeric"(bigint)
+"numeric"(smallint)
 int28eq(smallint,bigint)
 int28ne(smallint,bigint)
 int28lt(smallint,bigint)
@@ -803,6 +825,8 @@ btfloat48cmp(real,double precision)
 btfloat84cmp(double precision,real)
 md5(text)
 md5(bytea)
+bool(integer)
+int4(boolean)
 tidgt(tid,tid)
 tidlt(tid,tid)
 tidge(tid,tid)
@@ -837,6 +861,7 @@ macaddr8_gt(macaddr8,macaddr8)
 macaddr8_ge(macaddr8,macaddr8)
 macaddr8_ne(macaddr8,macaddr8)
 macaddr8_cmp(macaddr8,macaddr8)
+macaddr8(macaddr)
 xid8lt(xid8,xid8)
 xid8gt(xid8,xid8)
 xid8le(xid8,xid8)

From 0a0727ccfc5f4e2926623abe877bdc0b5bfd682e Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 25 Jul 2020 16:34:35 -0400
Subject: [PATCH 202/334] Improve performance of binary COPY FROM through
 better buffering.

At least on Linux and macOS, fread() turns out to have far higher
per-call overhead than one could wish.  Reading 64KB of data at a time
and then parceling it out with our own memcpy logic makes binary COPY
from a file significantly faster --- around 30% in simple testing for
cases with narrow text columns (on Linux ... even more on macOS).

In binary COPY from frontend, there's no per-call fread(), and this
patch introduces an extra layer of memcpy'ing, but it still manages
to eke out a small win.  Apparently, the control-logic overhead in
CopyGetData() is enough to be worth avoiding for small fetches.

Bharath Rupireddy and Amit Langote, reviewed by Vignesh C,
cosmetic tweaks by me

Discussion: https://postgr.es/m/CALj2ACU5Bz06HWLwqSzNMN=Gupoj6Rcn_QVC+k070V4em9wu=A@mail.gmail.com
---
 src/backend/commands/copy.c | 118 +++++++++++++++++++++++++-----------
 1 file changed, 83 insertions(+), 35 deletions(-)

diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index 44da71c4cb5c..db7d24a511e3 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -187,15 +187,15 @@ typedef struct CopyStateData
 	TransitionCaptureState *transition_capture;
 
 	/*
-	 * These variables are used to reduce overhead in textual COPY FROM.
+	 * These variables are used to reduce overhead in COPY FROM.
 	 *
 	 * attribute_buf holds the separated, de-escaped text for each field of
 	 * the current line.  The CopyReadAttributes functions return arrays of
 	 * pointers into this buffer.  We avoid palloc/pfree overhead by re-using
 	 * the buffer on each cycle.
 	 *
-	 * (In binary COPY FROM, attribute_buf holds the binary data for the
-	 * current field, while the other variables are not used.)
+	 * In binary COPY FROM, attribute_buf holds the binary data for the
+	 * current field, but the usage is otherwise similar.
 	 */
 	StringInfoData attribute_buf;
 
@@ -209,7 +209,8 @@ typedef struct CopyStateData
 	 * input cycle is first to read the whole line into line_buf, convert it
 	 * to server encoding there, and then extract the individual attribute
 	 * fields into attribute_buf.  line_buf is preserved unmodified so that we
-	 * can display it in error messages if appropriate.
+	 * can display it in error messages if appropriate.  (In binary mode,
+	 * line_buf is not used.)
 	 */
 	StringInfoData line_buf;
 	bool		line_buf_converted; /* converted to server encoding? */
@@ -217,15 +218,18 @@ typedef struct CopyStateData
 
 	/*
 	 * Finally, raw_buf holds raw data read from the data source (file or
-	 * client connection).  CopyReadLine parses this data sufficiently to
-	 * locate line boundaries, then transfers the data to line_buf and
-	 * converts it.  Note: we guarantee that there is a \0 at
-	 * raw_buf[raw_buf_len].
+	 * client connection).  In text mode, CopyReadLine parses this data
+	 * sufficiently to locate line boundaries, then transfers the data to
+	 * line_buf and converts it.  In binary mode, CopyReadBinaryData fetches
+	 * appropriate amounts of data from this buffer.  In both modes, we
+	 * guarantee that there is a \0 at raw_buf[raw_buf_len].
 	 */
 #define RAW_BUF_SIZE 65536		/* we palloc RAW_BUF_SIZE+1 bytes */
 	char	   *raw_buf;
 	int			raw_buf_index;	/* next byte to process */
 	int			raw_buf_len;	/* total # of bytes stored */
+	/* Shorthand for number of unconsumed bytes available in raw_buf */
+#define RAW_BUF_BYTES(cstate) ((cstate)->raw_buf_len - (cstate)->raw_buf_index)
 } CopyStateData;
 
 /* DestReceiver for COPY (query) TO */
@@ -394,6 +398,8 @@ static void CopySendInt32(CopyState cstate, int32 val);
 static bool CopyGetInt32(CopyState cstate, int32 *val);
 static void CopySendInt16(CopyState cstate, int16 val);
 static bool CopyGetInt16(CopyState cstate, int16 *val);
+static bool CopyLoadRawBuf(CopyState cstate);
+static int	CopyReadBinaryData(CopyState cstate, char *dest, int nbytes);
 
 
 /*
@@ -723,7 +729,7 @@ CopyGetData(CopyState cstate, void *databuf, int minread, int maxread)
 /*
  * CopySendInt32 sends an int32 in network byte order
  */
-static void
+static inline void
 CopySendInt32(CopyState cstate, int32 val)
 {
 	uint32		buf;
@@ -737,12 +743,12 @@ CopySendInt32(CopyState cstate, int32 val)
  *
  * Returns true if OK, false if EOF
  */
-static bool
+static inline bool
 CopyGetInt32(CopyState cstate, int32 *val)
 {
 	uint32		buf;
 
-	if (CopyGetData(cstate, &buf, sizeof(buf), sizeof(buf)) != sizeof(buf))
+	if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
 	{
 		*val = 0;				/* suppress compiler warning */
 		return false;
@@ -754,7 +760,7 @@ CopyGetInt32(CopyState cstate, int32 *val)
 /*
  * CopySendInt16 sends an int16 in network byte order
  */
-static void
+static inline void
 CopySendInt16(CopyState cstate, int16 val)
 {
 	uint16		buf;
@@ -766,12 +772,12 @@ CopySendInt16(CopyState cstate, int16 val)
 /*
  * CopyGetInt16 reads an int16 that appears in network byte order
  */
-static bool
+static inline bool
 CopyGetInt16(CopyState cstate, int16 *val)
 {
 	uint16		buf;
 
-	if (CopyGetData(cstate, &buf, sizeof(buf), sizeof(buf)) != sizeof(buf))
+	if (CopyReadBinaryData(cstate, (char *) &buf, sizeof(buf)) != sizeof(buf))
 	{
 		*val = 0;				/* suppress compiler warning */
 		return false;
@@ -786,26 +792,20 @@ CopyGetInt16(CopyState cstate, int16 *val)
  *
  * Returns true if able to obtain at least one more byte, else false.
  *
- * If raw_buf_index < raw_buf_len, the unprocessed bytes are transferred
- * down to the start of the buffer and then we load more data after that.
- * This case is used only when a frontend multibyte character crosses a
- * bufferload boundary.
+ * If RAW_BUF_BYTES(cstate) > 0, the unprocessed bytes are moved to the start
+ * of the buffer and then we load more data after that.  This case occurs only
+ * when a multibyte character crosses a bufferload boundary.
  */
 static bool
 CopyLoadRawBuf(CopyState cstate)
 {
-	int			nbytes;
+	int			nbytes = RAW_BUF_BYTES(cstate);
 	int			inbytes;
 
-	if (cstate->raw_buf_index < cstate->raw_buf_len)
-	{
-		/* Copy down the unprocessed data */
-		nbytes = cstate->raw_buf_len - cstate->raw_buf_index;
+	/* Copy down the unprocessed data if any. */
+	if (nbytes > 0)
 		memmove(cstate->raw_buf, cstate->raw_buf + cstate->raw_buf_index,
 				nbytes);
-	}
-	else
-		nbytes = 0;				/* no data need be saved */
 
 	inbytes = CopyGetData(cstate, cstate->raw_buf + nbytes,
 						  1, RAW_BUF_SIZE - nbytes);
@@ -816,6 +816,54 @@ CopyLoadRawBuf(CopyState cstate)
 	return (inbytes > 0);
 }
 
+/*
+ * CopyReadBinaryData
+ *
+ * Reads up to 'nbytes' bytes from cstate->copy_file via cstate->raw_buf
+ * and writes them to 'dest'.  Returns the number of bytes read (which
+ * would be less than 'nbytes' only if we reach EOF).
+ */
+static int
+CopyReadBinaryData(CopyState cstate, char *dest, int nbytes)
+{
+	int			copied_bytes = 0;
+
+	if (RAW_BUF_BYTES(cstate) >= nbytes)
+	{
+		/* Enough bytes are present in the buffer. */
+		memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, nbytes);
+		cstate->raw_buf_index += nbytes;
+		copied_bytes = nbytes;
+	}
+	else
+	{
+		/*
+		 * Not enough bytes in the buffer, so must read from the file.  Need
+		 * to loop since 'nbytes' could be larger than the buffer size.
+		 */
+		do
+		{
+			int			copy_bytes;
+
+			/* Load more data if buffer is empty. */
+			if (RAW_BUF_BYTES(cstate) == 0)
+			{
+				if (!CopyLoadRawBuf(cstate))
+					break;		/* EOF */
+			}
+
+			/* Transfer some bytes. */
+			copy_bytes = Min(nbytes - copied_bytes, RAW_BUF_BYTES(cstate));
+			memcpy(dest, cstate->raw_buf + cstate->raw_buf_index, copy_bytes);
+			cstate->raw_buf_index += copy_bytes;
+			dest += copy_bytes;
+			copied_bytes += copy_bytes;
+		} while (copied_bytes < nbytes);
+	}
+
+	return copied_bytes;
+}
+
 
 /*
  *	 DoCopy executes the SQL COPY statement
@@ -3366,17 +3414,17 @@ BeginCopyFrom(ParseState *pstate,
 	cstate->cur_attval = NULL;
 
 	/*
-	 * Set up variables to avoid per-attribute overhead.  attribute_buf is
-	 * used in both text and binary modes, but we use line_buf and raw_buf
+	 * Set up variables to avoid per-attribute overhead.  attribute_buf and
+	 * raw_buf are used in both text and binary modes, but we use line_buf
 	 * only in text mode.
 	 */
 	initStringInfo(&cstate->attribute_buf);
+	cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1);
+	cstate->raw_buf_index = cstate->raw_buf_len = 0;
 	if (!cstate->binary)
 	{
 		initStringInfo(&cstate->line_buf);
 		cstate->line_buf_converted = false;
-		cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1);
-		cstate->raw_buf_index = cstate->raw_buf_len = 0;
 	}
 
 	/* Assign range table, we'll need it in CopyFrom. */
@@ -3527,7 +3575,7 @@ BeginCopyFrom(ParseState *pstate,
 		int32		tmp;
 
 		/* Signature */
-		if (CopyGetData(cstate, readSig, 11, 11) != 11 ||
+		if (CopyReadBinaryData(cstate, readSig, 11) != 11 ||
 			memcmp(readSig, BinarySignature, 11) != 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
@@ -3555,7 +3603,7 @@ BeginCopyFrom(ParseState *pstate,
 		/* Skip extension header, if present */
 		while (tmp-- > 0)
 		{
-			if (CopyGetData(cstate, readSig, 1, 1) != 1)
+			if (CopyReadBinaryData(cstate, readSig, 1) != 1)
 				ereport(ERROR,
 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 						 errmsg("invalid COPY file header (wrong length)")));
@@ -3771,7 +3819,7 @@ NextCopyFrom(CopyState cstate, ExprContext *econtext,
 			char		dummy;
 
 			if (cstate->copy_dest != COPY_OLD_FE &&
-				CopyGetData(cstate, &dummy, 1, 1) > 0)
+				CopyReadBinaryData(cstate, &dummy, 1) > 0)
 				ereport(ERROR,
 						(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 						 errmsg("received copy data after EOF marker")));
@@ -4744,8 +4792,8 @@ CopyReadBinaryAttribute(CopyState cstate, FmgrInfo *flinfo,
 	resetStringInfo(&cstate->attribute_buf);
 
 	enlargeStringInfo(&cstate->attribute_buf, fld_size);
-	if (CopyGetData(cstate, cstate->attribute_buf.data,
-					fld_size, fld_size) != fld_size)
+	if (CopyReadBinaryData(cstate, cstate->attribute_buf.data,
+						   fld_size) != fld_size)
 		ereport(ERROR,
 				(errcode(ERRCODE_BAD_COPY_FILE_FORMAT),
 				 errmsg("unexpected EOF in COPY data")));

From ce4939ff70890fa658a4095b9fe457f8432b2575 Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Sat, 25 Jul 2020 14:50:59 -0700
Subject: [PATCH 203/334] Use RAND_poll() for seeding randomness after fork().

OpenSSL deprecated RAND_cleanup(), and OpenSSL 1.1.0 made it into a
no-op.  Replace it with RAND_poll(), per an OpenSSL community
recommendation.  While this has no user-visible consequences under
OpenSSL defaults, it might help under non-default settings.

Daniel Gustafsson, reviewed by David Steele and Michael Paquier.

Discussion: https://postgr.es/m/9B038FA5-23E8-40D0-B932-D515E1D8F66A@yesql.se
---
 src/backend/postmaster/fork_process.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/backend/postmaster/fork_process.c b/src/backend/postmaster/fork_process.c
index def3cee37e2d..15d634080078 100644
--- a/src/backend/postmaster/fork_process.c
+++ b/src/backend/postmaster/fork_process.c
@@ -109,10 +109,12 @@ fork_process(void)
 		}
 
 		/*
-		 * Make sure processes do not share OpenSSL randomness state.
+		 * Make sure processes do not share OpenSSL randomness state. This is
+		 * no longer required in OpenSSL 1.1.1 and later versions, but until
+		 * we drop support for version < 1.1.1 we need to do this.
 		 */
 #ifdef USE_OPENSSL
-		RAND_cleanup();
+		RAND_poll();
 #endif
 	}
 

From 15e441972276e95639f8c3d9f5f66c2318fe9348 Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Sat, 25 Jul 2020 14:50:59 -0700
Subject: [PATCH 204/334] Remove optimization for RAND_poll() failing.

The loop to generate seed data will exit on RAND_status(), so we don't
need to handle the case of RAND_poll() failing separately.  Failures
here are rare, so this a code cleanup, essentially.

Daniel Gustafsson, reviewed by David Steele and Michael Paquier.

Discussion: https://postgr.es/m/9B038FA5-23E8-40D0-B932-D515E1D8F66A@yesql.se
---
 src/port/pg_strong_random.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/port/pg_strong_random.c b/src/port/pg_strong_random.c
index eed8b8780894..14e8382cd895 100644
--- a/src/port/pg_strong_random.c
+++ b/src/port/pg_strong_random.c
@@ -108,7 +108,11 @@ pg_strong_random(void *buf, size_t len)
 	/*
 	 * Check that OpenSSL's CSPRNG has been sufficiently seeded, and if not
 	 * add more seed data using RAND_poll().  With some older versions of
-	 * OpenSSL, it may be necessary to call RAND_poll() a number of times.
+	 * OpenSSL, it may be necessary to call RAND_poll() a number of times.  If
+	 * RAND_poll() fails to generate seed data within the given amount of
+	 * retries, subsequent RAND_bytes() calls will fail, but we allow that to
+	 * happen to let pg_strong_random() callers handle that with appropriate
+	 * error handling.
 	 */
 #define NUM_RAND_POLL_RETRIES 8
 
@@ -120,16 +124,7 @@ pg_strong_random(void *buf, size_t len)
 			break;
 		}
 
-		if (RAND_poll() == 0)
-		{
-			/*
-			 * RAND_poll() failed to generate any seed data, which means that
-			 * RAND_bytes() will probably fail.  For now, just fall through
-			 * and let that happen.  XXX: maybe we could seed it some other
-			 * way.
-			 */
-			break;
-		}
+		RAND_poll();
 	}
 
 	if (RAND_bytes(buf, len) == 1)

From 11a68e4b53ffccf336a2faf5fa380acda28e880b Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Sun, 26 Jul 2020 16:32:11 +0900
Subject: [PATCH 205/334] Tweak behavior of pg_stat_activity.leader_pid
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The initial implementation of leader_pid in pg_stat_activity added by
b025f32 took the approach to strictly print what a PGPROC entry
includes.  In short, if a backend has been involved in parallel query at
least once, leader_pid would remain set as long as the backend is alive.
For a parallel group leader, this means that the field would always be
set after it participated at least once in parallel query, and after
more discussions this could be confusing if using for example a
connection pooler.

This commit changes the data printed so as leader_pid becomes always
NULL for a parallel group leader, showing up a non-NULL value only for
the parallel workers, and actually as long as a parallel query is
running as workers are shut down once the query has completed.

This does not change the definition of any catalog, so no catalog bump
is needed.  Per discussion with Justin Pryzby, Álvaro Herrera, Julien
Rouhaud and me.

Discussion: https://postgr.es/m/20200721035145.GB17300@paquier.xyz
Backpatch-through: 13
---
 doc/src/sgml/monitoring.sgml        | 9 +++------
 src/backend/utils/adt/pgstatfuncs.c | 8 +++++++-
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index dc49177c78c1..7dcddf478a11 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -687,12 +687,9 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
        <structfield>leader_pid</structfield> <type>integer</type>
       </para>
       <para>
-       Process ID of the parallel group leader if this process is or
-       has been involved in parallel query, or null. This field is set
-       when a process wants to cooperate with parallel workers, and
-       remains set as long as the process exists. For a parallel group leader,
-       this field is set to its own process ID. For a parallel worker,
-       this field is set to the process ID of the parallel group leader.
+       Process ID of the parallel group leader, if this process is a
+       parallel query worker.  <literal>NULL</literal> if this process is a
+       parallel group leader or does not participate in parallel query.
       </para></entry>
      </row>
 
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 2aff739466ff..95738a4e34ee 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -735,7 +735,13 @@ pg_stat_get_activity(PG_FUNCTION_ARGS)
 				wait_event = pgstat_get_wait_event(raw_wait_event);
 
 				leader = proc->lockGroupLeader;
-				if (leader)
+
+				/*
+				 * Show the leader only for active parallel workers.  This
+				 * leaves the field as NULL for the leader of a parallel
+				 * group.
+				 */
+				if (leader && leader->pid != beentry->st_procpid)
 				{
 					values[29] = Int32GetDatum(leader->pid);
 					nulls[29] = false;

From 56788d2156fc32bd5737e7ac716d70e6a269b7bc Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Sun, 26 Jul 2020 21:02:45 +1200
Subject: [PATCH 206/334] Allocate consecutive blocks during parallel seqscans

Previously we would allocate blocks to parallel workers during a parallel
sequential scan 1 block at a time.  Since other workers were likely to
request a block before a worker returns for another block number to work
on, this could lead to non-sequential I/O patterns in each worker which
could cause the operating system's readahead to perform poorly or not at
all.

Here we change things so that we allocate consecutive "chunks" of blocks
to workers and have them work on those until they're done, at which time
we allocate another chunk for the worker.  The size of these chunks is
based on the size of the relation.

Initial patch here was by Thomas Munro which showed some good improvements
just having a fixed chunk size of 64 blocks with a simple ramp-down near
the end of the scan. The revisions of the patch to make the chunk size
based on the relation size and the adjusted ramp-down in powers of two was
done by me, along with quite extensive benchmarking to determine the
optimal chunk sizes.

For the most part, benchmarks have shown significant performance
improvements for large parallel sequential scans on Linux, FreeBSD and
Windows using SSDs.  It's less clear how this affects the performance of
cloud providers.  Tests done so far are unable to obtain stable enough
performance to provide meaningful benchmark results.  It is possible that
this could cause some performance regressions on more obscure filesystems,
so we may need to later provide users with some ability to get something
closer to the old behavior.  For now, let's leave that until we see that
it's really required.

Author: Thomas Munro, David Rowley
Reviewed-by: Ranier Vilela, Soumyadeep Chakraborty, Robert Haas
Reviewed-by: Amit Kapila, Kirk Jamison
Discussion: https://postgr.es/m/CA+hUKGJ_EErDv41YycXcbMbCBkztA34+z1ts9VQH+ACRuvpxig@mail.gmail.com
---
 src/backend/access/heap/heapam.c   |  22 ++++--
 src/backend/access/table/tableam.c | 118 +++++++++++++++++++++++++++--
 src/include/access/relscan.h       |  14 +++-
 src/include/access/tableam.h       |   2 +
 4 files changed, 144 insertions(+), 12 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index d881f4cd46a5..2c9bb0c7ee24 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -520,12 +520,14 @@ heapgettup(HeapScanDesc scan,
 			{
 				ParallelBlockTableScanDesc pbscan =
 				(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+				ParallelBlockTableScanWorker pbscanwork =
+				(ParallelBlockTableScanWorker) scan->rs_base.rs_private;
 
 				table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
-														 pbscan);
+														 pbscanwork, pbscan);
 
 				page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
-														 pbscan);
+														 pbscanwork, pbscan);
 
 				/* Other processes might have already finished the scan. */
 				if (page == InvalidBlockNumber)
@@ -720,9 +722,11 @@ heapgettup(HeapScanDesc scan,
 		{
 			ParallelBlockTableScanDesc pbscan =
 			(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+			ParallelBlockTableScanWorker pbscanwork =
+			(ParallelBlockTableScanWorker) scan->rs_base.rs_private;
 
 			page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
-													 pbscan);
+													 pbscanwork, pbscan);
 			finished = (page == InvalidBlockNumber);
 		}
 		else
@@ -834,12 +838,14 @@ heapgettup_pagemode(HeapScanDesc scan,
 			{
 				ParallelBlockTableScanDesc pbscan =
 				(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+				ParallelBlockTableScanWorker pbscanwork =
+				(ParallelBlockTableScanWorker) scan->rs_base.rs_private;
 
 				table_block_parallelscan_startblock_init(scan->rs_base.rs_rd,
-														 pbscan);
+														 pbscanwork, pbscan);
 
 				page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
-														 pbscan);
+														 pbscanwork, pbscan);
 
 				/* Other processes might have already finished the scan. */
 				if (page == InvalidBlockNumber)
@@ -1019,9 +1025,11 @@ heapgettup_pagemode(HeapScanDesc scan,
 		{
 			ParallelBlockTableScanDesc pbscan =
 			(ParallelBlockTableScanDesc) scan->rs_base.rs_parallel;
+			ParallelBlockTableScanWorker pbscanwork =
+			(ParallelBlockTableScanWorker) scan->rs_base.rs_private;
 
 			page = table_block_parallelscan_nextpage(scan->rs_base.rs_rd,
-													 pbscan);
+													 pbscanwork, pbscan);
 			finished = (page == InvalidBlockNumber);
 		}
 		else
@@ -1155,6 +1163,8 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 	scan->rs_base.rs_nkeys = nkeys;
 	scan->rs_base.rs_flags = flags;
 	scan->rs_base.rs_parallel = parallel_scan;
+	scan->rs_base.rs_private =
+		palloc(sizeof(ParallelBlockTableScanWorkerData));
 	scan->rs_strategy = NULL;	/* set in initscan */
 
 	/*
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 4b2bb29559a7..4e8553de2afc 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -25,10 +25,24 @@
 #include "access/tableam.h"
 #include "access/xact.h"
 #include "optimizer/plancat.h"
+#include "port/pg_bitutils.h"
 #include "storage/bufmgr.h"
 #include "storage/shmem.h"
 #include "storage/smgr.h"
 
+/*
+ * Constants to control the behavior of block allocation to parallel workers
+ * during a parallel seqscan.  Technically these values do not need to be
+ * powers of 2, but having them as powers of 2 makes the math more optimal
+ * and makes the ramp-down stepping more even.
+ */
+
+/* The number of I/O chunks we try to break a parallel seqscan down into */
+#define PARALLEL_SEQSCAN_NCHUNKS			2048
+/* Ramp down size of allocations when we've only this number of chunks left */
+#define PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS	64
+/* Cap the size of parallel I/O chunks to this number of blocks */
+#define PARALLEL_SEQSCAN_MAX_CHUNK_SIZE		8192
 
 /* GUC variables */
 char	   *default_table_access_method = DEFAULT_TABLE_ACCESS_METHOD;
@@ -408,10 +422,37 @@ table_block_parallelscan_reinitialize(Relation rel, ParallelTableScanDesc pscan)
  * to set the startblock once.
  */
 void
-table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanDesc pbscan)
+table_block_parallelscan_startblock_init(Relation rel,
+										 ParallelBlockTableScanWorker pbscanwork,
+										 ParallelBlockTableScanDesc pbscan)
 {
 	BlockNumber sync_startpage = InvalidBlockNumber;
 
+	/* Reset the state we use for controlling allocation size. */
+	memset(pbscanwork, 0, sizeof(*pbscanwork));
+
+	StaticAssertStmt(MaxBlockNumber <= 0xFFFFFFFE,
+					 "pg_nextpower2_32 may be too small for non-standard BlockNumber width");
+
+	/*
+	 * We determine the chunk size based on the size of the relation. First we
+	 * split the relation into PARALLEL_SEQSCAN_NCHUNKS chunks but we then
+	 * take the next highest power of 2 number of the chunk size.  This means
+	 * we split the relation into somewhere between PARALLEL_SEQSCAN_NCHUNKS
+	 * and PARALLEL_SEQSCAN_NCHUNKS / 2 chunks.
+	 */
+	pbscanwork->phsw_chunk_size = pg_nextpower2_32(Max(pbscan->phs_nblocks /
+													   PARALLEL_SEQSCAN_NCHUNKS, 1));
+
+	/*
+	 * Ensure we don't go over the maximum chunk size with larger tables. This
+	 * means we may get much more than PARALLEL_SEQSCAN_NCHUNKS for larger
+	 * tables.  Too large a chunk size has been shown to be detrimental to
+	 * synchronous scan performance.
+	 */
+	pbscanwork->phsw_chunk_size = Min(pbscanwork->phsw_chunk_size,
+									  PARALLEL_SEQSCAN_MAX_CHUNK_SIZE);
+
 retry:
 	/* Grab the spinlock. */
 	SpinLockAcquire(&pbscan->phs_mutex);
@@ -451,13 +492,40 @@ table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanDes
  * backend gets an InvalidBlockNumber return.
  */
 BlockNumber
-table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbscan)
+table_block_parallelscan_nextpage(Relation rel,
+								  ParallelBlockTableScanWorker pbscanwork,
+								  ParallelBlockTableScanDesc pbscan)
 {
 	BlockNumber page;
 	uint64		nallocated;
 
 	/*
-	 * phs_nallocated tracks how many pages have been allocated to workers
+	 * The logic below allocates block numbers out to parallel workers in a
+	 * way that each worker will receive a set of consecutive block numbers to
+	 * scan.  Earlier versions of this would allocate the next highest block
+	 * number to the next worker to call this function.  This would generally
+	 * result in workers never receiving consecutive block numbers.  Some
+	 * operating systems would not detect the sequential I/O pattern due to
+	 * each backend being a different process which could result in poor
+	 * performance due to inefficient or no readahead.  To work around this
+	 * issue, we now allocate a range of block numbers for each worker and
+	 * when they come back for another block, we give them the next one in
+	 * that range until the range is complete.  When the worker completes the
+	 * range of blocks we then allocate another range for it and return the
+	 * first block number from that range.
+	 *
+	 * Here we name these ranges of blocks "chunks".  The initial size of
+	 * these chunks is determined in table_block_parallelscan_startblock_init
+	 * based on the size of the relation.  Towards the end of the scan, we
+	 * start making reductions in the size of the chunks in order to attempt
+	 * to divide the remaining work over all the workers as evenly as
+	 * possible.
+	 *
+	 * Here pbscanwork is local worker memory.  phsw_chunk_remaining tracks
+	 * the number of blocks remaining in the chunk.  When that reaches 0 then
+	 * we must allocate a new chunk for the worker.
+	 *
+	 * phs_nallocated tracks how many blocks have been allocated to workers
 	 * already.  When phs_nallocated >= rs_nblocks, all blocks have been
 	 * allocated.
 	 *
@@ -468,10 +536,50 @@ table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanDesc pbsca
 	 * wide because of that, to avoid wrapping around when rs_nblocks is close
 	 * to 2^32.
 	 *
-	 * The actual page to return is calculated by adding the counter to the
+	 * The actual block to return is calculated by adding the counter to the
 	 * starting block number, modulo nblocks.
 	 */
-	nallocated = pg_atomic_fetch_add_u64(&pbscan->phs_nallocated, 1);
+
+	/*
+	 * First check if we have any remaining blocks in a previous chunk for
+	 * this worker.  We must consume all of the blocks from that before we
+	 * allocate a new chunk to the worker.
+	 */
+	if (pbscanwork->phsw_chunk_remaining > 0)
+	{
+		/*
+		 * Give them the next block in the range and update the remaining
+		 * number of blocks.
+		 */
+		nallocated = ++pbscanwork->phsw_nallocated;
+		pbscanwork->phsw_chunk_remaining--;
+	}
+	else
+	{
+		/*
+		 * When we've only got PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS chunks
+		 * remaining in the scan, we half the chunk size.  Since we reduce the
+		 * chunk size here, we'll hit this again after doing
+		 * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS at the new size.  After a few
+		 * iterations of this, we'll end up doing the last few blocks with the
+		 * chunk size set to 1.
+		 */
+		if (pbscanwork->phsw_chunk_size > 1 &&
+			pbscanwork->phsw_nallocated > pbscan->phs_nblocks -
+			(pbscanwork->phsw_chunk_size * PARALLEL_SEQSCAN_RAMPDOWN_CHUNKS))
+			pbscanwork->phsw_chunk_size >>= 1;
+
+		nallocated = pbscanwork->phsw_nallocated =
+			pg_atomic_fetch_add_u64(&pbscan->phs_nallocated,
+									pbscanwork->phsw_chunk_size);
+
+		/*
+		 * Set the remaining number of blocks in this chunk so that subsequent
+		 * calls from this worker continue on with this chunk until it's done.
+		 */
+		pbscanwork->phsw_chunk_remaining = pbscanwork->phsw_chunk_size - 1;
+	}
+
 	if (nallocated >= pbscan->phs_nblocks)
 		page = InvalidBlockNumber;	/* all blocks have been allocated */
 	else
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 6f0258831f74..56459769519b 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -42,9 +42,9 @@ typedef struct TableScanDescData
 	 */
 	uint32		rs_flags;
 
+	void	   *rs_private;		/* per-worker private memory for AM to use */
 	struct ParallelTableScanDescData *rs_parallel;	/* parallel scan
 													 * information */
-
 } TableScanDescData;
 typedef struct TableScanDescData *TableScanDesc;
 
@@ -81,6 +81,18 @@ typedef struct ParallelBlockTableScanDescData
 }			ParallelBlockTableScanDescData;
 typedef struct ParallelBlockTableScanDescData *ParallelBlockTableScanDesc;
 
+/*
+ * Per backend state for parallel table scan, for block-oriented storage.
+ */
+typedef struct ParallelBlockTableScanWorkerData
+{
+	uint64		phsw_nallocated;	/* Current # of blocks into the scan */
+	uint32		phsw_chunk_remaining;	/* # blocks left in this chunk */
+	uint32		phsw_chunk_size;	/* The number of blocks to allocate in
+									 * each I/O chunk for the scan */
+}			ParallelBlockTableScanWorkerData;
+typedef struct ParallelBlockTableScanWorkerData *ParallelBlockTableScanWorker;
+
 /*
  * Base class for fetches from a table via an index. This is the base-class
  * for such scans, which needs to be embedded in the respective struct for
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 0d28f01ca918..7ba72c84e021 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -1793,8 +1793,10 @@ extern Size table_block_parallelscan_initialize(Relation rel,
 extern void table_block_parallelscan_reinitialize(Relation rel,
 												  ParallelTableScanDesc pscan);
 extern BlockNumber table_block_parallelscan_nextpage(Relation rel,
+													 ParallelBlockTableScanWorker pbscanwork,
 													 ParallelBlockTableScanDesc pbscan);
 extern void table_block_parallelscan_startblock_init(Relation rel,
+													 ParallelBlockTableScanWorker pbscanwork,
 													 ParallelBlockTableScanDesc pbscan);
 
 

From 200f6100a9f9fc71273aeb6aceac4430f3437195 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jdavis@postgresql.org>
Date: Sun, 26 Jul 2020 14:55:52 -0700
Subject: [PATCH 207/334] Fix LookupTupleHashEntryHash() pipeline-stall issue.

Refactor hash lookups in nodeAgg.c to improve performance.

Author: Andres Freund and Jeff Davis
Discussion: https://postgr.es/m/20200612213715.op4ye4q7gktqvpuo%40alap3.anarazel.de
Backpatch-through: 13
---
 src/backend/executor/execGrouping.c       |  29 ++--
 src/backend/executor/nodeAgg.c            | 163 +++++++++++-----------
 src/backend/executor/nodeRecursiveunion.c |   4 +-
 src/backend/executor/nodeSetOp.c          |   4 +-
 src/backend/executor/nodeSubplan.c        |   4 +-
 src/include/executor/executor.h           |   2 +-
 6 files changed, 105 insertions(+), 101 deletions(-)

diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 019b87df21ec..321f427e478f 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -22,11 +22,11 @@
 #include "utils/memutils.h"
 
 static int	TupleHashTableMatch(struct tuplehash_hash *tb, const MinimalTuple tuple1, const MinimalTuple tuple2);
-static uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
-										  const MinimalTuple tuple);
-static TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashtable,
-													TupleTableSlot *slot,
-													bool *isnew, uint32 hash);
+static inline uint32 TupleHashTableHash_internal(struct tuplehash_hash *tb,
+												 const MinimalTuple tuple);
+static inline TupleHashEntry LookupTupleHashEntry_internal(TupleHashTable hashtable,
+														   TupleTableSlot *slot,
+														   bool *isnew, uint32 hash);
 
 /*
  * Define parameters for tuple hash table code generation. The interface is
@@ -291,6 +291,9 @@ ResetTupleHashTable(TupleHashTable hashtable)
  * If isnew is NULL, we do not create new entries; we return NULL if no
  * match is found.
  *
+ * If hash is not NULL, we set it to the calculated hash value. This allows
+ * callers access to the hash value even if no entry is returned.
+ *
  * If isnew isn't NULL, then a new entry is created if no existing entry
  * matches.  On return, *isnew is true if the entry is newly created,
  * false if it existed already.  ->additional_data in the new entry has
@@ -298,11 +301,11 @@ ResetTupleHashTable(TupleHashTable hashtable)
  */
 TupleHashEntry
 LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
-					 bool *isnew)
+					 bool *isnew, uint32 *hash)
 {
 	TupleHashEntry entry;
 	MemoryContext oldContext;
-	uint32		hash;
+	uint32		local_hash;
 
 	/* Need to run the hash functions in short-lived context */
 	oldContext = MemoryContextSwitchTo(hashtable->tempcxt);
@@ -312,8 +315,13 @@ LookupTupleHashEntry(TupleHashTable hashtable, TupleTableSlot *slot,
 	hashtable->in_hash_funcs = hashtable->tab_hash_funcs;
 	hashtable->cur_eq_func = hashtable->tab_eq_func;
 
-	hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
-	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, hash);
+	local_hash = TupleHashTableHash_internal(hashtable->hashtab, NULL);
+	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, local_hash);
+
+	if (hash != NULL)
+		*hash = local_hash;
+
+	Assert(entry == NULL || entry->hash == local_hash);
 
 	MemoryContextSwitchTo(oldContext);
 
@@ -362,6 +370,7 @@ LookupTupleHashEntryHash(TupleHashTable hashtable, TupleTableSlot *slot,
 	hashtable->cur_eq_func = hashtable->tab_eq_func;
 
 	entry = LookupTupleHashEntry_internal(hashtable, slot, isnew, hash);
+	Assert(entry == NULL || entry->hash == hash);
 
 	MemoryContextSwitchTo(oldContext);
 
@@ -480,7 +489,7 @@ TupleHashTableHash_internal(struct tuplehash_hash *tb,
  * NB: This function may or may not change the memory context. Caller is
  * expected to change it back.
  */
-static TupleHashEntry
+static inline TupleHashEntry
 LookupTupleHashEntry_internal(TupleHashTable hashtable, TupleTableSlot *slot,
 							  bool *isnew, uint32 hash)
 {
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index b79c845a6b73..bbfc4af1ec9c 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -391,7 +391,9 @@ static void finalize_partialaggregate(AggState *aggstate,
 									  AggStatePerAgg peragg,
 									  AggStatePerGroup pergroupstate,
 									  Datum *resultVal, bool *resultIsNull);
-static void prepare_hash_slot(AggState *aggstate);
+static inline void prepare_hash_slot(AggStatePerHash perhash,
+									 TupleTableSlot *inputslot,
+									 TupleTableSlot *hashslot);
 static void prepare_projection_slot(AggState *aggstate,
 									TupleTableSlot *slot,
 									int currentSet);
@@ -413,8 +415,9 @@ static int	hash_choose_num_partitions(uint64 input_groups,
 									   double hashentrysize,
 									   int used_bits,
 									   int *log2_npartittions);
-static AggStatePerGroup lookup_hash_entry(AggState *aggstate, uint32 hash,
-										  bool *in_hash_table);
+static void initialize_hash_entry(AggState *aggstate,
+								  TupleHashTable hashtable,
+								  TupleHashEntry entry);
 static void lookup_hash_entries(AggState *aggstate);
 static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
 static void agg_fill_hash_table(AggState *aggstate);
@@ -1207,12 +1210,11 @@ finalize_partialaggregate(AggState *aggstate,
  * Extract the attributes that make up the grouping key into the
  * hashslot. This is necessary to compute the hash or perform a lookup.
  */
-static void
-prepare_hash_slot(AggState *aggstate)
+static inline void
+prepare_hash_slot(AggStatePerHash perhash,
+				  TupleTableSlot *inputslot,
+				  TupleTableSlot *hashslot)
 {
-	TupleTableSlot *inputslot = aggstate->tmpcontext->ecxt_outertuple;
-	AggStatePerHash perhash = &aggstate->perhash[aggstate->current_set];
-	TupleTableSlot *hashslot = perhash->hashslot;
 	int			i;
 
 	/* transfer just the needed columns into hashslot */
@@ -2013,75 +2015,39 @@ hash_choose_num_partitions(uint64 input_groups, double hashentrysize,
 }
 
 /*
- * Find or create a hashtable entry for the tuple group containing the current
- * tuple (already set in tmpcontext's outertuple slot), in the current grouping
- * set (which the caller must have selected - note that initialize_aggregate
- * depends on this).
- *
- * When called, CurrentMemoryContext should be the per-query context. The
- * already-calculated hash value for the tuple must be specified.
- *
- * If in "spill mode", then only find existing hashtable entries; don't create
- * new ones. If a tuple's group is not already present in the hash table for
- * the current grouping set, assign *in_hash_table=false and the caller will
- * spill it to disk.
+ * Initialize a freshly-created TupleHashEntry.
  */
-static AggStatePerGroup
-lookup_hash_entry(AggState *aggstate, uint32 hash, bool *in_hash_table)
+static void
+initialize_hash_entry(AggState *aggstate, TupleHashTable hashtable,
+					  TupleHashEntry entry)
 {
-	AggStatePerHash perhash = &aggstate->perhash[aggstate->current_set];
-	TupleTableSlot *hashslot = perhash->hashslot;
-	TupleHashEntryData *entry;
-	bool		isnew = false;
-	bool	   *p_isnew;
-
-	/* if hash table already spilled, don't create new entries */
-	p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
-
-	/* find or create the hashtable entry using the filtered tuple */
-	entry = LookupTupleHashEntryHash(perhash->hashtable, hashslot, p_isnew,
-									 hash);
-
-	if (entry == NULL)
-	{
-		*in_hash_table = false;
-		return NULL;
-	}
-	else
-		*in_hash_table = true;
-
-	if (isnew)
-	{
-		AggStatePerGroup pergroup;
-		int			transno;
+	AggStatePerGroup pergroup;
+	int			transno;
 
-		aggstate->hash_ngroups_current++;
-		hash_agg_check_limits(aggstate);
+	aggstate->hash_ngroups_current++;
+	hash_agg_check_limits(aggstate);
 
-		/* no need to allocate or initialize per-group state */
-		if (aggstate->numtrans == 0)
-			return NULL;
+	/* no need to allocate or initialize per-group state */
+	if (aggstate->numtrans == 0)
+		return;
 
-		pergroup = (AggStatePerGroup)
-			MemoryContextAlloc(perhash->hashtable->tablecxt,
-							   sizeof(AggStatePerGroupData) * aggstate->numtrans);
+	pergroup = (AggStatePerGroup)
+		MemoryContextAlloc(hashtable->tablecxt,
+						   sizeof(AggStatePerGroupData) * aggstate->numtrans);
 
-		entry->additional = pergroup;
+	entry->additional = pergroup;
 
-		/*
-		 * Initialize aggregates for new tuple group, lookup_hash_entries()
-		 * already has selected the relevant grouping set.
-		 */
-		for (transno = 0; transno < aggstate->numtrans; transno++)
-		{
-			AggStatePerTrans pertrans = &aggstate->pertrans[transno];
-			AggStatePerGroup pergroupstate = &pergroup[transno];
+	/*
+	 * Initialize aggregates for new tuple group, lookup_hash_entries()
+	 * already has selected the relevant grouping set.
+	 */
+	for (transno = 0; transno < aggstate->numtrans; transno++)
+	{
+		AggStatePerTrans pertrans = &aggstate->pertrans[transno];
+		AggStatePerGroup pergroupstate = &pergroup[transno];
 
-			initialize_aggregate(aggstate, pertrans, pergroupstate);
-		}
+		initialize_aggregate(aggstate, pertrans, pergroupstate);
 	}
-
-	return entry->additional;
 }
 
 /*
@@ -2106,21 +2072,37 @@ static void
 lookup_hash_entries(AggState *aggstate)
 {
 	AggStatePerGroup *pergroup = aggstate->hash_pergroup;
+	TupleTableSlot *outerslot = aggstate->tmpcontext->ecxt_outertuple;
 	int			setno;
 
 	for (setno = 0; setno < aggstate->num_hashes; setno++)
 	{
 		AggStatePerHash perhash = &aggstate->perhash[setno];
+		TupleHashTable hashtable = perhash->hashtable;
+		TupleTableSlot *hashslot = perhash->hashslot;
+		TupleHashEntry entry;
 		uint32		hash;
-		bool		in_hash_table;
+		bool		isnew = false;
+		bool	   *p_isnew;
+
+		/* if hash table already spilled, don't create new entries */
+		p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
 
 		select_current_set(aggstate, setno, true);
-		prepare_hash_slot(aggstate);
-		hash = TupleHashTableHash(perhash->hashtable, perhash->hashslot);
-		pergroup[setno] = lookup_hash_entry(aggstate, hash, &in_hash_table);
+		prepare_hash_slot(perhash,
+						  outerslot,
+						  hashslot);
+
+		entry = LookupTupleHashEntry(hashtable, hashslot,
+									 p_isnew, &hash);
 
-		/* check to see if we need to spill the tuple for this grouping set */
-		if (!in_hash_table)
+		if (entry != NULL)
+		{
+			if (isnew)
+				initialize_hash_entry(aggstate, hashtable, entry);
+			pergroup[setno] = entry->additional;
+		}
+		else
 		{
 			HashAggSpill *spill = &aggstate->hash_spills[setno];
 			TupleTableSlot *slot = aggstate->tmpcontext->ecxt_outertuple;
@@ -2131,6 +2113,7 @@ lookup_hash_entries(AggState *aggstate)
 								   aggstate->hashentrysize);
 
 			hashagg_spill_tuple(aggstate, spill, slot, hash);
+			pergroup[setno] = NULL;
 		}
 	}
 }
@@ -2588,6 +2571,7 @@ static bool
 agg_refill_hash_table(AggState *aggstate)
 {
 	HashAggBatch *batch;
+	AggStatePerHash perhash;
 	HashAggSpill spill;
 	HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo;
 	uint64		ngroups_estimate;
@@ -2639,6 +2623,8 @@ agg_refill_hash_table(AggState *aggstate)
 
 	select_current_set(aggstate, batch->setno, true);
 
+	perhash = &aggstate->perhash[aggstate->current_set];
+
 	/*
 	 * Spilled tuples are always read back as MinimalTuples, which may be
 	 * different from the outer plan, so recompile the aggregate expressions.
@@ -2652,10 +2638,13 @@ agg_refill_hash_table(AggState *aggstate)
 							 HASHAGG_READ_BUFFER_SIZE);
 	for (;;)
 	{
-		TupleTableSlot *slot = aggstate->hash_spill_rslot;
+		TupleTableSlot *spillslot = aggstate->hash_spill_rslot;
+		TupleTableSlot *hashslot = perhash->hashslot;
+		TupleHashEntry entry;
 		MinimalTuple tuple;
 		uint32		hash;
-		bool		in_hash_table;
+		bool		isnew = false;
+		bool	   *p_isnew = aggstate->hash_spill_mode ? NULL : &isnew;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -2663,16 +2652,20 @@ agg_refill_hash_table(AggState *aggstate)
 		if (tuple == NULL)
 			break;
 
-		ExecStoreMinimalTuple(tuple, slot, true);
-		aggstate->tmpcontext->ecxt_outertuple = slot;
+		ExecStoreMinimalTuple(tuple, spillslot, true);
+		aggstate->tmpcontext->ecxt_outertuple = spillslot;
 
-		prepare_hash_slot(aggstate);
-		aggstate->hash_pergroup[batch->setno] =
-			lookup_hash_entry(aggstate, hash, &in_hash_table);
+		prepare_hash_slot(perhash,
+						  aggstate->tmpcontext->ecxt_outertuple,
+						  hashslot);
+		entry = LookupTupleHashEntryHash(
+										 perhash->hashtable, hashslot, p_isnew, hash);
 
-		if (in_hash_table)
+		if (entry != NULL)
 		{
-			/* Advance the aggregates (or combine functions) */
+			if (isnew)
+				initialize_hash_entry(aggstate, perhash->hashtable, entry);
+			aggstate->hash_pergroup[batch->setno] = entry->additional;
 			advance_aggregates(aggstate);
 		}
 		else
@@ -2688,7 +2681,9 @@ agg_refill_hash_table(AggState *aggstate)
 								   ngroups_estimate, aggstate->hashentrysize);
 			}
 			/* no memory for a new group, spill */
-			hashagg_spill_tuple(aggstate, &spill, slot, hash);
+			hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
+
+			aggstate->hash_pergroup[batch->setno] = NULL;
 		}
 
 		/*
diff --git a/src/backend/executor/nodeRecursiveunion.c b/src/backend/executor/nodeRecursiveunion.c
index 620414a1edcf..046242682f01 100644
--- a/src/backend/executor/nodeRecursiveunion.c
+++ b/src/backend/executor/nodeRecursiveunion.c
@@ -94,7 +94,7 @@ ExecRecursiveUnion(PlanState *pstate)
 			if (plan->numCols > 0)
 			{
 				/* Find or build hashtable entry for this tuple's group */
-				LookupTupleHashEntry(node->hashtable, slot, &isnew);
+				LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
 				/* Must reset temp context after each hashtable lookup */
 				MemoryContextReset(node->tempContext);
 				/* Ignore tuple if already seen */
@@ -141,7 +141,7 @@ ExecRecursiveUnion(PlanState *pstate)
 		if (plan->numCols > 0)
 		{
 			/* Find or build hashtable entry for this tuple's group */
-			LookupTupleHashEntry(node->hashtable, slot, &isnew);
+			LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
 			/* Must reset temp context after each hashtable lookup */
 			MemoryContextReset(node->tempContext);
 			/* Ignore tuple if already seen */
diff --git a/src/backend/executor/nodeSetOp.c b/src/backend/executor/nodeSetOp.c
index bfd148a41a24..8d4ccff19cc6 100644
--- a/src/backend/executor/nodeSetOp.c
+++ b/src/backend/executor/nodeSetOp.c
@@ -381,7 +381,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 
 			/* Find or build hashtable entry for this tuple's group */
 			entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
-										 &isnew);
+										 &isnew, NULL);
 
 			/* If new tuple group, initialize counts */
 			if (isnew)
@@ -402,7 +402,7 @@ setop_fill_hash_table(SetOpState *setopstate)
 
 			/* For tuples not seen previously, do not make hashtable entry */
 			entry = LookupTupleHashEntry(setopstate->hashtable, outerslot,
-										 NULL);
+										 NULL, NULL);
 
 			/* Advance the counts if entry is already present */
 			if (entry)
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 298b7757f57c..38c2fc0b50b6 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -595,12 +595,12 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
 		 */
 		if (slotNoNulls(slot))
 		{
-			(void) LookupTupleHashEntry(node->hashtable, slot, &isnew);
+			(void) LookupTupleHashEntry(node->hashtable, slot, &isnew, NULL);
 			node->havehashrows = true;
 		}
 		else if (node->hashnulls)
 		{
-			(void) LookupTupleHashEntry(node->hashnulls, slot, &isnew);
+			(void) LookupTupleHashEntry(node->hashnulls, slot, &isnew, NULL);
 			node->havenullrows = true;
 		}
 
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index c7deeac662f6..415e117407c9 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -139,7 +139,7 @@ extern TupleHashTable BuildTupleHashTableExt(PlanState *parent,
 											 MemoryContext tempcxt, bool use_variable_hash_iv);
 extern TupleHashEntry LookupTupleHashEntry(TupleHashTable hashtable,
 										   TupleTableSlot *slot,
-										   bool *isnew);
+										   bool *isnew, uint32 *hash);
 extern uint32 TupleHashTableHash(TupleHashTable hashtable,
 								 TupleTableSlot *slot);
 extern TupleHashEntry LookupTupleHashEntryHash(TupleHashTable hashtable,

From e971357961f2bf5bddebb3f68ba8b55954709486 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 27 Jul 2020 10:28:06 +0900
Subject: [PATCH 208/334] Fix handling of structure for bytea data type in ECPG

Some code paths dedicated to bytea used the structure for varchar.  This
did not lead to any actual bugs, as bytea and varchar have the same
definition, but it could become a trap if one of these definitions
changes for a new feature or a bug fix.

Issue introduced by 050710b.

Author: Shenhao Wang
Reviewed-by: Vignesh C, Michael Paquier
Discussion: https://postgr.es/m/07ac7dee1efc44f99d7f53a074420177@G08CNEXMBPEKD06.g08.fujitsu.local
Backpatch-through: 12
---
 src/interfaces/ecpg/ecpglib/data.c       | 4 ++--
 src/interfaces/ecpg/ecpglib/descriptor.c | 4 ++--
 src/interfaces/ecpg/ecpglib/execute.c    | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/interfaces/ecpg/ecpglib/data.c b/src/interfaces/ecpg/ecpglib/data.c
index d3db5094cfa6..6bc91ef7eb6d 100644
--- a/src/interfaces/ecpg/ecpglib/data.c
+++ b/src/interfaces/ecpg/ecpglib/data.c
@@ -523,8 +523,8 @@ ecpg_get_data(const PGresult *results, int act_tuple, int act_field, int lineno,
 
 				case ECPGt_bytea:
 					{
-						struct ECPGgeneric_varchar *variable =
-						(struct ECPGgeneric_varchar *) (var + offset * act_tuple);
+						struct ECPGgeneric_bytea *variable =
+						(struct ECPGgeneric_bytea *) (var + offset * act_tuple);
 						long		dst_size,
 									src_size,
 									dec_size;
diff --git a/src/interfaces/ecpg/ecpglib/descriptor.c b/src/interfaces/ecpg/ecpglib/descriptor.c
index f71f539bef9e..369c2f0867ac 100644
--- a/src/interfaces/ecpg/ecpglib/descriptor.c
+++ b/src/interfaces/ecpg/ecpglib/descriptor.c
@@ -591,8 +591,8 @@ set_desc_attr(struct descriptor_item *desc_item, struct variable *var,
 
 	else
 	{
-		struct ECPGgeneric_varchar *variable =
-		(struct ECPGgeneric_varchar *) (var->value);
+		struct ECPGgeneric_bytea *variable =
+		(struct ECPGgeneric_bytea *) (var->value);
 
 		desc_item->is_binary = true;
 		desc_item->data_len = variable->len;
diff --git a/src/interfaces/ecpg/ecpglib/execute.c b/src/interfaces/ecpg/ecpglib/execute.c
index 6961d7c75b4d..9d61ae72506f 100644
--- a/src/interfaces/ecpg/ecpglib/execute.c
+++ b/src/interfaces/ecpg/ecpglib/execute.c
@@ -822,8 +822,8 @@ ecpg_store_input(const int lineno, const bool force_indicator, const struct vari
 
 			case ECPGt_bytea:
 				{
-					struct ECPGgeneric_varchar *variable =
-					(struct ECPGgeneric_varchar *) (var->value);
+					struct ECPGgeneric_bytea *variable =
+					(struct ECPGgeneric_bytea *) (var->value);
 
 					if (!(mallocedval = (char *) ecpg_alloc(variable->len, lineno)))
 						return false;
@@ -1401,7 +1401,7 @@ ecpg_build_params(struct statement *stmt)
 
 			if (var->type == ECPGt_bytea)
 			{
-				binary_length = ((struct ECPGgeneric_varchar *) (var->value))->len;
+				binary_length = ((struct ECPGgeneric_bytea *) (var->value))->len;
 				binary_format = true;
 			}
 		}

From a3ab7a707d9eda4b2162273348cba52252c0f0c9 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 27 Jul 2020 15:58:32 +0900
Subject: [PATCH 209/334] Fix corner case with 16kB-long decompression in
 pgcrypto, take 2

A compressed stream may end with an empty packet.  In this case
decompression finishes before reading the empty packet and the
remaining stream packet causes a failure in reading the following
data.  This commit makes sure to consume such extra data, avoiding a
failure when decompression the data.  This corner case was reproducible
easily with a data length of 16kB, and existed since e94dd6a.  A cheap
regression test is added to cover this case based on a random,
incompressible string.

The first attempt of this patch has allowed to find an older failure
within the compression logic of pgcrypto, fixed by b9b6105.  This
involved SLES 15 with z390 where a custom flavor of libz gets used.
Bonus thanks to Mark Wong for providing access to the specific
environment.

Reported-by: Frank Gagnepain
Author: Kyotaro Horiguchi, Michael Paquier
Reviewed-by: Tom Lane
Discussion: https://postgr.es/m/16476-692ef7b84e5fb893@postgresql.org
Backpatch-through: 9.5
---
 contrib/pgcrypto/expected/pgp-compression.out | 30 +++++++++++++++++++
 contrib/pgcrypto/pgp-compress.c               | 21 +++++++++++++
 contrib/pgcrypto/sql/pgp-compression.sql      | 21 +++++++++++++
 3 files changed, 72 insertions(+)

diff --git a/contrib/pgcrypto/expected/pgp-compression.out b/contrib/pgcrypto/expected/pgp-compression.out
index 32b350b8fe05..d4c57feba30b 100644
--- a/contrib/pgcrypto/expected/pgp-compression.out
+++ b/contrib/pgcrypto/expected/pgp-compression.out
@@ -48,3 +48,33 @@ select pgp_sym_decrypt(
  Secret message
 (1 row)
 
+-- check corner case involving an input string of 16kB, as per bug #16476.
+SELECT setseed(0);
+ setseed 
+---------
+ 
+(1 row)
+
+WITH random_string AS
+(
+  -- This generates a random string of 16366 bytes.  This is chosen
+  -- as random so that it does not get compressed, and the decompression
+  -- would work on a string with the same length as the origin, making the
+  -- test behavior more predictible.  lpad() ensures that the generated
+  -- hexadecimal value is completed by extra zero characters if random()
+  -- has generated a value strictly lower than 16.
+  SELECT string_agg(decode(lpad(to_hex((random()*256)::int), 2, '0'), 'hex'), '') as bytes
+    FROM generate_series(0, 16365)
+)
+SELECT bytes =
+    pgp_sym_decrypt_bytea(
+      pgp_sym_encrypt_bytea(bytes, 'key',
+                            'compress-algo=1,compress-level=1'),
+                            'key', 'expect-compress-algo=1')
+    AS is_same
+  FROM random_string;
+ is_same 
+---------
+ t
+(1 row)
+
diff --git a/contrib/pgcrypto/pgp-compress.c b/contrib/pgcrypto/pgp-compress.c
index 4b1d2a1ff5f9..3636a662b076 100644
--- a/contrib/pgcrypto/pgp-compress.c
+++ b/contrib/pgcrypto/pgp-compress.c
@@ -286,7 +286,28 @@ decompress_read(void *priv, PullFilter *src, int len,
 
 	dec->buf_data = dec->buf_len - dec->stream.avail_out;
 	if (res == Z_STREAM_END)
+	{
+		uint8	   *tmp;
+
+		/*
+		 * A stream must be terminated by a normal packet.  If the last stream
+		 * packet in the source stream is a full packet, a normal empty packet
+		 * must follow.  Since the underlying packet reader doesn't know that
+		 * the compressed stream has been ended, we need to to consume the
+		 * terminating packet here.  This read does not harm even if the
+		 * stream has already ended.
+		 */
+		res = pullf_read(src, 1, &tmp);
+
+		if (res < 0)
+			return res;
+		else if (res > 0)
+		{
+			px_debug("decompress_read: extra bytes after end of stream");
+			return PXE_PGP_CORRUPT_DATA;
+		}
 		dec->eof = 1;
+	}
 	goto restart;
 }
 
diff --git a/contrib/pgcrypto/sql/pgp-compression.sql b/contrib/pgcrypto/sql/pgp-compression.sql
index ca9ee1fc0088..87c59c6cabc4 100644
--- a/contrib/pgcrypto/sql/pgp-compression.sql
+++ b/contrib/pgcrypto/sql/pgp-compression.sql
@@ -28,3 +28,24 @@ select pgp_sym_decrypt(
 	pgp_sym_encrypt('Secret message', 'key',
 			'compress-algo=2, compress-level=0'),
 	'key', 'expect-compress-algo=0');
+
+-- check corner case involving an input string of 16kB, as per bug #16476.
+SELECT setseed(0);
+WITH random_string AS
+(
+  -- This generates a random string of 16366 bytes.  This is chosen
+  -- as random so that it does not get compressed, and the decompression
+  -- would work on a string with the same length as the origin, making the
+  -- test behavior more predictible.  lpad() ensures that the generated
+  -- hexadecimal value is completed by extra zero characters if random()
+  -- has generated a value strictly lower than 16.
+  SELECT string_agg(decode(lpad(to_hex((random()*256)::int), 2, '0'), 'hex'), '') as bytes
+    FROM generate_series(0, 16365)
+)
+SELECT bytes =
+    pgp_sym_decrypt_bytea(
+      pgp_sym_encrypt_bytea(bytes, 'key',
+                            'compress-algo=1,compress-level=1'),
+                            'key', 'expect-compress-algo=1')
+    AS is_same
+  FROM random_string;

From bcbf9446a2983b6452c19cc50050456be262f7c5 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 27 Jul 2020 17:53:19 -0700
Subject: [PATCH 210/334] Remove hashagg_avoid_disk_plan GUC.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Note: This GUC was originally named enable_hashagg_disk when it appeared
in commit 1f39bce0, which added disk-based hash aggregation.  It was
subsequently renamed in commit 92c58fd9.

Author: Peter Geoghegan
Reviewed-By: Jeff Davis, Álvaro Herrera
Discussion: https://postgr.es/m/9d9d1e1252a52ea1bad84ea40dbebfd54e672a0f.camel%40j-davis.com
Backpatch: 13-, where disk-based hash aggregation was introduced.
---
 doc/src/sgml/config.sgml              |  17 ---
 src/backend/optimizer/path/costsize.c |   1 -
 src/backend/optimizer/plan/planner.c  | 162 +++++++++-----------------
 src/backend/utils/misc/guc.c          |  10 --
 src/include/optimizer/cost.h          |   1 -
 5 files changed, 55 insertions(+), 136 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 6ce59078967c..822bbf1f2726 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -4840,23 +4840,6 @@ ANY <replaceable class="parameter">num_sync</replaceable> ( <replaceable class="
       </listitem>
      </varlistentry>
 
-     <varlistentry id="guc-hashagg-avoid-disk-plan" xreflabel="hashagg_avoid_disk_plan">
-      <term><varname>hashagg_avoid_disk_plan</varname> (<type>boolean</type>)
-      <indexterm>
-       <primary><varname>hashagg_avoid_disk_plan</varname> configuration parameter</primary>
-      </indexterm>
-      </term>
-      <listitem>
-       <para>
-        If set to <literal>on</literal>, causes the planner to avoid choosing
-        hashed aggregation plans that are expected to use the disk. If hashed
-        aggregation is chosen, it may still require the use of disk at
-        execution time, even if this parameter is enabled. The default is
-        <literal>off</literal>.
-       </para>
-      </listitem>
-     </varlistentry>
-
      </variablelist>
      </sect2>
      <sect2 id="runtime-config-query-constants">
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 945aa9337481..27ce4cc8069b 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -130,7 +130,6 @@ bool		enable_tidscan = true;
 bool		enable_sort = true;
 bool		enable_incremental_sort = true;
 bool		enable_hashagg = true;
-bool		hashagg_avoid_disk_plan = true;
 bool		enable_nestloop = true;
 bool		enable_material = true;
 bool		enable_mergejoin = true;
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index b406d41e9189..1345e522dcf5 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4850,11 +4850,10 @@ create_distinct_paths(PlannerInfo *root,
 	 * Consider hash-based implementations of DISTINCT, if possible.
 	 *
 	 * If we were not able to make any other types of path, we *must* hash or
-	 * die trying.  If we do have other choices, there are several things that
+	 * die trying.  If we do have other choices, there are two things that
 	 * should prevent selection of hashing: if the query uses DISTINCT ON
 	 * (because it won't really have the expected behavior if we hash), or if
-	 * enable_hashagg is off, or if it looks like the hashtable will exceed
-	 * work_mem.
+	 * enable_hashagg is off.
 	 *
 	 * Note: grouping_is_hashable() is much more expensive to check than the
 	 * other gating conditions, so we want to do it last.
@@ -4864,12 +4863,7 @@ create_distinct_paths(PlannerInfo *root,
 	else if (parse->hasDistinctOn || !enable_hashagg)
 		allow_hash = false;		/* policy-based decision not to hash */
 	else
-	{
-		Size		hashentrysize = hash_agg_entry_size(0, cheapest_input_path->pathtarget->width, 0);
-
-		allow_hash = !hashagg_avoid_disk_plan ||
-			(hashentrysize * numDistinctRows <= work_mem * 1024L);
-	}
+		allow_hash = true;		/* default */
 
 	if (allow_hash && grouping_is_hashable(parse->distinctClause))
 	{
@@ -6749,8 +6743,6 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 
 	if (can_hash)
 	{
-		double		hashaggtablesize;
-
 		if (parse->groupingSets)
 		{
 			/*
@@ -6762,63 +6754,41 @@ add_paths_to_grouping_rel(PlannerInfo *root, RelOptInfo *input_rel,
 		}
 		else
 		{
-			hashaggtablesize = estimate_hashagg_tablesize(cheapest_path,
-														  agg_costs,
-														  dNumGroups);
-
 			/*
-			 * Provided that the estimated size of the hashtable does not
-			 * exceed work_mem, we'll generate a HashAgg Path, although if we
-			 * were unable to sort above, then we'd better generate a Path, so
-			 * that we at least have one.
+			 * Generate a HashAgg Path.  We just need an Agg over the
+			 * cheapest-total input path, since input order won't matter.
 			 */
-			if (!hashagg_avoid_disk_plan ||
-				hashaggtablesize < work_mem * 1024L ||
-				grouped_rel->pathlist == NIL)
-			{
-				/*
-				 * We just need an Agg over the cheapest-total input path,
-				 * since input order won't matter.
-				 */
-				add_path(grouped_rel, (Path *)
-						 create_agg_path(root, grouped_rel,
-										 cheapest_path,
-										 grouped_rel->reltarget,
-										 AGG_HASHED,
-										 AGGSPLIT_SIMPLE,
-										 parse->groupClause,
-										 havingQual,
-										 agg_costs,
-										 dNumGroups));
-			}
+			add_path(grouped_rel, (Path *)
+					 create_agg_path(root, grouped_rel,
+									 cheapest_path,
+									 grouped_rel->reltarget,
+									 AGG_HASHED,
+									 AGGSPLIT_SIMPLE,
+									 parse->groupClause,
+									 havingQual,
+									 agg_costs,
+									 dNumGroups));
 		}
 
 		/*
 		 * Generate a Finalize HashAgg Path atop of the cheapest partially
-		 * grouped path, assuming there is one. Once again, we'll only do this
-		 * if it looks as though the hash table won't exceed work_mem.
+		 * grouped path, assuming there is one
 		 */
 		if (partially_grouped_rel && partially_grouped_rel->pathlist)
 		{
 			Path	   *path = partially_grouped_rel->cheapest_total_path;
 
-			hashaggtablesize = estimate_hashagg_tablesize(path,
-														  agg_final_costs,
-														  dNumGroups);
-
-			if (!hashagg_avoid_disk_plan ||
-				hashaggtablesize < work_mem * 1024L)
-				add_path(grouped_rel, (Path *)
-						 create_agg_path(root,
-										 grouped_rel,
-										 path,
-										 grouped_rel->reltarget,
-										 AGG_HASHED,
-										 AGGSPLIT_FINAL_DESERIAL,
-										 parse->groupClause,
-										 havingQual,
-										 agg_final_costs,
-										 dNumGroups));
+			add_path(grouped_rel, (Path *)
+					 create_agg_path(root,
+									 grouped_rel,
+									 path,
+									 grouped_rel->reltarget,
+									 AGG_HASHED,
+									 AGGSPLIT_FINAL_DESERIAL,
+									 parse->groupClause,
+									 havingQual,
+									 agg_final_costs,
+									 dNumGroups));
 		}
 	}
 
@@ -7171,65 +7141,43 @@ create_partial_grouping_paths(PlannerInfo *root,
 		}
 	}
 
+	/*
+	 * Add a partially-grouped HashAgg Path where possible
+	 */
 	if (can_hash && cheapest_total_path != NULL)
 	{
-		double		hashaggtablesize;
-
 		/* Checked above */
 		Assert(parse->hasAggs || parse->groupClause);
 
-		hashaggtablesize =
-			estimate_hashagg_tablesize(cheapest_total_path,
-									   agg_partial_costs,
-									   dNumPartialGroups);
-
-		/*
-		 * Tentatively produce a partial HashAgg Path, depending on if it
-		 * looks as if the hash table will fit in work_mem.
-		 */
-		if ((!hashagg_avoid_disk_plan || hashaggtablesize < work_mem * 1024L) &&
-			cheapest_total_path != NULL)
-		{
-			add_path(partially_grouped_rel, (Path *)
-					 create_agg_path(root,
-									 partially_grouped_rel,
-									 cheapest_total_path,
-									 partially_grouped_rel->reltarget,
-									 AGG_HASHED,
-									 AGGSPLIT_INITIAL_SERIAL,
-									 parse->groupClause,
-									 NIL,
-									 agg_partial_costs,
-									 dNumPartialGroups));
-		}
+		add_path(partially_grouped_rel, (Path *)
+				 create_agg_path(root,
+								 partially_grouped_rel,
+								 cheapest_total_path,
+								 partially_grouped_rel->reltarget,
+								 AGG_HASHED,
+								 AGGSPLIT_INITIAL_SERIAL,
+								 parse->groupClause,
+								 NIL,
+								 agg_partial_costs,
+								 dNumPartialGroups));
 	}
 
+	/*
+	 * Now add a partially-grouped HashAgg partial Path where possible
+	 */
 	if (can_hash && cheapest_partial_path != NULL)
 	{
-		double		hashaggtablesize;
-
-		hashaggtablesize =
-			estimate_hashagg_tablesize(cheapest_partial_path,
-									   agg_partial_costs,
-									   dNumPartialPartialGroups);
-
-		/* Do the same for partial paths. */
-		if ((!hashagg_avoid_disk_plan ||
-			 hashaggtablesize < work_mem * 1024L) &&
-			cheapest_partial_path != NULL)
-		{
-			add_partial_path(partially_grouped_rel, (Path *)
-							 create_agg_path(root,
-											 partially_grouped_rel,
-											 cheapest_partial_path,
-											 partially_grouped_rel->reltarget,
-											 AGG_HASHED,
-											 AGGSPLIT_INITIAL_SERIAL,
-											 parse->groupClause,
-											 NIL,
-											 agg_partial_costs,
-											 dNumPartialPartialGroups));
-		}
+		add_partial_path(partially_grouped_rel, (Path *)
+						 create_agg_path(root,
+										 partially_grouped_rel,
+										 cheapest_partial_path,
+										 partially_grouped_rel->reltarget,
+										 AGG_HASHED,
+										 AGGSPLIT_INITIAL_SERIAL,
+										 parse->groupClause,
+										 NIL,
+										 agg_partial_costs,
+										 dNumPartialPartialGroups));
 	}
 
 	/*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 6f603cbbe8c8..abfa95a2314b 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1006,16 +1006,6 @@ static struct config_bool ConfigureNamesBool[] =
 		true,
 		NULL, NULL, NULL
 	},
-	{
-		{"hashagg_avoid_disk_plan", PGC_USERSET, QUERY_TUNING_METHOD,
-			gettext_noop("Causes the planner to avoid hashed aggregation plans that are expected to use the disk."),
-			NULL,
-			GUC_EXPLAIN
-		},
-		&hashagg_avoid_disk_plan,
-		false,
-		NULL, NULL, NULL
-	},
 	{
 		{"enable_material", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of materialization."),
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 613db8eab688..6141654e4783 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -55,7 +55,6 @@ extern PGDLLIMPORT bool enable_tidscan;
 extern PGDLLIMPORT bool enable_sort;
 extern PGDLLIMPORT bool enable_incremental_sort;
 extern PGDLLIMPORT bool enable_hashagg;
-extern PGDLLIMPORT bool hashagg_avoid_disk_plan;
 extern PGDLLIMPORT bool enable_nestloop;
 extern PGDLLIMPORT bool enable_material;
 extern PGDLLIMPORT bool enable_mergejoin;

From 13838740f61fc455aa4196d257efc0b761daba1f Mon Sep 17 00:00:00 2001
From: Etsuro Fujita <efujita@postgresql.org>
Date: Tue, 28 Jul 2020 11:00:00 +0900
Subject: [PATCH 211/334] Fix some issues with step generation in partition
 pruning.

In the case of range partitioning, get_steps_using_prefix() assumes that
the passed-in prefix list contains at least one clause for each of the
partition keys earlier than one specified in the passed-in
step_lastkeyno, but the caller (ie, gen_prune_steps_from_opexps())
didn't take it into account, which led to a server crash or incorrect
results when the list contained no clauses for such partition keys, as
reported in bug #16500 and #16501 from Kobayashi Hisanori.  Update the
caller to call that function only when the list created there contains
at least one clause for each of the earlier partition keys in the case
of range partitioning.

While at it, fix some other issues:

* The list to pass to get_steps_using_prefix() is allowed to contain
  multiple clauses for the same partition key, as described in the
  comment for that function, but that function actually assumed that the
  list contained just a single clause for each of middle partition keys,
  which led to an assertion failure when the list contained multiple
  clauses for such partition keys.  Update that function to match the
  comment.
* In the case of hash partitioning, partition keys are allowed to be
  NULL, in which case the list to pass to get_steps_using_prefix()
  contains no clauses for NULL partition keys, but that function treats
  that case as like the case of range partitioning, which led to the
  assertion failure.  Update the assertion test to take into account
  NULL partition keys in the case of hash partitioning.
* Fix a typo in a comment in get_steps_using_prefix_recurse().
* gen_partprune_steps() failed to detect self-contradiction from
  strict-qual clauses and an IS NULL clause for the same partition key
  in some cases, producing incorrect partition-pruning steps, which led
  to incorrect results of partition pruning, but didn't cause any
  user-visible problems fortunately, as the self-contradiction is
  detected later in the query planning.  Update that function to detect
  the self-contradiction.

Per bug #16500 and #16501 from Kobayashi Hisanori.  Patch by me, initial
diagnosis for the reported issue and review by Dmitry Dolgov.
Back-patch to v11, where partition pruning was introduced.

Discussion: https://postgr.es/m/16500-d1613f2a78e1e090%40postgresql.org
Discussion: https://postgr.es/m/16501-5234a9a0394f6754%40postgresql.org
---
 src/backend/partitioning/partprune.c          | 187 ++++++++++++------
 src/test/regress/expected/partition_prune.out |  92 +++++++++
 src/test/regress/sql/partition_prune.sql      |  71 +++++++
 3 files changed, 294 insertions(+), 56 deletions(-)

diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c
index badd31a44c3c..253c69064982 100644
--- a/src/backend/partitioning/partprune.c
+++ b/src/backend/partitioning/partprune.c
@@ -1058,8 +1058,12 @@ gen_partprune_steps_internal(GeneratePruningStepsContext *context,
 				case PARTCLAUSE_MATCH_NULLNESS:
 					if (!clause_is_not_null)
 					{
-						/* check for conflicting IS NOT NULL */
-						if (bms_is_member(i, notnullkeys))
+						/*
+						 * check for conflicting IS NOT NULL as well as
+						 * contradicting strict clauses
+						 */
+						if (bms_is_member(i, notnullkeys) ||
+							keyclauses[i] != NIL)
 						{
 							context->contradictory = true;
 							return NIL;
@@ -1308,34 +1312,18 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 			{
 				case PARTITION_STRATEGY_LIST:
 				case PARTITION_STRATEGY_RANGE:
-					{
-						PartClauseInfo *last = NULL;
-
-						/*
-						 * Add this clause to the list of clauses to be used
-						 * for pruning if this is the first such key for this
-						 * operator strategy or if it is consecutively next to
-						 * the last column for which a clause with this
-						 * operator strategy was matched.
-						 */
-						if (btree_clauses[pc->op_strategy] != NIL)
-							last = llast(btree_clauses[pc->op_strategy]);
-
-						if (last == NULL ||
-							i == last->keyno || i == last->keyno + 1)
-							btree_clauses[pc->op_strategy] =
-								lappend(btree_clauses[pc->op_strategy], pc);
+					btree_clauses[pc->op_strategy] =
+						lappend(btree_clauses[pc->op_strategy], pc);
 
-						/*
-						 * We can't consider subsequent partition keys if the
-						 * clause for the current key contains a non-inclusive
-						 * operator.
-						 */
-						if (pc->op_strategy == BTLessStrategyNumber ||
-							pc->op_strategy == BTGreaterStrategyNumber)
-							consider_next_key = false;
-						break;
-					}
+					/*
+					 * We can't consider subsequent partition keys if the
+					 * clause for the current key contains a non-inclusive
+					 * operator.
+					 */
+					if (pc->op_strategy == BTLessStrategyNumber ||
+						pc->op_strategy == BTGreaterStrategyNumber)
+						consider_next_key = false;
+					break;
 
 				case PARTITION_STRATEGY_HASH:
 					if (pc->op_strategy != HTEqualStrategyNumber)
@@ -1374,6 +1362,7 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 				List	   *eq_clauses = btree_clauses[BTEqualStrategyNumber];
 				List	   *le_clauses = btree_clauses[BTLessEqualStrategyNumber];
 				List	   *ge_clauses = btree_clauses[BTGreaterEqualStrategyNumber];
+				bool		pk_has_clauses[PARTITION_MAX_KEYS];
 				int			strat;
 
 				/*
@@ -1396,6 +1385,35 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 						ListCell   *lc1;
 						List	   *prefix = NIL;
 						List	   *pc_steps;
+						bool		prefix_valid = true;
+
+						/*
+						 * If this is a clause for the first partition key,
+						 * there are no preceding expressions; generate a
+						 * pruning step without a prefix.
+						 *
+						 * Note that we pass NULL for step_nullkeys, because
+						 * we don't search list/range partition bounds where
+						 * some keys are NULL.
+						 */
+						if (pc->keyno == 0)
+						{
+							Assert(pc->op_strategy == strat);
+							pc_steps = get_steps_using_prefix(context, strat,
+															  pc->op_is_ne,
+															  pc->expr,
+															  pc->cmpfn,
+															  0,
+															  NULL,
+															  NIL);
+							opsteps = list_concat(opsteps, pc_steps);
+							continue;
+						}
+
+						/* (Re-)initialize the pk_has_clauses array */
+						Assert(pc->keyno > 0);
+						for (i = 0; i < pc->keyno; i++)
+							pk_has_clauses[i] = false;
 
 						/*
 						 * Expressions from = clauses can always be in the
@@ -1408,7 +1426,10 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 							if (eqpc->keyno == pc->keyno)
 								break;
 							if (eqpc->keyno < pc->keyno)
+							{
 								prefix = lappend(prefix, eqpc);
+								pk_has_clauses[eqpc->keyno] = true;
+							}
 						}
 
 						/*
@@ -1426,7 +1447,10 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 								if (lepc->keyno == pc->keyno)
 									break;
 								if (lepc->keyno < pc->keyno)
+								{
 									prefix = lappend(prefix, lepc);
+									pk_has_clauses[lepc->keyno] = true;
+								}
 							}
 						}
 
@@ -1445,11 +1469,33 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 								if (gepc->keyno == pc->keyno)
 									break;
 								if (gepc->keyno < pc->keyno)
+								{
 									prefix = lappend(prefix, gepc);
+									pk_has_clauses[gepc->keyno] = true;
+								}
 							}
 						}
 
 						/*
+						 * Check whether every earlier partition key has at
+						 * least one clause.
+						 */
+						for (i = 0; i < pc->keyno; i++)
+						{
+							if (!pk_has_clauses[i])
+							{
+								prefix_valid = false;
+								break;
+							}
+						}
+
+						/*
+						 * If prefix_valid, generate PartitionPruneStepOps.
+						 * Otherwise, we would not find clauses for a valid
+						 * subset of the partition keys anymore for the
+						 * strategy; give up on generating partition pruning
+						 * steps further for the strategy.
+						 *
 						 * As mentioned above, if 'prefix' contains multiple
 						 * expressions for the same key, the following will
 						 * generate multiple steps, one for each combination
@@ -1459,15 +1505,20 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 						 * we don't search list/range partition bounds where
 						 * some keys are NULL.
 						 */
-						Assert(pc->op_strategy == strat);
-						pc_steps = get_steps_using_prefix(context, strat,
-														  pc->op_is_ne,
-														  pc->expr,
-														  pc->cmpfn,
-														  pc->keyno,
-														  NULL,
-														  prefix);
-						opsteps = list_concat(opsteps, pc_steps);
+						if (prefix_valid)
+						{
+							Assert(pc->op_strategy == strat);
+							pc_steps = get_steps_using_prefix(context, strat,
+															  pc->op_is_ne,
+															  pc->expr,
+															  pc->cmpfn,
+															  pc->keyno,
+															  NULL,
+															  prefix);
+							opsteps = list_concat(opsteps, pc_steps);
+						}
+						else
+							break;
 					}
 				}
 				break;
@@ -2182,6 +2233,14 @@ match_clause_to_partition_key(GeneratePruningStepsContext *context,
  * 'prefix'.  Actually, since 'prefix' may contain multiple clauses for the
  * same partition key column, we must generate steps for various combinations
  * of the clauses of different keys.
+ *
+ * For list/range partitioning, callers must ensure that step_nullkeys is
+ * NULL, and that prefix contains at least one clause for each of the
+ * partition keys earlier than one specified in step_lastkeyno if it's
+ * greater than zero.  For hash partitioning, step_nullkeys is allowed to be
+ * non-NULL, but they must ensure that prefix contains at least one clause
+ * for each of the partition keys other than those specified in step_nullkeys
+ * and step_lastkeyno.
  */
 static List *
 get_steps_using_prefix(GeneratePruningStepsContext *context,
@@ -2193,6 +2252,9 @@ get_steps_using_prefix(GeneratePruningStepsContext *context,
 					   Bitmapset *step_nullkeys,
 					   List *prefix)
 {
+	Assert(step_nullkeys == NULL ||
+		   context->rel->part_scheme->strategy == PARTITION_STRATEGY_HASH);
+
 	/* Quick exit if there are no values to prefix with. */
 	if (list_length(prefix) == 0)
 	{
@@ -2261,7 +2323,7 @@ get_steps_using_prefix_recurse(GeneratePruningStepsContext *context,
 		ListCell   *next_start;
 
 		/*
-		 * For each clause with cur_keyno, adds its expr and cmpfn to
+		 * For each clause with cur_keyno, add its expr and cmpfn to
 		 * step_exprs and step_cmpfns, respectively, and recurse after setting
 		 * next_start to the ListCell of the first clause for the next
 		 * partition key.
@@ -2278,23 +2340,19 @@ get_steps_using_prefix_recurse(GeneratePruningStepsContext *context,
 		for_each_cell(lc, prefix, start)
 		{
 			List	   *moresteps;
+			List	   *step_exprs1,
+					   *step_cmpfns1;
 
 			pc = lfirst(lc);
 			if (pc->keyno == cur_keyno)
 			{
-				/* clean up before starting a new recursion cycle. */
-				if (cur_keyno == 0)
-				{
-					list_free(step_exprs);
-					list_free(step_cmpfns);
-					step_exprs = list_make1(pc->expr);
-					step_cmpfns = list_make1_oid(pc->cmpfn);
-				}
-				else
-				{
-					step_exprs = lappend(step_exprs, pc->expr);
-					step_cmpfns = lappend_oid(step_cmpfns, pc->cmpfn);
-				}
+				/* Leave the original step_exprs unmodified. */
+				step_exprs1 = list_copy(step_exprs);
+				step_exprs1 = lappend(step_exprs1, pc->expr);
+
+				/* Leave the original step_cmpfns unmodified. */
+				step_cmpfns1 = list_copy(step_cmpfns);
+				step_cmpfns1 = lappend_oid(step_cmpfns1, pc->cmpfn);
 			}
 			else
 			{
@@ -2311,9 +2369,12 @@ get_steps_using_prefix_recurse(GeneratePruningStepsContext *context,
 													   step_nullkeys,
 													   prefix,
 													   next_start,
-													   step_exprs,
-													   step_cmpfns);
+													   step_exprs1,
+													   step_cmpfns1);
 			result = list_concat(result, moresteps);
+
+			list_free(step_exprs1);
+			list_free(step_cmpfns1);
 		}
 	}
 	else
@@ -2321,9 +2382,23 @@ get_steps_using_prefix_recurse(GeneratePruningStepsContext *context,
 		/*
 		 * End the current recursion cycle and start generating steps, one for
 		 * each clause with cur_keyno, which is all clauses from here onward
-		 * till the end of the list.
+		 * till the end of the list.  Note that for hash partitioning,
+		 * step_nullkeys is allowed to be non-empty, in which case step_exprs
+		 * would only contain expressions for the earlier partition keys that
+		 * are not specified in step_nullkeys.
+		 */
+		Assert(list_length(step_exprs) == cur_keyno ||
+			   !bms_is_empty(step_nullkeys));
+		/*
+		 * Note also that for hash partitioning, each partition key should
+		 * have either equality clauses or an IS NULL clause, so if a
+		 * partition key doesn't have an expression, it would be specified
+		 * in step_nullkeys.
 		 */
-		Assert(list_length(step_exprs) == cur_keyno);
+		Assert(context->rel->part_scheme->strategy
+			   != PARTITION_STRATEGY_HASH ||
+			   list_length(step_exprs) + 2 + bms_num_members(step_nullkeys) ==
+			   context->rel->part_scheme->partnatts);
 		for_each_cell(lc, prefix, start)
 		{
 			PartClauseInfo *pc = lfirst(lc);
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index 4315e8e0a379..687cf8c5f415 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -3671,3 +3671,95 @@ explain (costs off) update listp1 set a = 1 where a = 2;
 reset constraint_exclusion;
 reset enable_partition_pruning;
 drop table listp;
+--
+-- Check that gen_prune_steps_from_opexps() works well for various cases of
+-- clauses for different partition keys
+--
+create table rp_prefix_test1 (a int, b varchar) partition by range(a, b);
+create table rp_prefix_test1_p1 partition of rp_prefix_test1 for values from (1, 'a') to (1, 'b');
+create table rp_prefix_test1_p2 partition of rp_prefix_test1 for values from (2, 'a') to (2, 'b');
+-- Don't call get_steps_using_prefix() with the last partition key b plus
+-- an empty prefix
+explain (costs off) select * from rp_prefix_test1 where a <= 1 and b = 'a';
+                    QUERY PLAN                    
+--------------------------------------------------
+ Seq Scan on rp_prefix_test1_p1 rp_prefix_test1
+   Filter: ((a <= 1) AND ((b)::text = 'a'::text))
+(2 rows)
+
+create table rp_prefix_test2 (a int, b int, c int) partition by range(a, b, c);
+create table rp_prefix_test2_p1 partition of rp_prefix_test2 for values from (1, 1, 0) to (1, 1, 10);
+create table rp_prefix_test2_p2 partition of rp_prefix_test2 for values from (2, 2, 0) to (2, 2, 10);
+-- Don't call get_steps_using_prefix() with the last partition key c plus
+-- an invalid prefix (ie, b = 1)
+explain (costs off) select * from rp_prefix_test2 where a <= 1 and b = 1 and c >= 0;
+                   QUERY PLAN                   
+------------------------------------------------
+ Seq Scan on rp_prefix_test2_p1 rp_prefix_test2
+   Filter: ((a <= 1) AND (c >= 0) AND (b = 1))
+(2 rows)
+
+create table rp_prefix_test3 (a int, b int, c int, d int) partition by range(a, b, c, d);
+create table rp_prefix_test3_p1 partition of rp_prefix_test3 for values from (1, 1, 1, 0) to (1, 1, 1, 10);
+create table rp_prefix_test3_p2 partition of rp_prefix_test3 for values from (2, 2, 2, 0) to (2, 2, 2, 10);
+-- Test that get_steps_using_prefix() handles a prefix that contains multiple
+-- clauses for the partition key b (ie, b >= 1 and b >= 2)
+explain (costs off) select * from rp_prefix_test3 where a >= 1 and b >= 1 and b >= 2 and c >= 2 and d >= 0;
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
+ Seq Scan on rp_prefix_test3_p2 rp_prefix_test3
+   Filter: ((a >= 1) AND (b >= 1) AND (b >= 2) AND (c >= 2) AND (d >= 0))
+(2 rows)
+
+create table hp_prefix_test (a int, b int, c int, d int) partition by hash (a part_test_int4_ops, b part_test_int4_ops, c part_test_int4_ops, d part_test_int4_ops);
+create table hp_prefix_test_p1 partition of hp_prefix_test for values with (modulus 2, remainder 0);
+create table hp_prefix_test_p2 partition of hp_prefix_test for values with (modulus 2, remainder 1);
+-- Test that get_steps_using_prefix() handles non-NULL step_nullkeys
+explain (costs off) select * from hp_prefix_test where a = 1 and b is null and c = 1 and d = 1;
+                         QUERY PLAN                          
+-------------------------------------------------------------
+ Seq Scan on hp_prefix_test_p1 hp_prefix_test
+   Filter: ((b IS NULL) AND (a = 1) AND (c = 1) AND (d = 1))
+(2 rows)
+
+drop table rp_prefix_test1;
+drop table rp_prefix_test2;
+drop table rp_prefix_test3;
+drop table hp_prefix_test;
+--
+-- Check that gen_partprune_steps() detects self-contradiction from clauses
+-- regardless of the order of the clauses (Here we use a custom operator to
+-- prevent the equivclass.c machinery from reordering the clauses)
+--
+create operator === (
+   leftarg = int4,
+   rightarg = int4,
+   procedure = int4eq,
+   commutator = ===,
+   hashes
+);
+create operator class part_test_int4_ops2
+for type int4
+using hash as
+operator 1 ===,
+function 2 part_hashint4_noop(int4, int8);
+create table hp_contradict_test (a int, b int) partition by hash (a part_test_int4_ops2, b part_test_int4_ops2);
+create table hp_contradict_test_p1 partition of hp_contradict_test for values with (modulus 2, remainder 0);
+create table hp_contradict_test_p2 partition of hp_contradict_test for values with (modulus 2, remainder 1);
+explain (costs off) select * from hp_contradict_test where a is null and a === 1 and b === 1;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+explain (costs off) select * from hp_contradict_test where a === 1 and b === 1 and a is null;
+        QUERY PLAN        
+--------------------------
+ Result
+   One-Time Filter: false
+(2 rows)
+
+drop table hp_contradict_test;
+drop operator class part_test_int4_ops2 using hash;
+drop operator ===(int4, int4);
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index 6658455a7461..93ef9dc1f340 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -1050,3 +1050,74 @@ reset constraint_exclusion;
 reset enable_partition_pruning;
 
 drop table listp;
+
+--
+-- Check that gen_prune_steps_from_opexps() works well for various cases of
+-- clauses for different partition keys
+--
+
+create table rp_prefix_test1 (a int, b varchar) partition by range(a, b);
+create table rp_prefix_test1_p1 partition of rp_prefix_test1 for values from (1, 'a') to (1, 'b');
+create table rp_prefix_test1_p2 partition of rp_prefix_test1 for values from (2, 'a') to (2, 'b');
+
+-- Don't call get_steps_using_prefix() with the last partition key b plus
+-- an empty prefix
+explain (costs off) select * from rp_prefix_test1 where a <= 1 and b = 'a';
+
+create table rp_prefix_test2 (a int, b int, c int) partition by range(a, b, c);
+create table rp_prefix_test2_p1 partition of rp_prefix_test2 for values from (1, 1, 0) to (1, 1, 10);
+create table rp_prefix_test2_p2 partition of rp_prefix_test2 for values from (2, 2, 0) to (2, 2, 10);
+
+-- Don't call get_steps_using_prefix() with the last partition key c plus
+-- an invalid prefix (ie, b = 1)
+explain (costs off) select * from rp_prefix_test2 where a <= 1 and b = 1 and c >= 0;
+
+create table rp_prefix_test3 (a int, b int, c int, d int) partition by range(a, b, c, d);
+create table rp_prefix_test3_p1 partition of rp_prefix_test3 for values from (1, 1, 1, 0) to (1, 1, 1, 10);
+create table rp_prefix_test3_p2 partition of rp_prefix_test3 for values from (2, 2, 2, 0) to (2, 2, 2, 10);
+
+-- Test that get_steps_using_prefix() handles a prefix that contains multiple
+-- clauses for the partition key b (ie, b >= 1 and b >= 2)
+explain (costs off) select * from rp_prefix_test3 where a >= 1 and b >= 1 and b >= 2 and c >= 2 and d >= 0;
+
+create table hp_prefix_test (a int, b int, c int, d int) partition by hash (a part_test_int4_ops, b part_test_int4_ops, c part_test_int4_ops, d part_test_int4_ops);
+create table hp_prefix_test_p1 partition of hp_prefix_test for values with (modulus 2, remainder 0);
+create table hp_prefix_test_p2 partition of hp_prefix_test for values with (modulus 2, remainder 1);
+
+-- Test that get_steps_using_prefix() handles non-NULL step_nullkeys
+explain (costs off) select * from hp_prefix_test where a = 1 and b is null and c = 1 and d = 1;
+
+drop table rp_prefix_test1;
+drop table rp_prefix_test2;
+drop table rp_prefix_test3;
+drop table hp_prefix_test;
+
+--
+-- Check that gen_partprune_steps() detects self-contradiction from clauses
+-- regardless of the order of the clauses (Here we use a custom operator to
+-- prevent the equivclass.c machinery from reordering the clauses)
+--
+
+create operator === (
+   leftarg = int4,
+   rightarg = int4,
+   procedure = int4eq,
+   commutator = ===,
+   hashes
+);
+create operator class part_test_int4_ops2
+for type int4
+using hash as
+operator 1 ===,
+function 2 part_hashint4_noop(int4, int8);
+
+create table hp_contradict_test (a int, b int) partition by hash (a part_test_int4_ops2, b part_test_int4_ops2);
+create table hp_contradict_test_p1 partition of hp_contradict_test for values with (modulus 2, remainder 0);
+create table hp_contradict_test_p2 partition of hp_contradict_test for values with (modulus 2, remainder 1);
+
+explain (costs off) select * from hp_contradict_test where a is null and a === 1 and b === 1;
+explain (costs off) select * from hp_contradict_test where a === 1 and b === 1 and a is null;
+
+drop table hp_contradict_test;
+drop operator class part_test_int4_ops2 using hash;
+drop operator ===(int4, int4);

From 45fdc9738b36d1068d3ad8fdb06436d6fd14436b Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Tue, 28 Jul 2020 08:06:44 +0530
Subject: [PATCH 212/334] Extend the logical decoding output plugin API with
 stream methods.

This adds seven methods to the output plugin API, adding support for
streaming changes of large in-progress transactions.

* stream_start
* stream_stop
* stream_abort
* stream_commit
* stream_change
* stream_message
* stream_truncate

Most of this is a simple extension of the existing methods, with
the semantic difference that the transaction (or subtransaction)
is incomplete and may be aborted later (which is something the
regular API does not really need to deal with).

This also extends the 'test_decoding' plugin, implementing these
new stream methods.

The stream_start/start_stop are used to demarcate a chunk of changes
streamed for a particular toplevel transaction.

This commit simply adds these new APIs and the upcoming patch to "allow
the streaming mode in ReorderBuffer" will use these APIs.

Author: Tomas Vondra, Dilip Kumar, Amit Kapila
Reviewed-by: Amit Kapila
Tested-by: Neha Sharma and Mahendra Singh Thalor
Discussion: https://postgr.es/m/688b0b7f-2f6c-d827-c27b-216a8e3ea700@2ndquadrant.com
---
 contrib/test_decoding/test_decoding.c     | 176 +++++++++++
 doc/src/sgml/logicaldecoding.sgml         | 218 ++++++++++++++
 src/backend/replication/logical/logical.c | 351 ++++++++++++++++++++++
 src/include/replication/logical.h         |   5 +
 src/include/replication/output_plugin.h   |  69 +++++
 src/include/replication/reorderbuffer.h   |  59 ++++
 6 files changed, 878 insertions(+)

diff --git a/contrib/test_decoding/test_decoding.c b/contrib/test_decoding/test_decoding.c
index 93c948856e7b..dbef52a3af47 100644
--- a/contrib/test_decoding/test_decoding.c
+++ b/contrib/test_decoding/test_decoding.c
@@ -62,6 +62,28 @@ static void pg_decode_message(LogicalDecodingContext *ctx,
 							  ReorderBufferTXN *txn, XLogRecPtr message_lsn,
 							  bool transactional, const char *prefix,
 							  Size sz, const char *message);
+static void pg_decode_stream_start(LogicalDecodingContext *ctx,
+								   ReorderBufferTXN *txn);
+static void pg_decode_stream_stop(LogicalDecodingContext *ctx,
+								  ReorderBufferTXN *txn);
+static void pg_decode_stream_abort(LogicalDecodingContext *ctx,
+								   ReorderBufferTXN *txn,
+								   XLogRecPtr abort_lsn);
+static void pg_decode_stream_commit(LogicalDecodingContext *ctx,
+									ReorderBufferTXN *txn,
+									XLogRecPtr commit_lsn);
+static void pg_decode_stream_change(LogicalDecodingContext *ctx,
+									ReorderBufferTXN *txn,
+									Relation relation,
+									ReorderBufferChange *change);
+static void pg_decode_stream_message(LogicalDecodingContext *ctx,
+									 ReorderBufferTXN *txn, XLogRecPtr message_lsn,
+									 bool transactional, const char *prefix,
+									 Size sz, const char *message);
+static void pg_decode_stream_truncate(LogicalDecodingContext *ctx,
+									  ReorderBufferTXN *txn,
+									  int nrelations, Relation relations[],
+									  ReorderBufferChange *change);
 
 void
 _PG_init(void)
@@ -83,6 +105,13 @@ _PG_output_plugin_init(OutputPluginCallbacks *cb)
 	cb->filter_by_origin_cb = pg_decode_filter;
 	cb->shutdown_cb = pg_decode_shutdown;
 	cb->message_cb = pg_decode_message;
+	cb->stream_start_cb = pg_decode_stream_start;
+	cb->stream_stop_cb = pg_decode_stream_stop;
+	cb->stream_abort_cb = pg_decode_stream_abort;
+	cb->stream_commit_cb = pg_decode_stream_commit;
+	cb->stream_change_cb = pg_decode_stream_change;
+	cb->stream_message_cb = pg_decode_stream_message;
+	cb->stream_truncate_cb = pg_decode_stream_truncate;
 }
 
 
@@ -540,3 +569,150 @@ pg_decode_message(LogicalDecodingContext *ctx,
 	appendBinaryStringInfo(ctx->out, message, sz);
 	OutputPluginWrite(ctx, true);
 }
+
+/*
+ * We never try to stream any empty xact so we don't need any special handling
+ * for skip_empty_xacts in streaming mode APIs.
+ */
+static void
+pg_decode_stream_start(LogicalDecodingContext *ctx,
+					   ReorderBufferTXN *txn)
+{
+	TestDecodingData *data = ctx->output_plugin_private;
+
+	OutputPluginPrepareWrite(ctx, true);
+	if (data->include_xids)
+		appendStringInfo(ctx->out, "opening a streamed block for transaction TXN %u", txn->xid);
+	else
+		appendStringInfo(ctx->out, "opening a streamed block for transaction");
+	OutputPluginWrite(ctx, true);
+}
+
+/*
+ * We never try to stream any empty xact so we don't need any special handling
+ * for skip_empty_xacts in streaming mode APIs.
+ */
+static void
+pg_decode_stream_stop(LogicalDecodingContext *ctx,
+					  ReorderBufferTXN *txn)
+{
+	TestDecodingData *data = ctx->output_plugin_private;
+
+	OutputPluginPrepareWrite(ctx, true);
+	if (data->include_xids)
+		appendStringInfo(ctx->out, "closing a streamed block for transaction TXN %u", txn->xid);
+	else
+		appendStringInfo(ctx->out, "closing a streamed block for transaction");
+	OutputPluginWrite(ctx, true);
+}
+
+/*
+ * We never try to stream any empty xact so we don't need any special handling
+ * for skip_empty_xacts in streaming mode APIs.
+ */
+static void
+pg_decode_stream_abort(LogicalDecodingContext *ctx,
+					   ReorderBufferTXN *txn,
+					   XLogRecPtr abort_lsn)
+{
+	TestDecodingData *data = ctx->output_plugin_private;
+
+	OutputPluginPrepareWrite(ctx, true);
+	if (data->include_xids)
+		appendStringInfo(ctx->out, "aborting streamed (sub)transaction TXN %u", txn->xid);
+	else
+		appendStringInfo(ctx->out, "aborting streamed (sub)transaction");
+	OutputPluginWrite(ctx, true);
+}
+
+/*
+ * We never try to stream any empty xact so we don't need any special handling
+ * for skip_empty_xacts in streaming mode APIs.
+ */
+static void
+pg_decode_stream_commit(LogicalDecodingContext *ctx,
+						ReorderBufferTXN *txn,
+						XLogRecPtr commit_lsn)
+{
+	TestDecodingData *data = ctx->output_plugin_private;
+
+	OutputPluginPrepareWrite(ctx, true);
+
+	if (data->include_xids)
+		appendStringInfo(ctx->out, "committing streamed transaction TXN %u", txn->xid);
+	else
+		appendStringInfo(ctx->out, "committing streamed transaction");
+
+	if (data->include_timestamp)
+		appendStringInfo(ctx->out, " (at %s)",
+						 timestamptz_to_str(txn->commit_time));
+
+	OutputPluginWrite(ctx, true);
+}
+
+/*
+ * In streaming mode, we don't display the changes as the transaction can abort
+ * at a later point in time.  We don't want users to see the changes until the
+ * transaction is committed.
+ */
+static void
+pg_decode_stream_change(LogicalDecodingContext *ctx,
+						ReorderBufferTXN *txn,
+						Relation relation,
+						ReorderBufferChange *change)
+{
+	TestDecodingData *data = ctx->output_plugin_private;
+
+	OutputPluginPrepareWrite(ctx, true);
+	if (data->include_xids)
+		appendStringInfo(ctx->out, "streaming change for TXN %u", txn->xid);
+	else
+		appendStringInfo(ctx->out, "streaming change for transaction");
+	OutputPluginWrite(ctx, true);
+}
+
+/*
+ * In streaming mode, we don't display the contents for transactional messages
+ * as the transaction can abort at a later point in time.  We don't want users to
+ * see the message contents until the transaction is committed.
+ */
+static void
+pg_decode_stream_message(LogicalDecodingContext *ctx,
+						 ReorderBufferTXN *txn, XLogRecPtr lsn, bool transactional,
+						 const char *prefix, Size sz, const char *message)
+{
+	OutputPluginPrepareWrite(ctx, true);
+
+	if (transactional)
+	{
+		appendStringInfo(ctx->out, "streaming message: transactional: %d prefix: %s, sz: %zu",
+						 transactional, prefix, sz);
+	}
+	else
+	{
+		appendStringInfo(ctx->out, "streaming message: transactional: %d prefix: %s, sz: %zu content:",
+						 transactional, prefix, sz);
+		appendBinaryStringInfo(ctx->out, message, sz);
+	}
+
+	OutputPluginWrite(ctx, true);
+}
+
+/*
+ * In streaming mode, we don't display the detailed information of Truncate.
+ * See pg_decode_stream_change.
+ */
+static void
+pg_decode_stream_truncate(LogicalDecodingContext *ctx, ReorderBufferTXN *txn,
+						  int nrelations, Relation relations[],
+						  ReorderBufferChange *change)
+{
+	TestDecodingData *data = ctx->output_plugin_private;
+
+	OutputPluginPrepareWrite(ctx, true);
+	if (data->include_xids)
+		appendStringInfo(ctx->out, "streaming truncate for TXN %u", txn->xid);
+	else
+		appendStringInfo(ctx->out, "streaming truncate for transaction");
+	OutputPluginWrite(ctx, true);
+}
diff --git a/doc/src/sgml/logicaldecoding.sgml b/doc/src/sgml/logicaldecoding.sgml
index c89f93cf6bb5..791a62b57c9b 100644
--- a/doc/src/sgml/logicaldecoding.sgml
+++ b/doc/src/sgml/logicaldecoding.sgml
@@ -389,6 +389,13 @@ typedef struct OutputPluginCallbacks
     LogicalDecodeMessageCB message_cb;
     LogicalDecodeFilterByOriginCB filter_by_origin_cb;
     LogicalDecodeShutdownCB shutdown_cb;
+    LogicalDecodeStreamStartCB stream_start_cb;
+    LogicalDecodeStreamStopCB stream_stop_cb;
+    LogicalDecodeStreamAbortCB stream_abort_cb;
+    LogicalDecodeStreamCommitCB stream_commit_cb;
+    LogicalDecodeStreamChangeCB stream_change_cb;
+    LogicalDecodeStreamMessageCB stream_message_cb;
+    LogicalDecodeStreamTruncateCB stream_truncate_cb;
 } OutputPluginCallbacks;
 
 typedef void (*LogicalOutputPluginInit) (struct OutputPluginCallbacks *cb);
@@ -401,6 +408,15 @@ typedef void (*LogicalOutputPluginInit) (struct OutputPluginCallbacks *cb);
      If <function>truncate_cb</function> is not set but a
      <command>TRUNCATE</command> is to be decoded, the action will be ignored.
     </para>
+
+    <para>
+     An output plugin may also define functions to support streaming of large,
+     in-progress transactions. The <function>stream_start_cb</function>,
+     <function>stream_stop_cb</function>, <function>stream_abort_cb</function>,
+     <function>stream_commit_cb</function> and <function>stream_change_cb</function>
+     are required, while <function>stream_message_cb</function> and
+     <function>stream_truncate_cb</function> are optional.
+    </para>
    </sect2>
 
    <sect2 id="logicaldecoding-capabilities">
@@ -679,6 +695,117 @@ typedef void (*LogicalDecodeMessageCB) (struct LogicalDecodingContext *ctx,
      </para>
     </sect3>
 
+    <sect3 id="logicaldecoding-output-plugin-stream-start">
+     <title>Stream Start Callback</title>
+     <para>
+      The <function>stream_start_cb</function> callback is called when opening
+      a block of streamed changes from an in-progress transaction.
+<programlisting>
+typedef void (*LogicalDecodeStreamStartCB) (struct LogicalDecodingContext *ctx,
+                                            ReorderBufferTXN *txn);
+</programlisting>
+     </para>
+    </sect3>
+
+    <sect3 id="logicaldecoding-output-plugin-stream-stop">
+     <title>Stream Stop Callback</title>
+     <para>
+      The <function>stream_stop_cb</function> callback is called when closing
+      a block of streamed changes from an in-progress transaction.
+<programlisting>
+typedef void (*LogicalDecodeStreamStopCB) (struct LogicalDecodingContext *ctx,
+                                           ReorderBufferTXN *txn);
+</programlisting>
+     </para>
+    </sect3>
+
+    <sect3 id="logicaldecoding-output-plugin-stream-abort">
+     <title>Stream Abort Callback</title>
+     <para>
+      The <function>stream_abort_cb</function> callback is called to abort
+      a previously streamed transaction.
+<programlisting>
+typedef void (*LogicalDecodeStreamAbortCB) (struct LogicalDecodingContext *ctx,
+                                            ReorderBufferTXN *txn,
+                                            XLogRecPtr abort_lsn);
+</programlisting>
+     </para>
+    </sect3>
+
+    <sect3 id="logicaldecoding-output-plugin-stream-commit">
+     <title>Stream Commit Callback</title>
+     <para>
+      The <function>stream_commit_cb</function> callback is called to commit
+      a previously streamed transaction.
+<programlisting>
+typedef void (*LogicalDecodeStreamCommitCB) (struct LogicalDecodingContext *ctx,
+                                             ReorderBufferTXN *txn,
+                                             XLogRecPtr commit_lsn);
+</programlisting>
+     </para>
+    </sect3>
+    
+    <sect3 id="logicaldecoding-output-plugin-stream-change">
+     <title>Stream Change Callback</title>
+     <para>
+      The <function>stream_change_cb</function> callback is called when sending
+      a change in a block of streamed changes (demarcated by
+      <function>stream_start_cb</function> and <function>stream_stop_cb</function> calls).
+      The actual changes are not displayed as the transaction can abort at a later
+      point in time and we don't decode changes for aborted transactions.
+<programlisting>
+typedef void (*LogicalDecodeStreamChangeCB) (struct LogicalDecodingContext *ctx,
+                                             ReorderBufferTXN *txn,
+                                             Relation relation,
+                                             ReorderBufferChange *change);
+</programlisting>
+     </para>
+    </sect3>
+
+    <sect3 id="logicaldecoding-output-plugin-stream-message">
+     <title>Stream Message Callback</title>
+     <para>
+      The <function>stream_message_cb</function> callback is called when sending
+      a generic message in a block of streamed changes (demarcated by
+      <function>stream_start_cb</function> and <function>stream_stop_cb</function> calls).
+      The message contents for transactional messages are not displayed as the transaction
+      can abort at a later point in time and we don't decode changes for aborted
+      transactions.
+<programlisting>
+typedef void (*LogicalDecodeStreamMessageCB) (struct LogicalDecodingContext *ctx,
+                                              ReorderBufferTXN *txn,
+                                              XLogRecPtr message_lsn,
+                                              bool transactional,
+                                              const char *prefix,
+                                              Size message_size,
+                                              const char *message);
+</programlisting>
+     </para>
+    </sect3>
+
+    <sect3 id="logicaldecoding-output-plugin-stream-truncate">
+     <title>Stream Truncate Callback</title>
+     <para>
+      The <function>stream_truncate_cb</function> callback is called for a
+      <command>TRUNCATE</command> command in a block of streamed changes
+      (demarcated by <function>stream_start_cb</function> and
+      <function>stream_stop_cb</function> calls).
+<programlisting>
+typedef void (*LogicalDecodeStreamTruncateCB) (struct LogicalDecodingContext *ctx,
+                                               ReorderBufferTXN *txn,
+                                               int nrelations,
+                                               Relation relations[],
+                                               ReorderBufferChange *change);
+</programlisting>
+      The parameters are analogous to the <function>stream_change_cb</function>
+      callback.  However, because <command>TRUNCATE</command> actions on
+      tables connected by foreign keys need to be executed together, this
+      callback receives an array of relations instead of just a single one.
+      See the description of the <xref linkend="sql-truncate"/> statement for
+      details.
+     </para>
+    </sect3>
+
    </sect2>
 
    <sect2 id="logicaldecoding-output-plugin-output">
@@ -747,4 +874,95 @@ OutputPluginWrite(ctx, true);
      </para>
    </note>
   </sect1>
+
+  <sect1 id="logicaldecoding-streaming">
+   <title>Streaming of Large Transactions for Logical Decoding</title>
+
+   <para>
+    The basic output plugin callbacks (e.g. <function>begin_cb</function>,
+    <function>change_cb</function>, <function>commit_cb</function> and
+    <function>message_cb</function>) are only invoked when the transaction
+    actually commits. The changes are still decoded from the transaction
+    log, but are only passed to the output plugin at commit (and discarded
+    if the transaction aborts).
+   </para>
+
+   <para>
+    This means that while the decoding happens incrementally, and may spill
+    to disk to keep memory usage under control, all the decoded changes have
+    to be transmitted when the transaction finally commits (or more precisely,
+    when the commit is decoded from the transaction log). Depending on the
+    size of the transaction and network bandwidth, the transfer time may
+    significantly increase the apply lag.
+   </para>
+
+   <para>
+    To reduce the apply lag caused by large transactions, an output plugin
+    may provide additional callback to support incremental streaming of
+    in-progress transactions. There are multiple required streaming callbacks
+    (<function>stream_start_cb</function>, <function>stream_stop_cb</function>,
+    <function>stream_abort_cb</function>, <function>stream_commit_cb</function>
+    and <function>stream_change_cb</function>) and two optional callbacks
+    (<function>stream_message_cb</function>) and (<function>stream_truncate_cb</function>).
+   </para>
+
+   <para>
+    When streaming an in-progress transaction, the changes (and messages) are
+    streamed in blocks demarcated by <function>stream_start_cb</function>
+    and <function>stream_stop_cb</function> callbacks. Once all the decoded
+    changes are transmitted, the transaction is committed using the
+    <function>stream_commit_cb</function> callback (or possibly aborted using
+    the <function>stream_abort_cb</function> callback).
+   </para>
+
+   <para>
+    One example sequence of streaming callback calls for one transaction may
+    look like this:
+<programlisting>
+stream_start_cb(...);   &lt;-- start of first block of changes
+  stream_change_cb(...);
+  stream_change_cb(...);
+  stream_message_cb(...);
+  stream_change_cb(...);
+  ...
+  stream_change_cb(...);
+stream_stop_cb(...);    &lt;-- end of first block of changes
+
+stream_start_cb(...);   &lt;-- start of second block of changes
+  stream_change_cb(...);
+  stream_change_cb(...);
+  stream_change_cb(...);
+  ...
+  stream_message_cb(...);
+  stream_change_cb(...);
+stream_stop_cb(...);    &lt;-- end of second block of changes
+
+stream_commit_cb(...);  &lt;-- commit of the streamed transaction
+</programlisting>
+   </para>
+
+   <para>
+    The actual sequence of callback calls may be more complicated, of course.
+    There may be blocks for multiple streamed transactions, some of the
+    transactions may get aborted, etc.
+   </para>
+
+   <para>
+    Similar to spill-to-disk behavior, streaming is triggered when the total
+    amount of changes decoded from the WAL (for all in-progress transactions)
+    exceeds limit defined by <varname>logical_decoding_work_mem</varname> setting.
+    At that point the largest toplevel transaction (measured by amount of memory
+    currently used for decoded changes) is selected and streamed.  However, in
+    some cases we still have to spill to the disk even if streaming is enabled
+    because if we cross the memory limit but we still have not decoded the
+    complete tuple e.g. only decoded toast table insert but not the main table
+    insert.
+   </para>
+
+   <para>
+    Even when streaming large transactions, the changes are still applied in
+    commit order, preserving the same guarantees as the non-streaming mode.
+   </para>
+
+  </sect1>
  </chapter>
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index 61902be3b0ed..05d24b93da02 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -65,6 +65,23 @@ static void message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
 							   XLogRecPtr message_lsn, bool transactional,
 							   const char *prefix, Size message_size, const char *message);
 
+/* streaming callbacks */
+static void stream_start_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+									XLogRecPtr first_lsn);
+static void stream_stop_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+								   XLogRecPtr last_lsn);
+static void stream_abort_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+									XLogRecPtr abort_lsn);
+static void stream_commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+									 XLogRecPtr commit_lsn);
+static void stream_change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+									 Relation relation, ReorderBufferChange *change);
+static void stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+									  XLogRecPtr message_lsn, bool transactional,
+									  const char *prefix, Size message_size, const char *message);
+static void stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+									   int nrelations, Relation relations[], ReorderBufferChange *change);
+
 static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin);
 
 /*
@@ -189,6 +206,39 @@ StartupDecodingContext(List *output_plugin_options,
 	ctx->reorder->commit = commit_cb_wrapper;
 	ctx->reorder->message = message_cb_wrapper;
 
+	/*
+	 * To support streaming, we require start/stop/abort/commit/change
+	 * callbacks. The message and truncate callbacks are optional, similar to
+	 * regular output plugins. We however enable streaming when at least one
+	 * of the methods is enabled so that we can easily identify missing
+	 * methods.
+	 *
+	 * We decide it here, but only check it later in the wrappers.
+	 */
+	ctx->streaming = (ctx->callbacks.stream_start_cb != NULL) ||
+		(ctx->callbacks.stream_stop_cb != NULL) ||
+		(ctx->callbacks.stream_abort_cb != NULL) ||
+		(ctx->callbacks.stream_commit_cb != NULL) ||
+		(ctx->callbacks.stream_change_cb != NULL) ||
+		(ctx->callbacks.stream_message_cb != NULL) ||
+		(ctx->callbacks.stream_truncate_cb != NULL);
+
+	/*
+	 * streaming callbacks
+	 *
+	 * stream_message and stream_truncate callbacks are optional, so we do not
+	 * fail with ERROR when missing, but the wrappers simply do nothing. We
+	 * must set the ReorderBuffer callbacks to something, otherwise the calls
+	 * from there will crash (we don't want to move the checks there).
+	 */
+	ctx->reorder->stream_start = stream_start_cb_wrapper;
+	ctx->reorder->stream_stop = stream_stop_cb_wrapper;
+	ctx->reorder->stream_abort = stream_abort_cb_wrapper;
+	ctx->reorder->stream_commit = stream_commit_cb_wrapper;
+	ctx->reorder->stream_change = stream_change_cb_wrapper;
+	ctx->reorder->stream_message = stream_message_cb_wrapper;
+	ctx->reorder->stream_truncate = stream_truncate_cb_wrapper;
+
 	ctx->out = makeStringInfo();
 	ctx->prepare_write = prepare_write;
 	ctx->write = do_write;
@@ -866,6 +916,307 @@ message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
 	error_context_stack = errcallback.previous;
 }
 
+static void
+stream_start_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						XLogRecPtr first_lsn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_start";
+	state.report_location = first_lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+
+	/*
+	 * report this message's lsn so replies from clients can give an up2date
+	 * answer. This won't ever be enough (and shouldn't be!) to confirm
+	 * receipt of this transaction, but it might allow another transaction's
+	 * commit to be confirmed with one message.
+	 */
+	ctx->write_location = first_lsn;
+
+	/* in streaming mode, stream_start_cb is required */
+	if (ctx->callbacks.stream_start_cb == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical streaming requires a stream_start_cb callback")));
+
+	ctx->callbacks.stream_start_cb(ctx, txn);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+stream_stop_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+					   XLogRecPtr last_lsn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_stop";
+	state.report_location = last_lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+
+	/*
+	 * report this message's lsn so replies from clients can give an up2date
+	 * answer. This won't ever be enough (and shouldn't be!) to confirm
+	 * receipt of this transaction, but it might allow another transaction's
+	 * commit to be confirmed with one message.
+	 */
+	ctx->write_location = last_lsn;
+
+	/* in streaming mode, stream_stop_cb is required */
+	if (ctx->callbacks.stream_stop_cb == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical streaming requires a stream_stop_cb callback")));
+
+	ctx->callbacks.stream_stop_cb(ctx, txn);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+stream_abort_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						XLogRecPtr abort_lsn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_abort";
+	state.report_location = abort_lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+	ctx->write_location = abort_lsn;
+
+	/* in streaming mode, stream_abort_cb is required */
+	if (ctx->callbacks.stream_abort_cb == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical streaming requires a stream_abort_cb callback")));
+
+	ctx->callbacks.stream_abort_cb(ctx, txn, abort_lsn);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+stream_commit_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						 XLogRecPtr commit_lsn)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_commit";
+	state.report_location = txn->final_lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+	ctx->write_location = txn->end_lsn;
+
+	/* in streaming mode, stream_abort_cb is required */
+	if (ctx->callbacks.stream_commit_cb == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical streaming requires a stream_commit_cb callback")));
+
+	ctx->callbacks.stream_commit_cb(ctx, txn, commit_lsn);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+stream_change_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						 Relation relation, ReorderBufferChange *change)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_change";
+	state.report_location = change->lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+
+	/*
+	 * report this change's lsn so replies from clients can give an up2date
+	 * answer. This won't ever be enough (and shouldn't be!) to confirm
+	 * receipt of this transaction, but it might allow another transaction's
+	 * commit to be confirmed with one message.
+	 */
+	ctx->write_location = change->lsn;
+
+	/* in streaming mode, stream_change_cb is required */
+	if (ctx->callbacks.stream_change_cb == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("logical streaming requires a stream_change_cb callback")));
+
+	ctx->callbacks.stream_change_cb(ctx, txn, relation, change);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						  XLogRecPtr message_lsn, bool transactional,
+						  const char *prefix, Size message_size, const char *message)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* this callback is optional */
+	if (ctx->callbacks.stream_message_cb == NULL)
+		return;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_message";
+	state.report_location = message_lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn != NULL ? txn->xid : InvalidTransactionId;
+	ctx->write_location = message_lsn;
+
+	/* do the actual work: call callback */
+	ctx->callbacks.stream_message_cb(ctx, txn, message_lsn, transactional, prefix,
+									 message_size, message);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
+static void
+stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
+						   int nrelations, Relation relations[],
+						   ReorderBufferChange *change)
+{
+	LogicalDecodingContext *ctx = cache->private_data;
+	LogicalErrorCallbackState state;
+	ErrorContextCallback errcallback;
+
+	Assert(!ctx->fast_forward);
+
+	/* We're only supposed to call this when streaming is supported. */
+	Assert(ctx->streaming);
+
+	/* this callback is optional */
+	if (!ctx->callbacks.stream_truncate_cb)
+		return;
+
+	/* Push callback + info on the error context stack */
+	state.ctx = ctx;
+	state.callback_name = "stream_truncate";
+	state.report_location = change->lsn;
+	errcallback.callback = output_plugin_error_callback;
+	errcallback.arg = (void *) &state;
+	errcallback.previous = error_context_stack;
+	error_context_stack = &errcallback;
+
+	/* set output state */
+	ctx->accept_writes = true;
+	ctx->write_xid = txn->xid;
+
+	/*
+	 * report this change's lsn so replies from clients can give an up2date
+	 * answer. This won't ever be enough (and shouldn't be!) to confirm
+	 * receipt of this transaction, but it might allow another transaction's
+	 * commit to be confirmed with one message.
+	 */
+	ctx->write_location = change->lsn;
+
+	ctx->callbacks.stream_truncate_cb(ctx, txn, nrelations, relations, change);
+
+	/* Pop the error context stack */
+	error_context_stack = errcallback.previous;
+}
+
 /*
  * Set the required catalog xmin horizon for historic snapshots in the current
  * replication slot.
diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h
index c2f2475e5d3e..deef31825d6e 100644
--- a/src/include/replication/logical.h
+++ b/src/include/replication/logical.h
@@ -79,6 +79,11 @@ typedef struct LogicalDecodingContext
 	 */
 	void	   *output_writer_private;
 
+	/*
+	 * Does the output plugin support streaming, and is it enabled?
+	 */
+	bool		streaming;
+
 	/*
 	 * State for writing output.
 	 */
diff --git a/src/include/replication/output_plugin.h b/src/include/replication/output_plugin.h
index 3dd9236c576d..b78c796450a1 100644
--- a/src/include/replication/output_plugin.h
+++ b/src/include/replication/output_plugin.h
@@ -99,6 +99,67 @@ typedef bool (*LogicalDecodeFilterByOriginCB) (struct LogicalDecodingContext *ct
  */
 typedef void (*LogicalDecodeShutdownCB) (struct LogicalDecodingContext *ctx);
 
+/*
+ * Called when starting to stream a block of changes from in-progress
+ * transaction (may be called repeatedly, if it's streamed in multiple
+ * chunks).
+ */
+typedef void (*LogicalDecodeStreamStartCB) (struct LogicalDecodingContext *ctx,
+											ReorderBufferTXN *txn);
+
+/*
+ * Called when stopping to stream a block of changes from in-progress
+ * transaction to a remote node (may be called repeatedly, if it's streamed
+ * in multiple chunks).
+ */
+typedef void (*LogicalDecodeStreamStopCB) (struct LogicalDecodingContext *ctx,
+										   ReorderBufferTXN *txn);
+
+/*
+ * Called to discard changes streamed to remote node from in-progress
+ * transaction.
+ */
+typedef void (*LogicalDecodeStreamAbortCB) (struct LogicalDecodingContext *ctx,
+											ReorderBufferTXN *txn,
+											XLogRecPtr abort_lsn);
+
+/*
+ * Called to apply changes streamed to remote node from in-progress
+ * transaction.
+ */
+typedef void (*LogicalDecodeStreamCommitCB) (struct LogicalDecodingContext *ctx,
+											 ReorderBufferTXN *txn,
+											 XLogRecPtr commit_lsn);
+
+/*
+ * Callback for streaming individual changes from in-progress transactions.
+ */
+typedef void (*LogicalDecodeStreamChangeCB) (struct LogicalDecodingContext *ctx,
+											 ReorderBufferTXN *txn,
+											 Relation relation,
+											 ReorderBufferChange *change);
+
+/*
+ * Callback for streaming generic logical decoding messages from in-progress
+ * transactions.
+ */
+typedef void (*LogicalDecodeStreamMessageCB) (struct LogicalDecodingContext *ctx,
+											  ReorderBufferTXN *txn,
+											  XLogRecPtr message_lsn,
+											  bool transactional,
+											  const char *prefix,
+											  Size message_size,
+											  const char *message);
+
+/*
+ * Callback for streaming truncates from in-progress transactions.
+ */
+typedef void (*LogicalDecodeStreamTruncateCB) (struct LogicalDecodingContext *ctx,
+											   ReorderBufferTXN *txn,
+											   int nrelations,
+											   Relation relations[],
+											   ReorderBufferChange *change);
+
 /*
  * Output plugin callbacks
  */
@@ -112,6 +173,14 @@ typedef struct OutputPluginCallbacks
 	LogicalDecodeMessageCB message_cb;
 	LogicalDecodeFilterByOriginCB filter_by_origin_cb;
 	LogicalDecodeShutdownCB shutdown_cb;
+	/* streaming of changes */
+	LogicalDecodeStreamStartCB stream_start_cb;
+	LogicalDecodeStreamStopCB stream_stop_cb;
+	LogicalDecodeStreamAbortCB stream_abort_cb;
+	LogicalDecodeStreamCommitCB stream_commit_cb;
+	LogicalDecodeStreamChangeCB stream_change_cb;
+	LogicalDecodeStreamMessageCB stream_message_cb;
+	LogicalDecodeStreamTruncateCB stream_truncate_cb;
 } OutputPluginCallbacks;
 
 /* Functions in replication/logical/logical.c */
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 1055e99e2e14..42bc81764873 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -348,6 +348,54 @@ typedef void (*ReorderBufferMessageCB) (ReorderBuffer *rb,
 										const char *prefix, Size sz,
 										const char *message);
 
+/* start streaming transaction callback signature */
+typedef void (*ReorderBufferStreamStartCB) (
+											ReorderBuffer *rb,
+											ReorderBufferTXN *txn,
+											XLogRecPtr first_lsn);
+
+/* stop streaming transaction callback signature */
+typedef void (*ReorderBufferStreamStopCB) (
+										   ReorderBuffer *rb,
+										   ReorderBufferTXN *txn,
+										   XLogRecPtr last_lsn);
+
+/* discard streamed transaction callback signature */
+typedef void (*ReorderBufferStreamAbortCB) (
+											ReorderBuffer *rb,
+											ReorderBufferTXN *txn,
+											XLogRecPtr abort_lsn);
+
+/* commit streamed transaction callback signature */
+typedef void (*ReorderBufferStreamCommitCB) (
+											 ReorderBuffer *rb,
+											 ReorderBufferTXN *txn,
+											 XLogRecPtr commit_lsn);
+
+/* stream change callback signature */
+typedef void (*ReorderBufferStreamChangeCB) (
+											 ReorderBuffer *rb,
+											 ReorderBufferTXN *txn,
+											 Relation relation,
+											 ReorderBufferChange *change);
+
+/* stream message callback signature */
+typedef void (*ReorderBufferStreamMessageCB) (
+											  ReorderBuffer *rb,
+											  ReorderBufferTXN *txn,
+											  XLogRecPtr message_lsn,
+											  bool transactional,
+											  const char *prefix, Size sz,
+											  const char *message);
+
+/* stream truncate callback signature */
+typedef void (*ReorderBufferStreamTruncateCB) (
+											   ReorderBuffer *rb,
+											   ReorderBufferTXN *txn,
+											   int nrelations,
+											   Relation relations[],
+											   ReorderBufferChange *change);
+
 struct ReorderBuffer
 {
 	/*
@@ -386,6 +434,17 @@ struct ReorderBuffer
 	ReorderBufferCommitCB commit;
 	ReorderBufferMessageCB message;
 
+	/*
+	 * Callbacks to be called when streaming a transaction.
+	 */
+	ReorderBufferStreamStartCB stream_start;
+	ReorderBufferStreamStopCB stream_stop;
+	ReorderBufferStreamAbortCB stream_abort;
+	ReorderBufferStreamCommitCB stream_commit;
+	ReorderBufferStreamChangeCB stream_change;
+	ReorderBufferStreamMessageCB stream_message;
+	ReorderBufferStreamTruncateCB stream_truncate;
+
 	/*
 	 * Pointer that will be passed untouched to the callbacks.
 	 */

From d7c8576ebe3949a644c700a9f54d88e7e373a647 Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Tue, 28 Jul 2020 22:52:03 +1200
Subject: [PATCH 213/334] Doc: Improve documentation for pg_jit_available()

Per complaint from Scott Ribe. Based on wording suggestion from Tom Lane.

Discussion: https://postgr.es/m/1956E806-1468-4417-9A9D-235AE1D5FE1A@elevated-dev.com
Backpatch-through: 11, where pg_jit_available() was added
---
 doc/src/sgml/func.sgml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 959f6a1c2f25..f766c1bc67c1 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -21087,10 +21087,10 @@ SELECT * FROM pg_ls_dir('.') WITH ORDINALITY AS t(ls,n);
         <returnvalue>boolean</returnvalue>
        </para>
        <para>
-        Returns true if <acronym>JIT</acronym> compilation is available in
-        this session (see <xref linkend="jit"/>).
-        Returns false if <xref linkend="guc-jit"/> is set to false, or if the
-        feature was not enabled at compile time.
+        Returns true if a <acronym>JIT</acronym> compiler extension is
+        available (see <xref linkend="jit"/>) and the
+        <xref linkend="guc-jit"/> configuration parameter is set to
+        <literal>on</literal>.
        </para></entry>
       </row>
 

From 0e3e1c4e1cea68073132fe817fb3a98cb5c1b805 Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Wed, 29 Jul 2020 11:42:21 +1200
Subject: [PATCH 214/334] Make EXPLAIN ANALYZE of HashAgg more similar to Hash
 Join

There were various unnecessary differences between Hash Agg's EXPLAIN
ANALYZE output and Hash Join's.  Here we modify the Hash Agg output so
that it's better aligned to Hash Join's.

The following changes have been made:
1. Start batches counter at 1 instead of 0.
2. Always display the "Batches" property, even when we didn't spill to
   disk.
3. Use the text "Batches" instead of "HashAgg Batches" for text format.
4. Use the text "Memory Usage" instead of "Peak Memory Usage" for text
   format.
5. Include "Batches" before "Memory Usage" in both text and non-text
   formats.

In passing also modify the "Planned Partitions" property so that we show
it regardless of if the value is 0 or not for non-text EXPLAIN formats.
This was pointed out by Justin Pryzby and probably should have been part
of 40efbf870.

Reviewed-by: Justin Pryzby, Jeff Davis
Discussion: https://postgr.es/m/CAApHDvrshRnA6C0VFnu7Fb9TVvgGo80PUMm5+2DiaS1gEkPvtw@mail.gmail.com
Backpatch-through: 13, where HashAgg batching was introduced
---
 src/backend/commands/explain.c | 35 +++++++++++++++++-----------------
 src/backend/executor/nodeAgg.c |  3 +++
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index a283e4d45c84..54e3797a15b6 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3059,21 +3059,19 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 	if (es->format != EXPLAIN_FORMAT_TEXT)
 	{
 
-		if (es->costs && aggstate->hash_planned_partitions > 0)
-		{
+		if (es->costs)
 			ExplainPropertyInteger("Planned Partitions", NULL,
 								   aggstate->hash_planned_partitions, es);
-		}
 
 		if (!es->analyze)
 			return;
 
 		/* EXPLAIN ANALYZE */
+		ExplainPropertyInteger("HashAgg Batches", NULL,
+							   aggstate->hash_batches_used, es);
 		ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es);
 		ExplainPropertyInteger("Disk Usage", "kB",
 							   aggstate->hash_disk_used, es);
-		ExplainPropertyInteger("HashAgg Batches", NULL,
-							   aggstate->hash_batches_used, es);
 	}
 	else
 	{
@@ -3099,13 +3097,13 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 		else
 			appendStringInfoString(es->str, "  ");
 
-		appendStringInfo(es->str, "Peak Memory Usage: " INT64_FORMAT "kB",
-						 memPeakKb);
+		appendStringInfo(es->str, "Batches: %d  Memory Usage: " INT64_FORMAT "kB",
+						 aggstate->hash_batches_used, memPeakKb);
 
-		if (aggstate->hash_batches_used > 0)
-			appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB  HashAgg Batches: %d",
-							 aggstate->hash_disk_used,
-							 aggstate->hash_batches_used);
+		/* Only display disk usage if we spilled to disk */
+		if (aggstate->hash_batches_used > 1)
+			appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB",
+							 aggstate->hash_disk_used);
 		appendStringInfoChar(es->str, '\n');
 	}
 
@@ -3130,21 +3128,22 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 			{
 				ExplainIndentText(es);
 
-				appendStringInfo(es->str, "Peak Memory Usage: " INT64_FORMAT "kB",
-								 memPeakKb);
+				appendStringInfo(es->str, "Batches: %d  Memory Usage: " INT64_FORMAT "kB",
+								 hash_batches_used, memPeakKb);
 
-				if (hash_batches_used > 0)
-					appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB  HashAgg Batches: %d",
-									 hash_disk_used, hash_batches_used);
+				/* Only display disk usage if we spilled to disk */
+				if (hash_batches_used > 1)
+					appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB",
+									 hash_disk_used);
 				appendStringInfoChar(es->str, '\n');
 			}
 			else
 			{
+				ExplainPropertyInteger("HashAgg Batches", NULL,
+									   hash_batches_used, es);
 				ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb,
 									   es);
 				ExplainPropertyInteger("Disk Usage", "kB", hash_disk_used, es);
-				ExplainPropertyInteger("HashAgg Batches", NULL,
-									   hash_batches_used, es);
 			}
 
 			if (es->workers_state)
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index bbfc4af1ec9c..7d7bfa945605 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -3641,6 +3641,9 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 		find_hash_columns(aggstate);
 		build_hash_tables(aggstate);
 		aggstate->table_filled = false;
+
+		/* Initialize this to 1, meaning nothing spilled, yet */
+		aggstate->hash_batches_used = 1;
 	}
 
 	/*

From f36e82072c8866ba2eca08d88d1a5c3e0c3d1eb4 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 28 Jul 2020 16:59:01 -0700
Subject: [PATCH 215/334] Doc: Remove obsolete CREATE AGGREGATE note.

The planner is in fact willing to use hash aggregation when work_mem is
not set high enough for everything to fit in memory.  This has been the
case since commit 1f39bce0, which added disk-based hash aggregation.

There are a few remaining cases in which hash aggregation is avoided as
a matter of policy when the planner surmises that spilling will be
necessary.  For example, callers of choose_hashed_setop() still
conservatively avoid hash aggregation when spilling is anticipated.
That doesn't seem like a good enough reason to mention hash aggregation
in this context.

Backpatch: 13-, where disk-based hash aggregation was introduced.
---
 doc/src/sgml/ref/create_aggregate.sgml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/doc/src/sgml/ref/create_aggregate.sgml b/doc/src/sgml/ref/create_aggregate.sgml
index 811e288ec1ef..a315fff8bd3f 100644
--- a/doc/src/sgml/ref/create_aggregate.sgml
+++ b/doc/src/sgml/ref/create_aggregate.sgml
@@ -386,10 +386,7 @@ SELECT col FROM tab ORDER BY col USING sortop LIMIT 1;
       If this parameter is omitted or is zero, a default estimate is used
       based on the <replaceable>state_data_type</replaceable>.
       The planner uses this value to estimate the memory required for a
-      grouped aggregate query.  The planner will consider using hash
-      aggregation for such a query only if the hash table is estimated to fit
-      in <xref linkend="guc-work-mem"/>; therefore, large values of this
-      parameter discourage use of hash aggregation.
+      grouped aggregate query.
      </para>
     </listitem>
    </varlistentry>

From b1d79127ed875f04720d2c4677a75f43528bfe08 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 28 Jul 2020 17:14:07 -0700
Subject: [PATCH 216/334] Correct obsolete UNION hash aggs comment.

Oversight in commit 1f39bce0, which added disk-based hash aggregation.

Backpatch: 13-, where disk-based hash aggregation was introduced.
---
 src/backend/optimizer/prep/prepunion.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 951aed80e7a2..6588f83d5ec6 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -944,11 +944,10 @@ make_union_unique(SetOperationStmt *op, Path *path, List *tlist,
 
 	/*
 	 * XXX for the moment, take the number of distinct groups as equal to the
-	 * total input size, ie, the worst case.  This is too conservative, but we
-	 * don't want to risk having the hashtable overrun memory; also, it's not
-	 * clear how to get a decent estimate of the true size.  One should note
-	 * as well the propensity of novices to write UNION rather than UNION ALL
-	 * even when they don't expect any duplicates...
+	 * total input size, ie, the worst case.  This is too conservative, but
+	 * it's not clear how to get a decent estimate of the true size.  One
+	 * should note as well the propensity of novices to write UNION rather
+	 * than UNION ALL even when they don't expect any duplicates...
 	 */
 	dNumGroups = path->rows;
 

From c49c74d19241b1fc8da6c215ebb40fd6b71c1bff Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 28 Jul 2020 17:59:16 -0700
Subject: [PATCH 217/334] Rename another "hash_mem" local variable.

Missed by my commit 564ce621.

Backpatch: 13-, where disk-based hash aggregation was introduced.
---
 src/backend/executor/nodeAgg.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 7d7bfa945605..586509c50b27 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -1839,15 +1839,15 @@ hash_agg_check_limits(AggState *aggstate)
 	uint64		ngroups = aggstate->hash_ngroups_current;
 	Size		meta_mem = MemoryContextMemAllocated(aggstate->hash_metacxt,
 													 true);
-	Size		hash_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
-													 true);
+	Size		hashkey_mem = MemoryContextMemAllocated(aggstate->hashcontext->ecxt_per_tuple_memory,
+														true);
 
 	/*
 	 * Don't spill unless there's at least one group in the hash table so we
 	 * can be sure to make progress even in edge cases.
 	 */
 	if (aggstate->hash_ngroups_current > 0 &&
-		(meta_mem + hash_mem > aggstate->hash_mem_limit ||
+		(meta_mem + hashkey_mem > aggstate->hash_mem_limit ||
 		 ngroups > aggstate->hash_ngroups_limit))
 	{
 		hash_agg_enter_spill_mode(aggstate);

From cb04ad498551dcdb91a834c2e8730cdf0b77e70a Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Wed, 29 Jul 2020 16:46:58 +1200
Subject: [PATCH 218/334] Move syncscan.c to src/backend/access/common.

Since the tableam.c code needs to make use of the syncscan.c routines
itself, and since other block-oriented AMs might also want to use it one
day, it didn't make sense for it to live under src/backend/access/heap.

Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKGLCnG%3DNEAByg6bk%2BCT9JZD97Y%3DAxKhh27Su9FeGWOKvDg%40mail.gmail.com
---
 src/backend/access/common/Makefile            |  1 +
 .../access/{heap => common}/syncscan.c        |  6 ++---
 src/backend/access/heap/Makefile              |  1 -
 src/backend/access/heap/heapam.c              |  1 +
 src/backend/access/heap/heapam_handler.c      |  1 +
 src/backend/access/table/tableam.c            |  2 +-
 src/backend/storage/ipc/ipci.c                |  1 +
 src/include/access/heapam.h                   |  6 -----
 src/include/access/syncscan.h                 | 25 +++++++++++++++++++
 9 files changed, 33 insertions(+), 11 deletions(-)
 rename src/backend/access/{heap => common}/syncscan.c (98%)
 create mode 100644 src/include/access/syncscan.h

diff --git a/src/backend/access/common/Makefile b/src/backend/access/common/Makefile
index fd74e14024c3..5a007d63f1a9 100644
--- a/src/backend/access/common/Makefile
+++ b/src/backend/access/common/Makefile
@@ -24,6 +24,7 @@ OBJS = \
 	reloptions.o \
 	scankey.o \
 	session.o \
+	syncscan.o \
 	toast_internals.o \
 	tupconvert.o \
 	tupdesc.o
diff --git a/src/backend/access/heap/syncscan.c b/src/backend/access/common/syncscan.c
similarity index 98%
rename from src/backend/access/heap/syncscan.c
rename to src/backend/access/common/syncscan.c
index a32f6836f80c..c1ce156902be 100644
--- a/src/backend/access/heap/syncscan.c
+++ b/src/backend/access/common/syncscan.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * syncscan.c
- *	  heap scan synchronization support
+ *	  scan synchronization support
  *
  * When multiple backends run a sequential scan on the same table, we try
  * to keep them synchronized to reduce the overall I/O needed.  The goal is
@@ -40,13 +40,13 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  src/backend/access/heap/syncscan.c
+ *	  src/backend/access/common/syncscan.c
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
-#include "access/heapam.h"
+#include "access/syncscan.h"
 #include "miscadmin.h"
 #include "storage/lwlock.h"
 #include "storage/shmem.h"
diff --git a/src/backend/access/heap/Makefile b/src/backend/access/heap/Makefile
index 51a7f5e0d01e..af0bd1888e53 100644
--- a/src/backend/access/heap/Makefile
+++ b/src/backend/access/heap/Makefile
@@ -20,7 +20,6 @@ OBJS = \
 	hio.o \
 	pruneheap.o \
 	rewriteheap.o \
-	syncscan.o \
 	vacuumlazy.o \
 	visibilitymap.o
 
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 2c9bb0c7ee24..8df2716de46c 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -41,6 +41,7 @@
 #include "access/parallel.h"
 #include "access/relscan.h"
 #include "access/subtrans.h"
+#include "access/syncscan.h"
 #include "access/sysattr.h"
 #include "access/tableam.h"
 #include "access/transam.h"
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 8f2e5379210c..267a6ee25a75 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -24,6 +24,7 @@
 #include "access/heaptoast.h"
 #include "access/multixact.h"
 #include "access/rewriteheap.h"
+#include "access/syncscan.h"
 #include "access/tableam.h"
 #include "access/tsmapi.h"
 #include "access/xact.h"
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 4e8553de2afc..3afb63b1fe4d 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -21,7 +21,7 @@
 
 #include <math.h>
 
-#include "access/heapam.h"		/* for ss_* */
+#include "access/syncscan.h"
 #include "access/tableam.h"
 #include "access/xact.h"
 #include "optimizer/plancat.h"
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 427b0d59cde2..e850ebd131e3 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -20,6 +20,7 @@
 #include "access/multixact.h"
 #include "access/nbtree.h"
 #include "access/subtrans.h"
+#include "access/syncscan.h"
 #include "access/twophase.h"
 #include "commands/async.h"
 #include "miscadmin.h"
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index f279edc47347..b31de389106d 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -182,12 +182,6 @@ extern void heap_page_prune_execute(Buffer buffer,
 									OffsetNumber *nowunused, int nunused);
 extern void heap_get_root_tuples(Page page, OffsetNumber *root_offsets);
 
-/* in heap/syncscan.c */
-extern void ss_report_location(Relation rel, BlockNumber location);
-extern BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks);
-extern void SyncScanShmemInit(void);
-extern Size SyncScanShmemSize(void);
-
 /* in heap/vacuumlazy.c */
 struct VacuumParams;
 extern void heap_vacuum_rel(Relation onerel,
diff --git a/src/include/access/syncscan.h b/src/include/access/syncscan.h
new file mode 100644
index 000000000000..7cbf63c399fe
--- /dev/null
+++ b/src/include/access/syncscan.h
@@ -0,0 +1,25 @@
+/*-------------------------------------------------------------------------
+ *
+ * syncscan.h
+ *    POSTGRES synchronous scan support functions.
+ *
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/access/syncscan.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SYNCSCAN_H
+#define SYNCSCAN_H
+
+#include "storage/block.h"
+#include "utils/relcache.h"
+
+extern void ss_report_location(Relation rel, BlockNumber location);
+extern BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks);
+extern void SyncScanShmemInit(void);
+extern Size SyncScanShmemSize(void);
+
+#endif

From f2130e77da51f35d37fd15a343bc1c4a4527e0fd Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Wed, 29 Jul 2020 14:44:32 +0900
Subject: [PATCH 219/334] Fix incorrect print format in json.c

Oid is unsigned, so %u needs to be used and not %d.  The code path
involved here is not normally reachable, so no backpatch is done.

Author: Justin Pryzby
Discussion: https://postgr.es/m/20200728015523.GA27308@telsasoft.com
---
 src/backend/utils/adt/json.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/adt/json.c b/src/backend/utils/adt/json.c
index 641ae3fdf8e3..a7a91b72f69b 100644
--- a/src/backend/utils/adt/json.c
+++ b/src/backend/utils/adt/json.c
@@ -458,7 +458,7 @@ JsonEncodeDateTime(char *buf, Datum value, Oid typid, const int *tzp)
 			}
 			break;
 		default:
-			elog(ERROR, "unknown jsonb value datetime type oid %d", typid);
+			elog(ERROR, "unknown jsonb value datetime type oid %u", typid);
 			return NULL;
 	}
 

From 9878b643f37b1e4167f64a9941244bfabed60623 Mon Sep 17 00:00:00 2001
From: Jeff Davis <jdavis@postgresql.org>
Date: Tue, 28 Jul 2020 23:15:47 -0700
Subject: [PATCH 220/334] HashAgg: use better cardinality estimate for
 recursive spilling.

Use HyperLogLog to estimate the group cardinality in a spilled
partition. This estimate is used to choose the number of partitions if
we recurse.

The previous behavior was to use the number of tuples in a spilled
partition as the estimate for the number of groups, which lead to
overpartitioning. That could cause the number of batches to be much
higher than expected (with each batch being very small), which made it
harder to interpret EXPLAIN ANALYZE results.

Reviewed-by: Peter Geoghegan
Discussion: https://postgr.es/m/a856635f9284bc36f7a77d02f47bbb6aaf7b59b3.camel@j-davis.com
Backpatch-through: 13
---
 src/backend/executor/nodeAgg.c | 64 ++++++++++++++++++++++------------
 src/include/executor/nodeAgg.h |  2 +-
 2 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 586509c50b27..02a9165c6941 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -245,9 +245,11 @@
 #include "catalog/pg_aggregate.h"
 #include "catalog/pg_proc.h"
 #include "catalog/pg_type.h"
+#include "common/hashfn.h"
 #include "executor/execExpr.h"
 #include "executor/executor.h"
 #include "executor/nodeAgg.h"
+#include "lib/hyperloglog.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
 #include "nodes/nodeFuncs.h"
@@ -295,6 +297,14 @@
 #define HASHAGG_READ_BUFFER_SIZE BLCKSZ
 #define HASHAGG_WRITE_BUFFER_SIZE BLCKSZ
 
+/*
+ * HyperLogLog is used for estimating the cardinality of the spilled tuples in
+ * a given partition. 5 bits corresponds to a size of about 32 bytes and a
+ * worst-case error of around 18%. That's effective enough to choose a
+ * reasonable number of partitions when recursing.
+ */
+#define HASHAGG_HLL_BIT_WIDTH 5
+
 /*
  * Estimate chunk overhead as a constant 16 bytes. XXX: should this be
  * improved?
@@ -339,6 +349,7 @@ typedef struct HashAggSpill
 	int64	   *ntuples;		/* number of tuples in each partition */
 	uint32		mask;			/* mask to find partition from hash value */
 	int			shift;			/* after masking, shift by this amount */
+	hyperLogLogState *hll_card;	/* cardinality estimate for contents */
 } HashAggSpill;
 
 /*
@@ -357,6 +368,7 @@ typedef struct HashAggBatch
 	LogicalTapeSet *tapeset;	/* borrowed reference to tape set */
 	int			input_tapenum;	/* input partition tape */
 	int64		input_tuples;	/* number of tuples in this batch */
+	double		input_card;		/* estimated group cardinality */
 } HashAggBatch;
 
 /* used to find referenced colnos */
@@ -411,7 +423,7 @@ static void hashagg_recompile_expressions(AggState *aggstate, bool minslot,
 static long hash_choose_num_buckets(double hashentrysize,
 									long estimated_nbuckets,
 									Size memory);
-static int	hash_choose_num_partitions(uint64 input_groups,
+static int	hash_choose_num_partitions(double input_groups,
 									   double hashentrysize,
 									   int used_bits,
 									   int *log2_npartittions);
@@ -432,10 +444,11 @@ static void hashagg_finish_initial_spills(AggState *aggstate);
 static void hashagg_reset_spill_state(AggState *aggstate);
 static HashAggBatch *hashagg_batch_new(LogicalTapeSet *tapeset,
 									   int input_tapenum, int setno,
-									   int64 input_tuples, int used_bits);
+									   int64 input_tuples, double input_card,
+									   int used_bits);
 static MinimalTuple hashagg_batch_read(HashAggBatch *batch, uint32 *hashp);
 static void hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo,
-							   int used_bits, uint64 input_tuples,
+							   int used_bits, double input_groups,
 							   double hashentrysize);
 static Size hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
 								TupleTableSlot *slot, uint32 hash);
@@ -1777,7 +1790,7 @@ hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
  * substantially larger than the initial value.
  */
 void
-hash_agg_set_limits(double hashentrysize, uint64 input_groups, int used_bits,
+hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
 					Size *mem_limit, uint64 *ngroups_limit,
 					int *num_partitions)
 {
@@ -1969,7 +1982,7 @@ hash_choose_num_buckets(double hashentrysize, long ngroups, Size memory)
  * *log2_npartitions to the log2() of the number of partitions.
  */
 static int
-hash_choose_num_partitions(uint64 input_groups, double hashentrysize,
+hash_choose_num_partitions(double input_groups, double hashentrysize,
 						   int used_bits, int *log2_npartitions)
 {
 	Size		mem_wanted;
@@ -2574,7 +2587,6 @@ agg_refill_hash_table(AggState *aggstate)
 	AggStatePerHash perhash;
 	HashAggSpill spill;
 	HashTapeInfo *tapeinfo = aggstate->hash_tapeinfo;
-	uint64		ngroups_estimate;
 	bool		spill_initialized = false;
 
 	if (aggstate->hash_batches == NIL)
@@ -2583,16 +2595,7 @@ agg_refill_hash_table(AggState *aggstate)
 	batch = linitial(aggstate->hash_batches);
 	aggstate->hash_batches = list_delete_first(aggstate->hash_batches);
 
-	/*
-	 * Estimate the number of groups for this batch as the total number of
-	 * tuples in its input file. Although that's a worst case, it's not bad
-	 * here for two reasons: (1) overestimating is better than
-	 * underestimating; and (2) we've already scanned the relation once, so
-	 * it's likely that we've already finalized many of the common values.
-	 */
-	ngroups_estimate = batch->input_tuples;
-
-	hash_agg_set_limits(aggstate->hashentrysize, ngroups_estimate,
+	hash_agg_set_limits(aggstate->hashentrysize, batch->input_card,
 						batch->used_bits, &aggstate->hash_mem_limit,
 						&aggstate->hash_ngroups_limit, NULL);
 
@@ -2678,7 +2681,7 @@ agg_refill_hash_table(AggState *aggstate)
 				 */
 				spill_initialized = true;
 				hashagg_spill_init(&spill, tapeinfo, batch->used_bits,
-								   ngroups_estimate, aggstate->hashentrysize);
+								   batch->input_card, aggstate->hashentrysize);
 			}
 			/* no memory for a new group, spill */
 			hashagg_spill_tuple(aggstate, &spill, spillslot, hash);
@@ -2936,7 +2939,7 @@ hashagg_tapeinfo_release(HashTapeInfo *tapeinfo, int tapenum)
  */
 static void
 hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, int used_bits,
-				   uint64 input_groups, double hashentrysize)
+				   double input_groups, double hashentrysize)
 {
 	int			npartitions;
 	int			partition_bits;
@@ -2946,6 +2949,7 @@ hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, int used_bits,
 
 	spill->partitions = palloc0(sizeof(int) * npartitions);
 	spill->ntuples = palloc0(sizeof(int64) * npartitions);
+	spill->hll_card = palloc0(sizeof(hyperLogLogState) * npartitions);
 
 	hashagg_tapeinfo_assign(tapeinfo, spill->partitions, npartitions);
 
@@ -2953,6 +2957,9 @@ hashagg_spill_init(HashAggSpill *spill, HashTapeInfo *tapeinfo, int used_bits,
 	spill->shift = 32 - used_bits - partition_bits;
 	spill->mask = (npartitions - 1) << spill->shift;
 	spill->npartitions = npartitions;
+
+	for (int i = 0; i < npartitions; i++)
+		initHyperLogLog(&spill->hll_card[i], HASHAGG_HLL_BIT_WIDTH);
 }
 
 /*
@@ -3001,6 +3008,13 @@ hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
 	partition = (hash & spill->mask) >> spill->shift;
 	spill->ntuples[partition]++;
 
+	/*
+	 * All hash values destined for a given partition have some bits in
+	 * common, which causes bad HLL cardinality estimates. Hash the hash to
+	 * get a more uniform distribution.
+	 */
+	addHyperLogLog(&spill->hll_card[partition], hash_bytes_uint32(hash));
+
 	tapenum = spill->partitions[partition];
 
 	LogicalTapeWrite(tapeset, tapenum, (void *) &hash, sizeof(uint32));
@@ -3023,7 +3037,7 @@ hashagg_spill_tuple(AggState *aggstate, HashAggSpill *spill,
  */
 static HashAggBatch *
 hashagg_batch_new(LogicalTapeSet *tapeset, int tapenum, int setno,
-				  int64 input_tuples, int used_bits)
+				  int64 input_tuples, double input_card, int used_bits)
 {
 	HashAggBatch *batch = palloc0(sizeof(HashAggBatch));
 
@@ -3032,6 +3046,7 @@ hashagg_batch_new(LogicalTapeSet *tapeset, int tapenum, int setno,
 	batch->tapeset = tapeset;
 	batch->input_tapenum = tapenum;
 	batch->input_tuples = input_tuples;
+	batch->input_card = input_card;
 
 	return batch;
 }
@@ -3135,21 +3150,26 @@ hashagg_spill_finish(AggState *aggstate, HashAggSpill *spill, int setno)
 
 	for (i = 0; i < spill->npartitions; i++)
 	{
-		int			tapenum = spill->partitions[i];
-		HashAggBatch *new_batch;
+		int				 tapenum = spill->partitions[i];
+		HashAggBatch	*new_batch;
+		double			 cardinality;
 
 		/* if the partition is empty, don't create a new batch of work */
 		if (spill->ntuples[i] == 0)
 			continue;
 
+		cardinality = estimateHyperLogLog(&spill->hll_card[i]);
+		freeHyperLogLog(&spill->hll_card[i]);
+
 		new_batch = hashagg_batch_new(aggstate->hash_tapeinfo->tapeset,
 									  tapenum, setno, spill->ntuples[i],
-									  used_bits);
+									  cardinality, used_bits);
 		aggstate->hash_batches = lcons(new_batch, aggstate->hash_batches);
 		aggstate->hash_batches_used++;
 	}
 
 	pfree(spill->ntuples);
+	pfree(spill->hll_card);
 	pfree(spill->partitions);
 }
 
diff --git a/src/include/executor/nodeAgg.h b/src/include/executor/nodeAgg.h
index bb0805abe091..b95516953844 100644
--- a/src/include/executor/nodeAgg.h
+++ b/src/include/executor/nodeAgg.h
@@ -320,7 +320,7 @@ extern void ExecReScanAgg(AggState *node);
 
 extern Size hash_agg_entry_size(int numTrans, Size tupleWidth,
 								Size transitionSpace);
-extern void hash_agg_set_limits(double hashentrysize, uint64 input_groups,
+extern void hash_agg_set_limits(double hashentrysize, double input_groups,
 								int used_bits, Size *mem_limit,
 								uint64 *ngroups_limit, int *num_partitions);
 

From b5310e4ff6b7b0b14a5ee2443839fbf3553623ea Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Wed, 29 Jul 2020 21:24:26 +0900
Subject: [PATCH 221/334] Remove non-fast promotion.

When fast promotion was supported in 9.3, non-fast promotion became
undocumented feature and it's basically not available for ordinary users.
However we decided not to remove non-fast promotion at that moment,
to leave it for a release or two for debugging purpose or as an emergency
method because fast promotion might have some issues, and then to
remove it later. Now, several versions were released since that decision
and there is no longer reason to keep supporting non-fast promotion.
Therefore this commit removes non-fast promotion.

Author: Fujii Masao
Reviewed-by: Hamid Akhtar, Kyotaro Horiguchi
Discussion: https://postgr.es/m/76066434-648f-f567-437b-54853b43398f@oss.nttdata.com
---
 src/backend/access/transam/xlog.c   | 48 +++++++----------------------
 src/backend/postmaster/postmaster.c |  7 ++++-
 src/bin/pg_ctl/pg_ctl.c             |  5 ---
 src/include/access/xlog.h           |  1 -
 4 files changed, 17 insertions(+), 44 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 184c6672f3be..756b838e6a54 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -299,9 +299,6 @@ bool		wal_receiver_create_temp_slot = false;
 /* are we currently in standby mode? */
 bool		StandbyMode = false;
 
-/* whether request for fast promotion has been made yet */
-static bool fast_promote = false;
-
 /*
  * if recoveryStopsBefore/After returns true, it saves information of the stop
  * point here
@@ -6322,7 +6319,7 @@ StartupXLOG(void)
 	DBState		dbstate_at_startup;
 	XLogReaderState *xlogreader;
 	XLogPageReadPrivate private;
-	bool		fast_promoted = false;
+	bool		promoted = false;
 	struct stat st;
 
 	/*
@@ -7727,14 +7724,14 @@ StartupXLOG(void)
 		 * the rule that TLI only changes in shutdown checkpoints, which
 		 * allows some extra error checking in xlog_redo.
 		 *
-		 * In fast promotion, only create a lightweight end-of-recovery record
+		 * In promotion, only create a lightweight end-of-recovery record
 		 * instead of a full checkpoint. A checkpoint is requested later,
 		 * after we're fully out of recovery mode and already accepting
 		 * queries.
 		 */
 		if (bgwriterLaunched)
 		{
-			if (fast_promote)
+			if (LocalPromoteIsTriggered)
 			{
 				checkPointLoc = ControlFile->checkPoint;
 
@@ -7745,7 +7742,7 @@ StartupXLOG(void)
 				record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
 				if (record != NULL)
 				{
-					fast_promoted = true;
+					promoted = true;
 
 					/*
 					 * Insert a special WAL record to mark the end of
@@ -7762,7 +7759,7 @@ StartupXLOG(void)
 				}
 			}
 
-			if (!fast_promoted)
+			if (!promoted)
 				RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
 								  CHECKPOINT_IMMEDIATE |
 								  CHECKPOINT_WAIT);
@@ -7953,12 +7950,12 @@ StartupXLOG(void)
 	WalSndWakeup();
 
 	/*
-	 * If this was a fast promotion, request an (online) checkpoint now. This
+	 * If this was a promotion, request an (online) checkpoint now. This
 	 * isn't required for consistency, but the last restartpoint might be far
 	 * back, and in case of a crash, recovering from it might take a longer
 	 * than is appropriate now that we're not in standby mode anymore.
 	 */
-	if (fast_promoted)
+	if (promoted)
 		RequestCheckpoint(CHECKPOINT_FORCE);
 }
 
@@ -12592,29 +12589,10 @@ CheckForStandbyTrigger(void)
 	if (LocalPromoteIsTriggered)
 		return true;
 
-	if (IsPromoteSignaled())
+	if (IsPromoteSignaled() && CheckPromoteSignal())
 	{
-		/*
-		 * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
-		 * signal handler. It now leaves the file in place and lets the
-		 * Startup process do the unlink. This allows Startup to know whether
-		 * it should create a full checkpoint before starting up (fallback
-		 * mode). Fast promotion takes precedence.
-		 */
-		if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
-		{
-			unlink(PROMOTE_SIGNAL_FILE);
-			unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
-			fast_promote = true;
-		}
-		else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
-		{
-			unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
-			fast_promote = false;
-		}
-
 		ereport(LOG, (errmsg("received promote request")));
-
+		RemovePromoteSignalFiles();
 		ResetPromoteSignaled();
 		SetPromoteIsTriggered();
 		return true;
@@ -12629,7 +12607,6 @@ CheckForStandbyTrigger(void)
 				(errmsg("promote trigger file found: %s", PromoteTriggerFile)));
 		unlink(PromoteTriggerFile);
 		SetPromoteIsTriggered();
-		fast_promote = true;
 		return true;
 	}
 	else if (errno != ENOENT)
@@ -12648,20 +12625,17 @@ void
 RemovePromoteSignalFiles(void)
 {
 	unlink(PROMOTE_SIGNAL_FILE);
-	unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
 }
 
 /*
- * Check to see if a promote request has arrived. Should be
- * called by postmaster after receiving SIGUSR1.
+ * Check to see if a promote request has arrived.
  */
 bool
 CheckPromoteSignal(void)
 {
 	struct stat stat_buf;
 
-	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
-		stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
+	if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
 		return true;
 
 	return false;
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index dec02586c7f1..1db6a3d29d01 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -5333,7 +5333,12 @@ sigusr1_handler(SIGNAL_ARGS)
 		 pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY) &&
 		CheckPromoteSignal())
 	{
-		/* Tell startup process to finish recovery */
+		/*
+		 * Tell startup process to finish recovery.
+		 *
+		 * Leave the promote signal file in place and let the Startup
+		 * process do the unlink.
+		 */
 		signal_child(StartupPID, SIGUSR2);
 	}
 
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index 3c03ace7ed6b..1cdc3ebaa338 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -1195,11 +1195,6 @@ do_promote(void)
 		exit(1);
 	}
 
-	/*
-	 * For 9.3 onwards, "fast" promotion is performed. Promotion with a full
-	 * checkpoint is still possible by writing a file called
-	 * "fallback_promote" instead of "promote"
-	 */
 	snprintf(promote_file, MAXPGPATH, "%s/promote", pg_data);
 
 	if ((prmfile = fopen(promote_file, "w")) == NULL)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 219a7299e1f0..221af87e715b 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -394,6 +394,5 @@ extern SessionBackupState get_backup_status(void);
 
 /* files to signal promotion to primary */
 #define PROMOTE_SIGNAL_FILE		"promote"
-#define FALLBACK_PROMOTE_SIGNAL_FILE  "fallback_promote"
 
 #endif							/* XLOG_H */

From 6023b7ea717ca04cf1bd53709d9c862db07eaefb Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Wed, 29 Jul 2020 23:21:55 +0900
Subject: [PATCH 222/334] pg_stat_statements: track number of rows processed by
 some utility commands.

This commit makes pg_stat_statements track the total number
of rows retrieved or affected by CREATE TABLE AS, SELECT INTO,
CREATE MATERIALIZED VIEW and FETCH commands.

Suggested-by: Pascal Legrand
Author: Fujii Masao
Reviewed-by: Asif Rehman
Discussion: https://postgr.es/m/1584293755198-0.post@n3.nabble.com
---
 .../expected/pg_stat_statements.out           | 66 +++++++++++++++++++
 .../pg_stat_statements/pg_stat_statements.c   | 10 ++-
 .../sql/pg_stat_statements.sql                | 27 ++++++++
 3 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/contrib/pg_stat_statements/expected/pg_stat_statements.out b/contrib/pg_stat_statements/expected/pg_stat_statements.out
index c3f013860ae3..e0edb134f3dc 100644
--- a/contrib/pg_stat_statements/expected/pg_stat_statements.out
+++ b/contrib/pg_stat_statements/expected/pg_stat_statements.out
@@ -528,6 +528,69 @@ SELECT query, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C";
  SELECT query, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C" |     0 |    0
 (9 rows)
 
+--
+-- Track the total number of rows retrieved or affected by the utility
+-- commands of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED VIEW
+-- and SELECT INTO
+--
+SELECT pg_stat_statements_reset();
+ pg_stat_statements_reset 
+--------------------------
+ 
+(1 row)
+
+CREATE TABLE pgss_ctas AS SELECT a, 'ctas' b FROM generate_series(1, 10) a;
+SELECT generate_series(1, 10) c INTO pgss_select_into;
+COPY pgss_ctas (a, b) FROM STDIN;
+CREATE MATERIALIZED VIEW pgss_matv AS SELECT * FROM pgss_ctas;
+BEGIN;
+DECLARE pgss_cursor CURSOR FOR SELECT * FROM pgss_matv;
+FETCH NEXT pgss_cursor;
+ a |  b   
+---+------
+ 1 | ctas
+(1 row)
+
+FETCH FORWARD 5 pgss_cursor;
+ a |  b   
+---+------
+ 2 | ctas
+ 3 | ctas
+ 4 | ctas
+ 5 | ctas
+ 6 | ctas
+(5 rows)
+
+FETCH FORWARD ALL pgss_cursor;
+ a  |  b   
+----+------
+  7 | ctas
+  8 | ctas
+  9 | ctas
+ 10 | ctas
+ 11 | copy
+ 12 | copy
+ 13 | copy
+(7 rows)
+
+COMMIT;
+SELECT query, plans, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C";
+                                        query                                        | plans | calls | rows 
+-------------------------------------------------------------------------------------+-------+-------+------
+ BEGIN                                                                               |     0 |     1 |    0
+ COMMIT                                                                              |     0 |     1 |    0
+ COPY pgss_ctas (a, b) FROM STDIN                                                    |     0 |     1 |    3
+ CREATE MATERIALIZED VIEW pgss_matv AS SELECT * FROM pgss_ctas                       |     0 |     1 |   13
+ CREATE TABLE pgss_ctas AS SELECT a, 'ctas' b FROM generate_series(1, 10) a          |     0 |     1 |   10
+ DECLARE pgss_cursor CURSOR FOR SELECT * FROM pgss_matv                              |     0 |     1 |    0
+ FETCH FORWARD 5 pgss_cursor                                                         |     0 |     1 |    5
+ FETCH FORWARD ALL pgss_cursor                                                       |     0 |     1 |    7
+ FETCH NEXT pgss_cursor                                                              |     0 |     1 |    1
+ SELECT generate_series(1, 10) c INTO pgss_select_into                               |     0 |     1 |   10
+ SELECT pg_stat_statements_reset()                                                   |     0 |     1 |    1
+ SELECT query, plans, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C" |     1 |     0 |    0
+(12 rows)
+
 --
 -- Track user activity and reset them
 --
@@ -728,6 +791,9 @@ SELECT query, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C";
 --
 DROP ROLE regress_stats_user1;
 DROP ROLE regress_stats_user2;
+DROP MATERIALIZED VIEW pgss_matv;
+DROP TABLE pgss_ctas;
+DROP TABLE pgss_select_into;
 --
 -- [re]plan counting
 --
diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index 14cad19afbc5..6b91c62c31a8 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -1170,7 +1170,15 @@ pgss_ProcessUtility(PlannedStmt *pstmt, const char *queryString,
 		INSTR_TIME_SET_CURRENT(duration);
 		INSTR_TIME_SUBTRACT(duration, start);
 
-		rows = (qc && qc->commandTag == CMDTAG_COPY) ? qc->nprocessed : 0;
+		/*
+		 * Track the total number of rows retrieved or affected by
+		 * the utility statements of COPY, FETCH, CREATE TABLE AS,
+		 * CREATE MATERIALIZED VIEW and SELECT INTO.
+		 */
+		rows = (qc && (qc->commandTag == CMDTAG_COPY ||
+					   qc->commandTag == CMDTAG_FETCH ||
+					   qc->commandTag == CMDTAG_SELECT)) ?
+			qc->nprocessed : 0;
 
 		/* calc differences of buffer counters. */
 		memset(&bufusage, 0, sizeof(BufferUsage));
diff --git a/contrib/pg_stat_statements/sql/pg_stat_statements.sql b/contrib/pg_stat_statements/sql/pg_stat_statements.sql
index 6ed8e3802802..996a24a293c5 100644
--- a/contrib/pg_stat_statements/sql/pg_stat_statements.sql
+++ b/contrib/pg_stat_statements/sql/pg_stat_statements.sql
@@ -250,6 +250,30 @@ DROP FUNCTION PLUS_TWO(INTEGER);
 
 SELECT query, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C";
 
+--
+-- Track the total number of rows retrieved or affected by the utility
+-- commands of COPY, FETCH, CREATE TABLE AS, CREATE MATERIALIZED VIEW
+-- and SELECT INTO
+--
+SELECT pg_stat_statements_reset();
+
+CREATE TABLE pgss_ctas AS SELECT a, 'ctas' b FROM generate_series(1, 10) a;
+SELECT generate_series(1, 10) c INTO pgss_select_into;
+COPY pgss_ctas (a, b) FROM STDIN;
+11	copy
+12	copy
+13	copy
+\.
+CREATE MATERIALIZED VIEW pgss_matv AS SELECT * FROM pgss_ctas;
+BEGIN;
+DECLARE pgss_cursor CURSOR FOR SELECT * FROM pgss_matv;
+FETCH NEXT pgss_cursor;
+FETCH FORWARD 5 pgss_cursor;
+FETCH FORWARD ALL pgss_cursor;
+COMMIT;
+
+SELECT query, plans, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C";
+
 --
 -- Track user activity and reset them
 --
@@ -313,6 +337,9 @@ SELECT query, calls, rows FROM pg_stat_statements ORDER BY query COLLATE "C";
 --
 DROP ROLE regress_stats_user1;
 DROP ROLE regress_stats_user2;
+DROP MATERIALIZED VIEW pgss_matv;
+DROP TABLE pgss_ctas;
+DROP TABLE pgss_select_into;
 
 --
 -- [re]plan counting

From d6c08e29e7bc8bc3bf49764192c4a9c71fc0b097 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Wed, 29 Jul 2020 14:14:58 -0700
Subject: [PATCH 223/334] Add hash_mem_multiplier GUC.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a GUC that acts as a multiplier on work_mem.  It gets applied when
sizing executor node hash tables that were previously size constrained
using work_mem alone.

The new GUC can be used to preferentially give hash-based nodes more
memory than the generic work_mem limit.  It is intended to enable admin
tuning of the executor's memory usage.  Overall system throughput and
system responsiveness can be improved by giving hash-based executor
nodes more memory (especially over sort-based alternatives, which are
often much less sensitive to being memory constrained).

The default value for hash_mem_multiplier is 1.0, which is also the
minimum valid value.  This means that hash-based nodes continue to apply
work_mem in the traditional way by default.

hash_mem_multiplier is generally useful.  However, it is being added now
due to concerns about hash aggregate performance stability for users
that upgrade to Postgres 13 (which added disk-based hash aggregation in
commit 1f39bce0).  While the old hash aggregate behavior risked
out-of-memory errors, it is nevertheless likely that many users actually
benefited.  Hash agg's previous indifference to work_mem during query
execution was not just faster; it also accidentally made aggregation
resilient to grouping estimate problems (at least in cases where this
didn't create destabilizing memory pressure).

hash_mem_multiplier can provide a certain kind of continuity with the
behavior of Postgres 12 hash aggregates in cases where the planner
incorrectly estimates that all groups (plus related allocations) will
fit in work_mem/hash_mem.  This seems necessary because hash-based
aggregation is usually much slower when only a small fraction of all
groups can fit.  Even when it isn't possible to totally avoid hash
aggregates that spill, giving hash aggregation more memory will reliably
improve performance (the same cannot be said for external sort
operations, which appear to be almost unaffected by memory availability
provided it's at least possible to get a single merge pass).

The PostgreSQL 13 release notes should advise users that increasing
hash_mem_multiplier can help with performance regressions associated
with hash aggregation.  That can be taken care of by a later commit.

Author: Peter Geoghegan
Reviewed-By: Álvaro Herrera, Jeff Davis
Discussion: https://postgr.es/m/20200625203629.7m6yvut7eqblgmfo@alap3.anarazel.de
Discussion: https://postgr.es/m/CAH2-WzmD%2Bi1pG6rc1%2BCjc4V6EaFJ_qSuKCCHVnH%3DoruqD-zqow%40mail.gmail.com
Backpatch: 13-, where disk-based hash aggregation was introduced.
---
 doc/src/sgml/config.sgml                      | 60 +++++++++++---
 doc/src/sgml/ref/postgres-ref.sgml            |  8 +-
 doc/src/sgml/runtime.sgml                     | 10 ++-
 src/backend/executor/execGrouping.c           |  5 +-
 src/backend/executor/nodeAgg.c                | 30 +++----
 src/backend/executor/nodeHash.c               | 80 ++++++++++++++-----
 src/backend/executor/nodeHashjoin.c           |  4 +-
 src/backend/optimizer/path/costsize.c         | 12 +--
 src/backend/optimizer/plan/planner.c          | 15 ++--
 src/backend/optimizer/plan/subselect.c        |  9 ++-
 src/backend/optimizer/prep/prepunion.c        |  9 ++-
 src/backend/optimizer/util/pathnode.c         |  3 +-
 src/backend/utils/adt/ri_triggers.c           | 18 ++++-
 src/backend/utils/init/globals.c              |  1 +
 src/backend/utils/misc/guc.c                  | 11 +++
 src/backend/utils/misc/postgresql.conf.sample |  1 +
 src/include/executor/hashjoin.h               |  4 +-
 src/include/executor/nodeHash.h               |  2 +-
 src/include/miscadmin.h                       |  4 +
 19 files changed, 205 insertions(+), 81 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 822bbf1f2726..427947cf4962 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1690,22 +1690,64 @@ include_dir 'conf.d'
       </term>
       <listitem>
        <para>
-        Sets the maximum amount of memory to be used by a query operation
+        Sets the base maximum amount of memory to be used by a query operation
         (such as a sort or hash table) before writing to temporary disk files.
         If this value is specified without units, it is taken as kilobytes.
         The default value is four megabytes (<literal>4MB</literal>).
         Note that for a complex query, several sort or hash operations might be
-        running in parallel; each operation will be allowed to use as much memory
-        as this value specifies before it starts to write data into temporary
-        files. Also, several running sessions could be doing such operations
-        concurrently.  Therefore, the total memory used could be many
-        times the value of <varname>work_mem</varname>; it is necessary to
-        keep this fact in mind when choosing the value. Sort operations are
-        used for <literal>ORDER BY</literal>, <literal>DISTINCT</literal>, and
-        merge joins.
+        running in parallel; each operation will generally be allowed
+        to use as much memory as this value specifies before it starts
+        to write data into temporary files.  Also, several running
+        sessions could be doing such operations concurrently.
+        Therefore, the total memory used could be many times the value
+        of <varname>work_mem</varname>; it is necessary to keep this
+        fact in mind when choosing the value.  Sort operations are used
+        for <literal>ORDER BY</literal>, <literal>DISTINCT</literal>,
+        and merge joins.
         Hash tables are used in hash joins, hash-based aggregation, and
         hash-based processing of <literal>IN</literal> subqueries.
        </para>
+       <para>
+        Hash-based operations are generally more sensitive to memory
+        availability than equivalent sort-based operations.  The
+        memory available for hash tables is computed by multiplying
+        <varname>work_mem</varname> by
+        <varname>hash_mem_multiplier</varname>.  This makes it
+        possible for hash-based operations to use an amount of memory
+        that exceeds the usual <varname>work_mem</varname> base
+        amount.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry id="guc-hash-mem-multiplier" xreflabel="hash_mem_multiplier">
+      <term><varname>hash_mem_multiplier</varname> (<type>floating point</type>)
+      <indexterm>
+       <primary><varname>hash_mem_multiplier</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Used to compute the maximum amount of memory that hash-based
+        operations can use.  The final limit is determined by
+        multiplying <varname>work_mem</varname> by
+        <varname>hash_mem_multiplier</varname>.  The default value is
+        1.0, which makes hash-based operations subject to the same
+        simple <varname>work_mem</varname> maximum as sort-based
+        operations.
+       </para>
+       <para>
+        Consider increasing <varname>hash_mem_multiplier</varname> in
+        environments where spilling by query operations is a regular
+        occurrence, especially when simply increasing
+        <varname>work_mem</varname> results in memory pressure (memory
+        pressure typically takes the form of intermittent out of
+        memory errors).  A setting of 1.5 or 2.0 may be effective with
+        mixed workloads.  Higher settings in the range of 2.0 - 8.0 or
+        more may be effective in environments where
+        <varname>work_mem</varname> has already been increased to 40MB
+        or more.
+       </para>
       </listitem>
      </varlistentry>
 
diff --git a/doc/src/sgml/ref/postgres-ref.sgml b/doc/src/sgml/ref/postgres-ref.sgml
index 5e5794bc90d8..6e62f54c597c 100644
--- a/doc/src/sgml/ref/postgres-ref.sgml
+++ b/doc/src/sgml/ref/postgres-ref.sgml
@@ -338,10 +338,10 @@ PostgreSQL documentation
       <term><option>-S</option> <replaceable class="parameter">work-mem</replaceable></term>
       <listitem>
        <para>
-        Specifies the amount of memory to be used by internal sorts and hashes
-        before resorting to temporary disk files.  See the description of the
-        <varname>work_mem</varname> configuration parameter in <xref
-        linkend="runtime-config-resource-memory"/>.
+        Specifies the base amount of memory to be used by sorts and
+        hash tables before resorting to temporary disk files.  See the
+        description of the <varname>work_mem</varname> configuration
+        parameter in <xref linkend="runtime-config-resource-memory"/>.
        </para>
       </listitem>
      </varlistentry>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index e09cb55efcd6..c8698898f325 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1326,10 +1326,12 @@ Out of Memory: Killed process 12345 (postgres).
     system running out of memory, you can avoid the problem by changing
     your configuration.  In some cases, it may help to lower memory-related
     configuration parameters, particularly
-    <link linkend="guc-shared-buffers"><varname>shared_buffers</varname></link>
-    and <link linkend="guc-work-mem"><varname>work_mem</varname></link>.  In
-    other cases, the problem may be caused by allowing too many connections
-    to the database server itself.  In many cases, it may be better to reduce
+    <link linkend="guc-shared-buffers"><varname>shared_buffers</varname></link>,
+    <link linkend="guc-work-mem"><varname>work_mem</varname></link>, and
+    <link linkend="guc-hash-mem-multiplier"><varname>hash_mem_multiplier</varname></link>.
+    In other cases, the problem may be caused by allowing too many
+    connections to the database server itself.  In many cases, it may
+    be better to reduce
     <link linkend="guc-max-connections"><varname>max_connections</varname></link>
     and instead make use of external connection-pooling software.
    </para>
diff --git a/src/backend/executor/execGrouping.c b/src/backend/executor/execGrouping.c
index 321f427e478f..90d04f9228aa 100644
--- a/src/backend/executor/execGrouping.c
+++ b/src/backend/executor/execGrouping.c
@@ -165,13 +165,14 @@ BuildTupleHashTableExt(PlanState *parent,
 {
 	TupleHashTable hashtable;
 	Size		entrysize = sizeof(TupleHashEntryData) + additionalsize;
+	int			hash_mem = get_hash_mem();
 	MemoryContext oldcontext;
 	bool		allow_jit;
 
 	Assert(nbuckets > 0);
 
-	/* Limit initial table size request to not more than work_mem */
-	nbuckets = Min(nbuckets, (long) ((work_mem * 1024L) / entrysize));
+	/* Limit initial table size request to not more than hash_mem */
+	nbuckets = Min(nbuckets, (long) ((hash_mem * 1024L) / entrysize));
 
 	oldcontext = MemoryContextSwitchTo(metacxt);
 
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 02a9165c6941..9776263ae75a 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -203,7 +203,7 @@
  *	  entries (and initialize new transition states), we instead spill them to
  *	  disk to be processed later. The tuples are spilled in a partitioned
  *	  manner, so that subsequent batches are smaller and less likely to exceed
- *	  work_mem (if a batch does exceed work_mem, it must be spilled
+ *	  hash_mem (if a batch does exceed hash_mem, it must be spilled
  *	  recursively).
  *
  *	  Spilled data is written to logical tapes. These provide better control
@@ -212,7 +212,7 @@
  *
  *	  Note that it's possible for transition states to start small but then
  *	  grow very large; for instance in the case of ARRAY_AGG. In such cases,
- *	  it's still possible to significantly exceed work_mem. We try to avoid
+ *	  it's still possible to significantly exceed hash_mem. We try to avoid
  *	  this situation by estimating what will fit in the available memory, and
  *	  imposing a limit on the number of groups separately from the amount of
  *	  memory consumed.
@@ -1516,7 +1516,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 
 	/*
 	 * Used to make sure initial hash table allocation does not exceed
-	 * work_mem. Note that the estimate does not include space for
+	 * hash_mem. Note that the estimate does not include space for
 	 * pass-by-reference transition data values, nor for the representative
 	 * tuple of each group.
 	 */
@@ -1782,7 +1782,7 @@ hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
 }
 
 /*
- * Set limits that trigger spilling to avoid exceeding work_mem. Consider the
+ * Set limits that trigger spilling to avoid exceeding hash_mem. Consider the
  * number of partitions we expect to create (if we do spill).
  *
  * There are two limits: a memory limit, and also an ngroups limit. The
@@ -1796,13 +1796,14 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
 {
 	int			npartitions;
 	Size		partition_mem;
+	int			hash_mem = get_hash_mem();
 
-	/* if not expected to spill, use all of work_mem */
-	if (input_groups * hashentrysize < work_mem * 1024L)
+	/* if not expected to spill, use all of hash_mem */
+	if (input_groups * hashentrysize < hash_mem * 1024L)
 	{
 		if (num_partitions != NULL)
 			*num_partitions = 0;
-		*mem_limit = work_mem * 1024L;
+		*mem_limit = hash_mem * 1024L;
 		*ngroups_limit = *mem_limit / hashentrysize;
 		return;
 	}
@@ -1824,14 +1825,14 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
 		HASHAGG_WRITE_BUFFER_SIZE * npartitions;
 
 	/*
-	 * Don't set the limit below 3/4 of work_mem. In that case, we are at the
+	 * Don't set the limit below 3/4 of hash_mem. In that case, we are at the
 	 * minimum number of partitions, so we aren't going to dramatically exceed
 	 * work mem anyway.
 	 */
-	if (work_mem * 1024L > 4 * partition_mem)
-		*mem_limit = work_mem * 1024L - partition_mem;
+	if (hash_mem * 1024L > 4 * partition_mem)
+		*mem_limit = hash_mem * 1024L - partition_mem;
 	else
-		*mem_limit = work_mem * 1024L * 0.75;
+		*mem_limit = hash_mem * 1024L * 0.75;
 
 	if (*mem_limit > hashentrysize)
 		*ngroups_limit = *mem_limit / hashentrysize;
@@ -1989,19 +1990,20 @@ hash_choose_num_partitions(double input_groups, double hashentrysize,
 	int			partition_limit;
 	int			npartitions;
 	int			partition_bits;
+	int			hash_mem = get_hash_mem();
 
 	/*
 	 * Avoid creating so many partitions that the memory requirements of the
-	 * open partition files are greater than 1/4 of work_mem.
+	 * open partition files are greater than 1/4 of hash_mem.
 	 */
 	partition_limit =
-		(work_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
+		(hash_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
 		HASHAGG_WRITE_BUFFER_SIZE;
 
 	mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize;
 
 	/* make enough partitions so that each one is likely to fit in memory */
-	npartitions = 1 + (mem_wanted / (work_mem * 1024L));
+	npartitions = 1 + (mem_wanted / (hash_mem * 1024L));
 
 	if (npartitions > partition_limit)
 		npartitions = partition_limit;
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 45b342011fe5..ea69eeb2a1e4 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -39,6 +39,7 @@
 #include "port/atomics.h"
 #include "port/pg_bitutils.h"
 #include "utils/dynahash.h"
+#include "utils/guc.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/syscache.h"
@@ -506,7 +507,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
 	hashtable->spaceAllowed = space_allowed;
 	hashtable->spaceUsedSkew = 0;
 	hashtable->spaceAllowedSkew =
-		hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;
+		hashtable->spaceAllowed * SKEW_HASH_MEM_PERCENT / 100;
 	hashtable->chunks = NULL;
 	hashtable->current_chunk = NULL;
 	hashtable->parallel_state = state->parallel_state;
@@ -665,7 +666,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
 
 void
 ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
-						bool try_combined_work_mem,
+						bool try_combined_hash_mem,
 						int parallel_workers,
 						size_t *space_allowed,
 						int *numbuckets,
@@ -682,6 +683,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	int			nbatch = 1;
 	int			nbuckets;
 	double		dbuckets;
+	int			hash_mem = get_hash_mem();
 
 	/* Force a plausible relation size if no info */
 	if (ntuples <= 0.0)
@@ -698,16 +700,16 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	inner_rel_bytes = ntuples * tupsize;
 
 	/*
-	 * Target in-memory hashtable size is work_mem kilobytes.
+	 * Target in-memory hashtable size is hash_mem kilobytes.
 	 */
-	hash_table_bytes = work_mem * 1024L;
+	hash_table_bytes = hash_mem * 1024L;
 
 	/*
-	 * Parallel Hash tries to use the combined work_mem of all workers to
-	 * avoid the need to batch.  If that won't work, it falls back to work_mem
+	 * Parallel Hash tries to use the combined hash_mem of all workers to
+	 * avoid the need to batch.  If that won't work, it falls back to hash_mem
 	 * per worker and tries to process batches in parallel.
 	 */
-	if (try_combined_work_mem)
+	if (try_combined_hash_mem)
 		hash_table_bytes += hash_table_bytes * parallel_workers;
 
 	*space_allowed = hash_table_bytes;
@@ -728,7 +730,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	 */
 	if (useskew)
 	{
-		skew_table_bytes = hash_table_bytes * SKEW_WORK_MEM_PERCENT / 100;
+		skew_table_bytes = hash_table_bytes * SKEW_HASH_MEM_PERCENT / 100;
 
 		/*----------
 		 * Divisor is:
@@ -751,7 +753,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	/*
 	 * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
 	 * memory is filled, assuming a single batch; but limit the value so that
-	 * the pointer arrays we'll try to allocate do not exceed work_mem nor
+	 * the pointer arrays we'll try to allocate do not exceed hash_mem nor
 	 * MaxAllocSize.
 	 *
 	 * Note that both nbuckets and nbatch must be powers of 2 to make
@@ -790,10 +792,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		long		bucket_size;
 
 		/*
-		 * If Parallel Hash with combined work_mem would still need multiple
-		 * batches, we'll have to fall back to regular work_mem budget.
+		 * If Parallel Hash with combined hash_mem would still need multiple
+		 * batches, we'll have to fall back to regular hash_mem budget.
 		 */
-		if (try_combined_work_mem)
+		if (try_combined_hash_mem)
 		{
 			ExecChooseHashTableSize(ntuples, tupwidth, useskew,
 									false, parallel_workers,
@@ -805,7 +807,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		}
 
 		/*
-		 * Estimate the number of buckets we'll want to have when work_mem is
+		 * Estimate the number of buckets we'll want to have when hash_mem is
 		 * entirely full.  Each bucket will contain a bucket pointer plus
 		 * NTUP_PER_BUCKET tuples, whose projected size already includes
 		 * overhead for the hash code, pointer to the next tuple, etc.
@@ -820,8 +822,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		/*
 		 * Buckets are simple pointers to hashjoin tuples, while tupsize
 		 * includes the pointer, hash code, and MinimalTupleData.  So buckets
-		 * should never really exceed 25% of work_mem (even for
-		 * NTUP_PER_BUCKET=1); except maybe for work_mem values that are not
+		 * should never really exceed 25% of hash_mem (even for
+		 * NTUP_PER_BUCKET=1); except maybe for hash_mem values that are not
 		 * 2^N bytes, where we might get more because of doubling. So let's
 		 * look for 50% here.
 		 */
@@ -1095,15 +1097,17 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
 				/* Figure out how many batches to use. */
 				if (hashtable->nbatch == 1)
 				{
+					int			hash_mem = get_hash_mem();
+
 					/*
 					 * We are going from single-batch to multi-batch.  We need
 					 * to switch from one large combined memory budget to the
-					 * regular work_mem budget.
+					 * regular hash_mem budget.
 					 */
-					pstate->space_allowed = work_mem * 1024L;
+					pstate->space_allowed = hash_mem * 1024L;
 
 					/*
-					 * The combined work_mem of all participants wasn't
+					 * The combined hash_mem of all participants wasn't
 					 * enough. Therefore one batch per participant would be
 					 * approximately equivalent and would probably also be
 					 * insufficient.  So try two batches per participant,
@@ -2855,7 +2859,7 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
 
 		/*
 		 * Check if our space limit would be exceeded.  To avoid choking on
-		 * very large tuples or very low work_mem setting, we'll always allow
+		 * very large tuples or very low hash_mem setting, we'll always allow
 		 * each backend to allocate at least one chunk.
 		 */
 		if (hashtable->batches[0].at_least_one_chunk &&
@@ -3366,3 +3370,41 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
 
 	return true;
 }
+
+/*
+ * Get a hash_mem value by multiplying the work_mem GUC's value by the
+ * hash_mem_multiplier GUC's value.
+ *
+ * Returns a work_mem style KB value that hash-based nodes (including but not
+ * limited to hash join) use in place of work_mem.  This is subject to the
+ * same restrictions as work_mem itself.  (There is no such thing as the
+ * hash_mem GUC, but it's convenient for our callers to pretend that there
+ * is.)
+ *
+ * Exported for use by the planner, as well as other hash-based executor
+ * nodes.  This is a rather random place for this, but there is no better
+ * place.
+ */
+int
+get_hash_mem(void)
+{
+	double		hash_mem;
+
+	Assert(hash_mem_multiplier >= 1.0);
+
+	hash_mem = (double) work_mem * hash_mem_multiplier;
+
+	/*
+	 * guc.c enforces a MAX_KILOBYTES limitation on work_mem in order to
+	 * support the assumption that raw derived byte values can be stored in
+	 * 'long' variables.  The returned hash_mem value must also meet this
+	 * assumption.
+	 *
+	 * We clamp the final value rather than throw an error because it should
+	 * be possible to set work_mem and hash_mem_multiplier independently.
+	 */
+	if (hash_mem < MAX_KILOBYTES)
+		return (int) hash_mem;
+
+	return MAX_KILOBYTES;
+}
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 9bb23fef1a6e..5532b91a71dc 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -89,9 +89,9 @@
  * PHJ_BUILD_HASHING_INNER so we can skip loading.
  *
  * Initially we try to plan for a single-batch hash join using the combined
- * work_mem of all participants to create a large shared hash table.  If that
+ * hash_mem of all participants to create a large shared hash table.  If that
  * turns out either at planning or execution time to be impossible then we
- * fall back to regular work_mem sized hash tables.
+ * fall back to regular hash_mem sized hash tables.
  *
  * To avoid deadlocks, we never wait for any barrier unless it is known that
  * all other backends attached to it are actively executing the node or have
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 27ce4cc8069b..fda4b2c6e875 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -3525,7 +3525,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 	 * Get hash table size that executor would use for inner relation.
 	 *
 	 * XXX for the moment, always assume that skew optimization will be
-	 * performed.  As long as SKEW_WORK_MEM_PERCENT is small, it's not worth
+	 * performed.  As long as SKEW_HASH_MEM_PERCENT is small, it's not worth
 	 * trying to determine that for sure.
 	 *
 	 * XXX at some point it might be interesting to try to account for skew
@@ -3534,7 +3534,7 @@ initial_cost_hashjoin(PlannerInfo *root, JoinCostWorkspace *workspace,
 	ExecChooseHashTableSize(inner_path_rows_total,
 							inner_path->pathtarget->width,
 							true,	/* useskew */
-							parallel_hash,	/* try_combined_work_mem */
+							parallel_hash,	/* try_combined_hash_mem */
 							outer_path->parallel_workers,
 							&space_allowed,
 							&numbuckets,
@@ -3597,6 +3597,7 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 	Cost		run_cost = workspace->run_cost;
 	int			numbuckets = workspace->numbuckets;
 	int			numbatches = workspace->numbatches;
+	int			hash_mem;
 	Cost		cpu_per_tuple;
 	QualCost	hash_qual_cost;
 	QualCost	qp_qual_cost;
@@ -3715,16 +3716,17 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
 	}
 
 	/*
-	 * If the bucket holding the inner MCV would exceed work_mem, we don't
+	 * If the bucket holding the inner MCV would exceed hash_mem, we don't
 	 * want to hash unless there is really no other alternative, so apply
 	 * disable_cost.  (The executor normally copes with excessive memory usage
 	 * by splitting batches, but obviously it cannot separate equal values
-	 * that way, so it will be unable to drive the batch size below work_mem
+	 * that way, so it will be unable to drive the batch size below hash_mem
 	 * when this is true.)
 	 */
+	hash_mem = get_hash_mem();
 	if (relation_byte_size(clamp_row_est(inner_path_rows * innermcvfreq),
 						   inner_path->pathtarget->width) >
-		(work_mem * 1024L))
+		(hash_mem * 1024L))
 		startup_cost += disable_cost;
 
 	/*
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 1345e522dcf5..b40a112c25b2 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4196,16 +4196,17 @@ consider_groupingsets_paths(PlannerInfo *root,
 							double dNumGroups)
 {
 	Query	   *parse = root->parse;
+	int			hash_mem = get_hash_mem();
 
 	/*
 	 * If we're not being offered sorted input, then only consider plans that
 	 * can be done entirely by hashing.
 	 *
-	 * We can hash everything if it looks like it'll fit in work_mem. But if
+	 * We can hash everything if it looks like it'll fit in hash_mem. But if
 	 * the input is actually sorted despite not being advertised as such, we
 	 * prefer to make use of that in order to use less memory.
 	 *
-	 * If none of the grouping sets are sortable, then ignore the work_mem
+	 * If none of the grouping sets are sortable, then ignore the hash_mem
 	 * limit and generate a path anyway, since otherwise we'll just fail.
 	 */
 	if (!is_sorted)
@@ -4257,10 +4258,10 @@ consider_groupingsets_paths(PlannerInfo *root,
 
 		/*
 		 * gd->rollups is empty if we have only unsortable columns to work
-		 * with.  Override work_mem in that case; otherwise, we'll rely on the
+		 * with.  Override hash_mem in that case; otherwise, we'll rely on the
 		 * sorted-input case to generate usable mixed paths.
 		 */
-		if (hashsize > work_mem * 1024L && gd->rollups)
+		if (hashsize > hash_mem * 1024L && gd->rollups)
 			return;				/* nope, won't fit */
 
 		/*
@@ -4379,7 +4380,7 @@ consider_groupingsets_paths(PlannerInfo *root,
 	{
 		List	   *rollups = NIL;
 		List	   *hash_sets = list_copy(gd->unsortable_sets);
-		double		availspace = (work_mem * 1024.0);
+		double		availspace = (hash_mem * 1024.0);
 		ListCell   *lc;
 
 		/*
@@ -4400,7 +4401,7 @@ consider_groupingsets_paths(PlannerInfo *root,
 
 			/*
 			 * We treat this as a knapsack problem: the knapsack capacity
-			 * represents work_mem, the item weights are the estimated memory
+			 * represents hash_mem, the item weights are the estimated memory
 			 * usage of the hashtables needed to implement a single rollup,
 			 * and we really ought to use the cost saving as the item value;
 			 * however, currently the costs assigned to sort nodes don't
@@ -4441,7 +4442,7 @@ consider_groupingsets_paths(PlannerInfo *root,
 																rollup->numGroups);
 
 					/*
-					 * If sz is enormous, but work_mem (and hence scale) is
+					 * If sz is enormous, but hash_mem (and hence scale) is
 					 * small, avoid integer overflow here.
 					 */
 					k_weights[i] = (int) Min(floor(sz / scale),
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index b02fcb9bfe7a..9a8f738c9d05 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -200,7 +200,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery,
 	 * XXX If an ANY subplan is uncorrelated, build_subplan may decide to hash
 	 * its output.  In that case it would've been better to specify full
 	 * retrieval.  At present, however, we can only check hashability after
-	 * we've made the subplan :-(.  (Determining whether it'll fit in work_mem
+	 * we've made the subplan :-(.  (Determining whether it'll fit in hash_mem
 	 * is the really hard part.)  Therefore, we don't want to be too
 	 * optimistic about the percentage of tuples retrieved, for fear of
 	 * selecting a plan that's bad for the materialization case.
@@ -278,7 +278,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery,
 
 			plan = create_plan(subroot, best_path);
 
-			/* Now we can check if it'll fit in work_mem */
+			/* Now we can check if it'll fit in hash_mem */
 			/* XXX can we check this at the Path stage? */
 			if (subplan_is_hashable(plan))
 			{
@@ -716,16 +716,17 @@ static bool
 subplan_is_hashable(Plan *plan)
 {
 	double		subquery_size;
+	int			hash_mem = get_hash_mem();
 
 	/*
-	 * The estimated size of the subquery result must fit in work_mem. (Note:
+	 * The estimated size of the subquery result must fit in hash_mem. (Note:
 	 * we use heap tuple overhead here even though the tuples will actually be
 	 * stored as MinimalTuples; this provides some fudge factor for hashtable
 	 * overhead.)
 	 */
 	subquery_size = plan->plan_rows *
 		(MAXALIGN(plan->plan_width) + MAXALIGN(SizeofHeapTupleHeader));
-	if (subquery_size > work_mem * 1024L)
+	if (subquery_size > hash_mem * 1024L)
 		return false;
 
 	return true;
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 6588f83d5ec6..2ebd4ea33207 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -1018,6 +1018,7 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses,
 					const char *construct)
 {
 	int			numGroupCols = list_length(groupClauses);
+	int			hash_mem = get_hash_mem();
 	bool		can_sort;
 	bool		can_hash;
 	Size		hashentrysize;
@@ -1049,15 +1050,17 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses,
 
 	/*
 	 * Don't do it if it doesn't look like the hashtable will fit into
-	 * work_mem.
+	 * hash_mem.
 	 */
 	hashentrysize = MAXALIGN(input_path->pathtarget->width) + MAXALIGN(SizeofMinimalTupleHeader);
 
-	if (hashentrysize * dNumGroups > work_mem * 1024L)
+	if (hashentrysize * dNumGroups > hash_mem * 1024L)
 		return false;
 
 	/*
-	 * See if the estimated cost is no more than doing it the other way.
+	 * See if the estimated cost is no more than doing it the other way.  We
+	 * deliberately give the hash case more memory when hash_mem exceeds
+	 * standard work mem (i.e. when hash_mem_multiplier exceeds 1.0).
 	 *
 	 * We need to consider input_plan + hashagg versus input_plan + sort +
 	 * group.  Note that the actual result plan might involve a SetOp or
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 5110a6b80601..c1fc866cbf91 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1720,8 +1720,9 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 		 * planner.c).
 		 */
 		int			hashentrysize = subpath->pathtarget->width + 64;
+		int			hash_mem = get_hash_mem();
 
-		if (hashentrysize * pathnode->path.rows > work_mem * 1024L)
+		if (hashentrysize * pathnode->path.rows > hash_mem * 1024L)
 		{
 			/*
 			 * We should not try to hash.  Hack the SpecialJoinInfo to
diff --git a/src/backend/utils/adt/ri_triggers.c b/src/backend/utils/adt/ri_triggers.c
index bb49e80d1665..06cf16d9d716 100644
--- a/src/backend/utils/adt/ri_triggers.c
+++ b/src/backend/utils/adt/ri_triggers.c
@@ -1450,7 +1450,9 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
 	 * enough to not use a multiple of work_mem, and one typically would not
 	 * have many large foreign-key validations happening concurrently.  So
 	 * this seems to meet the criteria for being considered a "maintenance"
-	 * operation, and accordingly we use maintenance_work_mem.
+	 * operation, and accordingly we use maintenance_work_mem.  However, we
+	 * must also set hash_mem_multiplier to 1, since it is surely not okay to
+	 * let that get applied to the maintenance_work_mem value.
 	 *
 	 * We use the equivalent of a function SET option to allow the setting to
 	 * persist for exactly the duration of the check query.  guc.c also takes
@@ -1462,6 +1464,9 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
 	(void) set_config_option("work_mem", workmembuf,
 							 PGC_USERSET, PGC_S_SESSION,
 							 GUC_ACTION_SAVE, true, 0, false);
+	(void) set_config_option("hash_mem_multiplier", "1",
+							 PGC_USERSET, PGC_S_SESSION,
+							 GUC_ACTION_SAVE, true, 0, false);
 
 	if (SPI_connect() != SPI_OK_CONNECT)
 		elog(ERROR, "SPI_connect failed");
@@ -1553,7 +1558,7 @@ RI_Initial_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
 		elog(ERROR, "SPI_finish failed");
 
 	/*
-	 * Restore work_mem.
+	 * Restore work_mem and hash_mem_multiplier.
 	 */
 	AtEOXact_GUC(true, save_nestlevel);
 
@@ -1685,7 +1690,9 @@ RI_PartitionRemove_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
 	 * enough to not use a multiple of work_mem, and one typically would not
 	 * have many large foreign-key validations happening concurrently.  So
 	 * this seems to meet the criteria for being considered a "maintenance"
-	 * operation, and accordingly we use maintenance_work_mem.
+	 * operation, and accordingly we use maintenance_work_mem.  However, we
+	 * must also set hash_mem_multiplier to 1, since it is surely not okay to
+	 * let that get applied to the maintenance_work_mem value.
 	 *
 	 * We use the equivalent of a function SET option to allow the setting to
 	 * persist for exactly the duration of the check query.  guc.c also takes
@@ -1697,6 +1704,9 @@ RI_PartitionRemove_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
 	(void) set_config_option("work_mem", workmembuf,
 							 PGC_USERSET, PGC_S_SESSION,
 							 GUC_ACTION_SAVE, true, 0, false);
+	(void) set_config_option("hash_mem_multiplier", "1",
+							 PGC_USERSET, PGC_S_SESSION,
+							 GUC_ACTION_SAVE, true, 0, false);
 
 	if (SPI_connect() != SPI_OK_CONNECT)
 		elog(ERROR, "SPI_connect failed");
@@ -1763,7 +1773,7 @@ RI_PartitionRemove_Check(Trigger *trigger, Relation fk_rel, Relation pk_rel)
 		elog(ERROR, "SPI_finish failed");
 
 	/*
-	 * Restore work_mem.
+	 * Restore work_mem and hash_mem_multiplier.
 	 */
 	AtEOXact_GUC(true, save_nestlevel);
 }
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 497d7c38ae6f..6ab821683989 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -119,6 +119,7 @@ int			IntervalStyle = INTSTYLE_POSTGRES;
 bool		enableFsync = true;
 bool		allowSystemTableMods = false;
 int			work_mem = 4096;
+double		hash_mem_multiplier = 1.0;
 int			maintenance_work_mem = 65536;
 int			max_parallel_maintenance_workers = 2;
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index abfa95a2314b..c20885e97b20 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -3542,6 +3542,17 @@ static struct config_real ConfigureNamesReal[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"hash_mem_multiplier", PGC_USERSET, RESOURCES_MEM,
+			gettext_noop("Multiple of work_mem to use for hash tables."),
+			NULL,
+			GUC_EXPLAIN
+		},
+		&hash_mem_multiplier,
+		1.0, 1.0, 1000.0,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"bgwriter_lru_multiplier", PGC_SIGHUP, RESOURCES_BGWRITER,
 			gettext_noop("Multiple of the average buffer usage to free per round."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 5a0b8e982179..aa30291ea396 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -130,6 +130,7 @@
 # Caution: it is not advisable to set max_prepared_transactions nonzero unless
 # you actively intend to use prepared transactions.
 #work_mem = 4MB				# min 64kB
+#hash_mem_multiplier = 1.0		# 1-1000.0 multiplier on hash table work_mem
 #maintenance_work_mem = 64MB		# min 1MB
 #autovacuum_work_mem = -1		# min 1MB, or -1 to use maintenance_work_mem
 #logical_decoding_work_mem = 64MB	# min 64kB
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 79b634e8ed10..eb5daba36b0f 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -88,7 +88,7 @@ typedef struct HashJoinTupleData
  * outer relation tuples with these hash values are matched against that
  * table instead of the main one.  Thus, tuples with these hash values are
  * effectively handled as part of the first batch and will never go to disk.
- * The skew hashtable is limited to SKEW_WORK_MEM_PERCENT of the total memory
+ * The skew hashtable is limited to SKEW_HASH_MEM_PERCENT of the total memory
  * allowed for the join; while building the hashtables, we decrease the number
  * of MCVs being specially treated if needed to stay under this limit.
  *
@@ -107,7 +107,7 @@ typedef struct HashSkewBucket
 
 #define SKEW_BUCKET_OVERHEAD  MAXALIGN(sizeof(HashSkewBucket))
 #define INVALID_SKEW_BUCKET_NO	(-1)
-#define SKEW_WORK_MEM_PERCENT  2
+#define SKEW_HASH_MEM_PERCENT  2
 #define SKEW_MIN_OUTER_FRACTION  0.01
 
 /*
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
index 64d2ce693ca7..2db4e2f67267 100644
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -61,7 +61,7 @@ extern bool ExecScanHashTableForUnmatched(HashJoinState *hjstate,
 extern void ExecHashTableReset(HashJoinTable hashtable);
 extern void ExecHashTableResetMatchFlags(HashJoinTable hashtable);
 extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
-									bool try_combined_work_mem,
+									bool try_combined_hash_mem,
 									int parallel_workers,
 									size_t *space_allowed,
 									int *numbuckets,
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 18bc8a7b9045..72e33523984f 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -243,6 +243,7 @@ extern PGDLLIMPORT int IntervalStyle;
 extern bool enableFsync;
 extern PGDLLIMPORT bool allowSystemTableMods;
 extern PGDLLIMPORT int work_mem;
+extern PGDLLIMPORT double hash_mem_multiplier;
 extern PGDLLIMPORT int maintenance_work_mem;
 extern PGDLLIMPORT int max_parallel_maintenance_workers;
 
@@ -469,4 +470,7 @@ extern bool has_rolreplication(Oid roleid);
 extern bool BackupInProgress(void);
 extern void CancelBackup(void);
 
+/* in executor/nodeHash.c */
+extern int	get_hash_mem(void);
+
 #endif							/* MISCADMIN_H */

From 3347c982bab0dd56d5b6cb784521233ba2bbac27 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Thu, 30 Jul 2020 17:08:11 +1200
Subject: [PATCH 224/334] Use a long lived WaitEventSet for WaitLatch().

Create LatchWaitSet at backend startup time, and use it to implement
WaitLatch().  This avoids repeated epoll/kqueue setup and teardown
system calls.

Reorder SubPostmasterMain() slightly so that we restore the postmaster
pipe and Windows signal emulation before we reach InitPostmasterChild(),
to make this work in EXEC_BACKEND builds.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/postmaster/postmaster.c | 24 ++++++-------
 src/backend/storage/ipc/latch.c     | 56 ++++++++++++++++++++++++++---
 src/backend/utils/init/miscinit.c   |  2 ++
 src/include/storage/latch.h         |  1 +
 4 files changed, 67 insertions(+), 16 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 1db6a3d29d01..5b5fc97c72da 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -4896,9 +4896,6 @@ SubPostmasterMain(int argc, char *argv[])
 	IsPostmasterEnvironment = true;
 	whereToSendOutput = DestNone;
 
-	/* Setup as postmaster child */
-	InitPostmasterChild();
-
 	/* Setup essential subsystems (to ensure elog() behaves sanely) */
 	InitializeGUCOptions();
 
@@ -4913,6 +4910,18 @@ SubPostmasterMain(int argc, char *argv[])
 	/* Close the postmaster's sockets (as soon as we know them) */
 	ClosePostmasterPorts(strcmp(argv[1], "--forklog") == 0);
 
+	/*
+	 * Start our win32 signal implementation. This has to be done after we
+	 * read the backend variables, because we need to pick up the signal pipe
+	 * from the parent process.
+	 */
+#ifdef WIN32
+	pgwin32_signal_initialize();
+#endif
+
+	/* Setup as postmaster child */
+	InitPostmasterChild();
+
 	/*
 	 * Set up memory area for GSS information. Mirrors the code in ConnCreate
 	 * for the non-exec case.
@@ -4956,15 +4965,6 @@ SubPostmasterMain(int argc, char *argv[])
 	if (strcmp(argv[1], "--forkavworker") == 0)
 		AutovacuumWorkerIAm();
 
-	/*
-	 * Start our win32 signal implementation. This has to be done after we
-	 * read the backend variables, because we need to pick up the signal pipe
-	 * from the parent process.
-	 */
-#ifdef WIN32
-	pgwin32_signal_initialize();
-#endif
-
 	/* In EXEC_BACKEND case we will not have inherited these settings */
 	pqinitmask();
 	PG_SETMASK(&BlockSig);
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 91fa4b619b8c..4153cc85579f 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -56,6 +56,7 @@
 #include "storage/latch.h"
 #include "storage/pmsignal.h"
 #include "storage/shmem.h"
+#include "utils/memutils.h"
 
 /*
  * Select the fd readiness primitive to use. Normally the "most modern"
@@ -129,6 +130,12 @@ struct WaitEventSet
 #endif
 };
 
+/* A common WaitEventSet used to implement WatchLatch() */
+static WaitEventSet *LatchWaitSet;
+
+/* The position of the latch in LatchWaitSet. */
+#define LatchWaitSetLatchPos 0
+
 #ifndef WIN32
 /* Are we currently in WaitLatch? The signal handler would like to know. */
 static volatile sig_atomic_t waiting = false;
@@ -242,6 +249,24 @@ InitializeLatchSupport(void)
 #endif
 }
 
+void
+InitializeLatchWaitSet(void)
+{
+	int			latch_pos PG_USED_FOR_ASSERTS_ONLY;
+
+	Assert(LatchWaitSet == NULL);
+
+	/* Set up the WaitEventSet used by WaitLatch(). */
+	LatchWaitSet = CreateWaitEventSet(TopMemoryContext, 2);
+	latch_pos = AddWaitEventToSet(LatchWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
+								  MyLatch, NULL);
+	if (IsUnderPostmaster)
+		AddWaitEventToSet(LatchWaitSet, WL_EXIT_ON_PM_DEATH,
+						  PGINVALID_SOCKET, NULL, NULL);
+
+	Assert(latch_pos == LatchWaitSetLatchPos);
+}
+
 /*
  * Initialize a process-local latch.
  */
@@ -365,8 +390,31 @@ int
 WaitLatch(Latch *latch, int wakeEvents, long timeout,
 		  uint32 wait_event_info)
 {
-	return WaitLatchOrSocket(latch, wakeEvents, PGINVALID_SOCKET, timeout,
-							 wait_event_info);
+	WaitEvent	event;
+
+	/* Postmaster-managed callers must handle postmaster death somehow. */
+	Assert(!IsUnderPostmaster ||
+		   (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+		   (wakeEvents & WL_POSTMASTER_DEATH));
+
+	/*
+	 * Some callers may have a latch other than MyLatch, or no latch at all,
+	 * or want to handle postmaster death differently.  It's cheap to assign
+	 * those, so just do it every time.
+	 */
+	if (!(wakeEvents & WL_LATCH_SET))
+		latch = NULL;
+	ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch);
+	LatchWaitSet->exit_on_postmaster_death =
+		((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
+
+	if (WaitEventSetWait(LatchWaitSet,
+						 (wakeEvents & WL_TIMEOUT) ? timeout : -1,
+						 &event, 1,
+						 wait_event_info) == 0)
+		return WL_TIMEOUT;
+	else
+		return event.events;
 }
 
 /*
@@ -830,7 +878,8 @@ AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
 
 /*
  * Change the event mask and, in the WL_LATCH_SET case, the latch associated
- * with the WaitEvent.
+ * with the WaitEvent.  The latch may be changed to NULL to disable the latch
+ * temporarily, and then set back to a latch later.
  *
  * 'pos' is the id returned by AddWaitEventToSet.
  */
@@ -862,7 +911,6 @@ ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
 	if (event->events & WL_LATCH_SET &&
 		events != event->events)
 	{
-		/* we could allow to disable latch events for a while */
 		elog(ERROR, "cannot modify latch event");
 	}
 
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index cca9704d2d7a..cf8f9579c345 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -120,6 +120,7 @@ InitPostmasterChild(void)
 	InitializeLatchSupport();
 	MyLatch = &LocalLatchData;
 	InitLatch(MyLatch);
+	InitializeLatchWaitSet();
 
 	/*
 	 * If possible, make this process a group leader, so that the postmaster
@@ -152,6 +153,7 @@ InitStandaloneProcess(const char *argv0)
 	InitializeLatchSupport();
 	MyLatch = &LocalLatchData;
 	InitLatch(MyLatch);
+	InitializeLatchWaitSet();
 
 	/* Compute paths, no postmaster to inherit from */
 	if (my_exec_path[0] == '\0')
diff --git a/src/include/storage/latch.h b/src/include/storage/latch.h
index 46ae56cae3f1..7c742021fb1b 100644
--- a/src/include/storage/latch.h
+++ b/src/include/storage/latch.h
@@ -176,6 +176,7 @@ extern int	WaitLatch(Latch *latch, int wakeEvents, long timeout,
 					  uint32 wait_event_info);
 extern int	WaitLatchOrSocket(Latch *latch, int wakeEvents,
 							  pgsocket sock, long timeout, uint32 wait_event_info);
+extern void InitializeLatchWaitSet(void);
 
 /*
  * Unix implementation uses SIGUSR1 for inter-process signaling.

From e2d394df5df28ab5ee4bfac6b13837e99e24045d Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Thu, 30 Jul 2020 17:23:32 +1200
Subject: [PATCH 225/334] Use WaitLatch() for condition variables.

Previously, condition_variable.c created a long lived WaitEventSet to
avoid extra system calls.  WaitLatch() now uses something similar
internally, so there is no point in wasting an extra kernel descriptor.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/storage/lmgr/condition_variable.c | 28 ++++---------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c
index 37b6a4eecdb6..2ec00397b491 100644
--- a/src/backend/storage/lmgr/condition_variable.c
+++ b/src/backend/storage/lmgr/condition_variable.c
@@ -30,9 +30,6 @@
 /* Initially, we are not prepared to sleep on any condition variable. */
 static ConditionVariable *cv_sleep_target = NULL;
 
-/* Reusable WaitEventSet. */
-static WaitEventSet *cv_wait_event_set = NULL;
-
 /*
  * Initialize a condition variable.
  */
@@ -62,23 +59,6 @@ ConditionVariablePrepareToSleep(ConditionVariable *cv)
 {
 	int			pgprocno = MyProc->pgprocno;
 
-	/*
-	 * If first time through in this process, create a WaitEventSet, which
-	 * we'll reuse for all condition variable sleeps.
-	 */
-	if (cv_wait_event_set == NULL)
-	{
-		WaitEventSet *new_event_set;
-
-		new_event_set = CreateWaitEventSet(TopMemoryContext, 2);
-		AddWaitEventToSet(new_event_set, WL_LATCH_SET, PGINVALID_SOCKET,
-						  MyLatch, NULL);
-		AddWaitEventToSet(new_event_set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
-		/* Don't set cv_wait_event_set until we have a correct WES. */
-		cv_wait_event_set = new_event_set;
-	}
-
 	/*
 	 * If some other sleep is already prepared, cancel it; this is necessary
 	 * because we have just one static variable tracking the prepared sleep,
@@ -135,6 +115,7 @@ ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
 	long		cur_timeout = -1;
 	instr_time	start_time;
 	instr_time	cur_time;
+	int			wait_events;
 
 	/*
 	 * If the caller didn't prepare to sleep explicitly, then do so now and
@@ -166,19 +147,20 @@ ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
 		INSTR_TIME_SET_CURRENT(start_time);
 		Assert(timeout >= 0 && timeout <= INT_MAX);
 		cur_timeout = timeout;
+		wait_events = WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH;
 	}
+	else
+		wait_events = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH;
 
 	while (true)
 	{
-		WaitEvent	event;
 		bool		done = false;
 
 		/*
 		 * Wait for latch to be set.  (If we're awakened for some other
 		 * reason, the code below will cope anyway.)
 		 */
-		(void) WaitEventSetWait(cv_wait_event_set, cur_timeout, &event, 1,
-								wait_event_info);
+		(void) WaitLatch(MyLatch, wait_events, cur_timeout, wait_event_info);
 
 		/* Reset latch before examining the state of the wait list. */
 		ResetLatch(MyLatch);

From e7591fd3cae6c64236ef29d3c87e69b96608a19b Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Thu, 30 Jul 2020 17:25:48 +1200
Subject: [PATCH 226/334] Introduce a WaitEventSet for the stats collector.

This avoids avoids some epoll/kqueue system calls for every wait.

Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Discussion: https://postgr.es/m/CA%2BhUKGJAC4Oqao%3DqforhNey20J8CiG2R%3DoBPqvfR0vOJrFysGw%40mail.gmail.com
---
 src/backend/postmaster/pgstat.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 88992c2da2c8..15f92b66c6ba 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4458,6 +4458,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	int			len;
 	PgStat_Msg	msg;
 	int			wr;
+	WaitEvent	event;
+	WaitEventSet *wes;
 
 	/*
 	 * Ignore all signals usually bound to some action in the postmaster,
@@ -4485,6 +4487,12 @@ PgstatCollectorMain(int argc, char *argv[])
 	pgStatRunningInCollector = true;
 	pgStatDBHash = pgstat_read_statsfiles(InvalidOid, true, true);
 
+	/* Prepare to wait for our latch or data in our socket. */
+	wes = CreateWaitEventSet(CurrentMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET, MyLatch, NULL);
+	AddWaitEventToSet(wes, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, pgStatSock, NULL, NULL);
+
 	/*
 	 * Loop to process messages until we get SIGQUIT or detect ungraceful
 	 * death of our parent postmaster.
@@ -4672,10 +4680,7 @@ PgstatCollectorMain(int argc, char *argv[])
 
 		/* Sleep until there's something to do */
 #ifndef WIN32
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE,
-							   pgStatSock, -1L,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, -1L, &event, 1, WAIT_EVENT_PGSTAT_MAIN);
 #else
 
 		/*
@@ -4688,18 +4693,15 @@ PgstatCollectorMain(int argc, char *argv[])
 		 * to not provoke "using stale statistics" complaints from
 		 * backend_read_statsfile.
 		 */
-		wr = WaitLatchOrSocket(MyLatch,
-							   WL_LATCH_SET | WL_POSTMASTER_DEATH | WL_SOCKET_READABLE | WL_TIMEOUT,
-							   pgStatSock,
-							   2 * 1000L /* msec */ ,
-							   WAIT_EVENT_PGSTAT_MAIN);
+		wr = WaitEventSetWait(wes, 2 * 1000L /* msec */ , &event, 1,
+							  WAIT_EVENT_PGSTAT_MAIN);
 #endif
 
 		/*
 		 * Emergency bailout if postmaster has died.  This is to avoid the
 		 * necessity for manual cleanup of all postmaster children.
 		 */
-		if (wr & WL_POSTMASTER_DEATH)
+		if (wr == 1 && event.events == WL_POSTMASTER_DEATH)
 			break;
 	}							/* end of outer loop */
 
@@ -4708,6 +4710,8 @@ PgstatCollectorMain(int argc, char *argv[])
 	 */
 	pgstat_write_statsfiles(true, true);
 
+	FreeWaitEventSet(wes);
+
 	exit(0);
 }
 

From 903134fcc0ccd188803fdbc2b7c06b898749153a Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Thu, 30 Jul 2020 15:48:44 +0900
Subject: [PATCH 227/334] doc: Mention index references in pg_inherits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Partitioned indexes are also registered in pg_inherits, but the
description of this catalog did not reflect that.

Author: Dagfinn Ilmari Mannsåker
Discussion: https://postgr.es/m/87k0ynj35y.fsf@wibble.ilmari.org
Backpatch-through: 11
---
 doc/src/sgml/catalogs.sgml | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index a99c681887b5..26fda20d1939 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -4417,9 +4417,9 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
 
   <para>
    The catalog <structname>pg_inherits</structname> records information about
-   table inheritance hierarchies.  There is one entry for each direct
-   parent-child table relationship in the database.  (Indirect inheritance can be determined
-   by following chains of entries.)
+   table and index inheritance hierarchies.  There is one entry for each direct
+   parent-child table or index relationship in the database.  (Indirect
+   inheritance can be determined by following chains of entries.)
   </para>
 
   <table>
@@ -4443,7 +4443,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        (references <link linkend="catalog-pg-class"><structname>pg_class</structname></link>.<structfield>oid</structfield>)
       </para>
       <para>
-       The OID of the child table
+       The OID of the child table or index
       </para></entry>
      </row>
 
@@ -4453,7 +4453,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        (references <link linkend="catalog-pg-class"><structname>pg_class</structname></link>.<structfield>oid</structfield>)
       </para>
       <para>
-       The OID of the parent table
+       The OID of the parent table or index
       </para></entry>
      </row>
 
@@ -4465,6 +4465,10 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        If there is more than one direct parent for a child table (multiple
        inheritance), this number tells the order in which the
        inherited columns are to be arranged.  The count starts at 1.
+      </para>
+      <para>
+       Indexes can not have multiple inheritance, since they can only inherit
+       when using declarative partitioning.
       </para></entry>
      </row>
     </tbody>

From f1af75c5f2516ec5b20cfe4b3a474071a318ae1e Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Thu, 30 Jul 2020 16:57:37 +0900
Subject: [PATCH 228/334] Include partitioned tables for tab completion of
 VACUUM in psql

The relkinds that support indexing are the same as the ones supporting
VACUUM, so the code gets refactored a bit with the completion query used
for CLUSTER, but there is no change for CLUSTER in this commit.

Author: Justin Pryzby
Reviewed-by: Fujii Masao, Michael Paquier, Masahiko Sawada
Discussion: https://postgr.es/m/20200728170408.GI20393@telsasoft.com
---
 src/bin/psql/tab-complete.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 8b735476ade4..c4af40bfa9fa 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -330,6 +330,9 @@ do { \
 
 /*
  * Assembly instructions for schema queries
+ *
+ * Note that toast tables are not included in those queries to avoid
+ * unnecessary bloat in the completions generated.
  */
 
 static const SchemaQuery Query_for_list_of_aggregates[] = {
@@ -573,8 +576,14 @@ static const SchemaQuery Query_for_list_of_indexables = {
 	.result = "pg_catalog.quote_ident(c.relname)",
 };
 
-/* Relations supporting VACUUM */
-static const SchemaQuery Query_for_list_of_vacuumables = {
+/*
+ * Relations supporting VACUUM are currently same as those supporting
+ * indexing.
+ */
+#define Query_for_list_of_vacuumables Query_for_list_of_indexables
+
+/* Relations supporting CLUSTER */
+static const SchemaQuery Query_for_list_of_clusterables = {
 	.catname = "pg_catalog.pg_class c",
 	.selcondition =
 	"c.relkind IN (" CppAsString2(RELKIND_RELATION) ", "
@@ -584,9 +593,6 @@ static const SchemaQuery Query_for_list_of_vacuumables = {
 	.result = "pg_catalog.quote_ident(c.relname)",
 };
 
-/* Relations supporting CLUSTER are currently same as those supporting VACUUM */
-#define Query_for_list_of_clusterables Query_for_list_of_vacuumables
-
 static const SchemaQuery Query_for_list_of_constraints_with_schema = {
 	.catname = "pg_catalog.pg_constraint c",
 	.selcondition = "c.conrelid <> 0",

From fd734f387d8780d9989d750942d026167de8cf3c Mon Sep 17 00:00:00 2001
From: Jeff Davis <jdavis@postgresql.org>
Date: Thu, 30 Jul 2020 08:44:58 -0700
Subject: [PATCH 229/334] Use pg_bitutils for HyperLogLog.

Using pg_leftmost_one_post32() yields substantial performance benefits.

Backpatching to version 13 because HLL is used for HashAgg
improvements in 9878b643, which was also backpatched to 13.

Reviewed-by: Peter Geoghegan
Discussion: https://postgr.es/m/CAH2-WzkGvDKVDo+0YvfvZ+1CE=iCi88DCOGFF3i1hTGGaxcKPw@mail.gmail.com
Backpatch-through: 13
---
 src/backend/lib/hyperloglog.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/src/backend/lib/hyperloglog.c b/src/backend/lib/hyperloglog.c
index a5cc1f8b83b4..351fed8186fb 100644
--- a/src/backend/lib/hyperloglog.c
+++ b/src/backend/lib/hyperloglog.c
@@ -49,6 +49,7 @@
 #include <math.h>
 
 #include "lib/hyperloglog.h"
+#include "port/pg_bitutils.h"
 
 #define POW_2_32			(4294967296.0)
 #define NEG_POW_2_32		(-4294967296.0)
@@ -242,11 +243,13 @@ rho(uint32 x, uint8 b)
 {
 	uint8		j = 1;
 
-	while (j <= b && !(x & 0x80000000))
-	{
-		j++;
-		x <<= 1;
-	}
+	if (x == 0)
+		return b + 1;
+
+	j = 32 - pg_leftmost_one_pos32(x);
+
+	if (j > b)
+		return b + 1;
 
 	return j;
 }

From cab2556f3ab289b81a9c6a75e669b6ec78356ffc Mon Sep 17 00:00:00 2001
From: Tatsuo Ishii <ishii@postgresql.org>
Date: Fri, 31 Jul 2020 07:18:41 +0900
Subject: [PATCH 230/334] Doc: fix high availability solutions comparison.

In "High Availability, Load Balancing, and Replication" chapter,
certain descriptions of Pgpool-II were not correct at this point.  It
does not need conflict resolution. Also "Multiple-Server Parallel
Query Execution" is not supported anymore.

Discussion: https://postgr.es/m/20200726.230128.53842489850344110.t-ishii%40sraoss.co.jp
Author: Tatsuo Ishii
Reviewed-by: Bruce Momjian
Backpatch-through: 9.5
---
 doc/src/sgml/high-availability.sgml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml
index 89f6d6eda636..a824d383f2d8 100644
--- a/doc/src/sgml/high-availability.sgml
+++ b/doc/src/sgml/high-availability.sgml
@@ -429,7 +429,7 @@ protocol to make nodes agree on a serializable transactional order.
      <entry align="center">&bull;</entry>
      <entry align="center"></entry>
      <entry align="center">&bull;</entry>
-     <entry align="center"></entry>
+     <entry align="center">&bull;</entry>
      <entry align="center"></entry>
      <entry align="center">&bull;</entry>
     </row>
@@ -471,8 +471,7 @@ protocol to make nodes agree on a serializable transactional order.
      concurrently on a single query.  It is usually accomplished by
      splitting the data among servers and having each server execute its
      part of the query and return results to a central server where they
-     are combined and returned to the user.  <productname>Pgpool-II</productname>
-     has this capability.  Also, this can be implemented using the
+     are combined and returned to the user. This can be implemented using the
      <productname>PL/Proxy</productname> tool set.
     </para>
 

From e3931d01f3afef14703827eda1dad0a3fb3b5d07 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Fri, 31 Jul 2020 10:54:26 +0900
Subject: [PATCH 231/334] Use multi-inserts for pg_attribute and pg_shdepend

For pg_attribute, this allows to insert at once a full set of attributes
for a relation (roughly 15% of WAL reduction in extreme cases).  For
pg_shdepend, this reduces the work done when creating new shared
dependencies from a database template.  The number of slots used for the
insertion is capped at 64kB of data inserted for both, depending on the
number of items to insert and the length of the rows involved.

More can be done for other catalogs, like pg_depend.  This part requires
a different approach as the number of slots to use depends also on the
number of entries discarded as pinned dependencies.  This is also
related to the rework or dependency handling for ALTER TABLE and CREATE
TABLE, mainly.

Author: Daniel Gustafsson
Reviewed-by: Andres Freund, Michael Paquier
Discussion: https://postgr.es/m/20190213182737.mxn6hkdxwrzgxk35@alap3.anarazel.de
---
 src/backend/access/heap/heapam.c  |   8 +-
 src/backend/catalog/heap.c        | 199 ++++++++++++++++++------------
 src/backend/catalog/index.c       |  19 +--
 src/backend/catalog/indexing.c    |  36 ++++++
 src/backend/catalog/pg_shdepend.c |  60 ++++++---
 src/backend/commands/tablecmds.c  |   8 +-
 src/include/catalog/heap.h        |   9 +-
 src/include/catalog/indexing.h    |   5 +
 8 files changed, 225 insertions(+), 119 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 8df2716de46c..5eef225f5c79 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2164,8 +2164,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 		RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
 
 		/*
-		 * Note that heap_multi_insert is not used for catalog tuples yet, but
-		 * this will cover the gap once that is the case.
+		 * For logical decoding we need combocids to properly decode the
+		 * catalog.
 		 */
 		if (needwal && need_cids)
 			log_heap_new_cid(relation, heaptuples[ndone]);
@@ -2180,8 +2180,8 @@ heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
 			RelationPutHeapTuple(relation, buffer, heaptup, false);
 
 			/*
-			 * We don't use heap_multi_insert for catalog tuples yet, but
-			 * better be prepared...
+			 * For logical decoding we need combocids to properly decode the
+			 * catalog.
 			 */
 			if (needwal && need_cids)
 				log_heap_new_cid(relation, heaptup);
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 3985326df62f..f2ca686397eb 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -710,70 +710,122 @@ CheckAttributeType(const char *attname,
 }
 
 /*
- * InsertPgAttributeTuple
- *		Construct and insert a new tuple in pg_attribute.
+ * Cap the maximum amount of bytes allocated for InsertPgAttributeTuples()
+ * slots.
+ */
+#define MAX_PGATTRIBUTE_INSERT_BYTES 65535
+
+/*
+ * InsertPgAttributeTuples
+ *		Construct and insert a set of tuples in pg_attribute.
  *
- * Caller has already opened and locked pg_attribute.  new_attribute is the
- * attribute to insert.  attcacheoff is always initialized to -1, attacl,
- * attfdwoptions and attmissingval are always initialized to NULL.
+ * Caller has already opened and locked pg_attribute.  tupdesc contains the
+ * attributes to insert.  attcacheoff is always initialized to -1, attacl,
+ * attfdwoptions and attmissingval are always initialized to NULL.  attoptions
+ * must contain the same number of elements as tupdesc, or be NULL.
  *
  * indstate is the index state for CatalogTupleInsertWithInfo.  It can be
  * passed as NULL, in which case we'll fetch the necessary info.  (Don't do
  * this when inserting multiple attributes, because it's a tad more
  * expensive.)
+ *
+ * new_rel_oid is the relation OID assigned to the attributes inserted.
+ * If set to InvalidOid, the relation OID from tupdesc is used instead.
  */
 void
-InsertPgAttributeTuple(Relation pg_attribute_rel,
-					   Form_pg_attribute new_attribute,
-					   Datum attoptions,
-					   CatalogIndexState indstate)
+InsertPgAttributeTuples(Relation pg_attribute_rel,
+						TupleDesc tupdesc,
+						Oid new_rel_oid,
+						Datum *attoptions,
+						CatalogIndexState indstate)
 {
-	Datum		values[Natts_pg_attribute];
-	bool		nulls[Natts_pg_attribute];
-	HeapTuple	tup;
+	TupleTableSlot **slot;
+	TupleDesc	td;
+	int			nslots;
+	int			natts = 0;
+	int			slotCount = 0;
+	bool		close_index = false;
+
+	td = RelationGetDescr(pg_attribute_rel);
+
+	/* Initialize the number of slots to use */
+	nslots = Min(tupdesc->natts,
+				 (MAX_PGATTRIBUTE_INSERT_BYTES / sizeof(FormData_pg_attribute)));
+	slot = palloc(sizeof(TupleTableSlot *) * nslots);
+	for (int i = 0; i < nslots; i++)
+		slot[i] = MakeSingleTupleTableSlot(td, &TTSOpsHeapTuple);
+
+	while (natts < tupdesc->natts)
+	{
+		Form_pg_attribute attrs = TupleDescAttr(tupdesc, natts);
 
-	/* This is a tad tedious, but way cleaner than what we used to do... */
-	memset(values, 0, sizeof(values));
-	memset(nulls, false, sizeof(nulls));
+		ExecClearTuple(slot[slotCount]);
 
-	values[Anum_pg_attribute_attrelid - 1] = ObjectIdGetDatum(new_attribute->attrelid);
-	values[Anum_pg_attribute_attname - 1] = NameGetDatum(&new_attribute->attname);
-	values[Anum_pg_attribute_atttypid - 1] = ObjectIdGetDatum(new_attribute->atttypid);
-	values[Anum_pg_attribute_attstattarget - 1] = Int32GetDatum(new_attribute->attstattarget);
-	values[Anum_pg_attribute_attlen - 1] = Int16GetDatum(new_attribute->attlen);
-	values[Anum_pg_attribute_attnum - 1] = Int16GetDatum(new_attribute->attnum);
-	values[Anum_pg_attribute_attndims - 1] = Int32GetDatum(new_attribute->attndims);
-	values[Anum_pg_attribute_attcacheoff - 1] = Int32GetDatum(-1);
-	values[Anum_pg_attribute_atttypmod - 1] = Int32GetDatum(new_attribute->atttypmod);
-	values[Anum_pg_attribute_attbyval - 1] = BoolGetDatum(new_attribute->attbyval);
-	values[Anum_pg_attribute_attstorage - 1] = CharGetDatum(new_attribute->attstorage);
-	values[Anum_pg_attribute_attalign - 1] = CharGetDatum(new_attribute->attalign);
-	values[Anum_pg_attribute_attnotnull - 1] = BoolGetDatum(new_attribute->attnotnull);
-	values[Anum_pg_attribute_atthasdef - 1] = BoolGetDatum(new_attribute->atthasdef);
-	values[Anum_pg_attribute_atthasmissing - 1] = BoolGetDatum(new_attribute->atthasmissing);
-	values[Anum_pg_attribute_attidentity - 1] = CharGetDatum(new_attribute->attidentity);
-	values[Anum_pg_attribute_attgenerated - 1] = CharGetDatum(new_attribute->attgenerated);
-	values[Anum_pg_attribute_attisdropped - 1] = BoolGetDatum(new_attribute->attisdropped);
-	values[Anum_pg_attribute_attislocal - 1] = BoolGetDatum(new_attribute->attislocal);
-	values[Anum_pg_attribute_attinhcount - 1] = Int32GetDatum(new_attribute->attinhcount);
-	values[Anum_pg_attribute_attcollation - 1] = ObjectIdGetDatum(new_attribute->attcollation);
-	values[Anum_pg_attribute_attoptions - 1] = attoptions;
-
-	/* start out with empty permissions and empty options */
-	nulls[Anum_pg_attribute_attacl - 1] = true;
-	nulls[Anum_pg_attribute_attoptions - 1] = attoptions == (Datum) 0;
-	nulls[Anum_pg_attribute_attfdwoptions - 1] = true;
-	nulls[Anum_pg_attribute_attmissingval - 1] = true;
-
-	tup = heap_form_tuple(RelationGetDescr(pg_attribute_rel), values, nulls);
+		if (new_rel_oid != InvalidOid)
+			slot[slotCount]->tts_values[Anum_pg_attribute_attrelid - 1] = ObjectIdGetDatum(new_rel_oid);
+		else
+			slot[slotCount]->tts_values[Anum_pg_attribute_attrelid - 1] = ObjectIdGetDatum(attrs->attrelid);
+
+		slot[slotCount]->tts_values[Anum_pg_attribute_attname - 1] = NameGetDatum(&attrs->attname);
+		slot[slotCount]->tts_values[Anum_pg_attribute_atttypid - 1] = ObjectIdGetDatum(attrs->atttypid);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attstattarget - 1] = Int32GetDatum(attrs->attstattarget);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attlen - 1] = Int16GetDatum(attrs->attlen);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attnum - 1] = Int16GetDatum(attrs->attnum);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attndims - 1] = Int32GetDatum(attrs->attndims);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attcacheoff - 1] = Int32GetDatum(-1);
+		slot[slotCount]->tts_values[Anum_pg_attribute_atttypmod - 1] = Int32GetDatum(attrs->atttypmod);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attbyval - 1] = BoolGetDatum(attrs->attbyval);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attstorage - 1] = CharGetDatum(attrs->attstorage);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attalign - 1] = CharGetDatum(attrs->attalign);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attnotnull - 1] = BoolGetDatum(attrs->attnotnull);
+		slot[slotCount]->tts_values[Anum_pg_attribute_atthasdef - 1] = BoolGetDatum(attrs->atthasdef);
+		slot[slotCount]->tts_values[Anum_pg_attribute_atthasmissing - 1] = BoolGetDatum(attrs->atthasmissing);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attidentity - 1] = CharGetDatum(attrs->attidentity);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attgenerated - 1] = CharGetDatum(attrs->attgenerated);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attisdropped - 1] = BoolGetDatum(attrs->attisdropped);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attislocal - 1] = BoolGetDatum(attrs->attislocal);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attinhcount - 1] = Int32GetDatum(attrs->attinhcount);
+		slot[slotCount]->tts_values[Anum_pg_attribute_attcollation - 1] = ObjectIdGetDatum(attrs->attcollation);
+		if (attoptions && attoptions[natts] != (Datum) 0)
+			slot[slotCount]->tts_values[Anum_pg_attribute_attoptions - 1] = attoptions[natts];
+		else
+			slot[slotCount]->tts_isnull[Anum_pg_attribute_attoptions - 1] = true;
 
-	/* finally insert the new tuple, update the indexes, and clean up */
-	if (indstate != NULL)
-		CatalogTupleInsertWithInfo(pg_attribute_rel, tup, indstate);
-	else
-		CatalogTupleInsert(pg_attribute_rel, tup);
+		/* start out with empty permissions and empty options */
+		slot[slotCount]->tts_isnull[Anum_pg_attribute_attacl - 1] = true;
+		slot[slotCount]->tts_isnull[Anum_pg_attribute_attfdwoptions - 1] = true;
+		slot[slotCount]->tts_isnull[Anum_pg_attribute_attmissingval - 1] = true;
 
-	heap_freetuple(tup);
+		ExecStoreVirtualTuple(slot[slotCount]);
+		slotCount++;
+
+		/*
+		 * If slots are full or the end of processing has been reached, insert
+		 * a batch of tuples.
+		 */
+		if (slotCount == nslots || natts == tupdesc->natts - 1)
+		{
+			/* fetch index info only when we know we need it */
+			if (!indstate)
+			{
+				indstate = CatalogOpenIndexes(pg_attribute_rel);
+				close_index = true;
+			}
+
+			/* insert the new tuples and update the indexes */
+			CatalogTuplesMultiInsertWithInfo(pg_attribute_rel, slot, slotCount,
+											 indstate);
+			slotCount = 0;
+		}
+
+		natts++;
+	}
+
+	if (close_index)
+		CatalogCloseIndexes(indstate);
+	for (int i = 0; i < nslots; i++)
+		ExecDropSingleTupleTableSlot(slot[i]);
+	pfree(slot);
 }
 
 /* --------------------------------
@@ -788,8 +840,6 @@ AddNewAttributeTuples(Oid new_rel_oid,
 					  TupleDesc tupdesc,
 					  char relkind)
 {
-	Form_pg_attribute attr;
-	int			i;
 	Relation	rel;
 	CatalogIndexState indstate;
 	int			natts = tupdesc->natts;
@@ -803,30 +853,26 @@ AddNewAttributeTuples(Oid new_rel_oid,
 
 	indstate = CatalogOpenIndexes(rel);
 
-	/*
-	 * First we add the user attributes.  This is also a convenient place to
-	 * add dependencies on their datatypes and collations.
-	 */
-	for (i = 0; i < natts; i++)
-	{
-		attr = TupleDescAttr(tupdesc, i);
-		/* Fill in the correct relation OID */
-		attr->attrelid = new_rel_oid;
-		/* Make sure this is OK, too */
-		attr->attstattarget = -1;
-
-		InsertPgAttributeTuple(rel, attr, (Datum) 0, indstate);
+	/* set stats detail level to a sane default */
+	for (int i = 0; i < natts; i++)
+		tupdesc->attrs[i].attstattarget = -1;
+	InsertPgAttributeTuples(rel, tupdesc, new_rel_oid, NULL, indstate);
 
+	/* add dependencies on their datatypes and collations */
+	for (int i = 0; i < natts; i++)
+	{
 		/* Add dependency info */
 		ObjectAddressSubSet(myself, RelationRelationId, new_rel_oid, i + 1);
-		ObjectAddressSet(referenced, TypeRelationId, attr->atttypid);
+		ObjectAddressSet(referenced, TypeRelationId,
+						 tupdesc->attrs[i].atttypid);
 		recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 
 		/* The default collation is pinned, so don't bother recording it */
-		if (OidIsValid(attr->attcollation) &&
-			attr->attcollation != DEFAULT_COLLATION_OID)
+		if (OidIsValid(tupdesc->attrs[i].attcollation) &&
+			tupdesc->attrs[i].attcollation != DEFAULT_COLLATION_OID)
 		{
-			ObjectAddressSet(referenced, CollationRelationId, attr->attcollation);
+			ObjectAddressSet(referenced, CollationRelationId,
+							 tupdesc->attrs[i].attcollation);
 			recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
 		}
 	}
@@ -838,17 +884,12 @@ AddNewAttributeTuples(Oid new_rel_oid,
 	 */
 	if (relkind != RELKIND_VIEW && relkind != RELKIND_COMPOSITE_TYPE)
 	{
-		for (i = 0; i < (int) lengthof(SysAtt); i++)
-		{
-			FormData_pg_attribute attStruct;
+		TupleDesc	td;
 
-			memcpy(&attStruct, SysAtt[i], sizeof(FormData_pg_attribute));
+		td = CreateTupleDesc(lengthof(SysAtt), (FormData_pg_attribute **) &SysAtt);
 
-			/* Fill in the correct relation OID in the copied tuple */
-			attStruct.attrelid = new_rel_oid;
-
-			InsertPgAttributeTuple(rel, &attStruct, (Datum) 0, indstate);
-		}
+		InsertPgAttributeTuples(rel, td, new_rel_oid, NULL, indstate);
+		FreeTupleDesc(td);
 	}
 
 	/*
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 8ec2864c76a9..1be27eec52e6 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -106,8 +106,7 @@ static TupleDesc ConstructTupleDescriptor(Relation heapRelation,
 										  Oid *classObjectId);
 static void InitializeAttributeOids(Relation indexRelation,
 									int numatts, Oid indexoid);
-static void AppendAttributeTuples(Relation indexRelation, int numatts,
-								  Datum *attopts);
+static void AppendAttributeTuples(Relation indexRelation, Datum *attopts);
 static void UpdateIndexRelation(Oid indexoid, Oid heapoid,
 								Oid parentIndexId,
 								IndexInfo *indexInfo,
@@ -485,12 +484,11 @@ InitializeAttributeOids(Relation indexRelation,
  * ----------------------------------------------------------------
  */
 static void
-AppendAttributeTuples(Relation indexRelation, int numatts, Datum *attopts)
+AppendAttributeTuples(Relation indexRelation, Datum *attopts)
 {
 	Relation	pg_attribute;
 	CatalogIndexState indstate;
 	TupleDesc	indexTupDesc;
-	int			i;
 
 	/*
 	 * open the attribute relation and its indexes
@@ -504,15 +502,7 @@ AppendAttributeTuples(Relation indexRelation, int numatts, Datum *attopts)
 	 */
 	indexTupDesc = RelationGetDescr(indexRelation);
 
-	for (i = 0; i < numatts; i++)
-	{
-		Form_pg_attribute attr = TupleDescAttr(indexTupDesc, i);
-		Datum		attoptions = attopts ? attopts[i] : (Datum) 0;
-
-		Assert(attr->attnum == i + 1);
-
-		InsertPgAttributeTuple(pg_attribute, attr, attoptions, indstate);
-	}
+	InsertPgAttributeTuples(pg_attribute, indexTupDesc, InvalidOid, attopts, indstate);
 
 	CatalogCloseIndexes(indstate);
 
@@ -979,8 +969,7 @@ index_create(Relation heapRelation,
 	/*
 	 * append ATTRIBUTE tuples for the index
 	 */
-	AppendAttributeTuples(indexRelation, indexInfo->ii_NumIndexAttrs,
-						  indexInfo->ii_OpclassOptions);
+	AppendAttributeTuples(indexRelation, indexInfo->ii_OpclassOptions);
 
 	/* ----------------
 	 *	  update pg_index
diff --git a/src/backend/catalog/indexing.c b/src/backend/catalog/indexing.c
index fe277f3ad371..538f6a06b872 100644
--- a/src/backend/catalog/indexing.c
+++ b/src/backend/catalog/indexing.c
@@ -18,6 +18,7 @@
 #include "access/genam.h"
 #include "access/heapam.h"
 #include "access/htup_details.h"
+#include "access/xact.h"
 #include "catalog/index.h"
 #include "catalog/indexing.h"
 #include "executor/executor.h"
@@ -250,6 +251,41 @@ CatalogTupleInsertWithInfo(Relation heapRel, HeapTuple tup,
 	CatalogIndexInsert(indstate, tup);
 }
 
+/*
+ * CatalogTuplesMultiInsertWithInfo - as above, but for multiple tuples
+ *
+ * Insert multiple tuples into the given catalog relation at once, with an
+ * amortized cost of CatalogOpenIndexes.
+ */
+void
+CatalogTuplesMultiInsertWithInfo(Relation heapRel, TupleTableSlot **slot,
+								 int ntuples, CatalogIndexState indstate)
+{
+	/* Nothing to do */
+	if (ntuples <= 0)
+		return;
+
+	heap_multi_insert(heapRel, slot, ntuples,
+					  GetCurrentCommandId(true), 0, NULL);
+
+	/*
+	 * There is no equivalent to heap_multi_insert for the catalog indexes, so
+	 * we must loop over and insert individually.
+	 */
+	for (int i = 0; i < ntuples; i++)
+	{
+		bool		should_free;
+		HeapTuple	tuple;
+
+		tuple = ExecFetchSlotHeapTuple(slot[i], true, &should_free);
+		tuple->t_tableOid = slot[i]->tts_tableOid;
+		CatalogIndexInsert(indstate, tuple);
+
+		if (should_free)
+			heap_freetuple(tuple);
+	}
+}
+
 /*
  * CatalogTupleUpdate - do heap and indexing work for updating a catalog tuple
  *
diff --git a/src/backend/catalog/pg_shdepend.c b/src/backend/catalog/pg_shdepend.c
index 082b935a6984..ef2b87927ceb 100644
--- a/src/backend/catalog/pg_shdepend.c
+++ b/src/backend/catalog/pg_shdepend.c
@@ -785,6 +785,13 @@ checkSharedDependencies(Oid classId, Oid objectId,
 	return true;
 }
 
+
+/*
+ * Cap the maximum amount of bytes allocated for copyTemplateDependencies()
+ * slots.
+ */
+#define MAX_PGSHDEPEND_INSERT_BYTES 65535
+
 /*
  * copyTemplateDependencies
  *
@@ -799,14 +806,19 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 	ScanKeyData key[1];
 	SysScanDesc scan;
 	HeapTuple	tup;
+	int			slotCount;
 	CatalogIndexState indstate;
-	Datum		values[Natts_pg_shdepend];
-	bool		nulls[Natts_pg_shdepend];
-	bool		replace[Natts_pg_shdepend];
+	TupleTableSlot **slot;
+	int			nslots;
 
 	sdepRel = table_open(SharedDependRelationId, RowExclusiveLock);
 	sdepDesc = RelationGetDescr(sdepRel);
 
+	nslots = MAX_PGSHDEPEND_INSERT_BYTES / sizeof(FormData_pg_shdepend);
+	slot = palloc(sizeof(TupleTableSlot *) * nslots);
+	for (int i = 0; i < nslots; i++)
+		slot[i] = MakeSingleTupleTableSlot(sdepDesc, &TTSOpsHeapTuple);
+
 	indstate = CatalogOpenIndexes(sdepRel);
 
 	/* Scan all entries with dbid = templateDbId */
@@ -818,14 +830,6 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 	scan = systable_beginscan(sdepRel, SharedDependDependerIndexId, true,
 							  NULL, 1, key);
 
-	/* Set up to copy the tuples except for inserting newDbId */
-	memset(values, 0, sizeof(values));
-	memset(nulls, false, sizeof(nulls));
-	memset(replace, false, sizeof(replace));
-
-	replace[Anum_pg_shdepend_dbid - 1] = true;
-	values[Anum_pg_shdepend_dbid - 1] = ObjectIdGetDatum(newDbId);
-
 	/*
 	 * Copy the entries of the original database, changing the database Id to
 	 * that of the new database.  Note that because we are not copying rows
@@ -833,20 +837,46 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 	 * copy the ownership dependency of the template database itself; this is
 	 * what we want.
 	 */
+	slotCount = 0;
 	while (HeapTupleIsValid(tup = systable_getnext(scan)))
 	{
-		HeapTuple	newtup;
+		Form_pg_shdepend shdep;
+
+		ExecClearTuple(slot[slotCount]);
+
+		shdep = (Form_pg_shdepend) GETSTRUCT(tup);
+
+		slot[slotCount]->tts_values[Anum_pg_shdepend_dbid] = ObjectIdGetDatum(newDbId);
+		slot[slotCount]->tts_values[Anum_pg_shdepend_classid] = shdep->classid;
+		slot[slotCount]->tts_values[Anum_pg_shdepend_objid] = shdep->objid;
+		slot[slotCount]->tts_values[Anum_pg_shdepend_objsubid] = shdep->objsubid;
+		slot[slotCount]->tts_values[Anum_pg_shdepend_refclassid] = shdep->refclassid;
+		slot[slotCount]->tts_values[Anum_pg_shdepend_refobjid] = shdep->refobjid;
+		slot[slotCount]->tts_values[Anum_pg_shdepend_deptype] = shdep->deptype;
 
-		newtup = heap_modify_tuple(tup, sdepDesc, values, nulls, replace);
-		CatalogTupleInsertWithInfo(sdepRel, newtup, indstate);
+		ExecStoreVirtualTuple(slot[slotCount]);
+		slotCount++;
 
-		heap_freetuple(newtup);
+		/* If slots are full, insert a batch of tuples */
+		if (slotCount == nslots)
+		{
+			CatalogTuplesMultiInsertWithInfo(sdepRel, slot, slotCount, indstate);
+			slotCount = 0;
+		}
 	}
 
+	/* Insert any tuples left in the buffer */
+	if (slotCount > 0)
+		CatalogTuplesMultiInsertWithInfo(sdepRel, slot, slotCount, indstate);
+
 	systable_endscan(scan);
 
 	CatalogCloseIndexes(indstate);
 	table_close(sdepRel, RowExclusiveLock);
+
+	for (int i = 0; i < nslots; i++)
+		ExecDropSingleTupleTableSlot(slot[i]);
+	pfree(slot);
 }
 
 /*
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 27b596cb5912..ac53f79ada2a 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -5975,6 +5975,8 @@ ATExecAddColumn(List **wqueue, AlteredTableInfo *tab, Relation rel,
 	AlterTableCmd *childcmd;
 	AclResult	aclresult;
 	ObjectAddress address;
+	TupleDesc	tupdesc;
+	FormData_pg_attribute *aattr[] = {&attribute};
 
 	/* At top level, permission check was done in ATPrepCmd, else do it */
 	if (recursing)
@@ -6128,11 +6130,13 @@ ATExecAddColumn(List **wqueue, AlteredTableInfo *tab, Relation rel,
 	attribute.attislocal = colDef->is_local;
 	attribute.attinhcount = colDef->inhcount;
 	attribute.attcollation = collOid;
-	/* attribute.attacl is handled by InsertPgAttributeTuple */
+	/* attribute.attacl is handled by InsertPgAttributeTuples() */
 
 	ReleaseSysCache(typeTuple);
 
-	InsertPgAttributeTuple(attrdesc, &attribute, (Datum) 0, NULL);
+	tupdesc = CreateTupleDesc(lengthof(aattr), (FormData_pg_attribute **) &aattr);
+
+	InsertPgAttributeTuples(attrdesc, tupdesc, myrelid, NULL, NULL);
 
 	table_close(attrdesc, RowExclusiveLock);
 
diff --git a/src/include/catalog/heap.h b/src/include/catalog/heap.h
index cbfdfe2abe5e..d31141c1a218 100644
--- a/src/include/catalog/heap.h
+++ b/src/include/catalog/heap.h
@@ -93,10 +93,11 @@ extern void heap_truncate_check_FKs(List *relations, bool tempTables);
 
 extern List *heap_truncate_find_FKs(List *relationIds);
 
-extern void InsertPgAttributeTuple(Relation pg_attribute_rel,
-								   Form_pg_attribute new_attribute,
-								   Datum attoptions,
-								   CatalogIndexState indstate);
+extern void InsertPgAttributeTuples(Relation pg_attribute_rel,
+									TupleDesc tupdesc,
+									Oid new_rel_oid,
+									Datum *attoptions,
+									CatalogIndexState indstate);
 
 extern void InsertPgClassTuple(Relation pg_class_desc,
 							   Relation new_rel_desc,
diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h
index 8be303870f88..a7e2a9b26b46 100644
--- a/src/include/catalog/indexing.h
+++ b/src/include/catalog/indexing.h
@@ -19,6 +19,7 @@
 #define INDEXING_H
 
 #include "access/htup.h"
+#include "nodes/execnodes.h"
 #include "utils/relcache.h"
 
 /*
@@ -36,6 +37,10 @@ extern void CatalogCloseIndexes(CatalogIndexState indstate);
 extern void CatalogTupleInsert(Relation heapRel, HeapTuple tup);
 extern void CatalogTupleInsertWithInfo(Relation heapRel, HeapTuple tup,
 									   CatalogIndexState indstate);
+extern void CatalogTuplesMultiInsertWithInfo(Relation heapRel,
+											 TupleTableSlot **slot,
+											 int ntuples,
+											 CatalogIndexState indstate);
 extern void CatalogTupleUpdate(Relation heapRel, ItemPointer otid,
 							   HeapTuple tup);
 extern void CatalogTupleUpdateWithInfo(Relation heapRel,

From c5315f4f44843c20ada876fdb0d0828795dfbdf5 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Fri, 31 Jul 2020 14:15:18 +1200
Subject: [PATCH 232/334] Cache smgrnblocks() results in recovery.

Avoid repeatedly calling lseek(SEEK_END) during recovery by caching
the size of each fork.  For now, we can't use the same technique in
other processes, because we lack a shared invalidation mechanism.

Do this by generalizing the pre-existing caching used by FSM and VM
to support all forks.

Discussion: https://postgr.es/m/CAEepm%3D3SSw-Ty1DFcK%3D1rU-K6GSzYzfdD4d%2BZwapdN7dTa6%3DnQ%40mail.gmail.com
---
 contrib/pg_visibility/pg_visibility.c     |  2 +-
 src/backend/access/heap/visibilitymap.c   | 18 ++++-----
 src/backend/catalog/storage.c             |  4 +-
 src/backend/storage/freespace/freespace.c | 27 +++++++------
 src/backend/storage/smgr/smgr.c           | 49 +++++++++++++++++------
 src/include/storage/smgr.h                | 12 +++---
 6 files changed, 66 insertions(+), 46 deletions(-)

diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c
index 68d580ed1e02..e731161734ae 100644
--- a/contrib/pg_visibility/pg_visibility.c
+++ b/contrib/pg_visibility/pg_visibility.c
@@ -392,7 +392,7 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS)
 	check_relation_relkind(rel);
 
 	RelationOpenSmgr(rel);
-	rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
+	rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
 
 	block = visibilitymap_prepare_truncate(rel, 0);
 	if (BlockNumberIsValid(block))
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 0a51678c40df..b1072183bcd6 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -561,17 +561,16 @@ vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
 	 * If we haven't cached the size of the visibility map fork yet, check it
 	 * first.
 	 */
-	if (rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber)
+	if (rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber)
 	{
 		if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
-			rel->rd_smgr->smgr_vm_nblocks = smgrnblocks(rel->rd_smgr,
-														VISIBILITYMAP_FORKNUM);
+			smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
 		else
-			rel->rd_smgr->smgr_vm_nblocks = 0;
+			rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = 0;
 	}
 
 	/* Handle requests beyond EOF */
-	if (blkno >= rel->rd_smgr->smgr_vm_nblocks)
+	if (blkno >= rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM])
 	{
 		if (extend)
 			vm_extend(rel, blkno + 1);
@@ -641,11 +640,13 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	 * Create the file first if it doesn't exist.  If smgr_vm_nblocks is
 	 * positive then it must exist, no need for an smgrexists call.
 	 */
-	if ((rel->rd_smgr->smgr_vm_nblocks == 0 ||
-		 rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber) &&
+	if ((rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == 0 ||
+		 rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] == InvalidBlockNumber) &&
 		!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
 		smgrcreate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, false);
 
+	/* Invalidate cache so that smgrnblocks() asks the kernel. */
+	rel->rd_smgr->smgr_cached_nblocks[VISIBILITYMAP_FORKNUM] = InvalidBlockNumber;
 	vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
 
 	/* Now extend the file */
@@ -667,8 +668,5 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	 */
 	CacheInvalidateSmgr(rel->rd_smgr->smgr_rnode);
 
-	/* Update local cache with the up-to-date size */
-	rel->rd_smgr->smgr_vm_nblocks = vm_nblocks_now;
-
 	UnlockRelationForExtension(rel, ExclusiveLock);
 }
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index ec143b640aab..9e6e6c42d3c8 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -290,8 +290,8 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	 * Make sure smgr_targblock etc aren't pointing somewhere past new end
 	 */
 	rel->rd_smgr->smgr_targblock = InvalidBlockNumber;
-	rel->rd_smgr->smgr_fsm_nblocks = InvalidBlockNumber;
-	rel->rd_smgr->smgr_vm_nblocks = InvalidBlockNumber;
+	for (int i = 0; i <= MAX_FORKNUM; ++i)
+		rel->rd_smgr->smgr_cached_nblocks[i] = InvalidBlockNumber;
 
 	/* Prepare for truncation of MAIN fork of the relation */
 	forks[nforks] = MAIN_FORKNUM;
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index 95a21f6cc383..6a96126b0c2f 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -541,18 +541,19 @@ fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
 	 * value might be stale.  (We send smgr inval messages on truncation, but
 	 * not on extension.)
 	 */
-	if (rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber ||
-		blkno >= rel->rd_smgr->smgr_fsm_nblocks)
+	if (rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
+		blkno >= rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM])
 	{
+		/* Invalidate the cache so smgrnblocks asks the kernel. */
+		rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
 		if (smgrexists(rel->rd_smgr, FSM_FORKNUM))
-			rel->rd_smgr->smgr_fsm_nblocks = smgrnblocks(rel->rd_smgr,
-														 FSM_FORKNUM);
+			smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
 		else
-			rel->rd_smgr->smgr_fsm_nblocks = 0;
+			rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = 0;
 	}
 
 	/* Handle requests beyond EOF */
-	if (blkno >= rel->rd_smgr->smgr_fsm_nblocks)
+	if (blkno >= rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM])
 	{
 		if (extend)
 			fsm_extend(rel, blkno + 1);
@@ -621,14 +622,17 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 	RelationOpenSmgr(rel);
 
 	/*
-	 * Create the FSM file first if it doesn't exist.  If smgr_fsm_nblocks is
-	 * positive then it must exist, no need for an smgrexists call.
+	 * Create the FSM file first if it doesn't exist.  If
+	 * smgr_cached_nblocks[FSM_FORKNUM] is positive then it must exist, no
+	 * need for an smgrexists call.
 	 */
-	if ((rel->rd_smgr->smgr_fsm_nblocks == 0 ||
-		 rel->rd_smgr->smgr_fsm_nblocks == InvalidBlockNumber) &&
+	if ((rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == 0 ||
+		 rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber) &&
 		!smgrexists(rel->rd_smgr, FSM_FORKNUM))
 		smgrcreate(rel->rd_smgr, FSM_FORKNUM, false);
 
+	/* Invalidate cache so that smgrnblocks() asks the kernel. */
+	rel->rd_smgr->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
 	fsm_nblocks_now = smgrnblocks(rel->rd_smgr, FSM_FORKNUM);
 
 	while (fsm_nblocks_now < fsm_nblocks)
@@ -640,9 +644,6 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 		fsm_nblocks_now++;
 	}
 
-	/* Update local cache with the up-to-date size */
-	rel->rd_smgr->smgr_fsm_nblocks = fsm_nblocks_now;
-
 	UnlockRelationForExtension(rel, ExclusiveLock);
 }
 
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 7d667c6586f1..dcc09df0c772 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -17,6 +17,7 @@
  */
 #include "postgres.h"
 
+#include "access/xlog.h"
 #include "lib/ilist.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
@@ -174,8 +175,8 @@ smgropen(RelFileNode rnode, BackendId backend)
 		/* hash_search already filled in the lookup key */
 		reln->smgr_owner = NULL;
 		reln->smgr_targblock = InvalidBlockNumber;
-		reln->smgr_fsm_nblocks = InvalidBlockNumber;
-		reln->smgr_vm_nblocks = InvalidBlockNumber;
+		for (int i = 0; i <= MAX_FORKNUM; ++i)
+			reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
 		reln->smgr_which = 0;	/* we only have md.c at present */
 
 		/* implementation-specific initialization */
@@ -464,6 +465,16 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 {
 	smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
 										 buffer, skipFsync);
+
+	/*
+	 * Normally we expect this to increase nblocks by one, but if the cached
+	 * value isn't as expected, just invalidate it so the next call asks the
+	 * kernel.
+	 */
+	if (reln->smgr_cached_nblocks[forknum] == blocknum)
+		reln->smgr_cached_nblocks[forknum] = blocknum + 1;
+	else
+		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
 }
 
 /*
@@ -537,7 +548,20 @@ smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 BlockNumber
 smgrnblocks(SMgrRelation reln, ForkNumber forknum)
 {
-	return smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
+	BlockNumber result;
+
+	/*
+	 * For now, we only use cached values in recovery due to lack of a shared
+	 * invalidation mechanism for changes in file size.
+	 */
+	if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
+		return reln->smgr_cached_nblocks[forknum];
+
+	result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
+
+	reln->smgr_cached_nblocks[forknum] = result;
+
+	return result;
 }
 
 /*
@@ -576,20 +600,19 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 	/* Do the truncation */
 	for (i = 0; i < nforks; i++)
 	{
+		/* Make the cached size is invalid if we encounter an error. */
+		reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
+
 		smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]);
 
 		/*
-		 * We might as well update the local smgr_fsm_nblocks and
-		 * smgr_vm_nblocks settings. The smgr cache inval message that this
-		 * function sent will cause other backends to invalidate their copies
-		 * of smgr_fsm_nblocks and smgr_vm_nblocks, and these ones too at the
-		 * next command boundary. But these ensure they aren't outright wrong
-		 * until then.
+		 * We might as well update the local smgr_cached_nblocks values. The
+		 * smgr cache inval message that this function sent will cause other
+		 * backends to invalidate their copies of smgr_fsm_nblocks and
+		 * smgr_vm_nblocks, and these ones too at the next command boundary.
+		 * But these ensure they aren't outright wrong until then.
 		 */
-		if (forknum[i] == FSM_FORKNUM)
-			reln->smgr_fsm_nblocks = nblocks[i];
-		if (forknum[i] == VISIBILITYMAP_FORKNUM)
-			reln->smgr_vm_nblocks = nblocks[i];
+		reln->smgr_cached_nblocks[forknum[i]] = nblocks[i];
 	}
 }
 
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 656665959319..f28a84240132 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -45,15 +45,13 @@ typedef struct SMgrRelationData
 	struct SMgrRelationData **smgr_owner;
 
 	/*
-	 * These next three fields are not actually used or manipulated by smgr,
-	 * except that they are reset to InvalidBlockNumber upon a cache flush
-	 * event (in particular, upon truncation of the relation).  Higher levels
-	 * store cached state here so that it will be reset when truncation
-	 * happens.  In all three cases, InvalidBlockNumber means "unknown".
+	 * The following fields are reset to InvalidBlockNumber upon a cache flush
+	 * event, and hold the last known size for each fork.  This information is
+	 * currently only reliable during recovery, since there is no cache
+	 * invalidation for fork extension.
 	 */
 	BlockNumber smgr_targblock; /* current insertion target block */
-	BlockNumber smgr_fsm_nblocks;	/* last known size of fsm fork */
-	BlockNumber smgr_vm_nblocks;	/* last known size of vm fork */
+	BlockNumber smgr_cached_nblocks[MAX_FORKNUM + 1];	/* last known size */
 
 	/* additional public fields may someday exist here */
 

From 7b1110d2fd3da3d7536530d14952d4f4d9c25438 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Fri, 31 Jul 2020 14:17:29 +0900
Subject: [PATCH 233/334] Fix comment in instrument.h

local_blks_dirtied tracks the number of local blocks dirtied, not shared
ones.

Author: Kirk Jamison
Discussion: https://postgr.es/m/OSBPR01MB2341760686DC056DE89D2AB9EF710@OSBPR01MB2341.jpnprd01.prod.outlook.com
---
 src/include/executor/instrument.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index a97562e7a4fe..9dc3ecb07d79 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -24,7 +24,7 @@ typedef struct BufferUsage
 	long		shared_blks_written;	/* # of shared disk blocks written */
 	long		local_blks_hit; /* # of local buffer hits */
 	long		local_blks_read;	/* # of local disk blocks read */
-	long		local_blks_dirtied; /* # of shared blocks dirtied */
+	long		local_blks_dirtied; /* # of local blocks dirtied */
 	long		local_blks_written; /* # of local disk blocks written */
 	long		temp_blks_read; /* # of temp blocks read */
 	long		temp_blks_written;	/* # of temp blocks written */

From 84b1c63ad41872792d47e523363fce1f0e230022 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Fri, 31 Jul 2020 17:27:09 +1200
Subject: [PATCH 234/334] Preallocate some DSM space at startup.

Create an optional region in the main shared memory segment that can be
used to acquire and release "fast" DSM segments, and can benefit from
huge pages allocated at cluster startup time, if configured.  Fall back
to the existing mechanisms when that space is full.  The size is
controlled by a new GUC min_dynamic_shared_memory, defaulting to 0.

Main region DSM segments initially contain whatever garbage the memory
held last time they were used, rather than zeroes.  That change revealed
that DSA areas failed to initialize themselves correctly in memory that
wasn't zeroed first, so fix that problem.

Discussion: https://postgr.es/m/CA%2BhUKGLAE2QBv-WgGp%2BD9P_J-%3Dyne3zof9nfMaqq1h3EGHFXYQ%40mail.gmail.com
---
 doc/src/sgml/config.sgml                      |  24 +++
 src/backend/storage/ipc/dsm.c                 | 191 ++++++++++++++++--
 src/backend/storage/ipc/dsm_impl.c            |   3 +
 src/backend/storage/ipc/ipci.c                |   3 +
 src/backend/utils/misc/guc.c                  |  11 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/backend/utils/mmgr/dsa.c                  |   5 +-
 src/include/storage/dsm.h                     |   3 +
 src/include/storage/dsm_impl.h                |   1 +
 9 files changed, 216 insertions(+), 26 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 427947cf4962..994155ca00e2 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1906,6 +1906,30 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-min-dynamic-shared-memory" xreflabel="min_dynamic_shared_memory">
+      <term><varname>min_dynamic_shared_memory</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>min_dynamic_shared_memory</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Specifies the amount of memory that should be allocated at server
+        startup time for use by parallel queries.  When this memory region is
+        insufficient or exhausted by concurrent queries, new parallel queries
+        try to allocate extra shared memory temporarily from the operating
+        system using the method configured with
+        <varname>dynamic_shared_memory_type</varname>, which may be slower due
+        to memory management overheads.  Memory that is allocated at startup
+        time with <varname>min_dynamic_shared_memory</varname> is affected by
+        the <varname>huge_pages</varname> setting on operating systems where
+        that is supported, and may be more likely to benefit from larger pages
+        on operating systems where that is managed automatically.
+        The default value is <literal>0</literal> (none).
+       </para>
+      </listitem>
+     </varlistentry>
+
      </variablelist>
      </sect2>
 
diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c
index ef64d083570a..dffbd8e82a2a 100644
--- a/src/backend/storage/ipc/dsm.c
+++ b/src/backend/storage/ipc/dsm.c
@@ -35,10 +35,12 @@
 
 #include "lib/ilist.h"
 #include "miscadmin.h"
+#include "port/pg_bitutils.h"
 #include "storage/dsm.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
+#include "utils/freepage.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/resowner_private.h"
@@ -76,6 +78,8 @@ typedef struct dsm_control_item
 {
 	dsm_handle	handle;
 	uint32		refcnt;			/* 2+ = active, 1 = moribund, 0 = gone */
+	size_t		first_page;
+	size_t		npages;
 	void	   *impl_private_pm_handle; /* only needed on Windows */
 	bool		pinned;
 } dsm_control_item;
@@ -95,10 +99,15 @@ static dsm_segment *dsm_create_descriptor(void);
 static bool dsm_control_segment_sane(dsm_control_header *control,
 									 Size mapped_size);
 static uint64 dsm_control_bytes_needed(uint32 nitems);
+static inline dsm_handle make_main_region_dsm_handle(int slot);
+static inline bool is_main_region_dsm_handle(dsm_handle handle);
 
 /* Has this backend initialized the dynamic shared memory system yet? */
 static bool dsm_init_done = false;
 
+/* Preallocated DSM space in the main shared memory region. */
+static void *dsm_main_space_begin = NULL;
+
 /*
  * List of dynamic shared memory segments used by this backend.
  *
@@ -171,7 +180,7 @@ dsm_postmaster_startup(PGShmemHeader *shim)
 	{
 		Assert(dsm_control_address == NULL);
 		Assert(dsm_control_mapped_size == 0);
-		dsm_control_handle = random();
+		dsm_control_handle = random() << 1; /* Even numbers only */
 		if (dsm_control_handle == DSM_HANDLE_INVALID)
 			continue;
 		if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
@@ -247,8 +256,12 @@ dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
 		if (refcnt == 0)
 			continue;
 
-		/* Log debugging information. */
+		/* If it was using the main shmem area, there is nothing to do. */
 		handle = old_control->item[i].handle;
+		if (is_main_region_dsm_handle(handle))
+			continue;
+
+		/* Log debugging information. */
 		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
 			 handle, refcnt);
 
@@ -348,8 +361,11 @@ dsm_postmaster_shutdown(int code, Datum arg)
 		if (dsm_control->item[i].refcnt == 0)
 			continue;
 
-		/* Log debugging information. */
 		handle = dsm_control->item[i].handle;
+		if (is_main_region_dsm_handle(handle))
+			continue;
+
+		/* Log debugging information. */
 		elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
 			 handle);
 
@@ -418,6 +434,45 @@ dsm_set_control_handle(dsm_handle h)
 }
 #endif
 
+/*
+ * Reserve some space in the main shared memory segment for DSM segments.
+ */
+size_t
+dsm_estimate_size(void)
+{
+	return 1024 * 1024 * (size_t) min_dynamic_shared_memory;
+}
+
+/*
+ * Initialize space in the main shared memory segment for DSM segments.
+ */
+void
+dsm_shmem_init(void)
+{
+	size_t		size = dsm_estimate_size();
+	bool		found;
+
+	if (size == 0)
+		return;
+
+	dsm_main_space_begin = ShmemInitStruct("Preallocated DSM", size, &found);
+	if (!found)
+	{
+		FreePageManager *fpm = (FreePageManager *) dsm_main_space_begin;
+		size_t		first_page = 0;
+		size_t		pages;
+
+		/* Reserve space for the FreePageManager. */
+		while (first_page * FPM_PAGE_SIZE < sizeof(FreePageManager))
+			++first_page;
+
+		/* Initialize it and give it all the rest of the space. */
+		FreePageManagerInitialize(fpm, dsm_main_space_begin);
+		pages = (size / FPM_PAGE_SIZE) - first_page;
+		FreePageManagerPut(fpm, first_page, pages);
+	}
+}
+
 /*
  * Create a new dynamic shared memory segment.
  *
@@ -434,6 +489,10 @@ dsm_create(Size size, int flags)
 	dsm_segment *seg;
 	uint32		i;
 	uint32		nitems;
+	size_t		npages = 0;
+	size_t		first_page = 0;
+	FreePageManager *dsm_main_space_fpm = dsm_main_space_begin;
+	bool		using_main_dsm_region = false;
 
 	/* Unsafe in postmaster (and pointless in a stand-alone backend). */
 	Assert(IsUnderPostmaster);
@@ -444,20 +503,48 @@ dsm_create(Size size, int flags)
 	/* Create a new segment descriptor. */
 	seg = dsm_create_descriptor();
 
-	/* Loop until we find an unused segment identifier. */
-	for (;;)
+	/*
+	 * Lock the control segment while we try to allocate from the main shared
+	 * memory area, if configured.
+	 */
+	if (dsm_main_space_fpm)
 	{
-		Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
-		seg->handle = random();
-		if (seg->handle == DSM_HANDLE_INVALID)	/* Reserve sentinel */
-			continue;
-		if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
-						&seg->mapped_address, &seg->mapped_size, ERROR))
-			break;
+		npages = size / FPM_PAGE_SIZE;
+		if (size % FPM_PAGE_SIZE > 0)
+			++npages;
+
+		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+		if (FreePageManagerGet(dsm_main_space_fpm, npages, &first_page))
+		{
+			/* We can carve out a piece of the main shared memory segment. */
+			seg->mapped_address = (char *) dsm_main_space_begin +
+				first_page * FPM_PAGE_SIZE;
+			seg->mapped_size = npages * FPM_PAGE_SIZE;
+			using_main_dsm_region = true;
+			/* We'll choose a handle below. */
+		}
 	}
 
-	/* Lock the control segment so we can register the new segment. */
-	LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	if (!using_main_dsm_region)
+	{
+		/*
+		 * We need to create a new memory segment.  Loop until we find an
+		 * unused segment identifier.
+		 */
+		if (dsm_main_space_fpm)
+			LWLockRelease(DynamicSharedMemoryControlLock);
+		for (;;)
+		{
+			Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
+			seg->handle = random() << 1;	/* Even numbers only */
+			if (seg->handle == DSM_HANDLE_INVALID)	/* Reserve sentinel */
+				continue;
+			if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
+							&seg->mapped_address, &seg->mapped_size, ERROR))
+				break;
+		}
+		LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+	}
 
 	/* Search the control segment for an unused slot. */
 	nitems = dsm_control->nitems;
@@ -465,6 +552,14 @@ dsm_create(Size size, int flags)
 	{
 		if (dsm_control->item[i].refcnt == 0)
 		{
+			if (using_main_dsm_region)
+			{
+				seg->handle = make_main_region_dsm_handle(i);
+				dsm_control->item[i].first_page = first_page;
+				dsm_control->item[i].npages = npages;
+			}
+			else
+				Assert(!is_main_region_dsm_handle(seg->handle));
 			dsm_control->item[i].handle = seg->handle;
 			/* refcnt of 1 triggers destruction, so start at 2 */
 			dsm_control->item[i].refcnt = 2;
@@ -479,9 +574,12 @@ dsm_create(Size size, int flags)
 	/* Verify that we can support an additional mapping. */
 	if (nitems >= dsm_control->maxitems)
 	{
+		if (using_main_dsm_region)
+			FreePageManagerPut(dsm_main_space_fpm, first_page, npages);
 		LWLockRelease(DynamicSharedMemoryControlLock);
-		dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
-					&seg->mapped_address, &seg->mapped_size, WARNING);
+		if (!using_main_dsm_region)
+			dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+						&seg->mapped_address, &seg->mapped_size, WARNING);
 		if (seg->resowner != NULL)
 			ResourceOwnerForgetDSM(seg->resowner, seg);
 		dlist_delete(&seg->node);
@@ -495,6 +593,12 @@ dsm_create(Size size, int flags)
 	}
 
 	/* Enter the handle into a new array slot. */
+	if (using_main_dsm_region)
+	{
+		seg->handle = make_main_region_dsm_handle(nitems);
+		dsm_control->item[i].first_page = first_page;
+		dsm_control->item[i].npages = npages;
+	}
 	dsm_control->item[nitems].handle = seg->handle;
 	/* refcnt of 1 triggers destruction, so start at 2 */
 	dsm_control->item[nitems].refcnt = 2;
@@ -580,6 +684,12 @@ dsm_attach(dsm_handle h)
 		/* Otherwise we've found a match. */
 		dsm_control->item[i].refcnt++;
 		seg->control_slot = i;
+		if (is_main_region_dsm_handle(seg->handle))
+		{
+			seg->mapped_address = (char *) dsm_main_space_begin +
+				dsm_control->item[i].first_page * FPM_PAGE_SIZE;
+			seg->mapped_size = dsm_control->item[i].npages * FPM_PAGE_SIZE;
+		}
 		break;
 	}
 	LWLockRelease(DynamicSharedMemoryControlLock);
@@ -597,8 +707,9 @@ dsm_attach(dsm_handle h)
 	}
 
 	/* Here's where we actually try to map the segment. */
-	dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
-				&seg->mapped_address, &seg->mapped_size, ERROR);
+	if (!is_main_region_dsm_handle(seg->handle))
+		dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
+					&seg->mapped_address, &seg->mapped_size, ERROR);
 
 	return seg;
 }
@@ -688,8 +799,9 @@ dsm_detach(dsm_segment *seg)
 	 */
 	if (seg->mapped_address != NULL)
 	{
-		dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
-					&seg->mapped_address, &seg->mapped_size, WARNING);
+		if (!is_main_region_dsm_handle(seg->handle))
+			dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
+						&seg->mapped_address, &seg->mapped_size, WARNING);
 		seg->impl_private = NULL;
 		seg->mapped_address = NULL;
 		seg->mapped_size = 0;
@@ -729,10 +841,15 @@ dsm_detach(dsm_segment *seg)
 			 * other reason, the postmaster may not have any better luck than
 			 * we did.  There's not much we can do about that, though.
 			 */
-			if (dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+			if (is_main_region_dsm_handle(seg->handle) ||
+				dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
 							&seg->mapped_address, &seg->mapped_size, WARNING))
 			{
 				LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+				if (is_main_region_dsm_handle(seg->handle))
+					FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+									   dsm_control->item[control_slot].first_page,
+									   dsm_control->item[control_slot].npages);
 				Assert(dsm_control->item[control_slot].handle == seg->handle);
 				Assert(dsm_control->item[control_slot].refcnt == 1);
 				dsm_control->item[control_slot].refcnt = 0;
@@ -894,10 +1011,15 @@ dsm_unpin_segment(dsm_handle handle)
 		 * pass the mapped size, mapped address, and private data as NULL
 		 * here.
 		 */
-		if (dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+		if (is_main_region_dsm_handle(handle) ||
+			dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
 						&junk_mapped_address, &junk_mapped_size, WARNING))
 		{
 			LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+			if (is_main_region_dsm_handle(handle))
+				FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+								   dsm_control->item[control_slot].first_page,
+								   dsm_control->item[control_slot].npages);
 			Assert(dsm_control->item[control_slot].handle == handle);
 			Assert(dsm_control->item[control_slot].refcnt == 1);
 			dsm_control->item[control_slot].refcnt = 0;
@@ -1094,3 +1216,28 @@ dsm_control_bytes_needed(uint32 nitems)
 	return offsetof(dsm_control_header, item)
 		+ sizeof(dsm_control_item) * (uint64) nitems;
 }
+
+static inline dsm_handle
+make_main_region_dsm_handle(int slot)
+{
+	dsm_handle	handle;
+
+	/*
+	 * We need to create a handle that doesn't collide with any existing extra
+	 * segment created by dsm_impl_op(), so we'll make it odd.  It also
+	 * mustn't collide with any other main area pseudo-segment, so we'll
+	 * include the slot number in some of the bits.  We also want to make an
+	 * effort to avoid newly created and recently destroyed handles from being
+	 * confused, so we'll make the rest of the bits random.
+	 */
+	handle = 1;
+	handle |= slot << 1;
+	handle |= random() << (pg_leftmost_one_pos32(dsm_control->maxitems) + 1);
+	return handle;
+}
+
+static inline bool
+is_main_region_dsm_handle(dsm_handle handle)
+{
+	return handle & 1;
+}
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
index 1972aecbedc1..d4306418dcb2 100644
--- a/src/backend/storage/ipc/dsm_impl.c
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -113,6 +113,9 @@ const struct config_enum_entry dynamic_shared_memory_options[] = {
 /* Implementation selector. */
 int			dynamic_shared_memory_type;
 
+/* Amount of space reserved for DSM segments in the main area. */
+int			min_dynamic_shared_memory;
+
 /* Size of buffer to be used for zero-filling. */
 #define ZBUFFER_SIZE				8192
 
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index e850ebd131e3..96c2aaabbd65 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -120,6 +120,7 @@ CreateSharedMemoryAndSemaphores(void)
 		size = add_size(size, SpinlockSemaSize());
 		size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
 												 sizeof(ShmemIndexEnt)));
+		size = add_size(size, dsm_estimate_size());
 		size = add_size(size, BufferShmemSize());
 		size = add_size(size, LockShmemSize());
 		size = add_size(size, PredicateLockShmemSize());
@@ -209,6 +210,8 @@ CreateSharedMemoryAndSemaphores(void)
 	 */
 	InitShmemIndex();
 
+	dsm_shmem_init();
+
 	/*
 	 * Set up xlog, clog, and buffers
 	 */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index c20885e97b20..6c6bb2201493 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2231,6 +2231,17 @@ static struct config_int ConfigureNamesInt[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"min_dynamic_shared_memory", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("Amount of dynamic shared memory reserved at startup."),
+			NULL,
+			GUC_UNIT_MB
+		},
+		&min_dynamic_shared_memory,
+		0, 0, Min(INT_MAX, SIZE_MAX / 1024 / 1024),
+		NULL, NULL, NULL
+	},
+
 	/*
 	 * We sometimes multiply the number of shared buffers by two without
 	 * checking for overflow, so we mustn't allow more than INT_MAX / 2.
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index aa30291ea396..b0715ae18818 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -148,6 +148,7 @@
 					#   windows
 					#   mmap
 					# (change requires restart)
+#min_dynamic_shared_memory = 0MB	# (change requires restart)
 
 # - Disk -
 
diff --git a/src/backend/utils/mmgr/dsa.c b/src/backend/utils/mmgr/dsa.c
index b7ad8e62ef3f..6e5e41242978 100644
--- a/src/backend/utils/mmgr/dsa.c
+++ b/src/backend/utils/mmgr/dsa.c
@@ -1223,6 +1223,7 @@ create_internal(void *place, size_t size,
 	 * space.
 	 */
 	control = (dsa_area_control *) place;
+	memset(place, 0, sizeof(*control));
 	control->segment_header.magic =
 		DSA_SEGMENT_HEADER_MAGIC ^ control_handle ^ 0;
 	control->segment_header.next = DSA_SEGMENT_INDEX_NONE;
@@ -1233,14 +1234,10 @@ create_internal(void *place, size_t size,
 	control->handle = control_handle;
 	control->max_total_segment_size = (size_t) -1;
 	control->total_segment_size = size;
-	memset(&control->segment_handles[0], 0,
-		   sizeof(dsm_handle) * DSA_MAX_SEGMENTS);
 	control->segment_handles[0] = control_handle;
 	for (i = 0; i < DSA_NUM_SEGMENT_BINS; ++i)
 		control->segment_bins[i] = DSA_SEGMENT_INDEX_NONE;
-	control->high_segment_index = 0;
 	control->refcnt = 1;
-	control->freed_segment_counter = 0;
 	control->lwlock_tranche_id = tranche_id;
 
 	/*
diff --git a/src/include/storage/dsm.h b/src/include/storage/dsm.h
index 408c0543a635..0455576f4af4 100644
--- a/src/include/storage/dsm.h
+++ b/src/include/storage/dsm.h
@@ -29,6 +29,9 @@ extern void dsm_postmaster_startup(struct PGShmemHeader *);
 extern void dsm_backend_shutdown(void);
 extern void dsm_detach_all(void);
 
+extern size_t dsm_estimate_size(void);
+extern void dsm_shmem_init(void);
+
 #ifdef EXEC_BACKEND
 extern void dsm_set_control_handle(dsm_handle h);
 #endif
diff --git a/src/include/storage/dsm_impl.h b/src/include/storage/dsm_impl.h
index 562cb781a812..f6841e2534f9 100644
--- a/src/include/storage/dsm_impl.h
+++ b/src/include/storage/dsm_impl.h
@@ -40,6 +40,7 @@
 
 /* GUC. */
 extern int	dynamic_shared_memory_type;
+extern int	min_dynamic_shared_memory;
 
 /*
  * Directory for on-disk state.

From 7be04496a9f763fc4d4c1d06ce9ccc250e52df31 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Fri, 31 Jul 2020 19:08:09 +1200
Subject: [PATCH 235/334] Fix compiler warning from Clang.

Per build farm.

Discussion: https://postgr.es/m/20200731062626.GD3317%40paquier.xyz
---
 src/backend/utils/misc/guc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 6c6bb2201493..de87ad6ef702 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -2238,7 +2238,7 @@ static struct config_int ConfigureNamesInt[] =
 			GUC_UNIT_MB
 		},
 		&min_dynamic_shared_memory,
-		0, 0, Min(INT_MAX, SIZE_MAX / 1024 / 1024),
+		0, 0, (int) Min((size_t) INT_MAX, SIZE_MAX / (1024 * 1024)),
 		NULL, NULL, NULL
 	},
 

From 78e73e87548a1e0b71b6f2425f76ea6e9c85b2eb Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 31 Jul 2020 11:43:12 -0400
Subject: [PATCH 236/334] Fix recently-introduced performance problem in
 ts_headline().

The new hlCover() algorithm that I introduced in commit c9b0c678d
turns out to potentially take O(N^2) or worse time on long documents,
if there are many occurrences of individual query words but few or no
substrings that actually satisfy the query.  (One way to hit this
behavior is with a "common_word & rare_word" type of query.)  This
seems unavoidable given the original goal of checking every substring
of the document, so we have to back off that idea.  Fortunately, it
seems unlikely that anyone would really want headlines spanning all of
a long document, so we can avoid the worse-than-linear behavior by
imposing a maximum length of substring that we'll consider.

For now, just hard-wire that maximum length as a multiple of max_words
times max_fragments.  Perhaps at some point somebody will argue for
exposing it as a ts_headline parameter, but I'm hesitant to make such
a feature addition in a back-patched bug fix.

I also noted that the hlFirstIndex() function I'd added in that
commit was unnecessarily stupid: it really only needs to check whether
a HeadlineWordEntry's item pointer is null or not.  This wouldn't make
all that much difference in typical cases with queries having just
a few terms, but a cycle shaved is a cycle earned.

In addition, add a CHECK_FOR_INTERRUPTS call in TS_execute_recurse.
This ensures that hlCover's loop is cancellable if it manages to take
a long time, and it may protect some other TS_execute callers as well.

Back-patch to 9.6 as the previous commit was.  I also chose to add the
CHECK_FOR_INTERRUPTS call to 9.5.  The old hlCover() algorithm seems
to avoid the O(N^2) behavior, at least on the test case I tried, but
nonetheless it's not very quick on a long document.

Per report from Stephen Frost.

Discussion: https://postgr.es/m/20200724160535.GW12375@tamriel.snowman.net
---
 src/backend/tsearch/wparser_def.c   | 57 ++++++++++++++++-------------
 src/backend/utils/adt/tsvector_op.c |  3 ++
 2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c
index 76b6f9aef03c..7b29062a97ea 100644
--- a/src/backend/tsearch/wparser_def.c
+++ b/src/backend/tsearch/wparser_def.c
@@ -2003,24 +2003,14 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
  * Returns -1 if no such index
  */
 static int
-hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos)
+hlFirstIndex(HeadlineParsedText *prs, int pos)
 {
 	int			i;
 
-	/* For each word ... */
 	for (i = pos; i < prs->curwords; i++)
 	{
-		/* ... scan the query to see if this word matches any operand */
-		QueryItem  *item = GETQUERY(query);
-		int			j;
-
-		for (j = 0; j < query->size; j++)
-		{
-			if (item->type == QI_VAL &&
-				prs->words[i].item == &item->qoperand)
-				return i;
-			item++;
-		}
+		if (prs->words[i].item != NULL)
+			return i;
 	}
 	return -1;
 }
@@ -2028,8 +2018,14 @@ hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos)
 /*
  * hlCover: try to find a substring of prs' word list that satisfies query
  *
- * At entry, *p must be the first word index to consider (initialize this to
- * zero, or to the next index after a previous successful search).
+ * At entry, *p must be the first word index to consider (initialize this
+ * to zero, or to the next index after a previous successful search).
+ * We will consider all substrings starting at or after that word, and
+ * containing no more than max_cover words.  (We need a length limit to
+ * keep this from taking O(N^2) time for a long document with many query
+ * words but few complete matches.  Actually, since checkcondition_HL is
+ * roughly O(N) in the length of the substring being checked, it's even
+ * worse than that.)
  *
  * On success, sets *p to first word index and *q to last word index of the
  * cover substring, and returns true.
@@ -2038,7 +2034,8 @@ hlFirstIndex(HeadlineParsedText *prs, TSQuery query, int pos)
  * words used in the query.
  */
 static bool
-hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
+hlCover(HeadlineParsedText *prs, TSQuery query, int max_cover,
+		int *p, int *q)
 {
 	int			pmin,
 				pmax,
@@ -2052,7 +2049,7 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
 	 * appearing in the query; there's no point in trying endpoints in between
 	 * such points.
 	 */
-	pmin = hlFirstIndex(prs, query, *p);
+	pmin = hlFirstIndex(prs, *p);
 	while (pmin >= 0)
 	{
 		/* This useless assignment just keeps stupider compilers quiet */
@@ -2073,7 +2070,7 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
 				return true;
 			}
 			/* Nope, so advance pmax to next feasible endpoint */
-			nextpmax = hlFirstIndex(prs, query, pmax + 1);
+			nextpmax = hlFirstIndex(prs, pmax + 1);
 
 			/*
 			 * If this is our first advance past pmin, then the result is also
@@ -2084,7 +2081,7 @@ hlCover(HeadlineParsedText *prs, TSQuery query, int *p, int *q)
 				nextpmin = nextpmax;
 			pmax = nextpmax;
 		}
-		while (pmax >= 0);
+		while (pmax >= 0 && pmax - pmin < max_cover);
 		/* No luck here, so try next feasible startpoint */
 		pmin = nextpmin;
 	}
@@ -2186,7 +2183,7 @@ get_next_fragment(HeadlineParsedText *prs, int *startpos, int *endpos,
 static void
 mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
 				  int shortword, int min_words,
-				  int max_words, int max_fragments)
+				  int max_words, int max_fragments, int max_cover)
 {
 	int32		poslen,
 				curlen,
@@ -2213,7 +2210,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
 	covers = palloc(maxcovers * sizeof(CoverPos));
 
 	/* get all covers */
-	while (hlCover(prs, query, &p, &q))
+	while (hlCover(prs, query, max_cover, &p, &q))
 	{
 		startpos = p;
 		endpos = q;
@@ -2368,7 +2365,7 @@ mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, bool highlightall,
  */
 static void
 mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
-			  int shortword, int min_words, int max_words)
+			  int shortword, int min_words, int max_words, int max_cover)
 {
 	int			p = 0,
 				q = 0;
@@ -2386,7 +2383,7 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
 	if (!highlightall)
 	{
 		/* examine all covers, select a headline using the best one */
-		while (hlCover(prs, query, &p, &q))
+		while (hlCover(prs, query, max_cover, &p, &q))
 		{
 			/*
 			 * Count words (curlen) and interesting words (poslen) within
@@ -2542,6 +2539,7 @@ prsd_headline(PG_FUNCTION_ARGS)
 	int			shortword = 3;
 	int			max_fragments = 0;
 	bool		highlightall = false;
+	int			max_cover;
 	ListCell   *l;
 
 	/* Extract configuration option values */
@@ -2581,6 +2579,15 @@ prsd_headline(PG_FUNCTION_ARGS)
 							defel->defname)));
 	}
 
+	/*
+	 * We might eventually make max_cover a user-settable parameter, but for
+	 * now, just compute a reasonable value based on max_words and
+	 * max_fragments.
+	 */
+	max_cover = Max(max_words * 10, 100);
+	if (max_fragments > 0)
+		max_cover *= max_fragments;
+
 	/* in HighlightAll mode these parameters are ignored */
 	if (!highlightall)
 	{
@@ -2605,10 +2612,10 @@ prsd_headline(PG_FUNCTION_ARGS)
 	/* Apply appropriate headline selector */
 	if (max_fragments == 0)
 		mark_hl_words(prs, query, highlightall, shortword,
-					  min_words, max_words);
+					  min_words, max_words, max_cover);
 	else
 		mark_hl_fragments(prs, query, highlightall, shortword,
-						  min_words, max_words, max_fragments);
+						  min_words, max_words, max_fragments, max_cover);
 
 	/* Fill in default values for string options */
 	if (!prs->startsel)
diff --git a/src/backend/utils/adt/tsvector_op.c b/src/backend/utils/adt/tsvector_op.c
index f01b1ee25377..756a48a167ad 100644
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@@ -1868,6 +1868,9 @@ TS_execute_recurse(QueryItem *curitem, void *arg, uint32 flags,
 	/* since this function recurses, it could be driven to stack overflow */
 	check_stack_depth();
 
+	/* ... and let's check for query cancel while we're at it */
+	CHECK_FOR_INTERRUPTS();
+
 	if (curitem->type == QI_VAL)
 		return chkcond(arg, (QueryOperand *) curitem,
 					   NULL /* don't need position info */ );

From 3d2376d55c6f2d364a6a1a95cc350c531f6d9423 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 31 Jul 2020 17:11:28 -0400
Subject: [PATCH 237/334] Fix oversight in ALTER TYPE: typmodin/typmodout must
 propagate to arrays.

If a base type supports typmods, its array type does too, with the
same interpretation.  Hence changes in pg_type.typmodin/typmodout
must be propagated to the array type.

While here, improve AlterTypeRecurse to not recurse to domains if
there is nothing we'd need to change.

Oversight in fe30e7ebf.  Back-patch to v13 where that came in.
---
 src/backend/commands/typecmds.c           | 63 +++++++++++++++++++----
 src/test/regress/expected/create_type.out | 16 ++++++
 src/test/regress/sql/create_type.sql      |  8 +++
 3 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c
index 9e5938b10eb1..2e107ace39be 100644
--- a/src/backend/commands/typecmds.c
+++ b/src/backend/commands/typecmds.c
@@ -127,7 +127,8 @@ static char *domainAddConstraint(Oid domainOid, Oid domainNamespace,
 								 const char *domainName, ObjectAddress *constrAddr);
 static Node *replace_domain_constraint_value(ParseState *pstate,
 											 ColumnRef *cref);
-static void AlterTypeRecurse(Oid typeOid, HeapTuple tup, Relation catalog,
+static void AlterTypeRecurse(Oid typeOid, bool isImplicitArray,
+							 HeapTuple tup, Relation catalog,
 							 AlterTypeRecurseParams *atparams);
 
 
@@ -3853,8 +3854,8 @@ AlterType(AlterTypeStmt *stmt)
 				 errmsg("%s is not a base type",
 						format_type_be(typeOid))));
 
-	/* OK, recursively update this type and any domains over it */
-	AlterTypeRecurse(typeOid, tup, catalog, &atparams);
+	/* OK, recursively update this type and any arrays/domains over it */
+	AlterTypeRecurse(typeOid, false, tup, catalog, &atparams);
 
 	/* Clean up */
 	ReleaseSysCache(tup);
@@ -3870,13 +3871,15 @@ AlterType(AlterTypeStmt *stmt)
  * AlterTypeRecurse: one recursion step for AlterType()
  *
  * Apply the changes specified by "atparams" to the type identified by
- * "typeOid", whose existing pg_type tuple is "tup".  Then search for any
- * domains over this type, and recursively apply (most of) the same changes
- * to those domains.
+ * "typeOid", whose existing pg_type tuple is "tup".  If necessary,
+ * recursively update its array type as well.  Then search for any domains
+ * over this type, and recursively apply (most of) the same changes to those
+ * domains.
  *
  * We need this because the system generally assumes that a domain inherits
  * many properties from its base type.  See DefineDomain() above for details
- * of what is inherited.
+ * of what is inherited.  Arrays inherit a smaller number of properties,
+ * but not none.
  *
  * There's a race condition here, in that some other transaction could
  * concurrently add another domain atop this base type; we'd miss updating
@@ -3888,7 +3891,8 @@ AlterType(AlterTypeStmt *stmt)
  * committed.
  */
 static void
-AlterTypeRecurse(Oid typeOid, HeapTuple tup, Relation catalog,
+AlterTypeRecurse(Oid typeOid, bool isImplicitArray,
+				 HeapTuple tup, Relation catalog,
 				 AlterTypeRecurseParams *atparams)
 {
 	Datum		values[Natts_pg_type];
@@ -3949,12 +3953,43 @@ AlterTypeRecurse(Oid typeOid, HeapTuple tup, Relation catalog,
 							 NULL,	/* don't have defaultExpr handy */
 							 NULL,	/* don't have typacl handy */
 							 0, /* we rejected composite types above */
-							 false, /* and we rejected implicit arrays above */
-							 false, /* so it can't be a dependent type */
+							 isImplicitArray,	/* it might be an array */
+							 isImplicitArray,	/* dependent iff it's array */
 							 true);
 
 	InvokeObjectPostAlterHook(TypeRelationId, typeOid, 0);
 
+	/*
+	 * Arrays inherit their base type's typmodin and typmodout, but none of
+	 * the other properties we're concerned with here.  Recurse to the array
+	 * type if needed.
+	 */
+	if (!isImplicitArray &&
+		(atparams->updateTypmodin || atparams->updateTypmodout))
+	{
+		Oid			arrtypoid = ((Form_pg_type) GETSTRUCT(newtup))->typarray;
+
+		if (OidIsValid(arrtypoid))
+		{
+			HeapTuple	arrtup;
+			AlterTypeRecurseParams arrparams;
+
+			arrtup = SearchSysCache1(TYPEOID, ObjectIdGetDatum(arrtypoid));
+			if (!HeapTupleIsValid(arrtup))
+				elog(ERROR, "cache lookup failed for type %u", arrtypoid);
+
+			memset(&arrparams, 0, sizeof(arrparams));
+			arrparams.updateTypmodin = atparams->updateTypmodin;
+			arrparams.updateTypmodout = atparams->updateTypmodout;
+			arrparams.typmodinOid = atparams->typmodinOid;
+			arrparams.typmodoutOid = atparams->typmodoutOid;
+
+			AlterTypeRecurse(arrtypoid, true, arrtup, catalog, &arrparams);
+
+			ReleaseSysCache(arrtup);
+		}
+	}
+
 	/*
 	 * Now we need to recurse to domains.  However, some properties are not
 	 * inherited by domains, so clear the update flags for those.
@@ -3963,6 +3998,12 @@ AlterTypeRecurse(Oid typeOid, HeapTuple tup, Relation catalog,
 	atparams->updateTypmodin = false;	/* domains don't have typmods */
 	atparams->updateTypmodout = false;
 
+	/* Skip the scan if nothing remains to be done */
+	if (!(atparams->updateStorage ||
+		  atparams->updateSend ||
+		  atparams->updateAnalyze))
+		return;
+
 	/* Search pg_type for possible domains over this type */
 	ScanKeyInit(&key[0],
 				Anum_pg_type_typbasetype,
@@ -3983,7 +4024,7 @@ AlterTypeRecurse(Oid typeOid, HeapTuple tup, Relation catalog,
 		if (domainForm->typtype != TYPTYPE_DOMAIN)
 			continue;
 
-		AlterTypeRecurse(domainForm->oid, domainTup, catalog, atparams);
+		AlterTypeRecurse(domainForm->oid, false, domainTup, catalog, atparams);
 	}
 
 	systable_endscan(scan);
diff --git a/src/test/regress/expected/create_type.out b/src/test/regress/expected/create_type.out
index 86a8b65450f6..f85afcb31edf 100644
--- a/src/test/regress/expected/create_type.out
+++ b/src/test/regress/expected/create_type.out
@@ -270,6 +270,14 @@ FROM pg_type WHERE typname = 'myvarchar';
  myvarcharin | myvarcharout | myvarcharrecv | myvarcharsend | varchartypmodin | varchartypmodout | array_typanalyze | x
 (1 row)
 
+SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
+       typanalyze, typstorage
+FROM pg_type WHERE typname = '_myvarchar';
+ typinput | typoutput | typreceive |  typsend   |    typmodin     |    typmodout     |    typanalyze    | typstorage 
+----------+-----------+------------+------------+-----------------+------------------+------------------+------------
+ array_in | array_out | array_recv | array_send | varchartypmodin | varchartypmodout | array_typanalyze | x
+(1 row)
+
 SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
        typanalyze, typstorage
 FROM pg_type WHERE typname = 'myvarchardom';
@@ -278,6 +286,14 @@ FROM pg_type WHERE typname = 'myvarchardom';
  domain_in | myvarcharout | domain_recv | myvarcharsend | -        | -         | array_typanalyze | x
 (1 row)
 
+SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
+       typanalyze, typstorage
+FROM pg_type WHERE typname = '_myvarchardom';
+ typinput | typoutput | typreceive |  typsend   | typmodin | typmodout |    typanalyze    | typstorage 
+----------+-----------+------------+------------+----------+-----------+------------------+------------
+ array_in | array_out | array_recv | array_send | -        | -         | array_typanalyze | x
+(1 row)
+
 -- ensure dependencies are straight
 DROP FUNCTION myvarcharsend(myvarchar);  -- fail
 ERROR:  cannot drop function myvarcharsend(myvarchar) because other objects depend on it
diff --git a/src/test/regress/sql/create_type.sql b/src/test/regress/sql/create_type.sql
index 5b176bb2aed0..584ece06701d 100644
--- a/src/test/regress/sql/create_type.sql
+++ b/src/test/regress/sql/create_type.sql
@@ -214,10 +214,18 @@ SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
        typanalyze, typstorage
 FROM pg_type WHERE typname = 'myvarchar';
 
+SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
+       typanalyze, typstorage
+FROM pg_type WHERE typname = '_myvarchar';
+
 SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
        typanalyze, typstorage
 FROM pg_type WHERE typname = 'myvarchardom';
 
+SELECT typinput, typoutput, typreceive, typsend, typmodin, typmodout,
+       typanalyze, typstorage
+FROM pg_type WHERE typname = '_myvarchardom';
+
 -- ensure dependencies are straight
 DROP FUNCTION myvarcharsend(myvarchar);  -- fail
 DROP TYPE myvarchar;  -- fail

From c79aed4f793086300abfc188def94b5c0bd0b45d Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 31 Jul 2020 15:34:28 -0700
Subject: [PATCH 238/334] Restore lost amcheck TOAST test coverage.

Commit eba77534 fixed an amcheck false positive bug involving
inconsistencies in TOAST input state between table and index.  A test
case was added that verified that such an inconsistency didn't result in
a spurious corruption related error.

Test coverage from the test was accidentally lost by commit 501e41dd,
which propagated ALTER TABLE ...  SET STORAGE attstorage state to
indexes.  This broke the test because the test specifically relied on
attstorage not being propagated.  This artificially forced there to be
index tuples whose datums were equivalent to the datums in the heap
without the datums actually being bitwise equal.

Fix this by updating pg_attribute directly instead.  Commit 501e41dd
made similar changes to a test_decoding TOAST-related test case which
made the same assumption, but overlooked the amcheck test case.

Backpatch: 11-, just like commit eba77534 (and commit 501e41dd).
---
 contrib/amcheck/expected/check_btree.out | 16 ++++++++++++----
 contrib/amcheck/sql/check_btree.sql      | 14 ++++++++++----
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/contrib/amcheck/expected/check_btree.out b/contrib/amcheck/expected/check_btree.out
index f82f48d23b78..13848b7449b7 100644
--- a/contrib/amcheck/expected/check_btree.out
+++ b/contrib/amcheck/expected/check_btree.out
@@ -155,11 +155,19 @@ SELECT bt_index_parent_check('delete_test_table_pkey', true);
 -- tuple.  Bloom filter must fingerprint normalized index tuple representation.
 --
 CREATE TABLE toast_bug(buggy text);
-ALTER TABLE toast_bug ALTER COLUMN buggy SET STORAGE plain;
--- pg_attribute entry for toasty.buggy will have plain storage:
-CREATE INDEX toasty ON toast_bug(buggy);
--- Whereas pg_attribute entry for toast_bug.buggy now has extended storage:
 ALTER TABLE toast_bug ALTER COLUMN buggy SET STORAGE extended;
+CREATE INDEX toasty ON toast_bug(buggy);
+-- pg_attribute entry for toasty.buggy (the index) will have plain storage:
+UPDATE pg_attribute SET attstorage = 'p'
+WHERE attrelid = 'toasty'::regclass AND attname = 'buggy';
+-- Whereas pg_attribute entry for toast_bug.buggy (the table) still has extended storage:
+SELECT attstorage FROM pg_attribute
+WHERE attrelid = 'toast_bug'::regclass AND attname = 'buggy';
+ attstorage 
+------------
+ x
+(1 row)
+
 -- Insert compressible heap tuple (comfortably exceeds TOAST_TUPLE_THRESHOLD):
 INSERT INTO toast_bug SELECT repeat('a', 2200);
 -- Should not get false positive report of corruption:
diff --git a/contrib/amcheck/sql/check_btree.sql b/contrib/amcheck/sql/check_btree.sql
index a1fef644cb08..97a3e1a20d50 100644
--- a/contrib/amcheck/sql/check_btree.sql
+++ b/contrib/amcheck/sql/check_btree.sql
@@ -99,11 +99,17 @@ SELECT bt_index_parent_check('delete_test_table_pkey', true);
 -- tuple.  Bloom filter must fingerprint normalized index tuple representation.
 --
 CREATE TABLE toast_bug(buggy text);
-ALTER TABLE toast_bug ALTER COLUMN buggy SET STORAGE plain;
--- pg_attribute entry for toasty.buggy will have plain storage:
-CREATE INDEX toasty ON toast_bug(buggy);
--- Whereas pg_attribute entry for toast_bug.buggy now has extended storage:
 ALTER TABLE toast_bug ALTER COLUMN buggy SET STORAGE extended;
+CREATE INDEX toasty ON toast_bug(buggy);
+
+-- pg_attribute entry for toasty.buggy (the index) will have plain storage:
+UPDATE pg_attribute SET attstorage = 'p'
+WHERE attrelid = 'toasty'::regclass AND attname = 'buggy';
+
+-- Whereas pg_attribute entry for toast_bug.buggy (the table) still has extended storage:
+SELECT attstorage FROM pg_attribute
+WHERE attrelid = 'toast_bug'::regclass AND attname = 'buggy';
+
 -- Insert compressible heap tuple (comfortably exceeds TOAST_TUPLE_THRESHOLD):
 INSERT INTO toast_bug SELECT repeat('a', 2200);
 -- Should not get false positive report of corruption:

From 84c0e4b9bce794da914fe9c062753bf21369745f Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Sat, 1 Aug 2020 12:16:15 +1200
Subject: [PATCH 239/334] Improve programmer docs for simplehash and dynahash.

When reading the code it's not obvious when one should prefer dynahash
over simplehash and vice-versa, so, for programmer-friendliness, add
comments to inform that decision.

Show sample simplehash method signatures.

Author: James Coleman <jtc331@gmail.com>
Discussion: https://postgr.es/m/CAAaqYe_dOF39gAJ8rL-a3YO3Qo96MHMRQ2whFjK5ZcU6YvMQSA%40mail.gmail.com
---
 src/backend/utils/hash/dynahash.c | 12 ++++-
 src/include/lib/simplehash.h      | 73 +++++++++++++++++++++++++++++--
 2 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 5948b01abc34..f4fbccdd7e44 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * dynahash.c
- *	  dynamic hash tables
+ *	  dynamic chained hash tables
  *
  * dynahash.c supports both local-to-a-backend hash tables and hash tables in
  * shared memory.  For shared hash tables, it is the caller's responsibility
@@ -41,6 +41,16 @@
  * function must be supplied; comparison defaults to memcmp() and key copying
  * to memcpy() when a user-defined hashing function is selected.
  *
+ * Compared to simplehash, dynahash has the following benefits:
+ *
+ * - It supports partitioning, which is useful for shared memory access using
+ *   locks.
+ * - Shared memory hashes are allocated in a fixed size area at startup and
+ *   are discoverable by name from other processes.
+ * - Because entries don't need to be moved in the case of hash conflicts, has
+ *   better performance for large entries
+ * - Guarantees stable pointers to entries.
+ *
  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h
index 90dfa8a695dd..96f0c21f6064 100644
--- a/src/include/lib/simplehash.h
+++ b/src/include/lib/simplehash.h
@@ -1,10 +1,27 @@
 /*
  * simplehash.h
  *
- *	  Hash table implementation which will be specialized to user-defined
- *	  types, by including this file to generate the required code.  It's
- *	  probably not worthwhile to do so for hash tables that aren't performance
- *	  or space sensitive.
+ *	  When included this file generates a "templated" (by way of macros)
+ *	  open-addressing hash table implementation specialized to user-defined
+ *	  types.
+ *
+ *	  It's probably not worthwhile to generate such a specialized implementation
+ *	  for hash tables that aren't performance or space sensitive.
+ *
+ *	  Compared to dynahash, simplehash has the following benefits:
+ *
+ *	  - Due to the "templated" code generation has known structure sizes and no
+ *	    indirect function calls (which show up substantially in dynahash
+ *	    profiles). These features considerably increase speed for small
+ *	    entries.
+ *	  - Open addressing has better CPU cache behavior than dynahash's chained
+ *	    hashtables.
+ *	  - The generated interface is type-safe and easier to use than dynahash,
+ *	    though at the cost of more complex setup.
+ *	  - Allocates memory in a MemoryContext or another allocator with a
+ *	    malloc/free style interface (which isn't easily usable in a shared
+ *	    memory context)
+ *	  - Does not require the overhead of a separate memory context.
  *
  * Usage notes:
  *
@@ -34,6 +51,19 @@
  *	  - SH_STORE_HASH - if defined the hash is stored in the elements
  *	  - SH_GET_HASH(tb, a) - return the field to store the hash in
  *
+ *	  The element type is required to contain a "uint32 status" member.
+ *
+ *	  While SH_STORE_HASH (and subsequently SH_GET_HASH) are optional, because
+ *	  the hash table implementation needs to compare hashes to move elements
+ *	  (particularly when growing the hash), it's preferable, if possible, to
+ *	  store the element's hash in the element's data type. If the hash is so
+ *	  stored, the hash table will also compare hashes before calling SH_EQUAL
+ *	  when comparing two keys.
+ *
+ *	  For convenience the hash table create functions accept a void pointer
+ *	  that will be stored in the hash table type's member private_data. This
+ *	  allows callbacks to reference caller provided data.
+ *
  *	  For examples of usage look at tidbitmap.c (file local definition) and
  *	  execnodes.h/execGrouping.c (exposed declaration, file local
  *	  implementation).
@@ -149,24 +179,59 @@ typedef struct SH_ITERATOR
 
 /* externally visible function prototypes */
 #ifdef SH_RAW_ALLOCATOR
+/* <prefix>_hash <prefix>_create(uint32 nelements, void *private_data) */
 SH_SCOPE	SH_TYPE *SH_CREATE(uint32 nelements, void *private_data);
 #else
+/*
+ * <prefix>_hash <prefix>_create(MemoryContext ctx, uint32 nelements,
+ *								 void *private_data)
+ */
 SH_SCOPE	SH_TYPE *SH_CREATE(MemoryContext ctx, uint32 nelements,
 							   void *private_data);
 #endif
+
+/* void <prefix>_destroy(<prefix>_hash *tb) */
 SH_SCOPE void SH_DESTROY(SH_TYPE * tb);
+
+/* void <prefix>_reset(<prefix>_hash *tb) */
 SH_SCOPE void SH_RESET(SH_TYPE * tb);
+
+/* void <prefix>_grow(<prefix>_hash *tb) */
 SH_SCOPE void SH_GROW(SH_TYPE * tb, uint32 newsize);
+
+/* <element> *<prefix>_insert(<prefix>_hash *tb, <key> key, bool *found) */
 SH_SCOPE	SH_ELEMENT_TYPE *SH_INSERT(SH_TYPE * tb, SH_KEY_TYPE key, bool *found);
+
+/*
+ * <element> *<prefix>_insert_hash(<prefix>_hash *tb, <key> key, uint32 hash,
+ * 								  bool *found)
+ */
 SH_SCOPE	SH_ELEMENT_TYPE *SH_INSERT_HASH(SH_TYPE * tb, SH_KEY_TYPE key,
 											uint32 hash, bool *found);
+
+/* <element> *<prefix>_lookup(<prefix>_hash *tb, <key> key) */
 SH_SCOPE	SH_ELEMENT_TYPE *SH_LOOKUP(SH_TYPE * tb, SH_KEY_TYPE key);
+
+/* <element> *<prefix>_lookup_hash(<prefix>_hash *tb, <key> key, uint32 hash) */
 SH_SCOPE	SH_ELEMENT_TYPE *SH_LOOKUP_HASH(SH_TYPE * tb, SH_KEY_TYPE key,
 											uint32 hash);
+
+/* bool <prefix>_delete(<prefix>_hash *tb, <key> key) */
 SH_SCOPE bool SH_DELETE(SH_TYPE * tb, SH_KEY_TYPE key);
+
+/* void <prefix>_start_iterate(<prefix>_hash *tb, <prefix>_iterator *iter) */
 SH_SCOPE void SH_START_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter);
+
+/*
+ * void <prefix>_start_iterate_at(<prefix>_hash *tb, <prefix>_iterator *iter,
+ *								  uint32 at)
+ */
 SH_SCOPE void SH_START_ITERATE_AT(SH_TYPE * tb, SH_ITERATOR * iter, uint32 at);
+
+/* <element> *<prefix>_iterate(<prefix>_hash *tb, <prefix>_iterator *iter) */
 SH_SCOPE	SH_ELEMENT_TYPE *SH_ITERATE(SH_TYPE * tb, SH_ITERATOR * iter);
+
+/* void <prefix>_stat(<prefix>_hash *tb */
 SH_SCOPE void SH_STAT(SH_TYPE * tb);
 
 #endif							/* SH_DECLARE */

From 022350b8495a8a7ff0ff8dd6791572e91e7cd6fe Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Sat, 1 Aug 2020 11:49:13 +0900
Subject: [PATCH 240/334] Minimize slot creation for multi-inserts of
 pg_shdepend

When doing multiple insertions in pg_shdepend for the copy of
dependencies from a template database in CREATE DATABASE, the same
number of slots would have been created and used all the time.  As the
number of items to insert is not known in advance, this makes most of
the slots created for nothing.  This improves the slot handling so as
slot creation only happens when needed, minimizing the overhead of the
operation.

Author: Michael Paquier
Reviewed-by: Daniel Gustafsson
Discussion: https://postgr.es/m/20200731024148.GB3317@paquier.xyz
---
 src/backend/catalog/pg_shdepend.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/src/backend/catalog/pg_shdepend.c b/src/backend/catalog/pg_shdepend.c
index ef2b87927ceb..30b234e90e12 100644
--- a/src/backend/catalog/pg_shdepend.c
+++ b/src/backend/catalog/pg_shdepend.c
@@ -809,15 +809,19 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 	int			slotCount;
 	CatalogIndexState indstate;
 	TupleTableSlot **slot;
-	int			nslots;
+	int			nslots,
+				max_slots;
+	bool		slot_init = true;
 
 	sdepRel = table_open(SharedDependRelationId, RowExclusiveLock);
 	sdepDesc = RelationGetDescr(sdepRel);
 
-	nslots = MAX_PGSHDEPEND_INSERT_BYTES / sizeof(FormData_pg_shdepend);
-	slot = palloc(sizeof(TupleTableSlot *) * nslots);
-	for (int i = 0; i < nslots; i++)
-		slot[i] = MakeSingleTupleTableSlot(sdepDesc, &TTSOpsHeapTuple);
+	/*
+	 * Allocate the slots to use, but delay initialization until we know that
+	 * they will be used.
+	 */
+	max_slots = MAX_PGSHDEPEND_INSERT_BYTES / sizeof(FormData_pg_shdepend);
+	slot = palloc(sizeof(TupleTableSlot *) * max_slots);
 
 	indstate = CatalogOpenIndexes(sdepRel);
 
@@ -842,6 +846,9 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 	{
 		Form_pg_shdepend shdep;
 
+		if (slot_init)
+			slot[slotCount] = MakeSingleTupleTableSlot(sdepDesc, &TTSOpsHeapTuple);
+
 		ExecClearTuple(slot[slotCount]);
 
 		shdep = (Form_pg_shdepend) GETSTRUCT(tup);
@@ -858,10 +865,11 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 		slotCount++;
 
 		/* If slots are full, insert a batch of tuples */
-		if (slotCount == nslots)
+		if (slotCount == max_slots)
 		{
 			CatalogTuplesMultiInsertWithInfo(sdepRel, slot, slotCount, indstate);
 			slotCount = 0;
+			slot_init = false;
 		}
 	}
 
@@ -874,6 +882,8 @@ copyTemplateDependencies(Oid templateDbId, Oid newDbId)
 	CatalogCloseIndexes(indstate);
 	table_close(sdepRel, RowExclusiveLock);
 
+	/* Drop only the number of slots used */
+	nslots = slot_init ? slotCount : max_slots;
 	for (int i = 0; i < nslots; i++)
 		ExecDropSingleTupleTableSlot(slot[i]);
 	pfree(slot);

From e2b37d9e7cabc90633c4bd822e1bcfdd1bda44c4 Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Sat, 1 Aug 2020 23:39:36 +1200
Subject: [PATCH 241/334] Use pg_pread() and pg_pwrite() in slru.c.

This avoids lseek() system calls at every SLRU I/O, as was
done for relation files in commit c24dcd0c.

Reviewed-by: Ashwin Agrawal <aagrawal@pivotal.io>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKG%2Biqke4uTRFj8D8uEUUgj%2BRokPSp%2BCWM6YYzaaamG9Wvg%40mail.gmail.com
Discussion: https://postgr.es/m/CA%2BhUKGJ%2BoHhnvqjn3%3DHro7xu-YDR8FPr0FL6LF35kHRX%3D_bUzg%40mail.gmail.com
---
 src/backend/access/transam/slru.c | 25 ++++---------------------
 1 file changed, 4 insertions(+), 21 deletions(-)

diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 61249f4a12df..9e145f1c36ac 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -669,7 +669,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 	SlruShared	shared = ctl->shared;
 	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
 	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
-	int			offset = rpageno * BLCKSZ;
+	off_t		offset = rpageno * BLCKSZ;
 	char		path[MAXPGPATH];
 	int			fd;
 
@@ -699,17 +699,9 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 		return true;
 	}
 
-	if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
-	{
-		slru_errcause = SLRU_SEEK_FAILED;
-		slru_errno = errno;
-		CloseTransientFile(fd);
-		return false;
-	}
-
 	errno = 0;
 	pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
-	if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
+	if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
 	{
 		pgstat_report_wait_end();
 		slru_errcause = SLRU_READ_FAILED;
@@ -749,7 +741,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 	SlruShared	shared = ctl->shared;
 	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
 	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
-	int			offset = rpageno * BLCKSZ;
+	off_t		offset = rpageno * BLCKSZ;
 	char		path[MAXPGPATH];
 	int			fd = -1;
 
@@ -862,18 +854,9 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 		}
 	}
 
-	if (lseek(fd, (off_t) offset, SEEK_SET) < 0)
-	{
-		slru_errcause = SLRU_SEEK_FAILED;
-		slru_errno = errno;
-		if (!fdata)
-			CloseTransientFile(fd);
-		return false;
-	}
-
 	errno = 0;
 	pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
-	if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ)
+	if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
 	{
 		pgstat_report_wait_end();
 		/* if write didn't set errno, assume problem is no disk space */

From 9f9682783bea74bf8d93cac4f7dd65fa677f5dc7 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 1 Aug 2020 17:12:47 -0400
Subject: [PATCH 242/334] Invent "amadjustmembers" AM method for validating
 opclass members.

This allows AM-specific knowledge to be applied during creation of
pg_amop and pg_amproc entries.  Specifically, the AM knows better than
core code which entries to consider as required or optional.  Giving
the latter entries the appropriate sort of dependency allows them to
be dropped without taking out the whole opclass or opfamily; which
is something we'd like to have to correct obsolescent entries in
extensions.

This callback also opens the door to performing AM-specific validity
checks during opclass creation, rather than hoping than an opclass
developer will remember to test with "amvalidate".  For the most part
I've not actually added any such checks yet; that can happen in a
follow-on patch.  (Note that we shouldn't remove any tests from
"amvalidate", as those are still needed to cross-check manually
constructed entries in the initdb data.  So adding tests to
"amadjustmembers" will be somewhat duplicative, but it seems like
a good idea anyway.)

Patch by me, reviewed by Alexander Korotkov, Hamid Akhtar, and
Anastasia Lubennikova.

Discussion: https://postgr.es/m/4578.1565195302@sss.pgh.pa.us
---
 contrib/bloom/blutils.c                 |   1 +
 doc/src/sgml/indexam.sgml               |  44 +++++-
 src/backend/access/brin/brin.c          |   1 +
 src/backend/access/gin/ginutil.c        |   1 +
 src/backend/access/gin/ginvalidate.c    |  65 +++++++++
 src/backend/access/gist/gist.c          |   1 +
 src/backend/access/gist/gistvalidate.c  |  69 ++++++++++
 src/backend/access/hash/hash.c          |   1 +
 src/backend/access/hash/hashvalidate.c  |  96 +++++++++++++
 src/backend/access/index/amvalidate.c   |  39 ++++--
 src/backend/access/nbtree/nbtree.c      |   1 +
 src/backend/access/nbtree/nbtvalidate.c |  96 +++++++++++++
 src/backend/access/spgist/spgutils.c    |   1 +
 src/backend/access/spgist/spgvalidate.c |  66 +++++++++
 src/backend/commands/opclasscmds.c      | 170 ++++++++++++++----------
 src/bin/pg_dump/t/002_pg_dump.pl        |  10 +-
 src/include/access/amapi.h              |  43 ++++++
 src/include/access/amvalidate.h         |   5 +-
 src/include/access/gin_private.h        |   4 +
 src/include/access/gist_private.h       |   4 +
 src/include/access/hash.h               |   4 +
 src/include/access/nbtree.h             |   4 +
 src/include/access/spgist.h             |   4 +
 src/include/catalog/opfam_internal.h    |  28 ----
 24 files changed, 646 insertions(+), 112 deletions(-)
 delete mode 100644 src/include/catalog/opfam_internal.h

diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index d3bf8665df1f..26b9927c3aaf 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -139,6 +139,7 @@ blhandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = NULL;
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = blvalidate;
+	amroutine->amadjustmembers = NULL;
 	amroutine->ambeginscan = blbeginscan;
 	amroutine->amrescan = blrescan;
 	amroutine->amgettuple = NULL;
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index af87f172a7cd..1aea4db707d5 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -143,6 +143,7 @@ typedef struct IndexAmRoutine
     amproperty_function amproperty;     /* can be NULL */
     ambuildphasename_function ambuildphasename;   /* can be NULL */
     amvalidate_function amvalidate;
+    amadjustmembers_function amadjustmembers; /* can be NULL */
     ambeginscan_function ambeginscan;
     amrescan_function amrescan;
     amgettuple_function amgettuple;     /* can be NULL */
@@ -502,7 +503,48 @@ amvalidate (Oid opclassoid);
    the access method can reasonably do that.  For example, this might include
    testing that all required support functions are provided.
    The <function>amvalidate</function> function must return false if the opclass is
-   invalid.  Problems should be reported with <function>ereport</function> messages.
+   invalid.  Problems should be reported with <function>ereport</function>
+   messages, typically at <literal>INFO</literal> level.
+  </para>
+
+  <para>
+<programlisting>
+void
+amadjustmembers (Oid opfamilyoid,
+                 Oid opclassoid,
+                 List *operators,
+                 List *functions);
+</programlisting>
+   Validate proposed new operator and function members of an operator family,
+   so far as the access method can reasonably do that, and set their
+   dependency types if the default is not satisfactory.  This is called
+   during <command>CREATE OPERATOR CLASS</command> and during
+   <command>ALTER OPERATOR FAMILY ADD</command>; in the latter
+   case <parameter>opclassoid</parameter> is <literal>InvalidOid</literal>.
+   The <type>List</type> arguments are lists
+   of <structname>OpFamilyMember</structname> structs, as defined
+   in <filename>amapi.h</filename>.
+
+   Tests done by this function will typically be a subset of those
+   performed by <function>amvalidate</function>,
+   since <function>amadjustmembers</function> cannot assume that it is
+   seeing a complete set of members.  For example, it would be reasonable
+   to check the signature of a support function, but not to check whether
+   all required support functions are provided.  Any problems can be
+   reported by throwing an error.
+
+   The dependency-related fields of
+   the <structname>OpFamilyMember</structname> structs are initialized by
+   the core code to create hard dependencies on the opclass if this
+   is <command>CREATE OPERATOR CLASS</command>, or soft dependencies on the
+   opfamily if this is <command>ALTER OPERATOR FAMILY ADD</command>.
+   <function>amadjustmembers</function> can adjust these fields if some other
+   behavior is more appropriate.  For example, GIN, GiST, and SP-GiST
+   always set operator members to have soft dependencies on the opfamily,
+   since the connection between an operator and an opclass is relatively
+   weak in these index types; so it is reasonable to allow operator members
+   to be added and removed freely.  Optional support functions are typically
+   also given soft dependencies, so that they can be removed if necessary.
   </para>
 
 
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 7db3ae5ee0cf..1f72562c6030 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -120,6 +120,7 @@ brinhandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = NULL;
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = brinvalidate;
+	amroutine->amadjustmembers = NULL;
 	amroutine->ambeginscan = brinbeginscan;
 	amroutine->amrescan = brinrescan;
 	amroutine->amgettuple = NULL;
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index a400f1fedbc5..ef9b56fd363a 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -71,6 +71,7 @@ ginhandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = NULL;
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = ginvalidate;
+	amroutine->amadjustmembers = ginadjustmembers;
 	amroutine->ambeginscan = ginbeginscan;
 	amroutine->amrescan = ginrescan;
 	amroutine->amgettuple = NULL;
diff --git a/src/backend/access/gin/ginvalidate.c b/src/backend/access/gin/ginvalidate.c
index 1e3046f4eb7c..60ce1ae10663 100644
--- a/src/backend/access/gin/ginvalidate.c
+++ b/src/backend/access/gin/ginvalidate.c
@@ -271,3 +271,68 @@ ginvalidate(Oid opclassoid)
 
 	return result;
 }
+
+/*
+ * Prechecking function for adding operators/functions to a GIN opfamily.
+ */
+void
+ginadjustmembers(Oid opfamilyoid,
+				 Oid opclassoid,
+				 List *operators,
+				 List *functions)
+{
+	ListCell   *lc;
+
+	/*
+	 * Operator members of a GIN opfamily should never have hard dependencies,
+	 * since their connection to the opfamily depends only on what the support
+	 * functions think, and that can be altered.  For consistency, we make all
+	 * soft dependencies point to the opfamily, though a soft dependency on
+	 * the opclass would work as well in the CREATE OPERATOR CLASS case.
+	 */
+	foreach(lc, operators)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		op->ref_is_hard = false;
+		op->ref_is_family = true;
+		op->refobjid = opfamilyoid;
+	}
+
+	/*
+	 * Required support functions should have hard dependencies.  Preferably
+	 * those are just dependencies on the opclass, but if we're in ALTER
+	 * OPERATOR FAMILY, we leave the dependency pointing at the whole
+	 * opfamily.  (Given that GIN opclasses generally don't share opfamilies,
+	 * it seems unlikely to be worth working harder.)
+	 */
+	foreach(lc, functions)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		switch (op->number)
+		{
+			case GIN_EXTRACTVALUE_PROC:
+			case GIN_EXTRACTQUERY_PROC:
+				/* Required support function */
+				op->ref_is_hard = true;
+				break;
+			case GIN_COMPARE_PROC:
+			case GIN_CONSISTENT_PROC:
+			case GIN_COMPARE_PARTIAL_PROC:
+			case GIN_TRICONSISTENT_PROC:
+			case GIN_OPTIONS_PROC:
+				/* Optional, so force it to be a soft family dependency */
+				op->ref_is_hard = false;
+				op->ref_is_family = true;
+				op->refobjid = opfamilyoid;
+				break;
+			default:
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+						 errmsg("support function number %d is invalid for access method %s",
+								op->number, "gin")));
+				break;
+		}
+	}
+}
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 79fe6eb8d62c..25b42e38f224 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -92,6 +92,7 @@ gisthandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = gistproperty;
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = gistvalidate;
+	amroutine->amadjustmembers = gistadjustmembers;
 	amroutine->ambeginscan = gistbeginscan;
 	amroutine->amrescan = gistrescan;
 	amroutine->amgettuple = gistgettuple;
diff --git a/src/backend/access/gist/gistvalidate.c b/src/backend/access/gist/gistvalidate.c
index a285736a8109..2b9ab693be18 100644
--- a/src/backend/access/gist/gistvalidate.c
+++ b/src/backend/access/gist/gistvalidate.c
@@ -279,3 +279,72 @@ gistvalidate(Oid opclassoid)
 
 	return result;
 }
+
+/*
+ * Prechecking function for adding operators/functions to a GiST opfamily.
+ */
+void
+gistadjustmembers(Oid opfamilyoid,
+				  Oid opclassoid,
+				  List *operators,
+				  List *functions)
+{
+	ListCell   *lc;
+
+	/*
+	 * Operator members of a GiST opfamily should never have hard
+	 * dependencies, since their connection to the opfamily depends only on
+	 * what the support functions think, and that can be altered.  For
+	 * consistency, we make all soft dependencies point to the opfamily,
+	 * though a soft dependency on the opclass would work as well in the
+	 * CREATE OPERATOR CLASS case.
+	 */
+	foreach(lc, operators)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		op->ref_is_hard = false;
+		op->ref_is_family = true;
+		op->refobjid = opfamilyoid;
+	}
+
+	/*
+	 * Required support functions should have hard dependencies.  Preferably
+	 * those are just dependencies on the opclass, but if we're in ALTER
+	 * OPERATOR FAMILY, we leave the dependency pointing at the whole
+	 * opfamily.  (Given that GiST opclasses generally don't share opfamilies,
+	 * it seems unlikely to be worth working harder.)
+	 */
+	foreach(lc, functions)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		switch (op->number)
+		{
+			case GIST_CONSISTENT_PROC:
+			case GIST_UNION_PROC:
+			case GIST_PENALTY_PROC:
+			case GIST_PICKSPLIT_PROC:
+			case GIST_EQUAL_PROC:
+				/* Required support function */
+				op->ref_is_hard = true;
+				break;
+			case GIST_COMPRESS_PROC:
+			case GIST_DECOMPRESS_PROC:
+			case GIST_DISTANCE_PROC:
+			case GIST_FETCH_PROC:
+			case GIST_OPTIONS_PROC:
+				/* Optional, so force it to be a soft family dependency */
+				op->ref_is_hard = false;
+				op->ref_is_family = true;
+				op->refobjid = opfamilyoid;
+				break;
+			default:
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+						 errmsg("support function number %d is invalid for access method %s",
+								op->number, "gist")));
+				break;
+		}
+	}
+}
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 3ec6d528e77f..7c9ccf446c8a 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -89,6 +89,7 @@ hashhandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = NULL;
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = hashvalidate;
+	amroutine->amadjustmembers = hashadjustmembers;
 	amroutine->ambeginscan = hashbeginscan;
 	amroutine->amrescan = hashrescan;
 	amroutine->amgettuple = hashgettuple;
diff --git a/src/backend/access/hash/hashvalidate.c b/src/backend/access/hash/hashvalidate.c
index 6f14a9fb455d..0fe97e8276b6 100644
--- a/src/backend/access/hash/hashvalidate.c
+++ b/src/backend/access/hash/hashvalidate.c
@@ -16,6 +16,8 @@
 #include "access/amvalidate.h"
 #include "access/hash.h"
 #include "access/htup_details.h"
+#include "access/xact.h"
+#include "catalog/pg_am.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_amproc.h"
 #include "catalog/pg_opclass.h"
@@ -25,6 +27,7 @@
 #include "parser/parse_coerce.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
+#include "utils/lsyscache.h"
 #include "utils/regproc.h"
 #include "utils/syscache.h"
 
@@ -341,3 +344,96 @@ check_hash_func_signature(Oid funcid, int16 amprocnum, Oid argtype)
 	ReleaseSysCache(tp);
 	return result;
 }
+
+/*
+ * Prechecking function for adding operators/functions to a hash opfamily.
+ */
+void
+hashadjustmembers(Oid opfamilyoid,
+				  Oid opclassoid,
+				  List *operators,
+				  List *functions)
+{
+	Oid			opcintype;
+	ListCell   *lc;
+
+	/*
+	 * Hash operators and required support functions are always "loose"
+	 * members of the opfamily if they are cross-type.  If they are not
+	 * cross-type, we prefer to tie them to the appropriate opclass ... but if
+	 * the user hasn't created one, we can't do that, and must fall back to
+	 * using the opfamily dependency.  (We mustn't force creation of an
+	 * opclass in such a case, as leaving an incomplete opclass laying about
+	 * would be bad.  Throwing an error is another undesirable alternative.)
+	 *
+	 * This behavior results in a bit of a dump/reload hazard, in that the
+	 * order of restoring objects could affect what dependencies we end up
+	 * with.  pg_dump's existing behavior will preserve the dependency choices
+	 * in most cases, but not if a cross-type operator has been bound tightly
+	 * into an opclass.  That's a mistake anyway, so silently "fixing" it
+	 * isn't awful.
+	 *
+	 * Optional support functions are always "loose" family members.
+	 *
+	 * To avoid repeated lookups, we remember the most recently used opclass's
+	 * input type.
+	 */
+	if (OidIsValid(opclassoid))
+	{
+		/* During CREATE OPERATOR CLASS, need CCI to see the pg_opclass row */
+		CommandCounterIncrement();
+		opcintype = get_opclass_input_type(opclassoid);
+	}
+	else
+		opcintype = InvalidOid;
+
+	/*
+	 * We handle operators and support functions almost identically, so rather
+	 * than duplicate this code block, just join the lists.
+	 */
+	foreach(lc, list_concat_copy(operators, functions))
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		if (op->is_func && op->number != HASHSTANDARD_PROC)
+		{
+			/* Optional support proc, so always a soft family dependency */
+			op->ref_is_hard = false;
+			op->ref_is_family = true;
+			op->refobjid = opfamilyoid;
+		}
+		else if (op->lefttype != op->righttype)
+		{
+			/* Cross-type, so always a soft family dependency */
+			op->ref_is_hard = false;
+			op->ref_is_family = true;
+			op->refobjid = opfamilyoid;
+		}
+		else
+		{
+			/* Not cross-type; is there a suitable opclass? */
+			if (op->lefttype != opcintype)
+			{
+				/* Avoid repeating this expensive lookup, even if it fails */
+				opcintype = op->lefttype;
+				opclassoid = opclass_for_family_datatype(HASH_AM_OID,
+														 opfamilyoid,
+														 opcintype);
+			}
+			if (OidIsValid(opclassoid))
+			{
+				/* Hard dependency on opclass */
+				op->ref_is_hard = true;
+				op->ref_is_family = false;
+				op->refobjid = opclassoid;
+			}
+			else
+			{
+				/* We're stuck, so make a soft dependency on the opfamily */
+				op->ref_is_hard = false;
+				op->ref_is_family = true;
+				op->refobjid = opfamilyoid;
+			}
+		}
+	}
+}
diff --git a/src/backend/access/index/amvalidate.c b/src/backend/access/index/amvalidate.c
index 24d49750adae..b58c34aa5f2f 100644
--- a/src/backend/access/index/amvalidate.c
+++ b/src/backend/access/index/amvalidate.c
@@ -1,7 +1,8 @@
 /*-------------------------------------------------------------------------
  *
  * amvalidate.c
- *	  Support routines for index access methods' amvalidate functions.
+ *	  Support routines for index access methods' amvalidate and
+ *	  amadjustmembers functions.
  *
  * Copyright (c) 2016-2020, PostgreSQL Global Development Group
  *
@@ -222,21 +223,28 @@ check_amop_signature(Oid opno, Oid restype, Oid lefttype, Oid righttype)
 }
 
 /*
- * Is the datatype a legitimate input type for the btree opfamily?
+ * Get the OID of the opclass belonging to an opfamily and accepting
+ * the specified type as input type.  Returns InvalidOid if no such opclass.
+ *
+ * If there is more than one such opclass, you get a random one of them.
+ * Since that shouldn't happen, we don't waste cycles checking.
+ *
+ * We could look up the AM's OID from the opfamily, but all existing callers
+ * know that or can get it without an extra lookup, so we make them pass it.
  */
-bool
-opfamily_can_sort_type(Oid opfamilyoid, Oid datatypeoid)
+Oid
+opclass_for_family_datatype(Oid amoid, Oid opfamilyoid, Oid datatypeoid)
 {
-	bool		result = false;
+	Oid			result = InvalidOid;
 	CatCList   *opclist;
 	int			i;
 
 	/*
-	 * We search through all btree opclasses to see if one matches.  This is a
-	 * bit inefficient but there is no better index available.  It also saves
-	 * making an explicit check that the opfamily belongs to btree.
+	 * We search through all the AM's opclasses to see if one matches.  This
+	 * is a bit inefficient but there is no better index available.  It also
+	 * saves making an explicit check that the opfamily belongs to the AM.
 	 */
-	opclist = SearchSysCacheList1(CLAAMNAMENSP, ObjectIdGetDatum(BTREE_AM_OID));
+	opclist = SearchSysCacheList1(CLAAMNAMENSP, ObjectIdGetDatum(amoid));
 
 	for (i = 0; i < opclist->n_members; i++)
 	{
@@ -246,7 +254,7 @@ opfamily_can_sort_type(Oid opfamilyoid, Oid datatypeoid)
 		if (classform->opcfamily == opfamilyoid &&
 			classform->opcintype == datatypeoid)
 		{
-			result = true;
+			result = classform->oid;
 			break;
 		}
 	}
@@ -255,3 +263,14 @@ opfamily_can_sort_type(Oid opfamilyoid, Oid datatypeoid)
 
 	return result;
 }
+
+/*
+ * Is the datatype a legitimate input type for the btree opfamily?
+ */
+bool
+opfamily_can_sort_type(Oid opfamilyoid, Oid datatypeoid)
+{
+	return OidIsValid(opclass_for_family_datatype(BTREE_AM_OID,
+												  opfamilyoid,
+												  datatypeoid));
+}
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index d65f4357cc8b..49a8a9708e38 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -141,6 +141,7 @@ bthandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = btproperty;
 	amroutine->ambuildphasename = btbuildphasename;
 	amroutine->amvalidate = btvalidate;
+	amroutine->amadjustmembers = btadjustmembers;
 	amroutine->ambeginscan = btbeginscan;
 	amroutine->amrescan = btrescan;
 	amroutine->amgettuple = btgettuple;
diff --git a/src/backend/access/nbtree/nbtvalidate.c b/src/backend/access/nbtree/nbtvalidate.c
index 02905f79c826..5be728ad07cf 100644
--- a/src/backend/access/nbtree/nbtvalidate.c
+++ b/src/backend/access/nbtree/nbtvalidate.c
@@ -16,12 +16,15 @@
 #include "access/amvalidate.h"
 #include "access/htup_details.h"
 #include "access/nbtree.h"
+#include "access/xact.h"
+#include "catalog/pg_am.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_amproc.h"
 #include "catalog/pg_opclass.h"
 #include "catalog/pg_opfamily.h"
 #include "catalog/pg_type.h"
 #include "utils/builtins.h"
+#include "utils/lsyscache.h"
 #include "utils/regproc.h"
 #include "utils/syscache.h"
 
@@ -282,3 +285,96 @@ btvalidate(Oid opclassoid)
 
 	return result;
 }
+
+/*
+ * Prechecking function for adding operators/functions to a btree opfamily.
+ */
+void
+btadjustmembers(Oid opfamilyoid,
+				Oid opclassoid,
+				List *operators,
+				List *functions)
+{
+	Oid			opcintype;
+	ListCell   *lc;
+
+	/*
+	 * Btree operators and comparison support functions are always "loose"
+	 * members of the opfamily if they are cross-type.  If they are not
+	 * cross-type, we prefer to tie them to the appropriate opclass ... but if
+	 * the user hasn't created one, we can't do that, and must fall back to
+	 * using the opfamily dependency.  (We mustn't force creation of an
+	 * opclass in such a case, as leaving an incomplete opclass laying about
+	 * would be bad.  Throwing an error is another undesirable alternative.)
+	 *
+	 * This behavior results in a bit of a dump/reload hazard, in that the
+	 * order of restoring objects could affect what dependencies we end up
+	 * with.  pg_dump's existing behavior will preserve the dependency choices
+	 * in most cases, but not if a cross-type operator has been bound tightly
+	 * into an opclass.  That's a mistake anyway, so silently "fixing" it
+	 * isn't awful.
+	 *
+	 * Optional support functions are always "loose" family members.
+	 *
+	 * To avoid repeated lookups, we remember the most recently used opclass's
+	 * input type.
+	 */
+	if (OidIsValid(opclassoid))
+	{
+		/* During CREATE OPERATOR CLASS, need CCI to see the pg_opclass row */
+		CommandCounterIncrement();
+		opcintype = get_opclass_input_type(opclassoid);
+	}
+	else
+		opcintype = InvalidOid;
+
+	/*
+	 * We handle operators and support functions almost identically, so rather
+	 * than duplicate this code block, just join the lists.
+	 */
+	foreach(lc, list_concat_copy(operators, functions))
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		if (op->is_func && op->number != BTORDER_PROC)
+		{
+			/* Optional support proc, so always a soft family dependency */
+			op->ref_is_hard = false;
+			op->ref_is_family = true;
+			op->refobjid = opfamilyoid;
+		}
+		else if (op->lefttype != op->righttype)
+		{
+			/* Cross-type, so always a soft family dependency */
+			op->ref_is_hard = false;
+			op->ref_is_family = true;
+			op->refobjid = opfamilyoid;
+		}
+		else
+		{
+			/* Not cross-type; is there a suitable opclass? */
+			if (op->lefttype != opcintype)
+			{
+				/* Avoid repeating this expensive lookup, even if it fails */
+				opcintype = op->lefttype;
+				opclassoid = opclass_for_family_datatype(BTREE_AM_OID,
+														 opfamilyoid,
+														 opcintype);
+			}
+			if (OidIsValid(opclassoid))
+			{
+				/* Hard dependency on opclass */
+				op->ref_is_hard = true;
+				op->ref_is_family = false;
+				op->refobjid = opclassoid;
+			}
+			else
+			{
+				/* We're stuck, so make a soft dependency on the opfamily */
+				op->ref_is_hard = false;
+				op->ref_is_family = true;
+				op->refobjid = opfamilyoid;
+			}
+		}
+	}
+}
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index 0efe05e552b6..64d3ba82887b 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -74,6 +74,7 @@ spghandler(PG_FUNCTION_ARGS)
 	amroutine->amproperty = spgproperty;
 	amroutine->ambuildphasename = NULL;
 	amroutine->amvalidate = spgvalidate;
+	amroutine->amadjustmembers = spgadjustmembers;
 	amroutine->ambeginscan = spgbeginscan;
 	amroutine->amrescan = spgrescan;
 	amroutine->amgettuple = spggettuple;
diff --git a/src/backend/access/spgist/spgvalidate.c b/src/backend/access/spgist/spgvalidate.c
index f0cfd8b42b1e..d4f5841e2656 100644
--- a/src/backend/access/spgist/spgvalidate.c
+++ b/src/backend/access/spgist/spgvalidate.c
@@ -303,3 +303,69 @@ spgvalidate(Oid opclassoid)
 
 	return result;
 }
+
+/*
+ * Prechecking function for adding operators/functions to an SP-GiST opfamily.
+ */
+void
+spgadjustmembers(Oid opfamilyoid,
+				 Oid opclassoid,
+				 List *operators,
+				 List *functions)
+{
+	ListCell   *lc;
+
+	/*
+	 * Operator members of an SP-GiST opfamily should never have hard
+	 * dependencies, since their connection to the opfamily depends only on
+	 * what the support functions think, and that can be altered.  For
+	 * consistency, we make all soft dependencies point to the opfamily,
+	 * though a soft dependency on the opclass would work as well in the
+	 * CREATE OPERATOR CLASS case.
+	 */
+	foreach(lc, operators)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		op->ref_is_hard = false;
+		op->ref_is_family = true;
+		op->refobjid = opfamilyoid;
+	}
+
+	/*
+	 * Required support functions should have hard dependencies.  Preferably
+	 * those are just dependencies on the opclass, but if we're in ALTER
+	 * OPERATOR FAMILY, we leave the dependency pointing at the whole
+	 * opfamily.  (Given that SP-GiST opclasses generally don't share
+	 * opfamilies, it seems unlikely to be worth working harder.)
+	 */
+	foreach(lc, functions)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(lc);
+
+		switch (op->number)
+		{
+			case SPGIST_CONFIG_PROC:
+			case SPGIST_CHOOSE_PROC:
+			case SPGIST_PICKSPLIT_PROC:
+			case SPGIST_INNER_CONSISTENT_PROC:
+			case SPGIST_LEAF_CONSISTENT_PROC:
+				/* Required support function */
+				op->ref_is_hard = true;
+				break;
+			case SPGIST_COMPRESS_PROC:
+			case SPGIST_OPTIONS_PROC:
+				/* Optional, so force it to be a soft family dependency */
+				op->ref_is_hard = false;
+				op->ref_is_family = true;
+				op->refobjid = opfamilyoid;
+				break;
+			default:
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
+						 errmsg("support function number %d is invalid for access method %s",
+								op->number, "spgist")));
+				break;
+		}
+	}
+}
diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c
index 351866f9f22a..28395d5946f3 100644
--- a/src/backend/commands/opclasscmds.c
+++ b/src/backend/commands/opclasscmds.c
@@ -27,7 +27,6 @@
 #include "catalog/dependency.h"
 #include "catalog/indexing.h"
 #include "catalog/objectaccess.h"
-#include "catalog/opfam_internal.h"
 #include "catalog/pg_am.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_amproc.h"
@@ -62,12 +61,10 @@ static void processTypesSpec(List *args, Oid *lefttype, Oid *righttype);
 static void assignOperTypes(OpFamilyMember *member, Oid amoid, Oid typeoid);
 static void assignProcTypes(OpFamilyMember *member, Oid amoid, Oid typeoid,
 							int opclassOptsProcNum);
-static void addFamilyMember(List **list, OpFamilyMember *member, bool isProc);
-static void storeOperators(List *opfamilyname, Oid amoid,
-						   Oid opfamilyoid, Oid opclassoid,
+static void addFamilyMember(List **list, OpFamilyMember *member);
+static void storeOperators(List *opfamilyname, Oid amoid, Oid opfamilyoid,
 						   List *operators, bool isAdd);
-static void storeProcedures(List *opfamilyname, Oid amoid,
-							Oid opfamilyoid, Oid opclassoid,
+static void storeProcedures(List *opfamilyname, Oid amoid, Oid opfamilyoid,
 							List *procedures, bool isAdd);
 static void dropOperators(List *opfamilyname, Oid amoid, Oid opfamilyoid,
 						  List *operators);
@@ -518,11 +515,12 @@ DefineOpClass(CreateOpClassStmt *stmt)
 
 				/* Save the info */
 				member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember));
+				member->is_func = false;
 				member->object = operOid;
 				member->number = item->number;
 				member->sortfamily = sortfamilyOid;
 				assignOperTypes(member, amoid, typeoid);
-				addFamilyMember(&operators, member, false);
+				addFamilyMember(&operators, member);
 				break;
 			case OPCLASS_ITEM_FUNCTION:
 				if (item->number <= 0 || item->number > maxProcNumber)
@@ -541,6 +539,7 @@ DefineOpClass(CreateOpClassStmt *stmt)
 #endif
 				/* Save the info */
 				member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember));
+				member->is_func = true;
 				member->object = funcOid;
 				member->number = item->number;
 
@@ -550,7 +549,7 @@ DefineOpClass(CreateOpClassStmt *stmt)
 									 &member->lefttype, &member->righttype);
 
 				assignProcTypes(member, amoid, typeoid, optsProcNumber);
-				addFamilyMember(&procedures, member, true);
+				addFamilyMember(&procedures, member);
 				break;
 			case OPCLASS_ITEM_STORAGETYPE:
 				if (OidIsValid(storageoid))
@@ -662,14 +661,46 @@ DefineOpClass(CreateOpClassStmt *stmt)
 
 	heap_freetuple(tup);
 
+	/*
+	 * Now that we have the opclass OID, set up default dependency info for
+	 * the pg_amop and pg_amproc entries.  Historically, CREATE OPERATOR CLASS
+	 * has created hard dependencies on the opclass, so that's what we use.
+	 */
+	foreach(l, operators)
+	{
+		OpFamilyMember *op = (OpFamilyMember *) lfirst(l);
+
+		op->ref_is_hard = true;
+		op->ref_is_family = false;
+		op->refobjid = opclassoid;
+	}
+	foreach(l, procedures)
+	{
+		OpFamilyMember *proc = (OpFamilyMember *) lfirst(l);
+
+		proc->ref_is_hard = true;
+		proc->ref_is_family = false;
+		proc->refobjid = opclassoid;
+	}
+
+	/*
+	 * Let the index AM editorialize on the dependency choices.  It could also
+	 * do further validation on the operators and functions, if it likes.
+	 */
+	if (amroutine->amadjustmembers)
+		amroutine->amadjustmembers(opfamilyoid,
+								   opclassoid,
+								   operators,
+								   procedures);
+
 	/*
 	 * Now add tuples to pg_amop and pg_amproc tying in the operators and
 	 * functions.  Dependencies on them are inserted, too.
 	 */
 	storeOperators(stmt->opfamilyname, amoid, opfamilyoid,
-				   opclassoid, operators, false);
+				   operators, false);
 	storeProcedures(stmt->opfamilyname, amoid, opfamilyoid,
-					opclassoid, procedures, false);
+					procedures, false);
 
 	/* let event triggers know what happened */
 	EventTriggerCollectCreateOpClass(stmt, opclassoid, operators, procedures);
@@ -842,6 +873,7 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 				 int maxOpNumber, int maxProcNumber, int optsProcNumber,
 				 List *items)
 {
+	IndexAmRoutine *amroutine = GetIndexAmRoutineByAmId(amoid, false);
 	List	   *operators;		/* OpFamilyMember list for operators */
 	List	   *procedures;		/* OpFamilyMember list for support procs */
 	ListCell   *l;
@@ -900,11 +932,17 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 
 				/* Save the info */
 				member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember));
+				member->is_func = false;
 				member->object = operOid;
 				member->number = item->number;
 				member->sortfamily = sortfamilyOid;
+				/* We can set up dependency fields immediately */
+				/* Historically, ALTER ADD has created soft dependencies */
+				member->ref_is_hard = false;
+				member->ref_is_family = true;
+				member->refobjid = opfamilyoid;
 				assignOperTypes(member, amoid, InvalidOid);
-				addFamilyMember(&operators, member, false);
+				addFamilyMember(&operators, member);
 				break;
 			case OPCLASS_ITEM_FUNCTION:
 				if (item->number <= 0 || item->number > maxProcNumber)
@@ -924,8 +962,14 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 
 				/* Save the info */
 				member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember));
+				member->is_func = true;
 				member->object = funcOid;
 				member->number = item->number;
+				/* We can set up dependency fields immediately */
+				/* Historically, ALTER ADD has created soft dependencies */
+				member->ref_is_hard = false;
+				member->ref_is_family = true;
+				member->refobjid = opfamilyoid;
 
 				/* allow overriding of the function's actual arg types */
 				if (item->class_args)
@@ -933,7 +977,7 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 									 &member->lefttype, &member->righttype);
 
 				assignProcTypes(member, amoid, InvalidOid, optsProcNumber);
-				addFamilyMember(&procedures, member, true);
+				addFamilyMember(&procedures, member);
 				break;
 			case OPCLASS_ITEM_STORAGETYPE:
 				ereport(ERROR,
@@ -946,14 +990,24 @@ AlterOpFamilyAdd(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 		}
 	}
 
+	/*
+	 * Let the index AM editorialize on the dependency choices.  It could also
+	 * do further validation on the operators and functions, if it likes.
+	 */
+	if (amroutine->amadjustmembers)
+		amroutine->amadjustmembers(opfamilyoid,
+								   InvalidOid,	/* no specific opclass */
+								   operators,
+								   procedures);
+
 	/*
 	 * Add tuples to pg_amop and pg_amproc tying in the operators and
 	 * functions.  Dependencies on them are inserted, too.
 	 */
 	storeOperators(stmt->opfamilyname, amoid, opfamilyoid,
-				   InvalidOid, operators, true);
+				   operators, true);
 	storeProcedures(stmt->opfamilyname, amoid, opfamilyoid,
-					InvalidOid, procedures, true);
+					procedures, true);
 
 	/* make information available to event triggers */
 	EventTriggerCollectAlterOpFam(stmt, opfamilyoid,
@@ -996,10 +1050,11 @@ AlterOpFamilyDrop(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 				processTypesSpec(item->class_args, &lefttype, &righttype);
 				/* Save the info */
 				member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember));
+				member->is_func = false;
 				member->number = item->number;
 				member->lefttype = lefttype;
 				member->righttype = righttype;
-				addFamilyMember(&operators, member, false);
+				addFamilyMember(&operators, member);
 				break;
 			case OPCLASS_ITEM_FUNCTION:
 				if (item->number <= 0 || item->number > maxProcNumber)
@@ -1011,10 +1066,11 @@ AlterOpFamilyDrop(AlterOpFamilyStmt *stmt, Oid amoid, Oid opfamilyoid,
 				processTypesSpec(item->class_args, &lefttype, &righttype);
 				/* Save the info */
 				member = (OpFamilyMember *) palloc0(sizeof(OpFamilyMember));
+				member->is_func = true;
 				member->number = item->number;
 				member->lefttype = lefttype;
 				member->righttype = righttype;
-				addFamilyMember(&procedures, member, true);
+				addFamilyMember(&procedures, member);
 				break;
 			case OPCLASS_ITEM_STORAGETYPE:
 				/* grammar prevents this from appearing */
@@ -1324,7 +1380,7 @@ assignProcTypes(OpFamilyMember *member, Oid amoid, Oid typeoid,
  * duplicated strategy or proc number.
  */
 static void
-addFamilyMember(List **list, OpFamilyMember *member, bool isProc)
+addFamilyMember(List **list, OpFamilyMember *member)
 {
 	ListCell   *l;
 
@@ -1336,7 +1392,7 @@ addFamilyMember(List **list, OpFamilyMember *member, bool isProc)
 			old->lefttype == member->lefttype &&
 			old->righttype == member->righttype)
 		{
-			if (isProc)
+			if (member->is_func)
 				ereport(ERROR,
 						(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
 						 errmsg("function number %d for (%s,%s) appears more than once",
@@ -1358,13 +1414,10 @@ addFamilyMember(List **list, OpFamilyMember *member, bool isProc)
 /*
  * Dump the operators to pg_amop
  *
- * We also make dependency entries in pg_depend for the opfamily entries.
- * If opclassoid is valid then make an INTERNAL dependency on that opclass,
- * else make an AUTO dependency on the opfamily.
+ * We also make dependency entries in pg_depend for the pg_amop entries.
  */
 static void
-storeOperators(List *opfamilyname, Oid amoid,
-			   Oid opfamilyoid, Oid opclassoid,
+storeOperators(List *opfamilyname, Oid amoid, Oid opfamilyoid,
 			   List *operators, bool isAdd)
 {
 	Relation	rel;
@@ -1434,28 +1487,17 @@ storeOperators(List *opfamilyname, Oid amoid,
 		referenced.objectId = op->object;
 		referenced.objectSubId = 0;
 
-		if (OidIsValid(opclassoid))
-		{
-			/* if contained in an opclass, use a NORMAL dep on operator */
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
+		/* see comments in amapi.h about dependency strength */
+		recordDependencyOn(&myself, &referenced,
+						   op->ref_is_hard ? DEPENDENCY_NORMAL : DEPENDENCY_AUTO);
 
-			/* ... and an INTERNAL dep on the opclass */
-			referenced.classId = OperatorClassRelationId;
-			referenced.objectId = opclassoid;
-			referenced.objectSubId = 0;
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_INTERNAL);
-		}
-		else
-		{
-			/* if "loose" in the opfamily, use a AUTO dep on operator */
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
+		referenced.classId = op->ref_is_family ? OperatorFamilyRelationId :
+			OperatorClassRelationId;
+		referenced.objectId = op->refobjid;
+		referenced.objectSubId = 0;
 
-			/* ... and an AUTO dep on the opfamily */
-			referenced.classId = OperatorFamilyRelationId;
-			referenced.objectId = opfamilyoid;
-			referenced.objectSubId = 0;
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
-		}
+		recordDependencyOn(&myself, &referenced,
+						   op->ref_is_hard ? DEPENDENCY_INTERNAL : DEPENDENCY_AUTO);
 
 		/* A search operator also needs a dep on the referenced opfamily */
 		if (OidIsValid(op->sortfamily))
@@ -1463,8 +1505,11 @@ storeOperators(List *opfamilyname, Oid amoid,
 			referenced.classId = OperatorFamilyRelationId;
 			referenced.objectId = op->sortfamily;
 			referenced.objectSubId = 0;
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
+
+			recordDependencyOn(&myself, &referenced,
+							   op->ref_is_hard ? DEPENDENCY_NORMAL : DEPENDENCY_AUTO);
 		}
+
 		/* Post create hook of this access method operator */
 		InvokeObjectPostCreateHook(AccessMethodOperatorRelationId,
 								   entryoid, 0);
@@ -1476,13 +1521,10 @@ storeOperators(List *opfamilyname, Oid amoid,
 /*
  * Dump the procedures (support routines) to pg_amproc
  *
- * We also make dependency entries in pg_depend for the opfamily entries.
- * If opclassoid is valid then make an INTERNAL dependency on that opclass,
- * else make an AUTO dependency on the opfamily.
+ * We also make dependency entries in pg_depend for the pg_amproc entries.
  */
 static void
-storeProcedures(List *opfamilyname, Oid amoid,
-				Oid opfamilyoid, Oid opclassoid,
+storeProcedures(List *opfamilyname, Oid amoid, Oid opfamilyoid,
 				List *procedures, bool isAdd)
 {
 	Relation	rel;
@@ -1546,28 +1588,18 @@ storeProcedures(List *opfamilyname, Oid amoid,
 		referenced.objectId = proc->object;
 		referenced.objectSubId = 0;
 
-		if (OidIsValid(opclassoid))
-		{
-			/* if contained in an opclass, use a NORMAL dep on procedure */
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_NORMAL);
+		/* see comments in amapi.h about dependency strength */
+		recordDependencyOn(&myself, &referenced,
+						   proc->ref_is_hard ? DEPENDENCY_NORMAL : DEPENDENCY_AUTO);
 
-			/* ... and an INTERNAL dep on the opclass */
-			referenced.classId = OperatorClassRelationId;
-			referenced.objectId = opclassoid;
-			referenced.objectSubId = 0;
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_INTERNAL);
-		}
-		else
-		{
-			/* if "loose" in the opfamily, use a AUTO dep on procedure */
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
+		referenced.classId = proc->ref_is_family ? OperatorFamilyRelationId :
+			OperatorClassRelationId;
+		referenced.objectId = proc->refobjid;
+		referenced.objectSubId = 0;
+
+		recordDependencyOn(&myself, &referenced,
+						   proc->ref_is_hard ? DEPENDENCY_INTERNAL : DEPENDENCY_AUTO);
 
-			/* ... and an AUTO dep on the opfamily */
-			referenced.classId = OperatorFamilyRelationId;
-			referenced.objectId = opfamilyoid;
-			referenced.objectSubId = 0;
-			recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO);
-		}
 		/* Post create hook of access method procedure */
 		InvokeObjectPostCreateHook(AccessMethodProcedureRelationId,
 								   entryoid, 0);
diff --git a/src/bin/pg_dump/t/002_pg_dump.pl b/src/bin/pg_dump/t/002_pg_dump.pl
index e116235769b2..ec636620601e 100644
--- a/src/bin/pg_dump/t/002_pg_dump.pl
+++ b/src/bin/pg_dump/t/002_pg_dump.pl
@@ -524,6 +524,8 @@
 						 FUNCTION 1 (int4, int4) btint4cmp(int4,int4),
 						 FUNCTION 2 (int4, int4) btint4sortsupport(internal),
 						 FUNCTION 4 (int4, int4) btequalimage(oid);',
+		# note: it's correct that btint8sortsupport and bigint btequalimage
+		# are included here:
 		regexp => qr/^
 			\QALTER OPERATOR FAMILY dump_test.op_family USING btree ADD\E\n\s+
 			\QOPERATOR 1 <(bigint,integer) ,\E\n\s+
@@ -532,7 +534,9 @@
 			\QOPERATOR 4 >=(bigint,integer) ,\E\n\s+
 			\QOPERATOR 5 >(bigint,integer) ,\E\n\s+
 			\QFUNCTION 1 (integer, integer) btint4cmp(integer,integer) ,\E\n\s+
+			\QFUNCTION 2 (bigint, bigint) btint8sortsupport(internal) ,\E\n\s+
 			\QFUNCTION 2 (integer, integer) btint4sortsupport(internal) ,\E\n\s+
+			\QFUNCTION 4 (bigint, bigint) btequalimage(oid) ,\E\n\s+
 			\QFUNCTION 4 (integer, integer) btequalimage(oid);\E
 			/xm,
 		like =>
@@ -1559,6 +1563,8 @@
 						 FUNCTION 1 btint8cmp(bigint,bigint),
 						 FUNCTION 2 btint8sortsupport(internal),
 						 FUNCTION 4 btequalimage(oid);',
+		# note: it's correct that btint8sortsupport and btequalimage
+		# are NOT included here (they're optional support functions):
 		regexp => qr/^
 			\QCREATE OPERATOR CLASS dump_test.op_class\E\n\s+
 			\QFOR TYPE bigint USING btree FAMILY dump_test.op_family AS\E\n\s+
@@ -1567,9 +1573,7 @@
 			\QOPERATOR 3 =(bigint,bigint) ,\E\n\s+
 			\QOPERATOR 4 >=(bigint,bigint) ,\E\n\s+
 			\QOPERATOR 5 >(bigint,bigint) ,\E\n\s+
-			\QFUNCTION 1 (bigint, bigint) btint8cmp(bigint,bigint) ,\E\n\s+
-			\QFUNCTION 2 (bigint, bigint) btint8sortsupport(internal) ,\E\n\s+
-			\QFUNCTION 4 (bigint, bigint) btequalimage(oid);\E
+			\QFUNCTION 1 (bigint, bigint) btint8cmp(bigint,bigint);\E
 			/xm,
 		like =>
 		  { %full_runs, %dump_test_schema_runs, section_pre_data => 1, },
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 4325faa460bd..85b4766016f8 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -54,6 +54,42 @@ typedef enum IndexAMProperty
 	AMPROP_CAN_INCLUDE
 } IndexAMProperty;
 
+/*
+ * We use lists of this struct type to keep track of both operators and
+ * support functions while building or adding to an opclass or opfamily.
+ * amadjustmembers functions receive lists of these structs, and are allowed
+ * to alter their "ref" fields.
+ *
+ * The "ref" fields define how the pg_amop or pg_amproc entry should depend
+ * on the associated objects (that is, which dependency type to use, and
+ * which opclass or opfamily it should depend on).
+ *
+ * If ref_is_hard is true, the entry will have a NORMAL dependency on the
+ * operator or support func, and an INTERNAL dependency on the opclass or
+ * opfamily.  This forces the opclass or opfamily to be dropped if the
+ * operator or support func is dropped, and requires the CASCADE option
+ * to do so.  Nor will ALTER OPERATOR FAMILY DROP be allowed.  This is
+ * the right behavior for objects that are essential to an opclass.
+ *
+ * If ref_is_hard is false, the entry will have an AUTO dependency on the
+ * operator or support func, and also an AUTO dependency on the opclass or
+ * opfamily.  This allows ALTER OPERATOR FAMILY DROP, and causes that to
+ * happen automatically if the operator or support func is dropped.  This
+ * is the right behavior for inessential ("loose") objects.
+ */
+typedef struct OpFamilyMember
+{
+	bool		is_func;		/* is this an operator, or support func? */
+	Oid			object;			/* operator or support func's OID */
+	int			number;			/* strategy or support func number */
+	Oid			lefttype;		/* lefttype */
+	Oid			righttype;		/* righttype */
+	Oid			sortfamily;		/* ordering operator's sort opfamily, or 0 */
+	bool		ref_is_hard;	/* hard or soft dependency? */
+	bool		ref_is_family;	/* is dependency on opclass or opfamily? */
+	Oid			refobjid;		/* OID of opclass or opfamily */
+} OpFamilyMember;
+
 
 /*
  * Callback function signatures --- see indexam.sgml for more info.
@@ -114,6 +150,12 @@ typedef char *(*ambuildphasename_function) (int64 phasenum);
 /* validate definition of an opclass for this AM */
 typedef bool (*amvalidate_function) (Oid opclassoid);
 
+/* validate operators and support functions to be added to an opclass/family */
+typedef void (*amadjustmembers_function) (Oid opfamilyoid,
+										  Oid opclassoid,
+										  List *operators,
+										  List *functions);
+
 /* prepare for index scan */
 typedef IndexScanDesc (*ambeginscan_function) (Relation indexRelation,
 											   int nkeys,
@@ -224,6 +266,7 @@ typedef struct IndexAmRoutine
 	amproperty_function amproperty; /* can be NULL */
 	ambuildphasename_function ambuildphasename; /* can be NULL */
 	amvalidate_function amvalidate;
+	amadjustmembers_function amadjustmembers;	/* can be NULL */
 	ambeginscan_function ambeginscan;
 	amrescan_function amrescan;
 	amgettuple_function amgettuple; /* can be NULL */
diff --git a/src/include/access/amvalidate.h b/src/include/access/amvalidate.h
index f3a0e52d84ec..149fc75f8569 100644
--- a/src/include/access/amvalidate.h
+++ b/src/include/access/amvalidate.h
@@ -1,7 +1,8 @@
 /*-------------------------------------------------------------------------
  *
  * amvalidate.h
- *	  Support routines for index access methods' amvalidate functions.
+ *	  Support routines for index access methods' amvalidate and
+ *	  amadjustmembers functions.
  *
  * Copyright (c) 2016-2020, PostgreSQL Global Development Group
  *
@@ -32,6 +33,8 @@ extern bool check_amproc_signature(Oid funcid, Oid restype, bool exact,
 extern bool check_amoptsproc_signature(Oid funcid);
 extern bool check_amop_signature(Oid opno, Oid restype,
 								 Oid lefttype, Oid righttype);
+extern Oid	opclass_for_family_datatype(Oid amoid, Oid opfamilyoid,
+										Oid datatypeoid);
 extern bool opfamily_can_sort_type(Oid opfamilyoid, Oid datatypeoid);
 
 #endif							/* AMVALIDATE_H */
diff --git a/src/include/access/gin_private.h b/src/include/access/gin_private.h
index 71eeac205c9d..5cb2f72e4cf8 100644
--- a/src/include/access/gin_private.h
+++ b/src/include/access/gin_private.h
@@ -407,6 +407,10 @@ extern ItemPointer ginVacuumItemPointers(GinVacuumState *gvs,
 
 /* ginvalidate.c */
 extern bool ginvalidate(Oid opclassoid);
+extern void ginadjustmembers(Oid opfamilyoid,
+							 Oid opclassoid,
+							 List *operators,
+							 List *functions);
 
 /* ginbulk.c */
 typedef struct GinEntryAccumulator
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 4bfc6280002b..02e985549f63 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -464,6 +464,10 @@ extern bool gistcanreturn(Relation index, int attno);
 
 /* gistvalidate.c */
 extern bool gistvalidate(Oid opclassoid);
+extern void gistadjustmembers(Oid opfamilyoid,
+							  Oid opclassoid,
+							  List *operators,
+							  List *functions);
 
 /* gistutil.c */
 
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index 7e7b1b73d86d..bab4d9f1b05c 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -379,6 +379,10 @@ extern IndexBulkDeleteResult *hashvacuumcleanup(IndexVacuumInfo *info,
 												IndexBulkDeleteResult *stats);
 extern bytea *hashoptions(Datum reloptions, bool validate);
 extern bool hashvalidate(Oid opclassoid);
+extern void hashadjustmembers(Oid opfamilyoid,
+							  Oid opclassoid,
+							  List *operators,
+							  List *functions);
 
 /* private routines */
 
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index f5274cc75083..65d9698b899a 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1141,6 +1141,10 @@ extern bool _bt_allequalimage(Relation rel, bool debugmessage);
  * prototypes for functions in nbtvalidate.c
  */
 extern bool btvalidate(Oid opclassoid);
+extern void btadjustmembers(Oid opfamilyoid,
+							Oid opclassoid,
+							List *operators,
+							List *functions);
 
 /*
  * prototypes for functions in nbtsort.c
diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h
index 852d1e2961a7..9f2ccc1730f9 100644
--- a/src/include/access/spgist.h
+++ b/src/include/access/spgist.h
@@ -220,5 +220,9 @@ extern IndexBulkDeleteResult *spgvacuumcleanup(IndexVacuumInfo *info,
 
 /* spgvalidate.c */
 extern bool spgvalidate(Oid opclassoid);
+extern void spgadjustmembers(Oid opfamilyoid,
+							 Oid opclassoid,
+							 List *operators,
+							 List *functions);
 
 #endif							/* SPGIST_H */
diff --git a/src/include/catalog/opfam_internal.h b/src/include/catalog/opfam_internal.h
deleted file mode 100644
index d63bd9ffa3c2..000000000000
--- a/src/include/catalog/opfam_internal.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*-------------------------------------------------------------------------
- *
- * opfam_internal.h
- *
- * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- * src/include/catalog/opfam_internal.h
- *
- *-------------------------------------------------------------------------
- */
-#ifndef OPFAM_INTERNAL_H
-#define OPFAM_INTERNAL_H
-
-/*
- * We use lists of this struct type to keep track of both operators and
- * procedures while building or adding to an opfamily.
- */
-typedef struct
-{
-	Oid			object;			/* operator or support proc's OID */
-	int			number;			/* strategy or support proc number */
-	Oid			lefttype;		/* lefttype */
-	Oid			righttype;		/* righttype */
-	Oid			sortfamily;		/* ordering operator's sort opfamily, or 0 */
-} OpFamilyMember;
-
-#endif							/* OPFAM_INTERNAL_H */

From cd5e82256de5895595cdd99ecb03aea15b346f71 Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Sat, 1 Aug 2020 15:31:01 -0700
Subject: [PATCH 243/334] Change XID and mxact limits to warn at 40M and stop
 at 3M.

We have edge-case bugs when assigning values in the last few dozen pages
before the wrap limit.  We may introduce similar bugs in the future.  At
default BLCKSZ, this makes such bugs unreachable outside of single-user
mode.  Also, when VACUUM began to consume mxacts, multiStopLimit did not
change to compensate.

pg_upgrade may fail on a cluster that was already printing "must be
vacuumed" warnings.  Follow the warning's instructions to clear the
warning, then run pg_upgrade again.  One can still, peacefully consume
98% of XIDs or mxacts, so DBAs need not change routine VACUUM settings.

Discussion: https://postgr.es/m/20200621083513.GA3074645@rfd.leadboat.com
---
 doc/src/sgml/maintenance.sgml          |  8 ++++----
 src/backend/access/transam/multixact.c | 24 ++++++++++-------------
 src/backend/access/transam/varsup.c    | 27 ++++++++++++++------------
 3 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/doc/src/sgml/maintenance.sgml b/doc/src/sgml/maintenance.sgml
index 4acdd15d4b37..de0794adeb90 100644
--- a/doc/src/sgml/maintenance.sgml
+++ b/doc/src/sgml/maintenance.sgml
@@ -608,10 +608,10 @@ SELECT datname, age(datfrozenxid) FROM pg_database;
    <para>
     If for some reason autovacuum fails to clear old XIDs from a table, the
     system will begin to emit warning messages like this when the database's
-    oldest XIDs reach eleven million transactions from the wraparound point:
+    oldest XIDs reach forty million transactions from the wraparound point:
 
 <programlisting>
-WARNING:  database "mydb" must be vacuumed within 10985967 transactions
+WARNING:  database "mydb" must be vacuumed within 39985967 transactions
 HINT:  To avoid a database shutdown, execute a database-wide VACUUM in that database.
 </programlisting>
 
@@ -621,7 +621,7 @@ HINT:  To avoid a database shutdown, execute a database-wide VACUUM in that data
     be able to advance the database's <structfield>datfrozenxid</structfield>.)
     If these warnings are
     ignored, the system will shut down and refuse to start any new
-    transactions once there are fewer than 1 million transactions left
+    transactions once there are fewer than three million transactions left
     until wraparound:
 
 <programlisting>
@@ -629,7 +629,7 @@ ERROR:  database is not accepting commands to avoid wraparound data loss in data
 HINT:  Stop the postmaster and vacuum that database in single-user mode.
 </programlisting>
 
-    The 1-million-transaction safety margin exists to let the
+    The three-million-transaction safety margin exists to let the
     administrator recover without data loss, by manually executing the
     required <command>VACUUM</command> commands.  However, since the system will not
     execute commands once it has gone into the safety shutdown mode,
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index ce84dac0c400..475f5ed86110 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -2217,28 +2217,24 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
 		multiWrapLimit += FirstMultiXactId;
 
 	/*
-	 * We'll refuse to continue assigning MultiXactIds once we get within 100
-	 * multi of data loss.
-	 *
-	 * Note: This differs from the magic number used in
-	 * SetTransactionIdLimit() since vacuum itself will never generate new
-	 * multis.  XXX actually it does, if it needs to freeze old multis.
+	 * We'll refuse to continue assigning MultiXactIds once we get within 3M
+	 * multi of data loss.  See SetTransactionIdLimit.
 	 */
-	multiStopLimit = multiWrapLimit - 100;
+	multiStopLimit = multiWrapLimit - 3000000;
 	if (multiStopLimit < FirstMultiXactId)
 		multiStopLimit -= FirstMultiXactId;
 
 	/*
-	 * We'll start complaining loudly when we get within 10M multis of the
-	 * stop point.   This is kind of arbitrary, but if you let your gas gauge
-	 * get down to 1% of full, would you be looking for the next gas station?
-	 * We need to be fairly liberal about this number because there are lots
-	 * of scenarios where most transactions are done by automatic clients that
-	 * won't pay attention to warnings. (No, we're not gonna make this
+	 * We'll start complaining loudly when we get within 40M multis of data
+	 * loss.  This is kind of arbitrary, but if you let your gas gauge get
+	 * down to 2% of full, would you be looking for the next gas station?  We
+	 * need to be fairly liberal about this number because there are lots of
+	 * scenarios where most transactions are done by automatic clients that
+	 * won't pay attention to warnings.  (No, we're not gonna make this
 	 * configurable.  If you know enough to configure it, you know enough to
 	 * not get in this kind of trouble in the first place.)
 	 */
-	multiWarnLimit = multiStopLimit - 10000000;
+	multiWarnLimit = multiWrapLimit - 40000000;
 	if (multiWarnLimit < FirstMultiXactId)
 		multiWarnLimit -= FirstMultiXactId;
 
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index e14b53bf9e35..0142bc70f6a6 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -350,27 +350,30 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid)
 
 	/*
 	 * We'll refuse to continue assigning XIDs in interactive mode once we get
-	 * within 1M transactions of data loss.  This leaves lots of room for the
+	 * within 3M transactions of data loss.  This leaves lots of room for the
 	 * DBA to fool around fixing things in a standalone backend, while not
 	 * being significant compared to total XID space. (Note that since
 	 * vacuuming requires one transaction per table cleaned, we had better be
-	 * sure there's lots of XIDs left...)
+	 * sure there's lots of XIDs left...)  Also, at default BLCKSZ, this
+	 * leaves two completely-idle segments.  In the event of edge-case bugs
+	 * involving page or segment arithmetic, idle segments render the bugs
+	 * unreachable outside of single-user mode.
 	 */
-	xidStopLimit = xidWrapLimit - 1000000;
+	xidStopLimit = xidWrapLimit - 3000000;
 	if (xidStopLimit < FirstNormalTransactionId)
 		xidStopLimit -= FirstNormalTransactionId;
 
 	/*
-	 * We'll start complaining loudly when we get within 10M transactions of
-	 * the stop point.  This is kind of arbitrary, but if you let your gas
-	 * gauge get down to 1% of full, would you be looking for the next gas
-	 * station?  We need to be fairly liberal about this number because there
-	 * are lots of scenarios where most transactions are done by automatic
-	 * clients that won't pay attention to warnings. (No, we're not gonna make
-	 * this configurable.  If you know enough to configure it, you know enough
-	 * to not get in this kind of trouble in the first place.)
+	 * We'll start complaining loudly when we get within 40M transactions of
+	 * data loss.  This is kind of arbitrary, but if you let your gas gauge
+	 * get down to 2% of full, would you be looking for the next gas station?
+	 * We need to be fairly liberal about this number because there are lots
+	 * of scenarios where most transactions are done by automatic clients that
+	 * won't pay attention to warnings.  (No, we're not gonna make this
+	 * configurable.  If you know enough to configure it, you know enough to
+	 * not get in this kind of trouble in the first place.)
 	 */
-	xidWarnLimit = xidStopLimit - 10000000;
+	xidWarnLimit = xidWrapLimit - 40000000;
 	if (xidWarnLimit < FirstNormalTransactionId)
 		xidWarnLimit -= FirstNormalTransactionId;
 

From 6ee3b5fb990ea11992b0db960d79b1fbe7b5e8e5 Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Sun, 2 Aug 2020 14:24:46 +1200
Subject: [PATCH 244/334] Use int64 instead of long in incremental sort code

Windows 64bit has 4-byte long values which is not suitable for tracking
disk space usage in the incremental sort code. Let's just make all these
fields int64s.

Author: James Coleman
Discussion: https://postgr.es/m/CAApHDvpky%2BUhof8mryPf5i%3D6e6fib2dxHqBrhp0Qhu0NeBhLJw%40mail.gmail.com
Backpatch-through: 13, where the incremental sort code was added
---
 src/backend/commands/explain.c | 20 ++++++++++----------
 src/include/nodes/execnodes.h  |  8 ++++----
 src/include/utils/tuplesort.h  |  2 +-
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 54e3797a15b6..1e565fd33755 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2676,7 +2676,7 @@ show_sort_info(SortState *sortstate, ExplainState *es)
 		TuplesortInstrumentation stats;
 		const char *sortMethod;
 		const char *spaceType;
-		long		spaceUsed;
+		int64		spaceUsed;
 
 		tuplesort_get_stats(state, &stats);
 		sortMethod = tuplesort_method_name(stats.sortMethod);
@@ -2686,7 +2686,7 @@ show_sort_info(SortState *sortstate, ExplainState *es)
 		if (es->format == EXPLAIN_FORMAT_TEXT)
 		{
 			ExplainIndentText(es);
-			appendStringInfo(es->str, "Sort Method: %s  %s: %ldkB\n",
+			appendStringInfo(es->str, "Sort Method: %s  %s: " INT64_FORMAT "kB\n",
 							 sortMethod, spaceType, spaceUsed);
 		}
 		else
@@ -2715,7 +2715,7 @@ show_sort_info(SortState *sortstate, ExplainState *es)
 			TuplesortInstrumentation *sinstrument;
 			const char *sortMethod;
 			const char *spaceType;
-			long		spaceUsed;
+			int64		spaceUsed;
 
 			sinstrument = &sortstate->shared_info->sinstrument[n];
 			if (sinstrument->sortMethod == SORT_TYPE_STILL_IN_PROGRESS)
@@ -2731,7 +2731,7 @@ show_sort_info(SortState *sortstate, ExplainState *es)
 			{
 				ExplainIndentText(es);
 				appendStringInfo(es->str,
-								 "Sort Method: %s  %s: %ldkB\n",
+								 "Sort Method: %s  %s: " INT64_FORMAT "kB\n",
 								 sortMethod, spaceType, spaceUsed);
 			}
 			else
@@ -2795,23 +2795,23 @@ show_incremental_sort_group_info(IncrementalSortGroupInfo *groupInfo,
 
 		if (groupInfo->maxMemorySpaceUsed > 0)
 		{
-			long		avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
+			int64		avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
 			const char *spaceTypeName;
 
 			spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_MEMORY);
-			appendStringInfo(es->str, "  Average %s: %ldkB  Peak %s: %ldkB",
+			appendStringInfo(es->str, "  Average %s: " INT64_FORMAT "kB  Peak %s: " INT64_FORMAT "kB",
 							 spaceTypeName, avgSpace,
 							 spaceTypeName, groupInfo->maxMemorySpaceUsed);
 		}
 
 		if (groupInfo->maxDiskSpaceUsed > 0)
 		{
-			long		avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
+			int64		avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
 
 			const char *spaceTypeName;
 
 			spaceTypeName = tuplesort_space_type_name(SORT_SPACE_TYPE_DISK);
-			appendStringInfo(es->str, "  Average %s: %ldkB  Peak %s: %ldkB",
+			appendStringInfo(es->str, "  Average %s: " INT64_FORMAT "kB  Peak %s: " INT64_FORMAT "kB",
 							 spaceTypeName, avgSpace,
 							 spaceTypeName, groupInfo->maxDiskSpaceUsed);
 		}
@@ -2829,7 +2829,7 @@ show_incremental_sort_group_info(IncrementalSortGroupInfo *groupInfo,
 
 		if (groupInfo->maxMemorySpaceUsed > 0)
 		{
-			long		avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
+			int64		avgSpace = groupInfo->totalMemorySpaceUsed / groupInfo->groupCount;
 			const char *spaceTypeName;
 			StringInfoData memoryName;
 
@@ -2846,7 +2846,7 @@ show_incremental_sort_group_info(IncrementalSortGroupInfo *groupInfo,
 		}
 		if (groupInfo->maxDiskSpaceUsed > 0)
 		{
-			long		avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
+			int64		avgSpace = groupInfo->totalDiskSpaceUsed / groupInfo->groupCount;
 			const char *spaceTypeName;
 			StringInfoData diskName;
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 6f96b31fb438..cf832d7f9097 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2032,10 +2032,10 @@ typedef struct SortState
 typedef struct IncrementalSortGroupInfo
 {
 	int64		groupCount;
-	long		maxDiskSpaceUsed;
-	long		totalDiskSpaceUsed;
-	long		maxMemorySpaceUsed;
-	long		totalMemorySpaceUsed;
+	int64		maxDiskSpaceUsed;
+	int64		totalDiskSpaceUsed;
+	int64		maxMemorySpaceUsed;
+	int64		totalMemorySpaceUsed;
 	bits32		sortMethods;	/* bitmask of TuplesortMethod */
 } IncrementalSortGroupInfo;
 
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index d992b4875a5e..9e76666fe948 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -90,7 +90,7 @@ typedef struct TuplesortInstrumentation
 {
 	TuplesortMethod sortMethod; /* sort algorithm used */
 	TuplesortSpaceType spaceType;	/* type of space spaceUsed represents */
-	long		spaceUsed;		/* space consumption, in kB */
+	int64		spaceUsed;		/* space consumption, in kB */
 } TuplesortInstrumentation;
 
 

From 533020d05045046a3481fdd92777de7bb2e30ab3 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 2 Aug 2020 17:00:26 -0400
Subject: [PATCH 245/334] Fix minor issues in psql's new \dAc and related
 commands.

The type-name pattern in \dAc and \dAf was matched only to the actual
pg_type.typname string, which is fairly user-unfriendly in cases where
that is not what's shown to the user by format_type (compare "_int4"
and "integer[]").  Make this code match what \dT does, i.e. match the
pattern against either typname or format_type() output.  Also fix its
broken handling of schema-name restrictions.  (IOW, make these
processSQLNamePattern calls match \dT's.)  While here, adjust
whitespace to make the query a little prettier in -E output, too.

Also improve some inaccuracies and shaky grammar in the related
documentation.

Noted while working on a patch for intarray's opclasses; I wondered
why I couldn't get a match to "integer*" for the input type name.
---
 doc/src/sgml/indices.sgml      |  9 ++++++++
 doc/src/sgml/ref/psql-ref.sgml | 38 ++++++++++++++++++----------------
 src/bin/psql/describe.c        | 28 +++++++++++++++++--------
 3 files changed, 48 insertions(+), 27 deletions(-)

diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml
index 28adaba72d04..671299ff059d 100644
--- a/doc/src/sgml/indices.sgml
+++ b/doc/src/sgml/indices.sgml
@@ -1410,6 +1410,15 @@ SELECT am.amname AS index_method,
     ORDER BY index_method, opfamily_name, opfamily_operator;
 </programlisting>
   </para>
+
+  <tip>
+   <para>
+    <xref linkend="app-psql"/> has
+    commands <command>\dAc</command>, <command>\dAf</command>,
+    and <command>\dAo</command>, which provide slightly more sophisticated
+    versions of these queries.
+   </para>
+  </tip>
  </sect1>
 
 
diff --git a/doc/src/sgml/ref/psql-ref.sgml b/doc/src/sgml/ref/psql-ref.sgml
index 13179e845da7..fc16e6c0c413 100644
--- a/doc/src/sgml/ref/psql-ref.sgml
+++ b/doc/src/sgml/ref/psql-ref.sgml
@@ -1245,13 +1245,13 @@ testdb=&gt;
         <listitem>
         <para>
         Lists operator classes
-        (see <xref linkend="catalog-pg-opclass"/>).
+        (see <xref linkend="xindex-opclass"/>).
         If <replaceable class="parameter">access-method-pattern</replaceable>
         is specified, only operator classes associated with access methods whose
-        names match the pattern are listed.
+        names match that pattern are listed.
         If <replaceable class="parameter">input-type-pattern</replaceable>
         is specified, only operator classes associated with input types whose
-        names match the pattern are listed.
+        names match that pattern are listed.
         If <literal>+</literal> is appended to the command name, each operator
         class is listed with its associated operator family and owner.
         </para>
@@ -1268,13 +1268,13 @@ testdb=&gt;
         <listitem>
         <para>
         Lists operator families
-        (see <xref linkend="catalog-pg-opfamily"/>).
+        (see <xref linkend="xindex-opfamily"/>).
         If <replaceable class="parameter">access-method-pattern</replaceable>
         is specified, only operator families associated with access methods whose
-        names match the pattern are listed.
+        names match that pattern are listed.
         If <replaceable class="parameter">input-type-pattern</replaceable>
         is specified, only operator families associated with input types whose
-        names match the pattern are listed.
+        names match that pattern are listed.
         If <literal>+</literal> is appended to the command name, each operator
         family is listed with its owner.
         </para>
@@ -1292,15 +1292,15 @@ testdb=&gt;
         <listitem>
         <para>
         Lists operators associated with operator families
-        (<xref linkend="catalog-pg-amop"/>).
+        (see <xref linkend="xindex-strategies"/>).
         If <replaceable class="parameter">access-method-pattern</replaceable>
         is specified, only members of operator families associated with access
-        methods whose names match the pattern are listed.
-        If <replaceable class="parameter">input-type-pattern</replaceable>
-        is specified, only members of operator families whose names match the
+        methods whose names match that pattern are listed.
+        If <replaceable class="parameter">operator-family-pattern</replaceable>
+        is specified, only members of operator families whose names match that
         pattern are listed.
         If <literal>+</literal> is appended to the command name, each operator
-        is listed with its strategy number, purpose and sort operator family.
+        is listed with its sort operator family (if it is an ordering operator).
         </para>
         </listitem>
       </varlistentry>
@@ -1314,14 +1314,16 @@ testdb=&gt;
         </term>
         <listitem>
         <para>
-        Lists functions associated with operator families
-        (<xref linkend="catalog-pg-amproc"/>).
+        Lists support functions associated with operator families
+        (see <xref linkend="xindex-support"/>).
         If <replaceable class="parameter">access-method-pattern</replaceable>
-        is specified, only members of operator families associated with access
-        methods whose names match the pattern are listed.
-        If <replaceable class="parameter">input-type-pattern</replaceable>
-        is specified, only members of operator families whose names match the
-        pattern are listed.
+        is specified, only functions of operator families associated with
+        access methods whose names match that pattern are listed.
+        If <replaceable class="parameter">operator-family-pattern</replaceable>
+        is specified, only functions of operator families whose names match
+        that pattern are listed.
+        If <literal>+</literal> is appended to the command name, functions are
+        displayed verbosely, with their actual parameter lists.
         </para>
         </listitem>
       </varlistentry>
diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index e197dcdb4d23..57266f4fc351 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -6056,7 +6056,7 @@ printACLColumn(PQExpBuffer buf, const char *colname)
  * \dAc
  * Lists operator classes
  *
- * Takes an optional regexps to filter by index access method and type.
+ * Takes optional regexps to filter by index access method and input data type.
  */
 bool
 listOperatorClasses(const char *access_method_pattern,
@@ -6110,6 +6110,7 @@ listOperatorClasses(const char *access_method_pattern,
 					  "  LEFT JOIN pg_catalog.pg_am am on am.oid = c.opcmethod\n"
 					  "  LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.opcnamespace\n"
 					  "  LEFT JOIN pg_catalog.pg_type t ON t.oid = c.opcintype\n"
+					  "  LEFT JOIN pg_catalog.pg_namespace tn ON tn.oid = t.typnamespace\n"
 		);
 	if (verbose)
 		appendPQExpBuffer(&buf,
@@ -6120,8 +6121,13 @@ listOperatorClasses(const char *access_method_pattern,
 		have_where = processSQLNamePattern(pset.db, &buf, access_method_pattern,
 										   false, false, NULL, "am.amname", NULL, NULL);
 	if (type_pattern)
+	{
+		/* Match type name pattern against either internal or external name */
 		processSQLNamePattern(pset.db, &buf, type_pattern, have_where, false,
-							  NULL, "t.typname", NULL, NULL);
+							  "tn.nspname", "t.typname",
+							  "pg_catalog.format_type(t.oid, NULL)",
+							  "pg_catalog.pg_type_is_visible(t.oid)");
+	}
 
 	appendPQExpBufferStr(&buf, "ORDER BY 1, 2, 4;");
 	res = PSQLexec(buf.data);
@@ -6145,7 +6151,7 @@ listOperatorClasses(const char *access_method_pattern,
  * \dAf
  * Lists operator families
  *
- * Takes an optional regexps to filter by index access method and type.
+ * Takes optional regexps to filter by index access method and input data type.
  */
 bool
 listOperatorFamilies(const char *access_method_pattern,
@@ -6190,15 +6196,19 @@ listOperatorFamilies(const char *access_method_pattern,
 	if (type_pattern)
 	{
 		appendPQExpBuffer(&buf,
-						  "\n  %s EXISTS (\n"
+						  "  %s EXISTS (\n"
 						  "    SELECT 1\n"
 						  "    FROM pg_catalog.pg_type t\n"
 						  "    JOIN pg_catalog.pg_opclass oc ON oc.opcintype = t.oid\n"
-						  "    WHERE oc.opcfamily = f.oid",
+						  "    LEFT JOIN pg_catalog.pg_namespace tn ON tn.oid = t.typnamespace\n"
+						  "    WHERE oc.opcfamily = f.oid\n",
 						  have_where ? "AND" : "WHERE");
+		/* Match type name pattern against either internal or external name */
 		processSQLNamePattern(pset.db, &buf, type_pattern, true, false,
-							  NULL, "t.typname", NULL, NULL);
-		appendPQExpBuffer(&buf, ")");
+							  "tn.nspname", "t.typname",
+							  "pg_catalog.format_type(t.oid, NULL)",
+							  "pg_catalog.pg_type_is_visible(t.oid)");
+		appendPQExpBuffer(&buf, "  )\n");
 	}
 
 	appendPQExpBufferStr(&buf, "ORDER BY 1, 2;");
@@ -6223,7 +6233,7 @@ listOperatorFamilies(const char *access_method_pattern,
  * \dAo
  * Lists operators of operator families
  *
- * Takes an optional regexps to filter by index access method and operator
+ * Takes optional regexps to filter by index access method and operator
  * family.
  */
 bool
@@ -6310,7 +6320,7 @@ listOpFamilyOperators(const char *access_method_pattern,
  * \dAp
  * Lists support functions of operator families
  *
- * Takes an optional regexps to filter by index access method and operator
+ * Takes optional regexps to filter by index access method and operator
  * family.
  */
 bool

From 63e9aa6879cc5b87c77bab9afea3740748a4f00b Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Mon, 3 Aug 2020 12:17:41 +1200
Subject: [PATCH 246/334] Correct comment in simplehash.h.

Post-commit review for commit 84c0e4b9.

Author: David Rowley <dgrowleyml@gmail.com>
Discussion: https://postgr.es/m/CAApHDvptBx_%2BUPAzY0uXzopbvPVGKPeZ6Hoy8rnPcWz20Cr0Bw%40mail.gmail.com
---
 src/include/lib/simplehash.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/include/lib/simplehash.h b/src/include/lib/simplehash.h
index 96f0c21f6064..395be1ca9a73 100644
--- a/src/include/lib/simplehash.h
+++ b/src/include/lib/simplehash.h
@@ -51,7 +51,8 @@
  *	  - SH_STORE_HASH - if defined the hash is stored in the elements
  *	  - SH_GET_HASH(tb, a) - return the field to store the hash in
  *
- *	  The element type is required to contain a "uint32 status" member.
+ *	  The element type is required to contain a "status" member that can store
+ *	  the range of values defined in the SH_STATUS enum.
  *
  *	  While SH_STORE_HASH (and subsequently SH_GET_HASH) are optional, because
  *	  the hash table implementation needs to compare hashes to move elements

From f44b9b625bedd8e0bca67b3b42ba10ce482fa31b Mon Sep 17 00:00:00 2001
From: Thomas Munro <tmunro@postgresql.org>
Date: Mon, 3 Aug 2020 12:39:15 +1200
Subject: [PATCH 247/334] Fix rare failure in LDAP tests.

Instead of writing a query to psql's stdin, use -c.  This avoids a
failure where psql exits before we write, seen a few times on the build
farm.  Thanks to Tom Lane for the suggestion.

Back-patch to 11, where the LDAP tests arrived.

Reviewed-by: Noah Misch <noah@leadboat.com>
Discussion: https://postgr.es/m/CA%2BhUKGLFmW%2BHQYPeKiwSp5sdFFHtFViCpw4Mh6yAgEx74r5-Cw%40mail.gmail.com
---
 src/test/ldap/t/001_auth.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/test/ldap/t/001_auth.pl b/src/test/ldap/t/001_auth.pl
index f8941144f5e1..3bc7672451ec 100644
--- a/src/test/ldap/t/001_auth.pl
+++ b/src/test/ldap/t/001_auth.pl
@@ -165,7 +165,8 @@ sub test_access
 	my ($node, $role, $expected_res, $test_name) = @_;
 
 	my $res =
-	  $node->psql('postgres', 'SELECT 1', extra_params => [ '-U', $role ]);
+	  $node->psql('postgres', undef,
+				  extra_params => [ '-U', $role, '-c', 'SELECT 1' ]);
 	is($res, $expected_res, $test_name);
 	return;
 }

From b8fdee7d0ca8bd2165d46fb1468f75571b706a01 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 3 Aug 2020 13:38:48 +0900
Subject: [PATCH 248/334] Add %P to log_line_prefix for parallel group leader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is useful for monitoring purposes with log parsing.  Similarly to
pg_stat_activity, the leader's PID is shown only for active parallel
workers, minimizing the log footprint for the leaders as the equivalent
shared memory field is set as long as a backend is alive.

Author: Justin Pryzby
Reviewed-by: Álvaro Herrera, Michael Paquier, Julien Rouhaud, Tom Lane
Discussion: https://postgr.es/m/20200315111831.GA21492@telsasoft.com
---
 doc/src/sgml/config.sgml                      |  9 ++++-
 src/backend/utils/error/elog.c                | 38 +++++++++++++++++++
 src/backend/utils/misc/postgresql.conf.sample |  1 +
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 994155ca00e2..7a7177c55083 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -6694,6 +6694,12 @@ local0.*    /var/log/postgresql
              <entry>Process ID</entry>
              <entry>no</entry>
             </row>
+            <row>
+             <entry><literal>%P</literal></entry>
+             <entry>Process ID of the parallel group leader, if this process
+              is a parallel query worker</entry>
+             <entry>no</entry>
+            </row>
             <row>
              <entry><literal>%t</literal></entry>
              <entry>Time stamp without milliseconds</entry>
@@ -7026,7 +7032,7 @@ log_line_prefix = '%m [%p] %q%u@%d/%a '
         character count of the error position therein,
         location of the error in the PostgreSQL source code
         (if <varname>log_error_verbosity</varname> is set to <literal>verbose</literal>),
-        application name, and backend type.
+        application name, backend type, and process ID of parallel group leader.
         Here is a sample table definition for storing CSV-format log output:
 
 <programlisting>
@@ -7056,6 +7062,7 @@ CREATE TABLE postgres_log
   location text,
   application_name text,
   backend_type text,
+  leader_pid integer,
   PRIMARY KEY (session_id, session_line_num)
 );
 </programlisting>
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index e4b717c79a9c..d0b368530e7e 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -2448,6 +2448,29 @@ log_line_prefix(StringInfo buf, ErrorData *edata)
 				else
 					appendStringInfo(buf, "%d", MyProcPid);
 				break;
+
+			case 'P':
+				if (MyProc)
+				{
+					PGPROC	   *leader = MyProc->lockGroupLeader;
+
+					/*
+					 * Show the leader only for active parallel workers. This
+					 * leaves out the leader of a parallel group.
+					 */
+					if (leader == NULL || leader->pid == MyProcPid)
+						appendStringInfoSpaces(buf,
+											   padding > 0 ? padding : -padding);
+					else if (padding != 0)
+						appendStringInfo(buf, "%*d", padding, leader->pid);
+					else
+						appendStringInfo(buf, "%d", leader->pid);
+				}
+				else if (padding != 0)
+					appendStringInfoSpaces(buf,
+										   padding > 0 ? padding : -padding);
+				break;
+
 			case 'l':
 				if (padding != 0)
 					appendStringInfo(buf, "%*ld", padding, log_line_number);
@@ -2836,6 +2859,21 @@ write_csvlog(ErrorData *edata)
 	else
 		appendCSVLiteral(&buf, GetBackendTypeDesc(MyBackendType));
 
+	appendStringInfoChar(&buf, ',');
+
+	/* leader PID */
+	if (MyProc)
+	{
+		PGPROC	   *leader = MyProc->lockGroupLeader;
+
+		/*
+		 * Show the leader only for active parallel workers.  This leaves out
+		 * the leader of a parallel group.
+		 */
+		if (leader && leader->pid != MyProcPid)
+			appendStringInfo(&buf, "%d", leader->pid);
+	}
+
 	appendStringInfoChar(&buf, '\n');
 
 	/* If in the syslogger process, try to write messages direct to file */
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index b0715ae18818..9cb571f7cc73 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -537,6 +537,7 @@
 					#   %h = remote host
 					#   %b = backend type
 					#   %p = process ID
+					#   %P = process ID of parallel group leader
 					#   %t = timestamp without milliseconds
 					#   %m = timestamp with milliseconds
 					#   %n = timestamp with milliseconds (as a Unix epoch)

From 5f28b21eb3c5c2fb72c24608bc686acd7c9b113c Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 3 Aug 2020 09:46:12 -0400
Subject: [PATCH 249/334] Fix behavior of ecpg's "EXEC SQL elif name".

This ought to work much like C's "#elif defined(name)"; but the code
implemented it in a way equivalent to endif followed by ifdef, so that
it didn't matter whether any previous branch of the IF construct had
succeeded.  Fix that; add some test cases covering elif and nested IFs;
and improve the documentation, which also seemed a bit confused.

AFAICS the code has been like this since the feature was added in 1999
(commit b57b0e044).  So while it's surely wrong, there might be code
out there relying on the current behavior.  Hence, don't back-patch
into stable branches.  It seems all right to fix it in v13 though.

Per report from Ashutosh Sharma.  Reviewed by Ashutosh Sharma and
Michael Meskes.

Discussion: https://postgr.es/m/CAE9k0P=dQk9X0cU2tN49S7a9tv733-e1pVdpB1P-pWJ5PdTktg@mail.gmail.com
---
 doc/src/sgml/ecpg.sgml                        |  50 ++++---
 src/interfaces/ecpg/preproc/pgc.l             | 128 ++++++++++++------
 .../ecpg/test/expected/preproc-define.c       |  79 ++++++-----
 .../ecpg/test/expected/preproc-define.stderr  |  48 +++----
 src/interfaces/ecpg/test/preproc/define.pgc   |  17 +++
 5 files changed, 211 insertions(+), 111 deletions(-)

diff --git a/doc/src/sgml/ecpg.sgml b/doc/src/sgml/ecpg.sgml
index 106ae0984e8a..1f9d35eeb3eb 100644
--- a/doc/src/sgml/ecpg.sgml
+++ b/doc/src/sgml/ecpg.sgml
@@ -5695,7 +5695,7 @@ EXEC SQL UPDATE Tbl SET col = MYNUMBER;
   </sect2>
 
   <sect2 id="ecpg-ifdef">
-   <title>ifdef, ifndef, else, elif, and endif Directives</title>
+   <title>ifdef, ifndef, elif, else, and endif Directives</title>
    <para>
    You can use the following directives to compile code sections conditionally:
 
@@ -5705,7 +5705,7 @@ EXEC SQL UPDATE Tbl SET col = MYNUMBER;
      <listitem>
      <para>
       Checks a <replaceable>name</replaceable> and processes subsequent lines if
-      <replaceable>name</replaceable> has been created with <literal>EXEC SQL define
+      <replaceable>name</replaceable> has been defined via <literal>EXEC SQL define
       <replaceable>name</replaceable></literal>.
      </para>
      </listitem>
@@ -5716,30 +5716,40 @@ EXEC SQL UPDATE Tbl SET col = MYNUMBER;
      <listitem>
      <para>
       Checks a <replaceable>name</replaceable> and processes subsequent lines if
-      <replaceable>name</replaceable> has <emphasis>not</emphasis> been created with
+      <replaceable>name</replaceable> has <emphasis>not</emphasis> been defined via
       <literal>EXEC SQL define <replaceable>name</replaceable></literal>.
      </para>
      </listitem>
     </varlistentry>
 
     <varlistentry>
-     <term><literal>EXEC SQL else;</literal></term>
+     <term><literal>EXEC SQL elif <replaceable>name</replaceable>;</literal></term>
      <listitem>
      <para>
-      Starts processing an alternative section to a section introduced by
-      either <literal>EXEC SQL ifdef <replaceable>name</replaceable></literal> or
-      <literal>EXEC SQL ifndef <replaceable>name</replaceable></literal>.
+      Begins an optional alternative section after an
+      <literal>EXEC SQL ifdef <replaceable>name</replaceable></literal> or
+      <literal>EXEC SQL ifndef <replaceable>name</replaceable></literal>
+      directive.  Any number of <literal>elif</literal> sections can appear.
+      Lines following an <literal>elif</literal> will be processed
+      if <replaceable>name</replaceable> has been
+      defined <emphasis>and</emphasis> no previous section of the same
+      <literal>ifdef</literal>/<literal>ifndef</literal>...<literal>endif</literal>
+      construct has been processed.
      </para>
      </listitem>
     </varlistentry>
 
     <varlistentry>
-     <term><literal>EXEC SQL elif <replaceable>name</replaceable>;</literal></term>
+     <term><literal>EXEC SQL else;</literal></term>
      <listitem>
      <para>
-      Checks <replaceable>name</replaceable> and starts an alternative section if
-      <replaceable>name</replaceable> has been created with <literal>EXEC SQL define
-      <replaceable>name</replaceable></literal>.
+      Begins an optional, final alternative section after an
+      <literal>EXEC SQL ifdef <replaceable>name</replaceable></literal> or
+      <literal>EXEC SQL ifndef <replaceable>name</replaceable></literal>
+      directive.  Subsequent lines will be processed if no previous section
+      of the same
+      <literal>ifdef</literal>/<literal>ifndef</literal>...<literal>endif</literal>
+      construct has been processed.
      </para>
      </listitem>
     </varlistentry>
@@ -5748,7 +5758,9 @@ EXEC SQL UPDATE Tbl SET col = MYNUMBER;
      <term><literal>EXEC SQL endif;</literal></term>
      <listitem>
      <para>
-      Ends an alternative section.
+      Ends an
+      <literal>ifdef</literal>/<literal>ifndef</literal>...<literal>endif</literal>
+      construct.  Subsequent lines are processed normally.
      </para>
      </listitem>
     </varlistentry>
@@ -5756,14 +5768,20 @@ EXEC SQL UPDATE Tbl SET col = MYNUMBER;
    </para>
 
    <para>
-    Example:
+    <literal>ifdef</literal>/<literal>ifndef</literal>...<literal>endif</literal>
+    constructs can be nested, up to 127 levels deep.
+   </para>
+
+   <para>
+    This example will compile exactly one of the three <literal>SET
+    TIMEZONE</literal> commands:
 <programlisting>
-EXEC SQL ifndef TZVAR;
-EXEC SQL SET TIMEZONE TO 'GMT';
+EXEC SQL ifdef TZVAR;
+EXEC SQL SET TIMEZONE TO TZVAR;
 EXEC SQL elif TZNAME;
 EXEC SQL SET TIMEZONE TO TZNAME;
 EXEC SQL else;
-EXEC SQL SET TIMEZONE TO TZVAR;
+EXEC SQL SET TIMEZONE TO 'GMT';
 EXEC SQL endif;
 </programlisting>
    </para>
diff --git a/src/interfaces/ecpg/preproc/pgc.l b/src/interfaces/ecpg/preproc/pgc.l
index f6052798fd54..466bbac6a7b0 100644
--- a/src/interfaces/ecpg/preproc/pgc.l
+++ b/src/interfaces/ecpg/preproc/pgc.l
@@ -79,13 +79,29 @@ struct _yy_buffer
 
 static char *old;
 
+/*
+ * Vars for handling ifdef/elif/endif constructs.  preproc_tos is the current
+ * nesting depth of such constructs, and stacked_if_value[preproc_tos] is the
+ * state for the innermost level.  (For convenience, stacked_if_value[0] is
+ * initialized as though we are in the active branch of some outermost IF.)
+ * The active field is true if the current branch is active (being expanded).
+ * The saw_active field is true if we have found any successful branch,
+ * so that all subsequent branches of this level should be skipped.
+ * The else_branch field is true if we've found an 'else' (so that another
+ * 'else' or 'elif' at this level is an error.)
+ * For IFs nested within an inactive branch, all branches always have active
+ * set to false, but saw_active and else_branch are maintained normally.
+ * ifcond is valid only while evaluating an if-condition; it's true if we
+ * are doing ifdef, false if ifndef.
+ */
 #define MAX_NESTED_IF 128
 static short preproc_tos;
-static short ifcond;
+static bool ifcond;
 static struct _if_value
 {
-	short condition;
-	short else_branch;
+	bool active;
+	bool saw_active;
+	bool else_branch;
 } stacked_if_value[MAX_NESTED_IF];
 
 %}
@@ -1165,11 +1181,26 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 						  return S_ANYTHING;
 					  }
 					}
-<C,xskip>{exec_sql}{ifdef}{space}*	{ ifcond = true; BEGIN(xcond); }
+<C,xskip>{exec_sql}{ifdef}{space}* {
+					  if (preproc_tos >= MAX_NESTED_IF-1)
+						  mmfatal(PARSE_ERROR, "too many nested EXEC SQL IFDEF conditions");
+					  preproc_tos++;
+					  stacked_if_value[preproc_tos].active = false;
+					  stacked_if_value[preproc_tos].saw_active = false;
+					  stacked_if_value[preproc_tos].else_branch = false;
+					  ifcond = true;
+					  BEGIN(xcond);
+					}
 <C,xskip>{informix_special}{ifdef}{space}* {
 					  /* are we simulating Informix? */
 					  if (INFORMIX_MODE)
 					  {
+						  if (preproc_tos >= MAX_NESTED_IF-1)
+							  mmfatal(PARSE_ERROR, "too many nested EXEC SQL IFDEF conditions");
+						  preproc_tos++;
+						  stacked_if_value[preproc_tos].active = false;
+						  stacked_if_value[preproc_tos].saw_active = false;
+						  stacked_if_value[preproc_tos].else_branch = false;
 						  ifcond = true;
 						  BEGIN(xcond);
 					  }
@@ -1179,11 +1210,26 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 						  return S_ANYTHING;
 					  }
 					}
-<C,xskip>{exec_sql}{ifndef}{space}* { ifcond = false; BEGIN(xcond); }
+<C,xskip>{exec_sql}{ifndef}{space}* {
+					  if (preproc_tos >= MAX_NESTED_IF-1)
+						  mmfatal(PARSE_ERROR, "too many nested EXEC SQL IFDEF conditions");
+					  preproc_tos++;
+					  stacked_if_value[preproc_tos].active = false;
+					  stacked_if_value[preproc_tos].saw_active = false;
+					  stacked_if_value[preproc_tos].else_branch = false;
+					  ifcond = false;
+					  BEGIN(xcond);
+					}
 <C,xskip>{informix_special}{ifndef}{space}* {
 					  /* are we simulating Informix? */
 					  if (INFORMIX_MODE)
 					  {
+						  if (preproc_tos >= MAX_NESTED_IF-1)
+							  mmfatal(PARSE_ERROR, "too many nested EXEC SQL IFDEF conditions");
+						  preproc_tos++;
+						  stacked_if_value[preproc_tos].active = false;
+						  stacked_if_value[preproc_tos].saw_active = false;
+						  stacked_if_value[preproc_tos].else_branch = false;
 						  ifcond = false;
 						  BEGIN(xcond);
 					  }
@@ -1193,16 +1239,13 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 						  return S_ANYTHING;
 					  }
 					}
-<C,xskip>{exec_sql}{elif}{space}*	{	/* pop stack */
-						if ( preproc_tos == 0 ) {
+<C,xskip>{exec_sql}{elif}{space}*	{
+						if (preproc_tos == 0)
 							mmfatal(PARSE_ERROR, "missing matching \"EXEC SQL IFDEF\" / \"EXEC SQL IFNDEF\"");
-						}
-						else if ( stacked_if_value[preproc_tos].else_branch )
+						if (stacked_if_value[preproc_tos].else_branch)
 							mmfatal(PARSE_ERROR, "missing \"EXEC SQL ENDIF;\"");
-						else
-							preproc_tos--;
-
-						ifcond = true; BEGIN(xcond);
+						ifcond = true;
+						BEGIN(xcond);
 					}
 <C,xskip>{informix_special}{elif}{space}* {
 					/* are we simulating Informix? */
@@ -1210,11 +1253,8 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 					{
 						if (preproc_tos == 0)
 							mmfatal(PARSE_ERROR, "missing matching \"EXEC SQL IFDEF\" / \"EXEC SQL IFNDEF\"");
-						else if (stacked_if_value[preproc_tos].else_branch)
+						if (stacked_if_value[preproc_tos].else_branch)
 							mmfatal(PARSE_ERROR, "missing \"EXEC SQL ENDIF;\"");
-						else
-							preproc_tos--;
-
 						ifcond = true;
 						BEGIN(xcond);
 					}
@@ -1226,16 +1266,19 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 				}
 
 <C,xskip>{exec_sql}{else}{space}*";" {	/* only exec sql endif pops the stack, so take care of duplicated 'else' */
-					if (stacked_if_value[preproc_tos].else_branch)
+					if ( preproc_tos == 0 )
+						mmfatal(PARSE_ERROR, "missing matching \"EXEC SQL IFDEF\" / \"EXEC SQL IFNDEF\"");
+					else if (stacked_if_value[preproc_tos].else_branch)
 						mmfatal(PARSE_ERROR, "more than one EXEC SQL ELSE");
 					else
 					{
 						stacked_if_value[preproc_tos].else_branch = true;
-						stacked_if_value[preproc_tos].condition =
-							(stacked_if_value[preproc_tos-1].condition &&
-							 !stacked_if_value[preproc_tos].condition);
+						stacked_if_value[preproc_tos].active =
+							(stacked_if_value[preproc_tos-1].active &&
+							 !stacked_if_value[preproc_tos].saw_active);
+						stacked_if_value[preproc_tos].saw_active = true;
 
-						if (stacked_if_value[preproc_tos].condition)
+						if (stacked_if_value[preproc_tos].active)
 							BEGIN(C);
 						else
 							BEGIN(xskip);
@@ -1245,16 +1288,19 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 					/* are we simulating Informix? */
 					if (INFORMIX_MODE)
 					{
-						if (stacked_if_value[preproc_tos].else_branch)
+						if ( preproc_tos == 0 )
+							mmfatal(PARSE_ERROR, "missing matching \"EXEC SQL IFDEF\" / \"EXEC SQL IFNDEF\"");
+						else if (stacked_if_value[preproc_tos].else_branch)
 							mmfatal(PARSE_ERROR, "more than one EXEC SQL ELSE");
 						else
 						{
 							stacked_if_value[preproc_tos].else_branch = true;
-							stacked_if_value[preproc_tos].condition =
-							(stacked_if_value[preproc_tos-1].condition &&
-							 !stacked_if_value[preproc_tos].condition);
+							stacked_if_value[preproc_tos].active =
+								(stacked_if_value[preproc_tos-1].active &&
+								 !stacked_if_value[preproc_tos].saw_active);
+							stacked_if_value[preproc_tos].saw_active = true;
 
-							if (stacked_if_value[preproc_tos].condition)
+							if (stacked_if_value[preproc_tos].active)
 								BEGIN(C);
 							else
 								BEGIN(xskip);
@@ -1272,7 +1318,7 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 					else
 						preproc_tos--;
 
-					if (stacked_if_value[preproc_tos].condition)
+					if (stacked_if_value[preproc_tos].active)
 					   BEGIN(C);
 					else
 					   BEGIN(xskip);
@@ -1286,7 +1332,7 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 						else
 							preproc_tos--;
 
-						if (stacked_if_value[preproc_tos].condition)
+						if (stacked_if_value[preproc_tos].active)
 							BEGIN(C);
 						else
 							BEGIN(xskip);
@@ -1301,12 +1347,10 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 <xskip>{other}		{ /* ignore */ }
 
 <xcond>{identifier}{space}*";" {
-					if (preproc_tos >= MAX_NESTED_IF-1)
-						mmfatal(PARSE_ERROR, "too many nested EXEC SQL IFDEF conditions");
-					else
 					{
 						struct _defines *defptr;
 						unsigned int i;
+						bool this_active;
 
 						/*
 						 *	Skip the ";" and trailing whitespace. Note that yytext
@@ -1324,13 +1368,15 @@ cppline			{space}*#([^i][A-Za-z]*|{if}|{ifdef}|{ifndef}|{import})((\/\*[^*/]*\*+
 							 defptr = defptr->next)
 							/* skip */ ;
 
-						preproc_tos++;
-						stacked_if_value[preproc_tos].else_branch = false;
-						stacked_if_value[preproc_tos].condition =
-						(defptr ? ifcond : !ifcond) && stacked_if_value[preproc_tos-1].condition;
+						this_active = (defptr ? ifcond : !ifcond);
+						stacked_if_value[preproc_tos].active =
+							(stacked_if_value[preproc_tos-1].active &&
+							 !stacked_if_value[preproc_tos].saw_active &&
+							 this_active);
+						stacked_if_value[preproc_tos].saw_active |= this_active;
 					}
 
-					if (stacked_if_value[preproc_tos].condition)
+					if (stacked_if_value[preproc_tos].active)
 						BEGIN(C);
 					else
 						BEGIN(xskip);
@@ -1442,10 +1488,12 @@ lex_init(void)
 	parenths_open = 0;
 	current_function = NULL;
 
-	preproc_tos = 0;
 	yylineno = 1;
-	ifcond = true;
-	stacked_if_value[preproc_tos].condition = ifcond;
+
+	/* initialize state for if/else/endif */
+	preproc_tos = 0;
+	stacked_if_value[preproc_tos].active = true;
+	stacked_if_value[preproc_tos].saw_active = true;
 	stacked_if_value[preproc_tos].else_branch = false;
 
 	/* initialize literal buffer to a reasonable but expansible size */
diff --git a/src/interfaces/ecpg/test/expected/preproc-define.c b/src/interfaces/ecpg/test/expected/preproc-define.c
index bde15b74a0e7..0256609b1f96 100644
--- a/src/interfaces/ecpg/test/expected/preproc-define.c
+++ b/src/interfaces/ecpg/test/expected/preproc-define.c
@@ -40,73 +40,90 @@ main(void)
 {
 /* exec sql begin declare section */
 
+
 	   typedef char  string [ 8 ];
 
-#line 21 "define.pgc"
+#line 22 "define.pgc"
 
 	 
 	   
-	  	 
 
-#line 22 "define.pgc"
+	   
+
+	    
+
+
+
+	   
+
+  
+	   
+  
+	  	   
+
+	   
+
+
+
+#line 23 "define.pgc"
  intarray amount ;
  
-#line 23 "define.pgc"
+#line 24 "define.pgc"
  char name [ 6 ] [ 8 ] ;
  
-#line 24 "define.pgc"
+#line 37 "define.pgc"
  char letter [ 6 ] [ 1 ] ;
  
 #if 0
  
-#line 26 "define.pgc"
+#line 39 "define.pgc"
  int not_used ;
  
 #endif
 /* exec sql end declare section */
-#line 29 "define.pgc"
+#line 46 "define.pgc"
 
 	int i,j;
 
 	ECPGdebug(1, stderr);
 
 	{ ECPGconnect(__LINE__, 0, "ecpg1_regression" , NULL, NULL , NULL, 0); 
-#line 34 "define.pgc"
+#line 51 "define.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 34 "define.pgc"
+#line 51 "define.pgc"
 
 
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "create table test ( name char ( 8 ) , amount int , letter char ( 1 ) )", ECPGt_EOIT, ECPGt_EORT);
-#line 36 "define.pgc"
+#line 53 "define.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 36 "define.pgc"
+#line 53 "define.pgc"
 
 	{ ECPGtrans(__LINE__, NULL, "commit");
-#line 37 "define.pgc"
+#line 54 "define.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 37 "define.pgc"
+#line 54 "define.pgc"
 
 
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "insert into Test ( name , amount , letter ) values ( 'false' , 1 , 'f' )", ECPGt_EOIT, ECPGt_EORT);
-#line 39 "define.pgc"
+#line 56 "define.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 39 "define.pgc"
+#line 56 "define.pgc"
 
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "insert into test ( name , amount , letter ) values ( 'true' , 2 , 't' )", ECPGt_EOIT, ECPGt_EORT);
-#line 40 "define.pgc"
+#line 57 "define.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 40 "define.pgc"
+#line 57 "define.pgc"
 
 	{ ECPGtrans(__LINE__, NULL, "commit");
-#line 41 "define.pgc"
+#line 58 "define.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 41 "define.pgc"
+#line 58 "define.pgc"
 
 
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "select * from test", ECPGt_EOIT, 
@@ -116,10 +133,10 @@ if (sqlca.sqlcode < 0) sqlprint();}
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, 
 	ECPGt_char,(letter),(long)1,(long)6,(1)*sizeof(char), 
 	ECPGt_NO_INDICATOR, NULL , 0L, 0L, 0L, ECPGt_EORT);
-#line 43 "define.pgc"
+#line 60 "define.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 43 "define.pgc"
+#line 60 "define.pgc"
 
 
 	for (i=0, j=sqlca.sqlerrd[2]; i<j; i++)
@@ -129,16 +146,16 @@ if (sqlca.sqlcode < 0) sqlprint();}
 		   
 		   
 		
-#line 48 "define.pgc"
+#line 65 "define.pgc"
  string n ;
  
-#line 49 "define.pgc"
+#line 66 "define.pgc"
  char l = letter [ i ] [ 0 ] ;
  
-#line 50 "define.pgc"
+#line 67 "define.pgc"
  int a = amount [ i ] ;
 /* exec sql end declare section */
-#line 51 "define.pgc"
+#line 68 "define.pgc"
 
 
 		strncpy(n, name[i],  8);
@@ -146,22 +163,22 @@ if (sqlca.sqlcode < 0) sqlprint();}
 	}
 
 	{ ECPGdo(__LINE__, 0, 1, NULL, 0, ECPGst_normal, "drop table test", ECPGt_EOIT, ECPGt_EORT);
-#line 57 "define.pgc"
+#line 74 "define.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 57 "define.pgc"
+#line 74 "define.pgc"
 
 	{ ECPGtrans(__LINE__, NULL, "commit");
-#line 58 "define.pgc"
+#line 75 "define.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 58 "define.pgc"
+#line 75 "define.pgc"
 
 	{ ECPGdisconnect(__LINE__, "CURRENT");
-#line 59 "define.pgc"
+#line 76 "define.pgc"
 
 if (sqlca.sqlcode < 0) sqlprint();}
-#line 59 "define.pgc"
+#line 76 "define.pgc"
 
 
 	return 0;
diff --git a/src/interfaces/ecpg/test/expected/preproc-define.stderr b/src/interfaces/ecpg/test/expected/preproc-define.stderr
index 722cdc7572f8..869dc46b0dc0 100644
--- a/src/interfaces/ecpg/test/expected/preproc-define.stderr
+++ b/src/interfaces/ecpg/test/expected/preproc-define.stderr
@@ -2,53 +2,53 @@
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ECPGconnect: opening database ecpg1_regression on <DEFAULT> port <DEFAULT>  
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 36: query: create table test ( name char ( 8 ) , amount int , letter char ( 1 ) ); with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_execute on line 53: query: create table test ( name char ( 8 ) , amount int , letter char ( 1 ) ); with 0 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 36: using PQexec
+[NO_PID]: ecpg_execute on line 53: using PQexec
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 36: OK: CREATE TABLE
+[NO_PID]: ecpg_process_output on line 53: OK: CREATE TABLE
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ECPGtrans on line 37: action "commit"; connection "ecpg1_regression"
+[NO_PID]: ECPGtrans on line 54: action "commit"; connection "ecpg1_regression"
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 39: query: insert into Test ( name , amount , letter ) values ( 'false' , 1 , 'f' ); with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_execute on line 56: query: insert into Test ( name , amount , letter ) values ( 'false' , 1 , 'f' ); with 0 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 39: using PQexec
+[NO_PID]: ecpg_execute on line 56: using PQexec
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 39: OK: INSERT 0 1
+[NO_PID]: ecpg_process_output on line 56: OK: INSERT 0 1
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 40: query: insert into test ( name , amount , letter ) values ( 'true' , 2 , 't' ); with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_execute on line 57: query: insert into test ( name , amount , letter ) values ( 'true' , 2 , 't' ); with 0 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 40: using PQexec
+[NO_PID]: ecpg_execute on line 57: using PQexec
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 40: OK: INSERT 0 1
+[NO_PID]: ecpg_process_output on line 57: OK: INSERT 0 1
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ECPGtrans on line 41: action "commit"; connection "ecpg1_regression"
+[NO_PID]: ECPGtrans on line 58: action "commit"; connection "ecpg1_regression"
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 43: query: select * from test; with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_execute on line 60: query: select * from test; with 0 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 43: using PQexec
+[NO_PID]: ecpg_execute on line 60: using PQexec
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 43: correctly got 2 tuples with 3 fields
+[NO_PID]: ecpg_process_output on line 60: correctly got 2 tuples with 3 fields
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_get_data on line 43: RESULT: false    offset: -1; array: no
+[NO_PID]: ecpg_get_data on line 60: RESULT: false    offset: -1; array: no
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_get_data on line 43: RESULT: true     offset: -1; array: no
+[NO_PID]: ecpg_get_data on line 60: RESULT: true     offset: -1; array: no
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_get_data on line 43: RESULT: 1 offset: -1; array: no
+[NO_PID]: ecpg_get_data on line 60: RESULT: 1 offset: -1; array: no
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_get_data on line 43: RESULT: 2 offset: -1; array: no
+[NO_PID]: ecpg_get_data on line 60: RESULT: 2 offset: -1; array: no
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_get_data on line 43: RESULT: f offset: -1; array: no
+[NO_PID]: ecpg_get_data on line 60: RESULT: f offset: -1; array: no
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_get_data on line 43: RESULT: t offset: -1; array: no
+[NO_PID]: ecpg_get_data on line 60: RESULT: t offset: -1; array: no
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 57: query: drop table test; with 0 parameter(s) on connection ecpg1_regression
+[NO_PID]: ecpg_execute on line 74: query: drop table test; with 0 parameter(s) on connection ecpg1_regression
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_execute on line 57: using PQexec
+[NO_PID]: ecpg_execute on line 74: using PQexec
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ecpg_process_output on line 57: OK: DROP TABLE
+[NO_PID]: ecpg_process_output on line 74: OK: DROP TABLE
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ECPGtrans on line 58: action "commit"; connection "ecpg1_regression"
+[NO_PID]: ECPGtrans on line 75: action "commit"; connection "ecpg1_regression"
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ecpg_finish: connection ecpg1_regression closed
 [NO_PID]: sqlca: code: 0, state: 00000
diff --git a/src/interfaces/ecpg/test/preproc/define.pgc b/src/interfaces/ecpg/test/preproc/define.pgc
index 0d07ebfe6363..d360da7ece2a 100644
--- a/src/interfaces/ecpg/test/preproc/define.pgc
+++ b/src/interfaces/ecpg/test/preproc/define.pgc
@@ -17,15 +17,32 @@ int
 main(void)
 {
 exec sql begin declare section;
+
 exec sql ifdef NAMELEN;
 	typedef char string[NAMELEN];
 	intarray amount;
 	char name[AMOUNT][NAMELEN];
+exec sql elif AMOUNT;
+	should not get here;
+exec sql else;
+	should not get here either;
+exec sql endif;
+
+exec sql ifndef NAMELEN;
+	should not get here;
+exec sql elif AMOUNT;
+  exec sql ifdef NOSUCHNAME;
+	should not get here;
+  exec sql else;
 	char letter[AMOUNT][1];
 #if 0
 	int not_used;
 #endif
+  exec sql endif;
+exec sql elif AMOUNT;
+	should not get here;
 exec sql endif;
+
 exec sql end declare section;
 	int i,j;
 

From eeb01e3122bb0acb6f8575d352e8e63101662ae7 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 3 Aug 2020 13:11:16 -0400
Subject: [PATCH 250/334] Doc: fix obsolete info about allowed range of TZ
 offsets in timetz.

We've allowed UTC offsets up to +/- 15:59 since commit cd0ff9c0f, but
that commit forgot to fix the documentation about timetz.

Per bug #16571 from osdba.

Discussion: https://postgr.es/m/16571-eb7501598de78c8a@postgresql.org
---
 doc/src/sgml/datatype.sgml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/src/sgml/datatype.sgml b/doc/src/sgml/datatype.sgml
index 50e370cae440..fdc8715a0d1d 100644
--- a/doc/src/sgml/datatype.sgml
+++ b/doc/src/sgml/datatype.sgml
@@ -1727,8 +1727,9 @@ SELECT 'abc \153\154\155 \052\251\124'::bytea;
         <entry><type>time [ (<replaceable>p</replaceable>) ] with time zone</type></entry>
         <entry>12 bytes</entry>
         <entry>time of day (no date), with time zone</entry>
-        <entry>00:00:00+1459</entry>
-        <entry>24:00:00-1459</entry>
+        <!-- see MAX_TZDISP_HOUR in datatype/timestamp.h -->
+        <entry>00:00:00+1559</entry>
+        <entry>24:00:00-1559</entry>
         <entry>1 microsecond</entry>
        </row>
        <row>

From 9e496768b8a7303ea07888ea1baae8e2a57dda7b Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 3 Aug 2020 14:02:35 -0400
Subject: [PATCH 251/334] Remove unnecessary "DISTINCT" in psql's queries for
 \dAc and \dAf.

A moment's examination of these queries is sufficient to see that
they do not produce duplicate rows, unless perhaps there's
catalog corruption.  Using DISTINCT anyway is inefficient and
confusing; moreover it sets a poor example for anyone who
refers to psql -E output to see how to query the catalogs.
---
 src/bin/psql/describe.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c
index 57266f4fc351..d81f1575bf4c 100644
--- a/src/bin/psql/describe.c
+++ b/src/bin/psql/describe.c
@@ -6071,7 +6071,7 @@ listOperatorClasses(const char *access_method_pattern,
 	initPQExpBuffer(&buf);
 
 	printfPQExpBuffer(&buf,
-					  "SELECT DISTINCT"
+					  "SELECT\n"
 					  "  am.amname AS \"%s\",\n"
 					  "  pg_catalog.format_type(c.opcintype, NULL) AS \"%s\",\n"
 					  "  CASE\n"
@@ -6166,7 +6166,7 @@ listOperatorFamilies(const char *access_method_pattern,
 	initPQExpBuffer(&buf);
 
 	printfPQExpBuffer(&buf,
-					  "SELECT DISTINCT"
+					  "SELECT\n"
 					  "  am.amname AS \"%s\",\n"
 					  "  CASE\n"
 					  "    WHEN pg_catalog.pg_opfamily_is_visible(f.oid)\n"

From a451b7d44249b8655db8d40476ace9f84d76ab88 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 3 Aug 2020 13:04:42 -0700
Subject: [PATCH 252/334] Add nbtree page deletion assertion.

Add a documenting assertion that's similar to the nearby assertion added
by commit cd8c73a3.  This conveys that the entire call to _bt_pagedel()
does no work if it isn't possible to get a descent stack for the initial
scanblkno page.
---
 src/backend/access/nbtree/nbtpage.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 70bac0052fc6..53dff3268083 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1697,6 +1697,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
 				if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))
 				{
 					ReleaseBuffer(leafbuf);
+					Assert(ndeleted == 0);
 					return ndeleted;
 				}
 

From 9a9db08ae46209edcc5ecb120328a2bf92fd6069 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 3 Aug 2020 15:54:38 -0700
Subject: [PATCH 253/334] Fix replica backward scan race condition.

It was possible for the logic used by backward scans (which must reason
about concurrent page splits/deletions in its own peculiar way) to
become confused when running on a replica.  Concurrent replay of a WAL
record that describes the second phase of page deletion could cause
_bt_walk_left() to get confused.  btree_xlog_unlink_page() simply failed
to adhere to the same locking protocol that we use on the primary, which
is obviously wrong once you consider these two disparate functions
together.  This bug is present in all stable branches.

More concretely, the problem was that nothing stopped _bt_walk_left()
from observing inconsistencies between the deletion's target page and
its original sibling pages when running on a replica.  This is true even
though the second phase of page deletion is supposed to work as a single
atomic action.  Queries running on replicas raised "could not find left
sibling of block %u in index %s" can't-happen errors when they went back
to their scan's "original" page and observed that the page has not been
marked deleted (even though it really was concurrently deleted).

There is no evidence that this actually happened in the real world.  The
issue came to light during unrelated feature development work.  Note
that _bt_walk_left() is the only code that cares about the difference
between a half-dead page and a fully deleted page that isn't also
exclusively used by nbtree VACUUM (unless you include contrib/amcheck
code).  It seems very likely that backward scans are the only thing that
could become confused by the inconsistency.  Even amcheck's complex
bt_right_page_check_scankey() dance was unaffected.

To fix, teach btree_xlog_unlink_page() to lock the left sibling, target,
and right sibling pages in that order before releasing any locks (just
like _bt_unlink_halfdead_page()).  This is the simplest possible
approach.  There doesn't seem to be any opportunity to be more clever
about lock acquisition in the REDO routine, and it hardly seems worth
the trouble in any case.

This fix might enable contrib/amcheck verification of leaf page sibling
links with only an AccessShareLock on the relation.  An amcheck patch
from Andrey Borodin was rejected back in January because it clashed with
btree_xlog_unlink_page()'s lax approach to locking pages.  It now seems
likely that the real problem was with btree_xlog_unlink_page(), not the
patch.

This is a low severity, low likelihood bug, so no backpatch.

Author: Michail Nikolaev
Diagnosed-By: Michail Nikolaev
Discussion: https://postgr.es/m/CANtu0ohkR-evAWbpzJu54V8eCOtqjJyYp3PQ_SGoBTRGXWhWRw@mail.gmail.com
---
 src/backend/access/nbtree/README    | 18 ++++++
 src/backend/access/nbtree/nbtxlog.c | 88 ++++++++++++++++++-----------
 2 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 32ad9e339a29..9d5fc424a574 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -572,6 +572,24 @@ replay of page deletion records does not hold a write lock on the target
 leaf page throughout; only the primary needs to block out concurrent
 writers that insert on to the page being deleted.)
 
+There are also locking differences between the primary and WAL replay
+for the first stage of a page split (i.e. same-level differences in
+locking).  Replay of the first phase of a page split can get away with
+locking and updating the original right sibling page (which is also the
+new right sibling page's right sibling) after locks on the original page
+and its new right sibling have been released.  Again, this is okay
+because there are no writers.  Page deletion WAL replay cannot get away
+with being lax about same-level locking during replay, though -- doing
+so risks confusing concurrent backwards scans.
+
+Page deletion's second phase locks the left sibling page, target page,
+and right page in order on the standby, just like on the primary.  This
+allows backwards scans running on a standby to reason about page
+deletion on the leaf level; a page cannot appear deleted without that
+being reflected in the sibling pages.  It's probably possible to be more
+lax about how locks are acquired on the standby during the second phase
+of page deletion, but that hardly seems worth it.
+
 During recovery all index scans start with ignore_killed_tuples = false
 and we never set kill_prior_tuple. We do this because the oldest xmin
 on the standby server can be older than the oldest xmin on the primary
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 5d346da84fde..09d1b0e3419a 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -774,7 +774,9 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 	xl_btree_unlink_page *xlrec = (xl_btree_unlink_page *) XLogRecGetData(record);
 	BlockNumber leftsib;
 	BlockNumber rightsib;
-	Buffer		buffer;
+	Buffer		leftbuf;
+	Buffer		target;
+	Buffer		rightbuf;
 	Page		page;
 	BTPageOpaque pageop;
 
@@ -783,46 +785,39 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 
 	/*
 	 * In normal operation, we would lock all the pages this WAL record
-	 * touches before changing any of them.  In WAL replay, it should be okay
-	 * to lock just one page at a time, since no concurrent index updates can
-	 * be happening, and readers should not care whether they arrive at the
-	 * target page or not (since it's surely empty).
+	 * touches before changing any of them.  In WAL replay, we at least lock
+	 * the pages in the same standard left-to-right order (leftsib, target,
+	 * rightsib), and don't release the sibling locks until the target is
+	 * marked deleted.
+	 *
+	 * btree_xlog_split() can get away with fixing its right sibling page's
+	 * left link last of all, after dropping all other locks.  We prefer to
+	 * avoid dropping locks on same-level pages early compared to normal
+	 * operation.  This keeps things simple for backwards scans.  See
+	 * nbtree/README.
 	 */
 
-	/* Fix left-link of right sibling */
-	if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO)
-	{
-		page = (Page) BufferGetPage(buffer);
-		pageop = (BTPageOpaque) PageGetSpecialPointer(page);
-		pageop->btpo_prev = leftsib;
-
-		PageSetLSN(page, lsn);
-		MarkBufferDirty(buffer);
-	}
-	if (BufferIsValid(buffer))
-		UnlockReleaseBuffer(buffer);
-
 	/* Fix right-link of left sibling, if any */
 	if (leftsib != P_NONE)
 	{
-		if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+		if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO)
 		{
-			page = (Page) BufferGetPage(buffer);
+			page = (Page) BufferGetPage(leftbuf);
 			pageop = (BTPageOpaque) PageGetSpecialPointer(page);
 			pageop->btpo_next = rightsib;
 
 			PageSetLSN(page, lsn);
-			MarkBufferDirty(buffer);
+			MarkBufferDirty(leftbuf);
 		}
-		if (BufferIsValid(buffer))
-			UnlockReleaseBuffer(buffer);
 	}
+	else
+		leftbuf = InvalidBuffer;
 
 	/* Rewrite target page as empty deleted page */
-	buffer = XLogInitBufferForRedo(record, 0);
-	page = (Page) BufferGetPage(buffer);
+	target = XLogInitBufferForRedo(record, 0);
+	page = (Page) BufferGetPage(target);
 
-	_bt_pageinit(page, BufferGetPageSize(buffer));
+	_bt_pageinit(page, BufferGetPageSize(target));
 	pageop = (BTPageOpaque) PageGetSpecialPointer(page);
 
 	pageop->btpo_prev = leftsib;
@@ -832,8 +827,27 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 	pageop->btpo_cycleid = 0;
 
 	PageSetLSN(page, lsn);
-	MarkBufferDirty(buffer);
-	UnlockReleaseBuffer(buffer);
+	MarkBufferDirty(target);
+
+	/* Fix left-link of right sibling */
+	if (XLogReadBufferForRedo(record, 2, &rightbuf) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(rightbuf);
+		pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+		pageop->btpo_prev = leftsib;
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(rightbuf);
+	}
+
+	/* Release siblings */
+	if (BufferIsValid(leftbuf))
+		UnlockReleaseBuffer(leftbuf);
+	if (BufferIsValid(rightbuf))
+		UnlockReleaseBuffer(rightbuf);
+
+	/* Release target */
+	UnlockReleaseBuffer(target);
 
 	/*
 	 * If we deleted a parent of the targeted leaf page, instead of the leaf
@@ -845,13 +859,19 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 		/*
 		 * There is no real data on the page, so we just re-create it from
 		 * scratch using the information from the WAL record.
+		 *
+		 * Note that we don't end up here when the target page is also the
+		 * leafbuf page.  There is no need to add a dummy hikey item with a
+		 * top parent link when deleting leafbuf because it's the last page
+		 * we'll delete in the subtree undergoing deletion.
 		 */
-		IndexTupleData trunctuple;
+		Buffer				leafbuf;
+		IndexTupleData		trunctuple;
 
-		buffer = XLogInitBufferForRedo(record, 3);
-		page = (Page) BufferGetPage(buffer);
+		leafbuf = XLogInitBufferForRedo(record, 3);
+		page = (Page) BufferGetPage(leafbuf);
 
-		_bt_pageinit(page, BufferGetPageSize(buffer));
+		_bt_pageinit(page, BufferGetPageSize(leafbuf));
 		pageop = (BTPageOpaque) PageGetSpecialPointer(page);
 
 		pageop->btpo_flags = BTP_HALF_DEAD | BTP_LEAF;
@@ -870,8 +890,8 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 			elog(ERROR, "could not add dummy high key to half-dead page");
 
 		PageSetLSN(page, lsn);
-		MarkBufferDirty(buffer);
-		UnlockReleaseBuffer(buffer);
+		MarkBufferDirty(leafbuf);
+		UnlockReleaseBuffer(leafbuf);
 	}
 
 	/* Update metapage if needed */

From dd877998d498c511352bd3640fd57f041c90ea62 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 4 Aug 2020 14:36:01 +0900
Subject: [PATCH 254/334] Make new SSL TAP test for channel_binding more robust

The test would fail in an environment including a certificate file in
~/.postgresql/.  bdd6e9b fixed a similar failure, and d6e612f introduced
the same problem again with a new test.

Author: Kyotaro Horiguchi
Discussion: https://postgr.es/m/20200804.120033.31225582282178001.horikyota.ntt@gmail.com
Backpatch-through: 13
---
 src/test/ssl/t/002_scram.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/test/ssl/t/002_scram.pl b/src/test/ssl/t/002_scram.pl
index 01231f8ba0f0..20ab0d5b0bc2 100644
--- a/src/test/ssl/t/002_scram.pl
+++ b/src/test/ssl/t/002_scram.pl
@@ -97,7 +97,7 @@
 copy("ssl/client.key", $client_tmp_key);
 chmod 0600, $client_tmp_key;
 test_connect_fails(
-	"sslcert=ssl/client.crt sslkey=$client_tmp_key hostaddr=$SERVERHOSTADDR",
+	"sslcert=ssl/client.crt sslkey=$client_tmp_key sslrootcert=invalid hostaddr=$SERVERHOSTADDR",
 	"dbname=certdb user=ssltestuser channel_binding=require",
 	qr/channel binding required, but server authenticated client without channel binding/,
 	"Cert authentication and channel_binding=require");

From 0f76294260b92849c4958fb706ecd5b5cd73e40e Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 4 Aug 2020 15:20:31 -0400
Subject: [PATCH 255/334] Increase hard-wired timeout values in ecpg regression
 tests.

A couple of test cases had connect_timeout=14, a value that seems
to have been plucked from a hat.  While it's more than sufficient
for normal cases, slow/overloaded buildfarm machines can get a timeout
failure here, as per recent report from "sungazer".  Increase to 180
seconds, which is in line with our typical timeouts elsewhere in
the regression tests.

Back-patch to 9.6; the code looks different in 9.5, and this doesn't
seem to be quite worth the effort to adapt to that.

Report: https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=sungazer&dt=2020-08-04%2007%3A12%3A22
---
 src/interfaces/ecpg/test/connect/test1.pgc                     | 2 +-
 src/interfaces/ecpg/test/connect/test5.pgc                     | 2 +-
 src/interfaces/ecpg/test/expected/connect-test1-minGW32.stderr | 2 +-
 src/interfaces/ecpg/test/expected/connect-test1.c              | 2 +-
 src/interfaces/ecpg/test/expected/connect-test1.stderr         | 2 +-
 src/interfaces/ecpg/test/expected/connect-test5.c              | 2 +-
 src/interfaces/ecpg/test/expected/connect-test5.stderr         | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/interfaces/ecpg/test/connect/test1.pgc b/src/interfaces/ecpg/test/connect/test1.pgc
index 82cdfb8fc56e..961bd72ef2a9 100644
--- a/src/interfaces/ecpg/test/connect/test1.pgc
+++ b/src/interfaces/ecpg/test/connect/test1.pgc
@@ -46,7 +46,7 @@ exec sql end declare section;
 	exec sql connect to unix:postgresql://localhost/ecpg2_regression user regress_ecpg_user1 using "connectpw";
 	exec sql disconnect;
 
-	exec sql connect to unix:postgresql://localhost/ecpg2_regression?connect_timeout=14 user regress_ecpg_user1;
+	exec sql connect to unix:postgresql://localhost/ecpg2_regression?connect_timeout=180 user regress_ecpg_user1;
 	exec sql disconnect;
 
 	/* wrong db */
diff --git a/src/interfaces/ecpg/test/connect/test5.pgc b/src/interfaces/ecpg/test/connect/test5.pgc
index 2e34ab84fc62..e712fa87783f 100644
--- a/src/interfaces/ecpg/test/connect/test5.pgc
+++ b/src/interfaces/ecpg/test/connect/test5.pgc
@@ -55,7 +55,7 @@ exec sql end declare section;
 	exec sql connect to 'unix:postgresql://localhost/ecpg2_regression' as main user :user USING "connectpw";
 	exec sql disconnect main;
 
-	exec sql connect to unix:postgresql://localhost/ecpg2_regression?connect_timeout=14&client_encoding=latin1 as main user regress_ecpg_user1/connectpw;
+	exec sql connect to unix:postgresql://localhost/ecpg2_regression?connect_timeout=180&client_encoding=latin1 as main user regress_ecpg_user1/connectpw;
 	exec sql disconnect main;
 
 	exec sql connect to "unix:postgresql://200.46.204.71/ecpg2_regression" as main user regress_ecpg_user1/connectpw;
diff --git a/src/interfaces/ecpg/test/expected/connect-test1-minGW32.stderr b/src/interfaces/ecpg/test/expected/connect-test1-minGW32.stderr
index b334537b6005..853453d980ec 100644
--- a/src/interfaces/ecpg/test/expected/connect-test1-minGW32.stderr
+++ b/src/interfaces/ecpg/test/expected/connect-test1-minGW32.stderr
@@ -48,7 +48,7 @@
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ecpg_finish: connection ecpg2_regression closed
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ECPGconnect: opening database ecpg2_regression on <DEFAULT> port <DEFAULT> with options connect_timeout=14 for user regress_ecpg_user1
+[NO_PID]: ECPGconnect: opening database ecpg2_regression on <DEFAULT> port <DEFAULT> with options connect_timeout=180 for user regress_ecpg_user1
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ecpg_finish: connection ecpg2_regression closed
 [NO_PID]: sqlca: code: 0, state: 00000
diff --git a/src/interfaces/ecpg/test/expected/connect-test1.c b/src/interfaces/ecpg/test/expected/connect-test1.c
index 894273339cd2..ffd24e2fc8f4 100644
--- a/src/interfaces/ecpg/test/expected/connect-test1.c
+++ b/src/interfaces/ecpg/test/expected/connect-test1.c
@@ -93,7 +93,7 @@ main(void)
 #line 47 "test1.pgc"
 
 
-	{ ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/ecpg2_regression?connect_timeout=14" , "regress_ecpg_user1" , NULL , NULL, 0); }
+	{ ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/ecpg2_regression?connect_timeout=180" , "regress_ecpg_user1" , NULL , NULL, 0); }
 #line 49 "test1.pgc"
 
 	{ ECPGdisconnect(__LINE__, "CURRENT");}
diff --git a/src/interfaces/ecpg/test/expected/connect-test1.stderr b/src/interfaces/ecpg/test/expected/connect-test1.stderr
index c5cbf749efea..1986fc54adc2 100644
--- a/src/interfaces/ecpg/test/expected/connect-test1.stderr
+++ b/src/interfaces/ecpg/test/expected/connect-test1.stderr
@@ -48,7 +48,7 @@
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ecpg_finish: connection ecpg2_regression closed
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ECPGconnect: opening database ecpg2_regression on <DEFAULT> port <DEFAULT> with options connect_timeout=14 for user regress_ecpg_user1
+[NO_PID]: ECPGconnect: opening database ecpg2_regression on <DEFAULT> port <DEFAULT> with options connect_timeout=180 for user regress_ecpg_user1
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ecpg_finish: connection ecpg2_regression closed
 [NO_PID]: sqlca: code: 0, state: 00000
diff --git a/src/interfaces/ecpg/test/expected/connect-test5.c b/src/interfaces/ecpg/test/expected/connect-test5.c
index b44104854da0..6ae5b589dea4 100644
--- a/src/interfaces/ecpg/test/expected/connect-test5.c
+++ b/src/interfaces/ecpg/test/expected/connect-test5.c
@@ -121,7 +121,7 @@ main(void)
 #line 56 "test5.pgc"
 
 
-	{ ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/ecpg2_regression?connect_timeout=14 & client_encoding=latin1" , "regress_ecpg_user1" , "connectpw" , "main", 0); }
+	{ ECPGconnect(__LINE__, 0, "unix:postgresql://localhost/ecpg2_regression?connect_timeout=180 & client_encoding=latin1" , "regress_ecpg_user1" , "connectpw" , "main", 0); }
 #line 58 "test5.pgc"
 
 	{ ECPGdisconnect(__LINE__, "main");}
diff --git a/src/interfaces/ecpg/test/expected/connect-test5.stderr b/src/interfaces/ecpg/test/expected/connect-test5.stderr
index cefdb0739e5b..a54df175fbf0 100644
--- a/src/interfaces/ecpg/test/expected/connect-test5.stderr
+++ b/src/interfaces/ecpg/test/expected/connect-test5.stderr
@@ -61,7 +61,7 @@
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ecpg_finish: connection main closed
 [NO_PID]: sqlca: code: 0, state: 00000
-[NO_PID]: ECPGconnect: opening database ecpg2_regression on <DEFAULT> port <DEFAULT> with options connect_timeout=14 & client_encoding=latin1 for user regress_ecpg_user1
+[NO_PID]: ECPGconnect: opening database ecpg2_regression on <DEFAULT> port <DEFAULT> with options connect_timeout=180 & client_encoding=latin1 for user regress_ecpg_user1
 [NO_PID]: sqlca: code: 0, state: 00000
 [NO_PID]: ecpg_finish: connection main closed
 [NO_PID]: sqlca: code: 0, state: 00000

From f47b5e139579a77c1f7c63400f01ea39d515e8c8 Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Wed, 5 Aug 2020 02:15:34 +0300
Subject: [PATCH 256/334] Remove btree page items after page unlink

Currently, page unlink leaves remaining items "as is", but replay of
corresponding WAL-record re-initializes page leaving it with no items.
For the sake of consistency, this commit makes primary delete all the items
during page unlink as well.

Thanks to this change, we now don't mask contents of deleted btree page for
WAL consistency checking.

Discussion: https://postgr.es/m/CAPpHfdt_OTyQpXaPJcWzV2N-LNeNJseNB-K_A66qG%3DL518VTFw%40mail.gmail.com
Author: Alexander Korotkov
Reviewed-by: Peter Geoghegan
---
 contrib/amcheck/verify_nbtree.c     |  7 ++-----
 src/backend/access/nbtree/nbtpage.c |  9 +++++++++
 src/backend/access/nbtree/nbtxlog.c | 10 +---------
 3 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index e4d501a85d1f..c9f9e755dccc 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -2864,11 +2864,8 @@ palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
 	 * As noted at the beginning of _bt_binsrch(), an internal page must have
 	 * children, since there must always be a negative infinity downlink
 	 * (there may also be a highkey).  In the case of non-rightmost leaf
-	 * pages, there must be at least a highkey.  Deleted pages on replica
-	 * might contain no items, because page unlink re-initializes
-	 * page-to-be-deleted.  Deleted pages with no items might be on primary
-	 * too due to preceding recovery, but on primary new deletions can't
-	 * happen concurrently to amcheck.
+	 * pages, there must be at least a highkey.  The exceptions are deleted
+	 * pages, which contain no items.
 	 *
 	 * This is correct when pages are half-dead, since internal pages are
 	 * never half-dead, and leaf pages must have a high key when half-dead
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 53dff3268083..d5db9aaa3a13 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -2058,6 +2058,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	BTMetaPageData *metad = NULL;
 	ItemId		itemid;
 	Page		page;
+	PageHeader	header;
 	BTPageOpaque opaque;
 	bool		rightsib_is_rightmost;
 	int			targetlevel;
@@ -2327,6 +2328,14 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	opaque->btpo_flags |= BTP_DELETED;
 	opaque->btpo.xact = ReadNewTransactionId();
 
+	/*
+	 * Remove the remaining tuples on the page.  This keeps things simple for
+	 * WAL consistency checking.
+	 */
+	header = (PageHeader) page;
+	header->pd_lower = SizeOfPageHeaderData;
+	header->pd_upper = header->pd_special;
+
 	/* And update the metapage, if needed */
 	if (BufferIsValid(metabuf))
 	{
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 09d1b0e3419a..be0fa450f31d 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -1051,15 +1051,7 @@ btree_mask(char *pagedata, BlockNumber blkno)
 
 	maskopaq = (BTPageOpaque) PageGetSpecialPointer(page);
 
-	if (P_ISDELETED(maskopaq))
-	{
-		/*
-		 * Mask page content on a DELETED page since it will be re-initialized
-		 * during replay. See btree_xlog_unlink_page() for details.
-		 */
-		mask_page_content(page);
-	}
-	else if (P_ISLEAF(maskopaq))
+	if (P_ISLEAF(maskopaq))
 	{
 		/*
 		 * In btree leaf pages, it is possible to modify the LP_FLAGS without

From 7a980dfc6c15add6ec3309932cf3061bb6745f65 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 5 Aug 2020 15:38:55 -0400
Subject: [PATCH 257/334] Fix matching of sub-partitions when a partitioned
 plan is stale.

Since we no longer require AccessExclusiveLock to add a partition,
the executor may see that a partitioned table has more partitions
than the planner saw.  ExecCreatePartitionPruneState's code for
matching up the partition lists in such cases was faulty, and would
misbehave if the planner had successfully pruned any partitions from
the query.  (Thus, trouble would occur only if a partition addition
happens concurrently with a query that uses both static and dynamic
partition pruning.)  This led to an Assert failure in debug builds,
and probably to crashes or query misbehavior in production builds.

To repair the bug, just explicitly skip zeroes in the plan's
relid_map[] list.  I also made some cosmetic changes to make the code
more readable (IMO anyway).  Also, convert the cross-checking Assert
to a regular test-and-elog, since it's now apparent that this logic
is more fragile than one would like.

Currently, there's no way to repeatably exercise this code, except
with manual use of a debugger to stop the backend between planning
and execution.  Hence, no test case in this patch.  We oughta do
something about that testability gap, but that's for another day.

Amit Langote and Tom Lane, per report from Justin Pryzby.  Oversight
in commit 898e5e329; backpatch to v12 where that appeared.

Discussion: https://postgr.es/m/20200802181131.GA27754@telsasoft.com
---
 src/backend/executor/execPartition.c | 47 +++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c
index fb6ce4905681..79fcbd6b0665 100644
--- a/src/backend/executor/execPartition.c
+++ b/src/backend/executor/execPartition.c
@@ -1667,26 +1667,51 @@ ExecCreatePartitionPruneState(PlanState *planstate,
 				 * present in the one used to construct subplan_map and
 				 * subpart_map.  So we must construct new and longer arrays
 				 * where the partitions that were originally present map to
-				 * the same place, and any added indexes map to -1, as if the
-				 * new partitions had been pruned.
+				 * the same sub-structures, and any added partitions map to
+				 * -1, as if the new partitions had been pruned.
+				 *
+				 * Note: pinfo->relid_map[] may contain InvalidOid entries for
+				 * partitions pruned by the planner.  We cannot tell exactly
+				 * which of the partdesc entries these correspond to, but we
+				 * don't have to; just skip over them.  The non-pruned
+				 * relid_map entries, however, had better be a subset of the
+				 * partdesc entries and in the same order.
 				 */
 				pprune->subpart_map = palloc(sizeof(int) * partdesc->nparts);
-				for (pp_idx = 0; pp_idx < partdesc->nparts; ++pp_idx)
+				for (pp_idx = 0; pp_idx < partdesc->nparts; pp_idx++)
 				{
-					if (pinfo->relid_map[pd_idx] != partdesc->oids[pp_idx])
-					{
-						pprune->subplan_map[pp_idx] = -1;
-						pprune->subpart_map[pp_idx] = -1;
-					}
-					else
+					/* Skip any InvalidOid relid_map entries */
+					while (pd_idx < pinfo->nparts &&
+						   !OidIsValid(pinfo->relid_map[pd_idx]))
+						pd_idx++;
+
+					if (pd_idx < pinfo->nparts &&
+						pinfo->relid_map[pd_idx] == partdesc->oids[pp_idx])
 					{
+						/* match... */
 						pprune->subplan_map[pp_idx] =
 							pinfo->subplan_map[pd_idx];
 						pprune->subpart_map[pp_idx] =
-							pinfo->subpart_map[pd_idx++];
+							pinfo->subpart_map[pd_idx];
+						pd_idx++;
+					}
+					else
+					{
+						/* this partdesc entry is not in the plan */
+						pprune->subplan_map[pp_idx] = -1;
+						pprune->subpart_map[pp_idx] = -1;
 					}
 				}
-				Assert(pd_idx == pinfo->nparts);
+
+				/*
+				 * It might seem that we need to skip any trailing InvalidOid
+				 * entries in pinfo->relid_map before checking that we scanned
+				 * all of the relid_map.  But we will have skipped them above,
+				 * because they must correspond to some partdesc->oids
+				 * entries; we just couldn't tell which.
+				 */
+				if (pd_idx != pinfo->nparts)
+					elog(ERROR, "could not match partition child tables to plan elements");
 			}
 
 			/* present_parts is also subject to later modification */

From a6775352476ac92d6b3eb3ae2dfd2775e3622afe Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Wed, 5 Aug 2020 17:12:10 -0400
Subject: [PATCH 258/334] doc:  clarify "state" table reference in tutorial

Reported-by: Vyacheslav Shablistyy

Discussion: https://postgr.es/m/159586122762.680.1361378513036616007@wrigleys.postgresql.org

Backpatch-through: 9.5
---
 doc/src/sgml/advanced.sgml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/src/sgml/advanced.sgml b/doc/src/sgml/advanced.sgml
index f6c4627c3e0f..d77312600f7b 100644
--- a/doc/src/sgml/advanced.sgml
+++ b/doc/src/sgml/advanced.sgml
@@ -628,8 +628,9 @@ CREATE TABLE capitals (
     <firstterm>parent</firstterm>, <classname>cities</classname>.  The
     type of the column <structfield>name</structfield> is
     <type>text</type>, a native <productname>PostgreSQL</productname>
-    type for variable length character strings.  State capitals have
-    an extra column, <structfield>state</structfield>, that shows their state.  In
+    type for variable length character strings.  The
+    <classname>capitals</classname> table has
+    an extra column, <structfield>state</structfield>, which shows their states.  In
     <productname>PostgreSQL</productname>, a table can inherit from
     zero or more other tables.
    </para>

From bab150045bd9766869f471ede88734ea0989261c Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 6 Aug 2020 14:13:03 -0400
Subject: [PATCH 259/334] Register llvm_shutdown using on_proc_exit, not
 before_shmem_exit.

This seems more correct, because other before_shmem_exit calls may
expect the infrastructure that is needed to run queries and access the
database to be working, and also because this cleanup has nothing to
do with shared memory.

There are no known user-visible consequences to this, though, apart
from what was previous fixed by commit
303640199d0436c5e7acdf50b837a027b5726594 and back-patched as commit
bcbc27251d35336a6442761f59638138a772b839 and commit
f7013683d9bb663a6a917421b1374306a32f165b, so for now, no back-patch.

Bharath Rupireddy

Discussion: http://postgr.es/m/CALj2ACWk7j4F2v2fxxYfrroOF=AdFNPr1WsV+AGtHAFQOqm_pw@mail.gmail.com
---
 src/backend/jit/llvm/llvmjit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/jit/llvm/llvmjit.c b/src/backend/jit/llvm/llvmjit.c
index af8b34aaaf3c..43bed78a5299 100644
--- a/src/backend/jit/llvm/llvmjit.c
+++ b/src/backend/jit/llvm/llvmjit.c
@@ -683,7 +683,7 @@ llvm_session_initialize(void)
 	}
 #endif
 
-	before_shmem_exit(llvm_shutdown, 0);
+	on_proc_exit(llvm_shutdown, 0);
 
 	llvm_session_initialized = true;
 

From d5e96520ffca8eeeefc11f8fc82af610f68e63a8 Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Fri, 7 Aug 2020 10:22:18 +1200
Subject: [PATCH 260/334] Fix bogus EXPLAIN output for Hash Aggregate

9bdb300de modified the EXPLAIN output for Hash Aggregate to show details
from parallel workers. However, it neglected to consider that a given
parallel worker may not have assisted with the given Hash Aggregate. This
can occur when workers fail to start or during Parallel Append with
enable_partitionwise_join enabled when only a single worker is working on
a non-parallel aware sub-plan. It could also happen if a worker simply
wasn't fast enough to get any work done before other processes went and
finished all the work.

The bogus output came from the fact that ExplainOpenWorker() skipped
showing any details for non-initialized workers but show_hashagg_info()
did show details from the worker.  This meant that the worker properties
that were shown were not properly attributed to the worker that they
belong to.

In passing, we also now don't show Hash Aggregate properties for the
leader process when it did not contribute any work to the Hash Aggregate.
This can occur either during Parallel Append when only a parallel worker
worked on a given sub plan or with parallel_leader_participation set to
off.  This aims to make the behavior of Hash Aggregate's EXPLAIN output
more similar to Sort's.

Reported-by: Justin Pryzby
Discussion: https://postgr.es/m/20200805012105.GZ28072%40telsasoft.com
Backpatch-through: 13, where the original breakage was introduced
---
 src/backend/commands/explain.c | 63 ++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 25 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 1e565fd33755..30e0a7ee7f21 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3063,15 +3063,19 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 			ExplainPropertyInteger("Planned Partitions", NULL,
 								   aggstate->hash_planned_partitions, es);
 
-		if (!es->analyze)
-			return;
-
-		/* EXPLAIN ANALYZE */
-		ExplainPropertyInteger("HashAgg Batches", NULL,
-							   aggstate->hash_batches_used, es);
-		ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es);
-		ExplainPropertyInteger("Disk Usage", "kB",
-							   aggstate->hash_disk_used, es);
+		/*
+		 * During parallel query the leader may have not helped out.  We
+		 * detect this by checking how much memory it used.  If we find it
+		 * didn't do any work then we don't show its properties.
+		 */
+		if (es->analyze && aggstate->hash_mem_peak > 0)
+		{
+			ExplainPropertyInteger("HashAgg Batches", NULL,
+								   aggstate->hash_batches_used, es);
+			ExplainPropertyInteger("Peak Memory Usage", "kB", memPeakKb, es);
+			ExplainPropertyInteger("Disk Usage", "kB",
+								   aggstate->hash_disk_used, es);
+		}
 	}
 	else
 	{
@@ -3085,26 +3089,32 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 			gotone = true;
 		}
 
-		if (!es->analyze)
+		/*
+		 * During parallel query the leader may have not helped out.  We
+		 * detect this by checking how much memory it used.  If we find it
+		 * didn't do any work then we don't show its properties.
+		 */
+		if (es->analyze && aggstate->hash_mem_peak > 0)
 		{
-			if (gotone)
-				appendStringInfoChar(es->str, '\n');
-			return;
-		}
+			if (!gotone)
+				ExplainIndentText(es);
+			else
+				appendStringInfoString(es->str, "  ");
 
-		if (!gotone)
-			ExplainIndentText(es);
-		else
-			appendStringInfoString(es->str, "  ");
+			appendStringInfo(es->str, "Batches: %d  Memory Usage: " INT64_FORMAT "kB",
+							 aggstate->hash_batches_used, memPeakKb);
+			gotone = true;
 
-		appendStringInfo(es->str, "Batches: %d  Memory Usage: " INT64_FORMAT "kB",
-						 aggstate->hash_batches_used, memPeakKb);
+			/* Only display disk usage if we spilled to disk */
+			if (aggstate->hash_batches_used > 1)
+			{
+				appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB",
+					aggstate->hash_disk_used);
+			}
+		}
 
-		/* Only display disk usage if we spilled to disk */
-		if (aggstate->hash_batches_used > 1)
-			appendStringInfo(es->str, "  Disk Usage: " UINT64_FORMAT "kB",
-							 aggstate->hash_disk_used);
-		appendStringInfoChar(es->str, '\n');
+		if (gotone)
+			appendStringInfoChar(es->str, '\n');
 	}
 
 	/* Display stats for each parallel worker */
@@ -3117,6 +3127,9 @@ show_hashagg_info(AggState *aggstate, ExplainState *es)
 			int			hash_batches_used;
 
 			sinstrument = &aggstate->shared_info->sinstrument[n];
+			/* Skip workers that didn't do anything */
+			if (sinstrument->hash_mem_peak == 0)
+				continue;
 			hash_disk_used = sinstrument->hash_disk_used;
 			hash_batches_used = sinstrument->hash_batches_used;
 			memPeakKb = (sinstrument->hash_mem_peak + 1023) / 1024;

From c254d8d7b20bf629420b407a5451c3b32d1a7b0b Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Thu, 6 Aug 2020 15:25:49 -0700
Subject: [PATCH 261/334] amcheck: Sanitize metapage's allequalimage field.

This will be helpful if it ever proves necessary to revoke an opclass's
support for deduplication.

Backpatch: 13-, where nbtree deduplication was introduced.
---
 contrib/amcheck/verify_nbtree.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index c9f9e755dccc..384a8ac747e1 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -305,8 +305,20 @@ bt_index_check_internal(Oid indrelid, bool parentcheck, bool heapallindexed,
 					 errmsg("index \"%s\" lacks a main relation fork",
 							RelationGetRelationName(indrel))));
 
-		/* Check index, possibly against table it is an index on */
+		/* Extract metadata from metapage, and sanitize it in passing */
 		_bt_metaversion(indrel, &heapkeyspace, &allequalimage);
+		if (allequalimage && !heapkeyspace)
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("index \"%s\" metapage has equalimage field set on unsupported nbtree version",
+							RelationGetRelationName(indrel))));
+		if (allequalimage && !_bt_allequalimage(indrel, false))
+			ereport(ERROR,
+					(errcode(ERRCODE_INDEX_CORRUPTED),
+					 errmsg("index \"%s\" metapage incorrectly indicates that deduplication is safe",
+							RelationGetRelationName(indrel))));
+
+		/* Check index, possibly against table it is an index on */
 		bt_check_every_level(indrel, heaprel, heapkeyspace, parentcheck,
 							 heapallindexed, rootdescend);
 	}

From 3a3be80641c01e675d0ed484f15df8ec536d0a06 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Thu, 6 Aug 2020 16:23:52 -0700
Subject: [PATCH 262/334] Remove obsolete amcheck comment.

Oversight in commit d114cc53871.
---
 contrib/amcheck/verify_nbtree.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 384a8ac747e1..b87a3cb4717c 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -903,7 +903,6 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
  *	 tuple.
  *
  * - That downlink to block was encountered in parent where that's expected.
- *	 (Limited to readonly callers.)
  *
  * - That high keys of child pages matches corresponding pivot keys in parent.
  *

From 199cec9779504c08aaa8159c6308283156547409 Mon Sep 17 00:00:00 2001
From: Etsuro Fujita <efujita@postgresql.org>
Date: Fri, 7 Aug 2020 14:45:00 +0900
Subject: [PATCH 263/334] Fix yet another issue with step generation in
 partition pruning.

Commit 13838740f fixed some issues with step generation in partition
pruning, but there was yet another one: get_steps_using_prefix() assumes
that clauses in the passed-in prefix list are sorted in ascending order
of their partition key numbers, but the caller failed to ensure this for
range partitioning, which led to an assertion failure in debug builds.
Adjust the caller function to arrange the clauses in the prefix list in
the required order for range partitioning.

Back-patch to v11, like the previous commit.

Patch by me, reviewed by Amit Langote.

Discussion: https://postgr.es/m/CAPmGK16jkXiFG0YqMbU66wte-oJTfW6D1HaNvQf%3D%2B5o9%3Dm55wQ%40mail.gmail.com
---
 src/backend/partitioning/partprune.c          | 138 ++++++++++--------
 src/test/regress/expected/partition_prune.out |  10 ++
 src/test/regress/sql/partition_prune.sql      |   5 +
 3 files changed, 96 insertions(+), 57 deletions(-)

diff --git a/src/backend/partitioning/partprune.c b/src/backend/partitioning/partprune.c
index 253c69064982..6268623d5699 100644
--- a/src/backend/partitioning/partprune.c
+++ b/src/backend/partitioning/partprune.c
@@ -1362,7 +1362,6 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 				List	   *eq_clauses = btree_clauses[BTEqualStrategyNumber];
 				List	   *le_clauses = btree_clauses[BTLessEqualStrategyNumber];
 				List	   *ge_clauses = btree_clauses[BTGreaterEqualStrategyNumber];
-				bool		pk_has_clauses[PARTITION_MAX_KEYS];
 				int			strat;
 
 				/*
@@ -1382,10 +1381,15 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 					foreach(lc, btree_clauses[strat])
 					{
 						PartClauseInfo *pc = lfirst(lc);
+						ListCell   *eq_start;
+						ListCell   *le_start;
+						ListCell   *ge_start;
 						ListCell   *lc1;
 						List	   *prefix = NIL;
 						List	   *pc_steps;
 						bool		prefix_valid = true;
+						bool		pk_has_clauses;
+						int			keyno;
 
 						/*
 						 * If this is a clause for the first partition key,
@@ -1410,79 +1414,96 @@ gen_prune_steps_from_opexps(GeneratePruningStepsContext *context,
 							continue;
 						}
 
-						/* (Re-)initialize the pk_has_clauses array */
-						Assert(pc->keyno > 0);
-						for (i = 0; i < pc->keyno; i++)
-							pk_has_clauses[i] = false;
+						eq_start = list_head(eq_clauses);
+						le_start = list_head(le_clauses);
+						ge_start = list_head(ge_clauses);
 
 						/*
-						 * Expressions from = clauses can always be in the
-						 * prefix, provided they're from an earlier key.
+						 * We arrange clauses into prefix in ascending order
+						 * of their partition key numbers.
 						 */
-						foreach(lc1, eq_clauses)
+						for (keyno = 0; keyno < pc->keyno; keyno++)
 						{
-							PartClauseInfo *eqpc = lfirst(lc1);
+							pk_has_clauses = false;
 
-							if (eqpc->keyno == pc->keyno)
-								break;
-							if (eqpc->keyno < pc->keyno)
+							/*
+							 * Expressions from = clauses can always be in the
+							 * prefix, provided they're from an earlier key.
+							 */
+							for_each_cell(lc1, eq_clauses, eq_start)
 							{
-								prefix = lappend(prefix, eqpc);
-								pk_has_clauses[eqpc->keyno] = true;
-							}
-						}
+								PartClauseInfo *eqpc = lfirst(lc1);
 
-						/*
-						 * If we're generating steps for </<= strategy, we can
-						 * add other <= clauses to the prefix, provided
-						 * they're from an earlier key.
-						 */
-						if (strat == BTLessStrategyNumber ||
-							strat == BTLessEqualStrategyNumber)
-						{
-							foreach(lc1, le_clauses)
-							{
-								PartClauseInfo *lepc = lfirst(lc1);
-
-								if (lepc->keyno == pc->keyno)
+								if (eqpc->keyno == keyno)
+								{
+									prefix = lappend(prefix, eqpc);
+									pk_has_clauses = true;
+								}
+								else
+								{
+									Assert(eqpc->keyno > keyno);
 									break;
-								if (lepc->keyno < pc->keyno)
+								}
+							}
+							eq_start = lc1;
+
+							/*
+							 * If we're generating steps for </<= strategy, we
+							 * can add other <= clauses to the prefix,
+							 * provided they're from an earlier key.
+							 */
+							if (strat == BTLessStrategyNumber ||
+								strat == BTLessEqualStrategyNumber)
+							{
+								for_each_cell(lc1, le_clauses, le_start)
 								{
-									prefix = lappend(prefix, lepc);
-									pk_has_clauses[lepc->keyno] = true;
+									PartClauseInfo *lepc = lfirst(lc1);
+
+									if (lepc->keyno == keyno)
+									{
+										prefix = lappend(prefix, lepc);
+										pk_has_clauses = true;
+									}
+									else
+									{
+										Assert(lepc->keyno > keyno);
+										break;
+									}
 								}
+								le_start = lc1;
 							}
-						}
 
-						/*
-						 * If we're generating steps for >/>= strategy, we can
-						 * add other >= clauses to the prefix, provided
-						 * they're from an earlier key.
-						 */
-						if (strat == BTGreaterStrategyNumber ||
-							strat == BTGreaterEqualStrategyNumber)
-						{
-							foreach(lc1, ge_clauses)
+							/*
+							 * If we're generating steps for >/>= strategy, we
+							 * can add other >= clauses to the prefix,
+							 * provided they're from an earlier key.
+							 */
+							if (strat == BTGreaterStrategyNumber ||
+								strat == BTGreaterEqualStrategyNumber)
 							{
-								PartClauseInfo *gepc = lfirst(lc1);
-
-								if (gepc->keyno == pc->keyno)
-									break;
-								if (gepc->keyno < pc->keyno)
+								for_each_cell(lc1, ge_clauses, ge_start)
 								{
-									prefix = lappend(prefix, gepc);
-									pk_has_clauses[gepc->keyno] = true;
+									PartClauseInfo *gepc = lfirst(lc1);
+
+									if (gepc->keyno == keyno)
+									{
+										prefix = lappend(prefix, gepc);
+										pk_has_clauses = true;
+									}
+									else
+									{
+										Assert(gepc->keyno > keyno);
+										break;
+									}
 								}
+								ge_start = lc1;
 							}
-						}
 
-						/*
-						 * Check whether every earlier partition key has at
-						 * least one clause.
-						 */
-						for (i = 0; i < pc->keyno; i++)
-						{
-							if (!pk_has_clauses[i])
+							/*
+							 * If this key has no clauses, prefix is not valid
+							 * anymore.
+							 */
+							if (!pk_has_clauses)
 							{
 								prefix_valid = false;
 								break;
@@ -2241,6 +2262,9 @@ match_clause_to_partition_key(GeneratePruningStepsContext *context,
  * non-NULL, but they must ensure that prefix contains at least one clause
  * for each of the partition keys other than those specified in step_nullkeys
  * and step_lastkeyno.
+ *
+ * For both cases, callers must also ensure that clauses in prefix are sorted
+ * in ascending order of their partition key numbers.
  */
 static List *
 get_steps_using_prefix(GeneratePruningStepsContext *context,
diff --git a/src/test/regress/expected/partition_prune.out b/src/test/regress/expected/partition_prune.out
index 687cf8c5f415..50d2a7e4b975 100644
--- a/src/test/regress/expected/partition_prune.out
+++ b/src/test/regress/expected/partition_prune.out
@@ -3711,6 +3711,16 @@ explain (costs off) select * from rp_prefix_test3 where a >= 1 and b >= 1 and b
    Filter: ((a >= 1) AND (b >= 1) AND (b >= 2) AND (c >= 2) AND (d >= 0))
 (2 rows)
 
+-- Test that get_steps_using_prefix() handles a prefix that contains multiple
+-- clauses for the partition key b (ie, b >= 1 and b = 2)  (This also tests
+-- that the caller arranges clauses in that prefix in the required order)
+explain (costs off) select * from rp_prefix_test3 where a >= 1 and b >= 1 and b = 2 and c = 2 and d >= 0;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Seq Scan on rp_prefix_test3_p2 rp_prefix_test3
+   Filter: ((a >= 1) AND (b >= 1) AND (d >= 0) AND (b = 2) AND (c = 2))
+(2 rows)
+
 create table hp_prefix_test (a int, b int, c int, d int) partition by hash (a part_test_int4_ops, b part_test_int4_ops, c part_test_int4_ops, d part_test_int4_ops);
 create table hp_prefix_test_p1 partition of hp_prefix_test for values with (modulus 2, remainder 0);
 create table hp_prefix_test_p2 partition of hp_prefix_test for values with (modulus 2, remainder 1);
diff --git a/src/test/regress/sql/partition_prune.sql b/src/test/regress/sql/partition_prune.sql
index 93ef9dc1f340..1e904a8c5b7b 100644
--- a/src/test/regress/sql/partition_prune.sql
+++ b/src/test/regress/sql/partition_prune.sql
@@ -1080,6 +1080,11 @@ create table rp_prefix_test3_p2 partition of rp_prefix_test3 for values from (2,
 -- clauses for the partition key b (ie, b >= 1 and b >= 2)
 explain (costs off) select * from rp_prefix_test3 where a >= 1 and b >= 1 and b >= 2 and c >= 2 and d >= 0;
 
+-- Test that get_steps_using_prefix() handles a prefix that contains multiple
+-- clauses for the partition key b (ie, b >= 1 and b = 2)  (This also tests
+-- that the caller arranges clauses in that prefix in the required order)
+explain (costs off) select * from rp_prefix_test3 where a >= 1 and b >= 1 and b = 2 and c = 2 and d >= 0;
+
 create table hp_prefix_test (a int, b int, c int, d int) partition by hash (a part_test_int4_ops, b part_test_int4_ops, c part_test_int4_ops, d part_test_int4_ops);
 create table hp_prefix_test_p1 partition of hp_prefix_test for values with (modulus 2, remainder 0);
 create table hp_prefix_test_p2 partition of hp_prefix_test for values with (modulus 2, remainder 1);

From 3df92bbd1dba98f72e3f005406463b0718193a0f Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 7 Aug 2020 09:53:27 -0700
Subject: [PATCH 264/334] Rename nbtree split REDO routine variables.

Make the nbtree page split REDO routine variable names consistent with
_bt_split() (which handles the original execution of page splits).
These names make the code easier to follow by making the distinction
between the original page and the left half of the split clear.  (The
left half of the split page is a temp page that REDO creates to replace
the origpage contents.)

Also reduce the elevel used when adding a new high key to the temp page
from PANIC to ERROR to be consistent.  We already only raise an ERROR
when data item PageAddItem() temp page calls fail.
---
 src/backend/access/nbtree/nbtxlog.c | 96 ++++++++++++++---------------
 1 file changed, 48 insertions(+), 48 deletions(-)

diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index be0fa450f31d..1fd639246328 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -256,20 +256,20 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 	XLogRecPtr	lsn = record->EndRecPtr;
 	xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
 	bool		isleaf = (xlrec->level == 0);
-	Buffer		lbuf;
+	Buffer		buf;
 	Buffer		rbuf;
 	Page		rpage;
 	BTPageOpaque ropaque;
 	char	   *datapos;
 	Size		datalen;
-	BlockNumber leftsib;
-	BlockNumber rightsib;
-	BlockNumber rnext;
+	BlockNumber origpagenumber;
+	BlockNumber rightpagenumber;
+	BlockNumber spagenumber;
 
-	XLogRecGetBlockTag(record, 0, NULL, NULL, &leftsib);
-	XLogRecGetBlockTag(record, 1, NULL, NULL, &rightsib);
-	if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &rnext))
-		rnext = P_NONE;
+	XLogRecGetBlockTag(record, 0, NULL, NULL, &origpagenumber);
+	XLogRecGetBlockTag(record, 1, NULL, NULL, &rightpagenumber);
+	if (!XLogRecGetBlockTag(record, 2, NULL, NULL, &spagenumber))
+		spagenumber = P_NONE;
 
 	/*
 	 * Clear the incomplete split flag on the left sibling of the child page
@@ -287,8 +287,8 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 	_bt_pageinit(rpage, BufferGetPageSize(rbuf));
 	ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage);
 
-	ropaque->btpo_prev = leftsib;
-	ropaque->btpo_next = rnext;
+	ropaque->btpo_prev = origpagenumber;
+	ropaque->btpo_next = spagenumber;
 	ropaque->btpo.level = xlrec->level;
 	ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
 	ropaque->btpo_cycleid = 0;
@@ -298,8 +298,8 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 	PageSetLSN(rpage, lsn);
 	MarkBufferDirty(rbuf);
 
-	/* Now reconstruct left (original) sibling page */
-	if (XLogReadBufferForRedo(record, 0, &lbuf) == BLK_NEEDS_REDO)
+	/* Now reconstruct original page (left half of split) */
+	if (XLogReadBufferForRedo(record, 0, &buf) == BLK_NEEDS_REDO)
 	{
 		/*
 		 * To retain the same physical order of the tuples that they had, we
@@ -309,15 +309,15 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 		 * checking possible.  See also _bt_restore_page(), which does the
 		 * same for the right page.
 		 */
-		Page		lpage = (Page) BufferGetPage(lbuf);
-		BTPageOpaque lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
+		Page		origpage = (Page) BufferGetPage(buf);
+		BTPageOpaque oopaque = (BTPageOpaque) PageGetSpecialPointer(origpage);
 		OffsetNumber off;
 		IndexTuple	newitem = NULL,
 					left_hikey = NULL,
 					nposting = NULL;
 		Size		newitemsz = 0,
 					left_hikeysz = 0;
-		Page		newlpage;
+		Page		leftpage;
 		OffsetNumber leftoff,
 					replacepostingoff = InvalidOffsetNumber;
 
@@ -340,8 +340,8 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 
 				/* Use mutable, aligned newitem copy in _bt_swap_posting() */
 				newitem = CopyIndexTuple(newitem);
-				itemid = PageGetItemId(lpage, replacepostingoff);
-				oposting = (IndexTuple) PageGetItem(lpage, itemid);
+				itemid = PageGetItemId(origpage, replacepostingoff);
+				oposting = (IndexTuple) PageGetItem(origpage, itemid);
 				nposting = _bt_swap_posting(newitem, oposting,
 											xlrec->postingoff);
 			}
@@ -359,16 +359,16 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 
 		Assert(datalen == 0);
 
-		newlpage = PageGetTempPageCopySpecial(lpage);
+		leftpage = PageGetTempPageCopySpecial(origpage);
 
-		/* Set high key */
+		/* Add high key tuple from WAL record to temp page */
 		leftoff = P_HIKEY;
-		if (PageAddItem(newlpage, (Item) left_hikey, left_hikeysz,
-						P_HIKEY, false, false) == InvalidOffsetNumber)
-			elog(PANIC, "failed to add high key to left page after split");
+		if (PageAddItem(leftpage, (Item) left_hikey, left_hikeysz, P_HIKEY,
+						false, false) == InvalidOffsetNumber)
+			elog(ERROR, "failed to add high key to left page after split");
 		leftoff = OffsetNumberNext(leftoff);
 
-		for (off = P_FIRSTDATAKEY(lopaque); off < xlrec->firstrightoff; off++)
+		for (off = P_FIRSTDATAKEY(oopaque); off < xlrec->firstrightoff; off++)
 		{
 			ItemId		itemid;
 			Size		itemsz;
@@ -379,7 +379,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 			{
 				Assert(newitemonleft ||
 					   xlrec->firstrightoff == xlrec->newitemoff);
-				if (PageAddItem(newlpage, (Item) nposting,
+				if (PageAddItem(leftpage, (Item) nposting,
 								MAXALIGN(IndexTupleSize(nposting)), leftoff,
 								false, false) == InvalidOffsetNumber)
 					elog(ERROR, "failed to add new posting list item to left page after split");
@@ -390,16 +390,16 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 			/* add the new item if it was inserted on left page */
 			else if (newitemonleft && off == xlrec->newitemoff)
 			{
-				if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
+				if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff,
 								false, false) == InvalidOffsetNumber)
 					elog(ERROR, "failed to add new item to left page after split");
 				leftoff = OffsetNumberNext(leftoff);
 			}
 
-			itemid = PageGetItemId(lpage, off);
+			itemid = PageGetItemId(origpage, off);
 			itemsz = ItemIdGetLength(itemid);
-			item = (IndexTuple) PageGetItem(lpage, itemid);
-			if (PageAddItem(newlpage, (Item) item, itemsz, leftoff,
+			item = (IndexTuple) PageGetItem(origpage, itemid);
+			if (PageAddItem(leftpage, (Item) item, itemsz, leftoff,
 							false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add old item to left page after split");
 			leftoff = OffsetNumberNext(leftoff);
@@ -408,31 +408,31 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 		/* cope with possibility that newitem goes at the end */
 		if (newitemonleft && off == xlrec->newitemoff)
 		{
-			if (PageAddItem(newlpage, (Item) newitem, newitemsz, leftoff,
+			if (PageAddItem(leftpage, (Item) newitem, newitemsz, leftoff,
 							false, false) == InvalidOffsetNumber)
 				elog(ERROR, "failed to add new item to left page after split");
 			leftoff = OffsetNumberNext(leftoff);
 		}
 
-		PageRestoreTempPage(newlpage, lpage);
+		PageRestoreTempPage(leftpage, origpage);
 
 		/* Fix opaque fields */
-		lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
+		oopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
 		if (isleaf)
-			lopaque->btpo_flags |= BTP_LEAF;
-		lopaque->btpo_next = rightsib;
-		lopaque->btpo_cycleid = 0;
+			oopaque->btpo_flags |= BTP_LEAF;
+		oopaque->btpo_next = rightpagenumber;
+		oopaque->btpo_cycleid = 0;
 
-		PageSetLSN(lpage, lsn);
-		MarkBufferDirty(lbuf);
+		PageSetLSN(origpage, lsn);
+		MarkBufferDirty(buf);
 	}
 
 	/*
 	 * We no longer need the buffers.  They must be released together, so that
 	 * readers cannot observe two inconsistent halves.
 	 */
-	if (BufferIsValid(lbuf))
-		UnlockReleaseBuffer(lbuf);
+	if (BufferIsValid(buf))
+		UnlockReleaseBuffer(buf);
 	UnlockReleaseBuffer(rbuf);
 
 	/*
@@ -443,22 +443,22 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 	 * replay, because no other index update can be in progress, and readers
 	 * will cope properly when following an obsolete left-link.
 	 */
-	if (rnext != P_NONE)
+	if (spagenumber != P_NONE)
 	{
-		Buffer		buffer;
+		Buffer		sbuf;
 
-		if (XLogReadBufferForRedo(record, 2, &buffer) == BLK_NEEDS_REDO)
+		if (XLogReadBufferForRedo(record, 2, &sbuf) == BLK_NEEDS_REDO)
 		{
-			Page		page = (Page) BufferGetPage(buffer);
-			BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+			Page		spage = (Page) BufferGetPage(sbuf);
+			BTPageOpaque spageop = (BTPageOpaque) PageGetSpecialPointer(spage);
 
-			pageop->btpo_prev = rightsib;
+			spageop->btpo_prev = rightpagenumber;
 
-			PageSetLSN(page, lsn);
-			MarkBufferDirty(buffer);
+			PageSetLSN(spage, lsn);
+			MarkBufferDirty(sbuf);
 		}
-		if (BufferIsValid(buffer))
-			UnlockReleaseBuffer(buffer);
+		if (BufferIsValid(sbuf))
+			UnlockReleaseBuffer(sbuf);
 	}
 }
 

From 6f0b632f083ba08fabb6c496caf733802cee9d2e Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 7 Aug 2020 14:30:41 -0400
Subject: [PATCH 265/334] Support testing of cases where table schemas change
 after planning.

We have various cases where we allow DDL on tables to be performed with
less than full AccessExclusiveLock.  This requires concurrent queries
to be able to cope with the DDL change mid-flight, but up to now we had
no repeatable way to test such cases.  To improve that, invent a test
module that allows halting a backend after planning and then resuming
execution once we've done desired actions in another session.  (The same
approach could be used to inject delays in other places, if there's a
suitable hook available.)

This commit includes a single test case, which is meant to exercise the
previously-untestable ExecCreatePartitionPruneState code repaired by
commit 7a980dfc6.  We'd probably not bother with this if that were the
only foreseen benefit, but I expect additional test cases will use this
infrastructure in the future.

Test module by Andy Fan, partition-addition test case by me.

Discussion: https://postgr.es/m/20200802181131.GA27754@telsasoft.com
---
 src/test/modules/Makefile                     |   1 +
 src/test/modules/delay_execution/.gitignore   |   3 +
 src/test/modules/delay_execution/Makefile     |  21 ++++
 .../modules/delay_execution/delay_execution.c | 104 ++++++++++++++++++
 .../expected/partition-addition.out           |  21 ++++
 .../specs/partition-addition.spec             |  38 +++++++
 6 files changed, 188 insertions(+)
 create mode 100644 src/test/modules/delay_execution/.gitignore
 create mode 100644 src/test/modules/delay_execution/Makefile
 create mode 100644 src/test/modules/delay_execution/delay_execution.c
 create mode 100644 src/test/modules/delay_execution/expected/partition-addition.out
 create mode 100644 src/test/modules/delay_execution/specs/partition-addition.spec

diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 29de73c06062..1428529b041a 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -7,6 +7,7 @@ include $(top_builddir)/src/Makefile.global
 SUBDIRS = \
 		  brin \
 		  commit_ts \
+		  delay_execution \
 		  dummy_index_am \
 		  dummy_seclabel \
 		  snapshot_too_old \
diff --git a/src/test/modules/delay_execution/.gitignore b/src/test/modules/delay_execution/.gitignore
new file mode 100644
index 000000000000..ba2160b66ceb
--- /dev/null
+++ b/src/test/modules/delay_execution/.gitignore
@@ -0,0 +1,3 @@
+# Generated subdirectories
+/output_iso/
+/tmp_check_iso/
diff --git a/src/test/modules/delay_execution/Makefile b/src/test/modules/delay_execution/Makefile
new file mode 100644
index 000000000000..f270aebf3a55
--- /dev/null
+++ b/src/test/modules/delay_execution/Makefile
@@ -0,0 +1,21 @@
+# src/test/modules/delay_execution/Makefile
+
+PGFILEDESC = "delay_execution - allow delay between parsing and execution"
+
+MODULE_big = delay_execution
+OBJS = \
+	$(WIN32RES) \
+	delay_execution.o
+
+ISOLATION = partition-addition
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/delay_execution
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/delay_execution/delay_execution.c b/src/test/modules/delay_execution/delay_execution.c
new file mode 100644
index 000000000000..03ea23d0f266
--- /dev/null
+++ b/src/test/modules/delay_execution/delay_execution.c
@@ -0,0 +1,104 @@
+/*-------------------------------------------------------------------------
+ *
+ * delay_execution.c
+ *		Test module to allow delay between parsing and execution of a query.
+ *
+ * The delay is implemented by taking and immediately releasing a specified
+ * advisory lock.  If another process has previously taken that lock, the
+ * current process will be blocked until the lock is released; otherwise,
+ * there's no effect.  This allows an isolationtester script to reliably
+ * test behaviors where some specified action happens in another backend
+ * between parsing and execution of any desired query.
+ *
+ * Copyright (c) 2020, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/test/modules/delay_execution/delay_execution.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "optimizer/planner.h"
+#include "utils/builtins.h"
+#include "utils/guc.h"
+#include "utils/inval.h"
+
+
+PG_MODULE_MAGIC;
+
+/* GUC: advisory lock ID to use.  Zero disables the feature. */
+static int	post_planning_lock_id = 0;
+
+/* Save previous planner hook user to be a good citizen */
+static planner_hook_type prev_planner_hook = NULL;
+
+/* Module load/unload functions */
+void		_PG_init(void);
+void		_PG_fini(void);
+
+
+/* planner_hook function to provide the desired delay */
+static PlannedStmt *
+delay_execution_planner(Query *parse, const char *query_string,
+						int cursorOptions, ParamListInfo boundParams)
+{
+	PlannedStmt *result;
+
+	/* Invoke the planner, possibly via a previous hook user */
+	if (prev_planner_hook)
+		result = prev_planner_hook(parse, query_string, cursorOptions,
+								   boundParams);
+	else
+		result = standard_planner(parse, query_string, cursorOptions,
+								  boundParams);
+
+	/* If enabled, delay by taking and releasing the specified lock */
+	if (post_planning_lock_id != 0)
+	{
+		DirectFunctionCall1(pg_advisory_lock_int8,
+							Int64GetDatum((int64) post_planning_lock_id));
+		DirectFunctionCall1(pg_advisory_unlock_int8,
+							Int64GetDatum((int64) post_planning_lock_id));
+
+		/*
+		 * Ensure that we notice any pending invalidations, since the advisory
+		 * lock functions don't do this.
+		 */
+		AcceptInvalidationMessages();
+	}
+
+	return result;
+}
+
+/* Module load function */
+void
+_PG_init(void)
+{
+	/* Set up the GUC to control which lock is used */
+	DefineCustomIntVariable("delay_execution.post_planning_lock_id",
+							"Sets the advisory lock ID to be locked/unlocked after planning.",
+							"Zero disables the delay.",
+							&post_planning_lock_id,
+							0,
+							0, INT_MAX,
+							PGC_USERSET,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
+	/* Install our hook */
+	prev_planner_hook = planner_hook;
+	planner_hook = delay_execution_planner;
+}
+
+/* Module unload function (pro forma, not used currently) */
+void
+_PG_fini(void)
+{
+	planner_hook = prev_planner_hook;
+}
diff --git a/src/test/modules/delay_execution/expected/partition-addition.out b/src/test/modules/delay_execution/expected/partition-addition.out
new file mode 100644
index 000000000000..7c91090eeff8
--- /dev/null
+++ b/src/test/modules/delay_execution/expected/partition-addition.out
@@ -0,0 +1,21 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s2lock s1exec s2addp s2unlock
+step s2lock: SELECT pg_advisory_lock(12345);
+pg_advisory_lock
+
+               
+step s1exec: LOAD 'delay_execution';
+		  SET delay_execution.post_planning_lock_id = 12345;
+		  SELECT * FROM foo WHERE a <> 1 AND a <> (SELECT 3); <waiting ...>
+step s2addp: CREATE TABLE foo2 (LIKE foo);
+		  ALTER TABLE foo ATTACH PARTITION foo2 FOR VALUES IN (2);
+		  INSERT INTO foo VALUES (2, 'ADD2');
+step s2unlock: SELECT pg_advisory_unlock(12345);
+pg_advisory_unlock
+
+t              
+step s1exec: <... completed>
+a              b              
+
+4              GHI            
diff --git a/src/test/modules/delay_execution/specs/partition-addition.spec b/src/test/modules/delay_execution/specs/partition-addition.spec
new file mode 100644
index 000000000000..2a0948247e32
--- /dev/null
+++ b/src/test/modules/delay_execution/specs/partition-addition.spec
@@ -0,0 +1,38 @@
+# Test addition of a partition with less-than-exclusive locking.
+
+setup
+{
+  CREATE TABLE foo (a int, b text) PARTITION BY LIST(a);
+  CREATE TABLE foo1 PARTITION OF foo FOR VALUES IN (1);
+  CREATE TABLE foo3 PARTITION OF foo FOR VALUES IN (3);
+  CREATE TABLE foo4 PARTITION OF foo FOR VALUES IN (4);
+  INSERT INTO foo VALUES (1, 'ABC');
+  INSERT INTO foo VALUES (3, 'DEF');
+  INSERT INTO foo VALUES (4, 'GHI');
+}
+
+teardown
+{
+  DROP TABLE foo;
+}
+
+# The SELECT will be planned with just the three partitions shown above,
+# of which we expect foo1 to be pruned at planning and foo3 at execution.
+# Then we'll block, and by the time the query is actually executed,
+# partition foo2 will also exist.  We expect that not to be scanned.
+# This test is specifically designed to check ExecCreatePartitionPruneState's
+# code for matching up the partition lists in such cases.
+
+session "s1"
+step "s1exec"	{ LOAD 'delay_execution';
+		  SET delay_execution.post_planning_lock_id = 12345;
+		  SELECT * FROM foo WHERE a <> 1 AND a <> (SELECT 3); }
+
+session "s2"
+step "s2lock"	{ SELECT pg_advisory_lock(12345); }
+step "s2unlock"	{ SELECT pg_advisory_unlock(12345); }
+step "s2addp"	{ CREATE TABLE foo2 (LIKE foo);
+		  ALTER TABLE foo ATTACH PARTITION foo2 FOR VALUES IN (2);
+		  INSERT INTO foo VALUES (2, 'ADD2'); }
+
+permutation "s2lock" "s1exec" "s2addp" "s2unlock"

From cea3d55898655582e3a3835a7bed2c3a1b002fef Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Fri, 7 Aug 2020 17:24:40 -0400
Subject: [PATCH 266/334] Remove PROC_IN_ANALYZE and derived flags

These flags are unused and always have been.

Discussion: https://postgr.es/m/20200805235549.GA8118@alvherre.pgsql
---
 src/backend/commands/analyze.c  | 13 +------------
 src/include/storage/proc.h      |  3 +--
 src/include/storage/procarray.h |  7 -------
 3 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 924ef37c8163..e0fa73ba7909 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -247,11 +247,8 @@ analyze_rel(Oid relid, RangeVar *relation,
 	}
 
 	/*
-	 * OK, let's do it.  First let other backends know I'm in ANALYZE.
+	 * OK, let's do it.  First, initialize progress reporting.
 	 */
-	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-	MyPgXact->vacuumFlags |= PROC_IN_ANALYZE;
-	LWLockRelease(ProcArrayLock);
 	pgstat_progress_start_command(PROGRESS_COMMAND_ANALYZE,
 								  RelationGetRelid(onerel));
 
@@ -279,14 +276,6 @@ analyze_rel(Oid relid, RangeVar *relation,
 	relation_close(onerel, NoLock);
 
 	pgstat_progress_end_command();
-
-	/*
-	 * Reset my PGXACT flag.  Note: we need this here, and not in vacuum_rel,
-	 * because the vacuum flag is cleared by the end-of-xact code.
-	 */
-	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-	MyPgXact->vacuumFlags &= ~PROC_IN_ANALYZE;
-	LWLockRelease(ProcArrayLock);
 }
 
 /*
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index b20e2ad4f6aa..5ceb2494bae7 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -52,7 +52,6 @@ struct XidCache
  */
 #define		PROC_IS_AUTOVACUUM	0x01	/* is it an autovac worker? */
 #define		PROC_IN_VACUUM		0x02	/* currently running lazy vacuum */
-#define		PROC_IN_ANALYZE		0x04	/* currently running analyze */
 #define		PROC_VACUUM_FOR_WRAPAROUND	0x08	/* set by autovac only */
 #define		PROC_IN_LOGICAL_DECODING	0x10	/* currently doing logical
 												 * decoding outside xact */
@@ -60,7 +59,7 @@ struct XidCache
 
 /* flags reset at EOXact */
 #define		PROC_VACUUM_STATE_MASK \
-	(PROC_IN_VACUUM | PROC_IN_ANALYZE | PROC_VACUUM_FOR_WRAPAROUND)
+	(PROC_IN_VACUUM | PROC_VACUUM_FOR_WRAPAROUND)
 
 /*
  * We allow a small number of "weak" relation locks (AccessShareLock,
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index a5c7d0c0644a..01040d76e122 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -29,8 +29,6 @@
  */
 #define		PROCARRAY_VACUUM_FLAG			0x02	/* currently running lazy
 													 * vacuum */
-#define		PROCARRAY_ANALYZE_FLAG			0x04	/* currently running
-													 * analyze */
 #define		PROCARRAY_LOGICAL_DECODING_FLAG 0x10	/* currently doing logical
 													 * decoding outside xact */
 
@@ -42,7 +40,6 @@
  * have no corresponding PROC flag equivalent.
  */
 #define		PROCARRAY_PROC_FLAGS_MASK	(PROCARRAY_VACUUM_FLAG | \
-										 PROCARRAY_ANALYZE_FLAG | \
 										 PROCARRAY_LOGICAL_DECODING_FLAG)
 
 /* Use the following flags as an input "flags" to GetOldestXmin function */
@@ -50,10 +47,6 @@
 #define		PROCARRAY_FLAGS_DEFAULT			PROCARRAY_LOGICAL_DECODING_FLAG
 /* Ignore vacuum backends */
 #define		PROCARRAY_FLAGS_VACUUM			PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG
-/* Ignore analyze backends */
-#define		PROCARRAY_FLAGS_ANALYZE			PROCARRAY_FLAGS_DEFAULT | PROCARRAY_ANALYZE_FLAG
-/* Ignore both vacuum and analyze backends */
-#define		PROCARRAY_FLAGS_VACUUM_ANALYZE	PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG | PROCARRAY_ANALYZE_FLAG
 
 extern Size ProcArrayShmemSize(void);
 extern void CreateSharedProcArray(void);

From 0a7d771f0f63eb120e7f0a60aecd543ab25ba197 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 7 Aug 2020 15:27:56 -0700
Subject: [PATCH 267/334] Make nbtree split REDO locking match original
 execution.

Make the nbtree page split REDO routine consistent with original
execution in its approach to acquiring and releasing buffer locks (at
least for pages on the tree level of the page being split).  This brings
btree_xlog_split() in line with btree_xlog_unlink_page(), which was
taught to couple buffer locks by commit 9a9db08a.

Note that the precise order in which we both acquire and release sibling
buffer locks in btree_xlog_split() now matches original execution
exactly (the precise order in which the locks are released probably
doesn't matter much, but we might as well be consistent about it).

The rule for nbtree REDO routines from here on is that same-level locks
should be acquired in an order that's consistent with original
execution.  It's not practical to have a similar rule for cross-level
page locks, since for the most part original execution holds those locks
for a period that spans multiple atomic actions/WAL records.  It's also
not necessary, because clearly the cross-level lock coupling is only
truly needed during original execution because of the presence of
concurrent inserters.

This is not a bug fix (unlike the similar aforementioned commit, commit
9a9db08a).  The immediate reason to tighten things up in this area is to
enable an upcoming enhancement to contrib/amcheck that allows it to
verify that sibling links are in agreement with only an AccessShareLock
(this check produced false positives when run on a replica server on
account of the inconsistency fixed by this commit).  But that's not the
only reason to be stricter here.

It is generally useful to make locking on replicas be as close to what
happens during original execution as practically possible.  It makes it
less likely that hard to catch bugs will slip in in the future.  The
previous state of affairs seems to be a holdover from before the
introduction of Hot Standby, when buffer lock acquisitions during
recovery were totally unnecessary.  See also: commit 3bbf668d, which
tightened things up in this area a few years after the introduction of
Hot Standby.

Discussion: https://postgr.es/m/CAH2-Wz=465cJj11YXD9RKH8z=nhQa2dofOZ_23h67EXUGOJ00Q@mail.gmail.com
---
 src/backend/access/nbtree/README    | 23 +++---------
 src/backend/access/nbtree/nbtxlog.c | 58 ++++++++++++++---------------
 2 files changed, 35 insertions(+), 46 deletions(-)

diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 9d5fc424a574..abce31a5a96b 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -572,23 +572,12 @@ replay of page deletion records does not hold a write lock on the target
 leaf page throughout; only the primary needs to block out concurrent
 writers that insert on to the page being deleted.)
 
-There are also locking differences between the primary and WAL replay
-for the first stage of a page split (i.e. same-level differences in
-locking).  Replay of the first phase of a page split can get away with
-locking and updating the original right sibling page (which is also the
-new right sibling page's right sibling) after locks on the original page
-and its new right sibling have been released.  Again, this is okay
-because there are no writers.  Page deletion WAL replay cannot get away
-with being lax about same-level locking during replay, though -- doing
-so risks confusing concurrent backwards scans.
-
-Page deletion's second phase locks the left sibling page, target page,
-and right page in order on the standby, just like on the primary.  This
-allows backwards scans running on a standby to reason about page
-deletion on the leaf level; a page cannot appear deleted without that
-being reflected in the sibling pages.  It's probably possible to be more
-lax about how locks are acquired on the standby during the second phase
-of page deletion, but that hardly seems worth it.
+WAL replay holds same-level locks in a way that matches the approach
+taken during original execution, though. This prevent readers from
+observing same-level inconsistencies. It's probably possible to be more
+lax about how same-level locks are acquired during recovery (most kinds
+of readers could still move right to recover if we didn't couple
+same-level locks), but we prefer to be conservative here.
 
 During recovery all index scans start with ignore_killed_tuples = false
 and we never set kill_prior_tuple. We do this because the oldest xmin
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 1fd639246328..dbec58d5249c 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -172,10 +172,10 @@ btree_xlog_insert(bool isleaf, bool ismeta, bool posting,
 	 * Insertion to an internal page finishes an incomplete split at the child
 	 * level.  Clear the incomplete-split flag in the child.  Note: during
 	 * normal operation, the child and parent pages are locked at the same
-	 * time, so that clearing the flag and inserting the downlink appear
-	 * atomic to other backends.  We don't bother with that during replay,
-	 * because readers don't care about the incomplete-split flag and there
-	 * cannot be updates happening.
+	 * time (the locks are coupled), so that clearing the flag and inserting
+	 * the downlink appear atomic to other backends.  We don't bother with
+	 * that during replay, because readers don't care about the
+	 * incomplete-split flag and there cannot be updates happening.
 	 */
 	if (!isleaf)
 		_bt_clear_incomplete_split(record, 1);
@@ -272,9 +272,17 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 		spagenumber = P_NONE;
 
 	/*
-	 * Clear the incomplete split flag on the left sibling of the child page
-	 * this is a downlink for.  (Like in btree_xlog_insert, this can be done
-	 * before locking the other pages)
+	 * Clear the incomplete split flag on the appropriate child page one level
+	 * down when origpage/buf is an internal page (there must have been
+	 * cascading page splits during original execution in the event of an
+	 * internal page split).  This is like the corresponding btree_xlog_insert
+	 * call for internal pages.  We're not clearing the incomplete split flag
+	 * for the current page split here (you can think of this as part of the
+	 * insert of newitem that the page split action needs to perform in
+	 * passing).
+	 *
+	 * Like in btree_xlog_insert, this can be done before locking other pages.
+	 * We never need to couple cross-level locks in REDO routines.
 	 */
 	if (!isleaf)
 		_bt_clear_incomplete_split(record, 3);
@@ -427,22 +435,7 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 		MarkBufferDirty(buf);
 	}
 
-	/*
-	 * We no longer need the buffers.  They must be released together, so that
-	 * readers cannot observe two inconsistent halves.
-	 */
-	if (BufferIsValid(buf))
-		UnlockReleaseBuffer(buf);
-	UnlockReleaseBuffer(rbuf);
-
-	/*
-	 * Fix left-link of the page to the right of the new right sibling.
-	 *
-	 * Note: in normal operation, we do this while still holding lock on the
-	 * two split pages.  However, that's not necessary for correctness in WAL
-	 * replay, because no other index update can be in progress, and readers
-	 * will cope properly when following an obsolete left-link.
-	 */
+	/* Fix left-link of the page to the right of the new right sibling */
 	if (spagenumber != P_NONE)
 	{
 		Buffer		sbuf;
@@ -460,6 +453,14 @@ btree_xlog_split(bool newitemonleft, XLogReaderState *record)
 		if (BufferIsValid(sbuf))
 			UnlockReleaseBuffer(sbuf);
 	}
+
+	/*
+	 * Finally, release the remaining buffers.  sbuf, rbuf, and buf must be
+	 * released together, so that readers cannot observe inconsistencies.
+	 */
+	UnlockReleaseBuffer(rbuf);
+	if (BufferIsValid(buf))
+		UnlockReleaseBuffer(buf);
 }
 
 static void
@@ -733,6 +734,11 @@ btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
 		PageSetLSN(page, lsn);
 		MarkBufferDirty(buffer);
 	}
+
+	/*
+	 * Don't need to couple cross-level locks in REDO routines, so release
+	 * lock on internal page immediately
+	 */
 	if (BufferIsValid(buffer))
 		UnlockReleaseBuffer(buffer);
 
@@ -789,12 +795,6 @@ btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
 	 * the pages in the same standard left-to-right order (leftsib, target,
 	 * rightsib), and don't release the sibling locks until the target is
 	 * marked deleted.
-	 *
-	 * btree_xlog_split() can get away with fixing its right sibling page's
-	 * left link last of all, after dropping all other locks.  We prefer to
-	 * avoid dropping locks on same-level pages early compared to normal
-	 * operation.  This keeps things simple for backwards scans.  See
-	 * nbtree/README.
 	 */
 
 	/* Fix right-link of left sibling, if any */

From 7259736a6e5b7c7588fff9578370736a6648acbb Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Sat, 8 Aug 2020 07:34:39 +0530
Subject: [PATCH 268/334] Implement streaming mode in ReorderBuffer.

Instead of serializing the transaction to disk after reaching the
logical_decoding_work_mem limit in memory, we consume the changes we have
in memory and invoke stream API methods added by commit 45fdc9738b.
However, sometimes if we have incomplete toast or speculative insert we
spill to the disk because we can't generate the complete tuple and stream.
And, as soon as we get the complete tuple we stream the transaction
including the serialized changes.

We can do this incremental processing thanks to having assignments
(associating subxact with toplevel xacts) in WAL right away, and
thanks to logging the invalidation messages at each command end. These
features are added by commits 0bead9af48 and c55040ccd0 respectively.

Now that we can stream in-progress transactions, the concurrent aborts
may cause failures when the output plugin consults catalogs (both system
and user-defined).

We handle such failures by returning ERRCODE_TRANSACTION_ROLLBACK
sqlerrcode from system table scan APIs to the backend or WALSender
decoding a specific uncommitted transaction. The decoding logic on the
receipt of such a sqlerrcode aborts the decoding of the current
transaction and continue with the decoding of other transactions.

We have ReorderBufferTXN pointer in each ReorderBufferChange by which we
know which xact it belongs to.  The output plugin can use this to decide
which changes to discard in case of stream_abort_cb (e.g. when a subxact
gets discarded).

We also provide a new option via SQL APIs to fetch the changes being
streamed.

Author: Dilip Kumar, Tomas Vondra, Amit Kapila, Nikhil Sontakke
Reviewed-by: Amit Kapila, Kuntal Ghosh, Ajin Cherian
Tested-by: Neha Sharma, Mahendra Singh Thalor and Ajin Cherian
Discussion: https://postgr.es/m/688b0b7f-2f6c-d827-c27b-216a8e3ea700@2ndquadrant.com
---
 contrib/test_decoding/Makefile                |   2 +-
 contrib/test_decoding/expected/stream.out     |  94 ++
 contrib/test_decoding/expected/truncate.out   |   6 +
 contrib/test_decoding/sql/stream.sql          |  30 +
 contrib/test_decoding/sql/truncate.sql        |   1 +
 contrib/test_decoding/test_decoding.c         |  13 +
 doc/src/sgml/logicaldecoding.sgml             |   9 +-
 doc/src/sgml/test-decoding.sgml               |  22 +
 src/backend/access/heap/heapam.c              |  13 +
 src/backend/access/heap/heapam_visibility.c   |  42 +-
 src/backend/access/index/genam.c              |  53 +
 src/backend/access/table/tableam.c            |   8 +
 src/backend/access/transam/xact.c             |  19 +
 src/backend/replication/logical/decode.c      |  17 +-
 src/backend/replication/logical/logical.c     |  10 +
 .../replication/logical/reorderbuffer.c       | 981 ++++++++++++++++--
 src/include/access/heapam_xlog.h              |   1 +
 src/include/access/tableam.h                  |  55 +
 src/include/access/xact.h                     |   4 +
 src/include/replication/logical.h             |   1 +
 src/include/replication/reorderbuffer.h       |  56 +-
 21 files changed, 1331 insertions(+), 106 deletions(-)
 create mode 100644 contrib/test_decoding/expected/stream.out
 create mode 100644 contrib/test_decoding/sql/stream.sql

diff --git a/contrib/test_decoding/Makefile b/contrib/test_decoding/Makefile
index f439c582a5f9..ed9a3d6c0ede 100644
--- a/contrib/test_decoding/Makefile
+++ b/contrib/test_decoding/Makefile
@@ -5,7 +5,7 @@ PGFILEDESC = "test_decoding - example of a logical decoding output plugin"
 
 REGRESS = ddl xact rewrite toast permissions decoding_in_xact \
 	decoding_into_rel binary prepared replorigin time messages \
-	spill slot truncate
+	spill slot truncate stream
 ISOLATION = mxact delayed_startup ondisk_startup concurrent_ddl_dml \
 	oldest_xmin snapshot_transfer subxact_without_top
 
diff --git a/contrib/test_decoding/expected/stream.out b/contrib/test_decoding/expected/stream.out
new file mode 100644
index 000000000000..9a5d7e7c4399
--- /dev/null
+++ b/contrib/test_decoding/expected/stream.out
@@ -0,0 +1,94 @@
+SET synchronous_commit = on;
+SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding');
+ ?column? 
+----------
+ init
+(1 row)
+
+CREATE TABLE stream_test(data text);
+-- consume DDL
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');
+ data 
+------
+(0 rows)
+
+-- streaming test with sub-transaction
+BEGIN;
+savepoint s1;
+SELECT 'msg5' FROM pg_logical_emit_message(true, 'test', repeat('a', 50));
+ ?column? 
+----------
+ msg5
+(1 row)
+
+INSERT INTO stream_test SELECT repeat('a', 2000) || g.i FROM generate_series(1, 35) g(i);
+TRUNCATE table stream_test;
+rollback to s1;
+INSERT INTO stream_test SELECT repeat('a', 10) || g.i FROM generate_series(1, 20) g(i);
+COMMIT;
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1');
+                           data                           
+----------------------------------------------------------
+ opening a streamed block for transaction
+ streaming message: transactional: 1 prefix: test, sz: 50
+ closing a streamed block for transaction
+ aborting streamed (sub)transaction
+ opening a streamed block for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ closing a streamed block for transaction
+ committing streamed transaction
+(27 rows)
+
+-- streaming test for toast changes
+ALTER TABLE stream_test ALTER COLUMN data set storage external;
+-- consume DDL
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');
+ data 
+------
+(0 rows)
+
+INSERT INTO stream_test SELECT repeat('a', 6000) || g.i FROM generate_series(1, 10) g(i);
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1');
+                   data                   
+------------------------------------------
+ opening a streamed block for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ streaming change for transaction
+ closing a streamed block for transaction
+ committing streamed transaction
+(13 rows)
+
+DROP TABLE stream_test;
+SELECT pg_drop_replication_slot('regression_slot');
+ pg_drop_replication_slot 
+--------------------------
+ 
+(1 row)
+
diff --git a/contrib/test_decoding/expected/truncate.out b/contrib/test_decoding/expected/truncate.out
index 1cf2ae835c84..e64d377214ab 100644
--- a/contrib/test_decoding/expected/truncate.out
+++ b/contrib/test_decoding/expected/truncate.out
@@ -25,3 +25,9 @@ SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'inc
  COMMIT
 (9 rows)
 
+SELECT pg_drop_replication_slot('regression_slot');
+ pg_drop_replication_slot 
+--------------------------
+ 
+(1 row)
+
diff --git a/contrib/test_decoding/sql/stream.sql b/contrib/test_decoding/sql/stream.sql
new file mode 100644
index 000000000000..8abc30de0afc
--- /dev/null
+++ b/contrib/test_decoding/sql/stream.sql
@@ -0,0 +1,30 @@
+SET synchronous_commit = on;
+SELECT 'init' FROM pg_create_logical_replication_slot('regression_slot', 'test_decoding');
+
+CREATE TABLE stream_test(data text);
+
+-- consume DDL
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');
+
+-- streaming test with sub-transaction
+BEGIN;
+savepoint s1;
+SELECT 'msg5' FROM pg_logical_emit_message(true, 'test', repeat('a', 50));
+INSERT INTO stream_test SELECT repeat('a', 2000) || g.i FROM generate_series(1, 35) g(i);
+TRUNCATE table stream_test;
+rollback to s1;
+INSERT INTO stream_test SELECT repeat('a', 10) || g.i FROM generate_series(1, 20) g(i);
+COMMIT;
+
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1');
+
+-- streaming test for toast changes
+ALTER TABLE stream_test ALTER COLUMN data set storage external;
+-- consume DDL
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');
+
+INSERT INTO stream_test SELECT repeat('a', 6000) || g.i FROM generate_series(1, 10) g(i);
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1');
+
+DROP TABLE stream_test;
+SELECT pg_drop_replication_slot('regression_slot');
diff --git a/contrib/test_decoding/sql/truncate.sql b/contrib/test_decoding/sql/truncate.sql
index 5aecdf0881f5..5633854e0dfc 100644
--- a/contrib/test_decoding/sql/truncate.sql
+++ b/contrib/test_decoding/sql/truncate.sql
@@ -11,3 +11,4 @@ TRUNCATE tab1, tab1 RESTART IDENTITY CASCADE;
 TRUNCATE tab1, tab2;
 
 SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');
+SELECT pg_drop_replication_slot('regression_slot');
diff --git a/contrib/test_decoding/test_decoding.c b/contrib/test_decoding/test_decoding.c
index dbef52a3af47..34745150e9ba 100644
--- a/contrib/test_decoding/test_decoding.c
+++ b/contrib/test_decoding/test_decoding.c
@@ -122,6 +122,7 @@ pg_decode_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
 {
 	ListCell   *option;
 	TestDecodingData *data;
+	bool		enable_streaming = false;
 
 	data = palloc0(sizeof(TestDecodingData));
 	data->context = AllocSetContextCreate(ctx->context,
@@ -212,6 +213,16 @@ pg_decode_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
 						 errmsg("could not parse value \"%s\" for parameter \"%s\"",
 								strVal(elem->arg), elem->defname)));
 		}
+		else if (strcmp(elem->defname, "stream-changes") == 0)
+		{
+			if (elem->arg == NULL)
+				continue;
+			else if (!parse_bool(strVal(elem->arg), &enable_streaming))
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+						 errmsg("could not parse value \"%s\" for parameter \"%s\"",
+								strVal(elem->arg), elem->defname)));
+		}
 		else
 		{
 			ereport(ERROR,
@@ -221,6 +232,8 @@ pg_decode_startup(LogicalDecodingContext *ctx, OutputPluginOptions *opt,
 							elem->arg ? strVal(elem->arg) : "(null)")));
 		}
 	}
+
+	ctx->streaming &= enable_streaming;
 }
 
 /* cleanup this plugin's resources */
diff --git a/doc/src/sgml/logicaldecoding.sgml b/doc/src/sgml/logicaldecoding.sgml
index 791a62b57c9b..1571d71a5b6c 100644
--- a/doc/src/sgml/logicaldecoding.sgml
+++ b/doc/src/sgml/logicaldecoding.sgml
@@ -433,9 +433,12 @@ typedef void (*LogicalOutputPluginInit) (struct OutputPluginCallbacks *cb);
 ALTER TABLE user_catalog_table SET (user_catalog_table = true);
 CREATE TABLE another_catalog_table(data text) WITH (user_catalog_table = true);
 </programlisting>
-     Any actions leading to transaction ID assignment are prohibited. That, among others,
-     includes writing to tables, performing DDL changes, and
-     calling <literal>pg_current_xact_id()</literal>.
+     Note that access to user catalog tables or regular system catalog tables
+     in the output plugins has to be done via the <literal>systable_*</literal>
+     scan APIs only. Access via the <literal>heap_*</literal> scan APIs will
+     error out. Additionally, any actions leading to transaction ID assignment
+     are prohibited. That, among others, includes writing to tables, performing
+     DDL changes, and calling <literal>pg_current_xact_id()</literal>.
     </para>
    </sect2>
 
diff --git a/doc/src/sgml/test-decoding.sgml b/doc/src/sgml/test-decoding.sgml
index 8356a3d67b31..fe7c9783facd 100644
--- a/doc/src/sgml/test-decoding.sgml
+++ b/doc/src/sgml/test-decoding.sgml
@@ -39,4 +39,26 @@ postgres=# SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'i
 </programlisting>
  </para>
 
+<para>
+  We can also get the changes of the in-progress transaction and the typical
+  output, might be:
+
+<programlisting>
+postgres[33712]=#* SELECT * FROM pg_logical_slot_get_changes('test_slot', NULL, NULL, 'stream-changes', '1');
+    lsn    | xid |                       data                       
+-----------+-----+--------------------------------------------------
+ 0/16B21F8 | 503 | opening a streamed block for transaction TXN 503
+ 0/16B21F8 | 503 | streaming change for TXN 503
+ 0/16B2300 | 503 | streaming change for TXN 503
+ 0/16B2408 | 503 | streaming change for TXN 503
+ 0/16BEBA0 | 503 | closing a streamed block for transaction TXN 503
+ 0/16B21F8 | 503 | opening a streamed block for transaction TXN 503
+ 0/16BECA8 | 503 | streaming change for TXN 503
+ 0/16BEDB0 | 503 | streaming change for TXN 503
+ 0/16BEEB8 | 503 | streaming change for TXN 503
+ 0/16BEBA0 | 503 | closing a streamed block for transaction TXN 503
+(10 rows)
+</programlisting>
+ </para>
+
 </sect1>
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 5eef225f5c79..00169006fb1f 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1299,6 +1299,16 @@ heap_getnext(TableScanDesc sscan, ScanDirection direction)
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg_internal("only heap AM is supported")));
 
+	/*
+	 * We don't expect direct calls to heap_getnext with valid CheckXidAlive
+	 * for catalog or regular tables.  See detailed comments in xact.c where
+	 * these variables are declared.  Normally we have such a check at tableam
+	 * level API but this is called from many places so we need to ensure it
+	 * here.
+	 */
+	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+		elog(ERROR, "unexpected heap_getnext call during logical decoding");
+
 	/* Note: no locking manipulations needed */
 
 	if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
@@ -1956,6 +1966,9 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 		{
 			xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
 			bufflags |= REGBUF_KEEP_DATA;
+
+			if (IsToastRelation(relation))
+				xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION;
 		}
 
 		XLogBeginInsert();
diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c
index dba10890aabe..c77128087cf7 100644
--- a/src/backend/access/heap/heapam_visibility.c
+++ b/src/backend/access/heap/heapam_visibility.c
@@ -1571,8 +1571,25 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot,
 												 htup, buffer,
 												 &cmin, &cmax);
 
+		/*
+		 * If we haven't resolved the combocid to cmin/cmax, that means we
+		 * have not decoded the combocid yet. That means the cmin is
+		 * definitely in the future, and we're not supposed to see the tuple
+		 * yet.
+		 *
+		 * XXX This only applies to decoding of in-progress transactions. In
+		 * regular logical decoding we only execute this code at commit time,
+		 * at which point we should have seen all relevant combocids. So
+		 * ideally, we should error out in this case but in practice, this
+		 * won't happen. If we are too worried about this then we can add an
+		 * elog inside ResolveCminCmaxDuringDecoding.
+		 *
+		 * XXX For the streaming case, we can track the largest combocid
+		 * assigned, and error out based on this (when unable to resolve
+		 * combocid below that observed maximum value).
+		 */
 		if (!resolved)
-			elog(ERROR, "could not resolve cmin/cmax of catalog tuple");
+			return false;
 
 		Assert(cmin != InvalidCommandId);
 
@@ -1642,10 +1659,25 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot,
 												 htup, buffer,
 												 &cmin, &cmax);
 
-		if (!resolved)
-			elog(ERROR, "could not resolve combocid to cmax");
-
-		Assert(cmax != InvalidCommandId);
+		/*
+		 * If we haven't resolved the combocid to cmin/cmax, that means we
+		 * have not decoded the combocid yet. That means the cmax is
+		 * definitely in the future, and we're still supposed to see the
+		 * tuple.
+		 *
+		 * XXX This only applies to decoding of in-progress transactions. In
+		 * regular logical decoding we only execute this code at commit time,
+		 * at which point we should have seen all relevant combocids. So
+		 * ideally, we should error out in this case but in practice, this
+		 * won't happen. If we are too worried about this then we can add an
+		 * elog inside ResolveCminCmaxDuringDecoding.
+		 *
+		 * XXX For the streaming case, we can track the largest combocid
+		 * assigned, and error out based on this (when unable to resolve
+		 * combocid below that observed maximum value).
+		 */
+		if (!resolved || cmax == InvalidCommandId)
+			return true;
 
 		if (cmax >= snapshot->curcid)
 			return true;		/* deleted after scan started */
diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c
index dfba5ae39ae9..e3164e674a7b 100644
--- a/src/backend/access/index/genam.c
+++ b/src/backend/access/index/genam.c
@@ -28,6 +28,7 @@
 #include "lib/stringinfo.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
+#include "storage/procarray.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
@@ -429,9 +430,36 @@ systable_beginscan(Relation heapRelation,
 		sysscan->iscan = NULL;
 	}
 
+	/*
+	 * If CheckXidAlive is set then set a flag to indicate that system table
+	 * scan is in-progress.  See detailed comments in xact.c where these
+	 * variables are declared.
+	 */
+	if (TransactionIdIsValid(CheckXidAlive))
+		bsysscan = true;
+
 	return sysscan;
 }
 
+/*
+ * HandleConcurrentAbort - Handle concurrent abort of the CheckXidAlive.
+ *
+ * Error out, if CheckXidAlive is aborted. We can't directly use
+ * TransactionIdDidAbort as after crash such transaction might not have been
+ * marked as aborted.  See detailed comments in xact.c where the variable
+ * is declared.
+ */
+static inline void
+HandleConcurrentAbort()
+{
+	if (TransactionIdIsValid(CheckXidAlive) &&
+		!TransactionIdIsInProgress(CheckXidAlive) &&
+		!TransactionIdDidCommit(CheckXidAlive))
+		ereport(ERROR,
+				(errcode(ERRCODE_TRANSACTION_ROLLBACK),
+				 errmsg("transaction aborted during system catalog scan")));
+}
+
 /*
  * systable_getnext --- get next tuple in a heap-or-index scan
  *
@@ -481,6 +509,12 @@ systable_getnext(SysScanDesc sysscan)
 		}
 	}
 
+	/*
+	 * Handle the concurrent abort while fetching the catalog tuple during
+	 * logical streaming of a transaction.
+	 */
+	HandleConcurrentAbort();
+
 	return htup;
 }
 
@@ -517,6 +551,12 @@ systable_recheck_tuple(SysScanDesc sysscan, HeapTuple tup)
 											sysscan->slot,
 											freshsnap);
 
+	/*
+	 * Handle the concurrent abort while fetching the catalog tuple during
+	 * logical streaming of a transaction.
+	 */
+	HandleConcurrentAbort();
+
 	return result;
 }
 
@@ -545,6 +585,13 @@ systable_endscan(SysScanDesc sysscan)
 	if (sysscan->snapshot)
 		UnregisterSnapshot(sysscan->snapshot);
 
+	/*
+	 * Reset the bsysscan flag at the end of the systable scan.  See
+	 * detailed comments in xact.c where these variables are declared.
+	 */
+	if (TransactionIdIsValid(CheckXidAlive))
+		bsysscan = false;
+
 	pfree(sysscan);
 }
 
@@ -643,6 +690,12 @@ systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction)
 	if (htup && sysscan->iscan->xs_recheck)
 		elog(ERROR, "system catalog scans with lossy index conditions are not implemented");
 
+	/*
+	 * Handle the concurrent abort while fetching the catalog tuple during
+	 * logical streaming of a transaction.
+	 */
+	HandleConcurrentAbort();
+
 	return htup;
 }
 
diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c
index 3afb63b1fe4d..c63831976575 100644
--- a/src/backend/access/table/tableam.c
+++ b/src/backend/access/table/tableam.c
@@ -248,6 +248,14 @@ table_tuple_get_latest_tid(TableScanDesc scan, ItemPointer tid)
 	Relation	rel = scan->rs_rd;
 	const TableAmRoutine *tableam = rel->rd_tableam;
 
+	/*
+	 * We don't expect direct calls to table_tuple_get_latest_tid with valid
+	 * CheckXidAlive for catalog or regular tables.  See detailed comments in
+	 * xact.c where these variables are declared.
+	 */
+	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+		elog(ERROR, "unexpected table_tuple_get_latest_tid call during logical decoding");
+
 	/*
 	 * Since this can be called with user-supplied TID, don't trust the input
 	 * too much.
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index d4f7c29847f4..727d61603593 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -82,6 +82,19 @@ bool		XactDeferrable;
 
 int			synchronous_commit = SYNCHRONOUS_COMMIT_ON;
 
+/*
+ * CheckXidAlive is a xid value pointing to a possibly ongoing (sub)
+ * transaction.  Currently, it is used in logical decoding.  It's possible
+ * that such transactions can get aborted while the decoding is ongoing in
+ * which case we skip decoding that particular transaction.  To ensure that we
+ * check whether the CheckXidAlive is aborted after fetching the tuple from
+ * system tables.  We also ensure that during logical decoding we never
+ * directly access the tableam or heap APIs because we are checking for the
+ * concurrent aborts only in systable_* APIs.
+ */
+TransactionId CheckXidAlive = InvalidTransactionId;
+bool		bsysscan = false;
+
 /*
  * When running as a parallel worker, we place only a single
  * TransactionStateData on the parallel worker's state stack, and the XID
@@ -2680,6 +2693,9 @@ AbortTransaction(void)
 	/* Forget about any active REINDEX. */
 	ResetReindexState(s->nestingLevel);
 
+	/* Reset logical streaming state. */
+	ResetLogicalStreamingState();
+
 	/* If in parallel mode, clean up workers and exit parallel mode. */
 	if (IsInParallelMode())
 	{
@@ -4982,6 +4998,9 @@ AbortSubTransaction(void)
 	/* Forget about any active REINDEX. */
 	ResetReindexState(s->nestingLevel);
 
+	/* Reset logical streaming state. */
+	ResetLogicalStreamingState();
+
 	/* Exit from parallel mode, if necessary. */
 	if (IsInParallelMode())
 	{
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index f3a1c31a2921..f21f61d5e10b 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -724,7 +724,9 @@ DecodeInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 
 	change->data.tp.clear_toast_afterwards = true;
 
-	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change);
+	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
+							 change,
+							 xlrec->flags & XLH_INSERT_ON_TOAST_RELATION);
 }
 
 /*
@@ -791,7 +793,8 @@ DecodeUpdate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 
 	change->data.tp.clear_toast_afterwards = true;
 
-	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change);
+	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
+							 change, false);
 }
 
 /*
@@ -848,7 +851,8 @@ DecodeDelete(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 
 	change->data.tp.clear_toast_afterwards = true;
 
-	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change);
+	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
+							 change, false);
 }
 
 /*
@@ -884,7 +888,7 @@ DecodeTruncate(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 	memcpy(change->data.truncate.relids, xlrec->relids,
 		   xlrec->nrelids * sizeof(Oid));
 	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r),
-							 buf->origptr, change);
+							 buf->origptr, change, false);
 }
 
 /*
@@ -984,7 +988,7 @@ DecodeMultiInsert(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 			change->data.tp.clear_toast_afterwards = false;
 
 		ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r),
-								 buf->origptr, change);
+								 buf->origptr, change, false);
 
 		/* move to the next xl_multi_insert_tuple entry */
 		data += datalen;
@@ -1022,7 +1026,8 @@ DecodeSpecConfirm(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 
 	change->data.tp.clear_toast_afterwards = true;
 
-	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr, change);
+	ReorderBufferQueueChange(ctx->reorder, XLogRecGetXid(r), buf->origptr,
+							 change, false);
 }
 
 
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index 05d24b93da02..42f284b33f6b 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -1442,3 +1442,13 @@ LogicalConfirmReceivedLocation(XLogRecPtr lsn)
 		SpinLockRelease(&MyReplicationSlot->mutex);
 	}
 }
+
+/*
+ * Clear logical streaming state during (sub)transaction abort.
+ */
+void
+ResetLogicalStreamingState(void)
+{
+	CheckXidAlive = InvalidTransactionId;
+	bsysscan = false;
+}
diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index ce6e62152f03..5b7afe6d9e9c 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -178,6 +178,21 @@ typedef struct ReorderBufferDiskChange
 	/* data follows */
 } ReorderBufferDiskChange;
 
+#define IsSpecInsert(action) \
+( \
+	((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT) \
+)
+#define IsSpecConfirm(action) \
+( \
+	((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM) \
+)
+#define IsInsertOrUpdate(action) \
+( \
+	(((action) == REORDER_BUFFER_CHANGE_INSERT) || \
+	((action) == REORDER_BUFFER_CHANGE_UPDATE) || \
+	((action) == REORDER_BUFFER_CHANGE_INTERNAL_SPEC_INSERT)) \
+)
+
 /*
  * Maximum number of changes kept in memory, per transaction. After that,
  * changes are spooled to disk.
@@ -236,6 +251,7 @@ static Size ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn
 static void ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
 									   char *change);
 static void ReorderBufferRestoreCleanup(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
 static void ReorderBufferCleanupSerializedTXNs(const char *slotname);
 static void ReorderBufferSerializedPath(char *path, ReplicationSlot *slot,
 										TransactionId xid, XLogSegNo segno);
@@ -244,6 +260,16 @@ static void ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap);
 static Snapshot ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap,
 									  ReorderBufferTXN *txn, CommandId cid);
 
+/*
+ * ---------------------------------------
+ * Streaming support functions
+ * ---------------------------------------
+ */
+static inline bool ReorderBufferCanStream(ReorderBuffer *rb);
+static inline bool ReorderBufferCanStartStreaming(ReorderBuffer *rb);
+static void ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn);
+static void ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn);
+
 /* ---------------------------------------
  * toast reassembly support
  * ---------------------------------------
@@ -367,6 +393,9 @@ ReorderBufferGetTXN(ReorderBuffer *rb)
 	dlist_init(&txn->tuplecids);
 	dlist_init(&txn->subtxns);
 
+	/* InvalidCommandId is not zero, so set it explicitly */
+	txn->command_id = InvalidCommandId;
+
 	return txn;
 }
 
@@ -416,13 +445,15 @@ ReorderBufferGetChange(ReorderBuffer *rb)
 }
 
 /*
- * Free an ReorderBufferChange.
+ * Free a ReorderBufferChange and update memory accounting, if requested.
  */
 void
-ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change)
+ReorderBufferReturnChange(ReorderBuffer *rb, ReorderBufferChange *change,
+						  bool upd_mem)
 {
 	/* update memory accounting info */
-	ReorderBufferChangeMemoryUpdate(rb, change, false);
+	if (upd_mem)
+		ReorderBufferChangeMemoryUpdate(rb, change, false);
 
 	/* free contained data */
 	switch (change->action)
@@ -624,16 +655,102 @@ ReorderBufferTXNByXid(ReorderBuffer *rb, TransactionId xid, bool create,
 }
 
 /*
- * Queue a change into a transaction so it can be replayed upon commit.
+ * Record the partial change for the streaming of in-progress transactions.  We
+ * can stream only complete changes so if we have a partial change like toast
+ * table insert or speculative insert then we mark such a 'txn' so that it
+ * can't be streamed.  We also ensure that if the changes in such a 'txn' are
+ * above logical_decoding_work_mem threshold then we stream them as soon as we
+ * have a complete change.
+ */
+static void
+ReorderBufferProcessPartialChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+								  ReorderBufferChange *change,
+								  bool toast_insert)
+{
+	ReorderBufferTXN *toptxn;
+
+	/*
+	 * The partial changes need to be processed only while streaming
+	 * in-progress transactions.
+	 */
+	if (!ReorderBufferCanStream(rb))
+		return;
+
+	/* Get the top transaction. */
+	if (txn->toptxn != NULL)
+		toptxn = txn->toptxn;
+	else
+		toptxn = txn;
+
+	/*
+	 * Set the toast insert bit whenever we get toast insert to indicate a
+	 * partial change and clear it when we get the insert or update on main
+	 * table (Both update and insert will do the insert in the toast table).
+	 */
+	if (toast_insert)
+		toptxn->txn_flags |= RBTXN_HAS_TOAST_INSERT;
+	else if (rbtxn_has_toast_insert(toptxn) &&
+			 IsInsertOrUpdate(change->action))
+		toptxn->txn_flags &= ~RBTXN_HAS_TOAST_INSERT;
+
+	/*
+	 * Set the spec insert bit whenever we get the speculative insert to
+	 * indicate the partial change and clear the same on speculative confirm.
+	 */
+	if (IsSpecInsert(change->action))
+		toptxn->txn_flags |= RBTXN_HAS_SPEC_INSERT;
+	else if (IsSpecConfirm(change->action))
+	{
+		/*
+		 * Speculative confirm change must be preceded by speculative
+		 * insertion.
+		 */
+		Assert(rbtxn_has_spec_insert(toptxn));
+		toptxn->txn_flags &= ~RBTXN_HAS_SPEC_INSERT;
+	}
+
+	/*
+	 * Stream the transaction if it is serialized before and the changes are
+	 * now complete in the top-level transaction.
+	 *
+	 * The reason for doing the streaming of such a transaction as soon as we
+	 * get the complete change for it is that previously it would have reached
+	 * the memory threshold and wouldn't get streamed because of incomplete
+	 * changes.  Delaying such transactions would increase apply lag for them.
+	 */
+	if (ReorderBufferCanStartStreaming(rb) &&
+		!(rbtxn_has_incomplete_tuple(toptxn)) &&
+		rbtxn_is_serialized(txn))
+		ReorderBufferStreamTXN(rb, toptxn);
+}
+
+/*
+ * Queue a change into a transaction so it can be replayed upon commit or will be
+ * streamed when we reach logical_decoding_work_mem threshold.
  */
 void
 ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
-						 ReorderBufferChange *change)
+						 ReorderBufferChange *change, bool toast_insert)
 {
 	ReorderBufferTXN *txn;
 
 	txn = ReorderBufferTXNByXid(rb, xid, true, NULL, lsn, true);
 
+	/*
+	 * While streaming the previous changes we have detected that the
+	 * transaction is aborted.  So there is no point in collecting further
+	 * changes for it.
+	 */
+	if (txn->concurrent_abort)
+	{
+		/*
+		 * We don't need to update memory accounting for this change as we
+		 * have not added it to the queue yet.
+		 */
+		ReorderBufferReturnChange(rb, change, false);
+		return;
+	}
+
 	change->lsn = lsn;
 	change->txn = txn;
 
@@ -645,6 +762,9 @@ ReorderBufferQueueChange(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn,
 	/* update memory accounting information */
 	ReorderBufferChangeMemoryUpdate(rb, change, true);
 
+	/* process partial change */
+	ReorderBufferProcessPartialChange(rb, txn, change, toast_insert);
+
 	/* check the memory limits and evict something if needed */
 	ReorderBufferCheckMemoryLimit(rb);
 }
@@ -674,7 +794,7 @@ ReorderBufferQueueMessage(ReorderBuffer *rb, TransactionId xid,
 		change->data.msg.message = palloc(message_size);
 		memcpy(change->data.msg.message, message, message_size);
 
-		ReorderBufferQueueChange(rb, xid, lsn, change);
+		ReorderBufferQueueChange(rb, xid, lsn, change, false);
 
 		MemoryContextSwitchTo(oldcontext);
 	}
@@ -763,6 +883,38 @@ AssertTXNLsnOrder(ReorderBuffer *rb)
 #endif
 }
 
+/*
+ * AssertChangeLsnOrder
+ *
+ * Check ordering of changes in the (sub)transaction.
+ */
+static void
+AssertChangeLsnOrder(ReorderBufferTXN *txn)
+{
+#ifdef USE_ASSERT_CHECKING
+	dlist_iter	iter;
+	XLogRecPtr	prev_lsn = txn->first_lsn;
+
+	dlist_foreach(iter, &txn->changes)
+	{
+		ReorderBufferChange *cur_change;
+
+		cur_change = dlist_container(ReorderBufferChange, node, iter.cur);
+
+		Assert(txn->first_lsn != InvalidXLogRecPtr);
+		Assert(cur_change->lsn != InvalidXLogRecPtr);
+		Assert(txn->first_lsn <= cur_change->lsn);
+
+		if (txn->end_lsn != InvalidXLogRecPtr)
+			Assert(cur_change->lsn <= txn->end_lsn);
+
+		Assert(prev_lsn <= cur_change->lsn);
+
+		prev_lsn = cur_change->lsn;
+	}
+#endif
+}
+
 /*
  * ReorderBufferGetOldestTXN
  *		Return oldest transaction in reorderbuffer
@@ -1018,6 +1170,9 @@ ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
 
 	*iter_state = NULL;
 
+	/* Check ordering of changes in the toplevel transaction. */
+	AssertChangeLsnOrder(txn);
+
 	/*
 	 * Calculate the size of our heap: one element for every transaction that
 	 * contains changes.  (Besides the transactions already in the reorder
@@ -1032,6 +1187,9 @@ ReorderBufferIterTXNInit(ReorderBuffer *rb, ReorderBufferTXN *txn,
 
 		cur_txn = dlist_container(ReorderBufferTXN, node, cur_txn_i.cur);
 
+		/* Check ordering of changes in this subtransaction. */
+		AssertChangeLsnOrder(cur_txn);
+
 		if (cur_txn->nentries > 0)
 			nr_txns++;
 	}
@@ -1148,7 +1306,7 @@ ReorderBufferIterTXNNext(ReorderBuffer *rb, ReorderBufferIterTXNState *state)
 	{
 		change = dlist_container(ReorderBufferChange, node,
 								 dlist_pop_head_node(&state->old_change));
-		ReorderBufferReturnChange(rb, change);
+		ReorderBufferReturnChange(rb, change, true);
 		Assert(dlist_is_empty(&state->old_change));
 	}
 
@@ -1234,7 +1392,7 @@ ReorderBufferIterTXNFinish(ReorderBuffer *rb,
 
 		change = dlist_container(ReorderBufferChange, node,
 								 dlist_pop_head_node(&state->old_change));
-		ReorderBufferReturnChange(rb, change);
+		ReorderBufferReturnChange(rb, change, true);
 		Assert(dlist_is_empty(&state->old_change));
 	}
 
@@ -1280,7 +1438,7 @@ ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 		/* Check we're not mixing changes from different transactions. */
 		Assert(change->txn == txn);
 
-		ReorderBufferReturnChange(rb, change);
+		ReorderBufferReturnChange(rb, change, true);
 	}
 
 	/*
@@ -1297,7 +1455,7 @@ ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 		Assert(change->txn == txn);
 		Assert(change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID);
 
-		ReorderBufferReturnChange(rb, change);
+		ReorderBufferReturnChange(rb, change, true);
 	}
 
 	/*
@@ -1309,6 +1467,15 @@ ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 		dlist_delete(&txn->base_snapshot_node);
 	}
 
+	/*
+	 * Cleanup the snapshot for the last streamed run.
+	 */
+	if (txn->snapshot_now != NULL)
+	{
+		Assert(rbtxn_is_streamed(txn));
+		ReorderBufferFreeSnap(rb, txn->snapshot_now);
+	}
+
 	/*
 	 * Remove TXN from its containing list.
 	 *
@@ -1334,6 +1501,91 @@ ReorderBufferCleanupTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 	ReorderBufferReturnTXN(rb, txn);
 }
 
+/*
+ * Discard changes from a transaction (and subtransactions), after streaming
+ * them.  Keep the remaining info - transactions, tuplecids, invalidations and
+ * snapshots.
+ */
+static void
+ReorderBufferTruncateTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	dlist_mutable_iter iter;
+
+	/* cleanup subtransactions & their changes */
+	dlist_foreach_modify(iter, &txn->subtxns)
+	{
+		ReorderBufferTXN *subtxn;
+
+		subtxn = dlist_container(ReorderBufferTXN, node, iter.cur);
+
+		/*
+		 * Subtransactions are always associated to the toplevel TXN, even if
+		 * they originally were happening inside another subtxn, so we won't
+		 * ever recurse more than one level deep here.
+		 */
+		Assert(rbtxn_is_known_subxact(subtxn));
+		Assert(subtxn->nsubtxns == 0);
+
+		ReorderBufferTruncateTXN(rb, subtxn);
+	}
+
+	/* cleanup changes in the toplevel txn */
+	dlist_foreach_modify(iter, &txn->changes)
+	{
+		ReorderBufferChange *change;
+
+		change = dlist_container(ReorderBufferChange, node, iter.cur);
+
+		/* Check we're not mixing changes from different transactions. */
+		Assert(change->txn == txn);
+
+		/* remove the change from it's containing list */
+		dlist_delete(&change->node);
+
+		ReorderBufferReturnChange(rb, change, true);
+	}
+
+	/*
+	 * Mark the transaction as streamed.
+	 *
+	 * The toplevel transaction, identified by (toptxn==NULL), is marked as
+	 * streamed always, even if it does not contain any changes (that is, when
+	 * all the changes are in subtransactions).
+	 *
+	 * For subtransactions, we only mark them as streamed when there are
+	 * changes in them.
+	 *
+	 * We do it this way because of aborts - we don't want to send aborts for
+	 * XIDs the downstream is not aware of. And of course, it always knows
+	 * about the toplevel xact (we send the XID in all messages), but we never
+	 * stream XIDs of empty subxacts.
+	 */
+	if ((!txn->toptxn) || (txn->nentries_mem != 0))
+		txn->txn_flags |= RBTXN_IS_STREAMED;
+
+	/*
+	 * Destroy the (relfilenode, ctid) hashtable, so that we don't leak any
+	 * memory. We could also keep the hash table and update it with new ctid
+	 * values, but this seems simpler and good enough for now.
+	 */
+	if (txn->tuplecid_hash != NULL)
+	{
+		hash_destroy(txn->tuplecid_hash);
+		txn->tuplecid_hash = NULL;
+	}
+
+	/* If this txn is serialized then clean the disk space. */
+	if (rbtxn_is_serialized(txn))
+	{
+		ReorderBufferRestoreCleanup(rb, txn);
+		txn->txn_flags &= ~RBTXN_IS_SERIALIZED;
+	}
+
+	/* also reset the number of entries in the transaction */
+	txn->nentries_mem = 0;
+	txn->nentries = 0;
+}
+
 /*
  * Build a hash with a (relfilenode, ctid) -> (cmin, cmax) mapping for use by
  * HeapTupleSatisfiesHistoricMVCC.
@@ -1485,57 +1737,191 @@ ReorderBufferFreeSnap(ReorderBuffer *rb, Snapshot snap)
 }
 
 /*
- * Perform the replay of a transaction and its non-aborted subtransactions.
- *
- * Subtransactions previously have to be processed by
- * ReorderBufferCommitChild(), even if previously assigned to the toplevel
- * transaction with ReorderBufferAssignChild.
- *
- * We currently can only decode a transaction's contents when its commit
- * record is read because that's the only place where we know about cache
- * invalidations. Thus, once a toplevel commit is read, we iterate over the top
- * and subtransactions (using a k-way merge) and replay the changes in lsn
- * order.
+ * If the transaction was (partially) streamed, we need to commit it in a
+ * 'streamed' way.  That is, we first stream the remaining part of the
+ * transaction, and then invoke stream_commit message.
  */
-void
-ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
-					XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
-					TimestampTz commit_time,
-					RepOriginId origin_id, XLogRecPtr origin_lsn)
+static void
+ReorderBufferStreamCommit(ReorderBuffer *rb, ReorderBufferTXN *txn)
 {
-	ReorderBufferTXN *txn;
-	volatile Snapshot snapshot_now;
-	volatile CommandId command_id = FirstCommandId;
-	bool		using_subtxn;
-	ReorderBufferIterTXNState *volatile iterstate = NULL;
+	/* we should only call this for previously streamed transactions */
+	Assert(rbtxn_is_streamed(txn));
 
-	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
-								false);
+	ReorderBufferStreamTXN(rb, txn);
 
-	/* unknown transaction, nothing to replay */
-	if (txn == NULL)
-		return;
+	rb->stream_commit(rb, txn, txn->final_lsn);
 
-	txn->final_lsn = commit_lsn;
-	txn->end_lsn = end_lsn;
-	txn->commit_time = commit_time;
-	txn->origin_id = origin_id;
-	txn->origin_lsn = origin_lsn;
+	ReorderBufferCleanupTXN(rb, txn);
+}
 
+/*
+ * Set xid to detect concurrent aborts.
+ *
+ * While streaming an in-progress transaction there is a possibility that the
+ * (sub)transaction might get aborted concurrently.  In such case if the
+ * (sub)transaction has catalog update then we might decode the tuple using
+ * wrong catalog version.  For example, suppose there is one catalog tuple with
+ * (xmin: 500, xmax: 0).  Now, the transaction 501 updates the catalog tuple
+ * and after that we will have two tuples (xmin: 500, xmax: 501) and
+ * (xmin: 501, xmax: 0).  Now, if 501 is aborted and some other transaction
+ * say 502 updates the same catalog tuple then the first tuple will be changed
+ * to (xmin: 500, xmax: 502).  So, the problem is that when we try to decode
+ * the tuple inserted/updated in 501 after the catalog update, we will see the
+ * catalog tuple with (xmin: 500, xmax: 502) as visible because it will
+ * consider that the tuple is deleted by xid 502 which is not visible to our
+ * snapshot.  And when we will try to decode with that catalog tuple, it can
+ * lead to a wrong result or a crash.  So, it is necessary to detect
+ * concurrent aborts to allow streaming of in-progress transactions.
+ *
+ * For detecting the concurrent abort we set CheckXidAlive to the current
+ * (sub)transaction's xid for which this change belongs to.  And, during
+ * catalog scan we can check the status of the xid and if it is aborted we will
+ * report a specific error so that we can stop streaming current transaction
+ * and discard the already streamed changes on such an error.  We might have
+ * already streamed some of the changes for the aborted (sub)transaction, but
+ * that is fine because when we decode the abort we will stream abort message
+ * to truncate the changes in the subscriber.
+ */
+static inline void
+SetupCheckXidLive(TransactionId xid)
+{
 	/*
-	 * If this transaction has no snapshot, it didn't make any changes to the
-	 * database, so there's nothing to decode.  Note that
-	 * ReorderBufferCommitChild will have transferred any snapshots from
-	 * subtransactions if there were any.
+	 * If the input transaction id is already set as a CheckXidAlive then
+	 * nothing to do.
 	 */
-	if (txn->base_snapshot == NULL)
-	{
-		Assert(txn->ninvalidations == 0);
-		ReorderBufferCleanupTXN(rb, txn);
+	if (TransactionIdEquals(CheckXidAlive, xid))
 		return;
+
+	/*
+	 * setup CheckXidAlive if it's not committed yet.  We don't check if the
+	 * xid is aborted.  That will happen during catalog access.
+	 */
+	if (!TransactionIdDidCommit(xid))
+		CheckXidAlive = xid;
+	else
+		CheckXidAlive = InvalidTransactionId;
+}
+
+/*
+ * Helper function for ReorderBufferProcessTXN for applying change.
+ */
+static inline void
+ReorderBufferApplyChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						 Relation relation, ReorderBufferChange *change,
+						 bool streaming)
+{
+	if (streaming)
+		rb->stream_change(rb, txn, relation, change);
+	else
+		rb->apply_change(rb, txn, relation, change);
+}
+
+/*
+ * Helper function for ReorderBufferProcessTXN for applying the truncate.
+ */
+static inline void
+ReorderBufferApplyTruncate(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						   int nrelations, Relation *relations,
+						   ReorderBufferChange *change, bool streaming)
+{
+	if (streaming)
+		rb->stream_truncate(rb, txn, nrelations, relations, change);
+	else
+		rb->apply_truncate(rb, txn, nrelations, relations, change);
+}
+
+/*
+ * Helper function for ReorderBufferProcessTXN for applying the message.
+ */
+static inline void
+ReorderBufferApplyMessage(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						  ReorderBufferChange *change, bool streaming)
+{
+	if (streaming)
+		rb->stream_message(rb, txn, change->lsn, true,
+						   change->data.msg.prefix,
+						   change->data.msg.message_size,
+						   change->data.msg.message);
+	else
+		rb->message(rb, txn, change->lsn, true,
+					change->data.msg.prefix,
+					change->data.msg.message_size,
+					change->data.msg.message);
+}
+
+/*
+ * Function to store the command id and snapshot at the end of the current
+ * stream so that we can reuse the same while sending the next stream.
+ */
+static inline void
+ReorderBufferSaveTXNSnapshot(ReorderBuffer *rb, ReorderBufferTXN *txn,
+							 Snapshot snapshot_now, CommandId command_id)
+{
+	txn->command_id = command_id;
+
+	/* Avoid copying if it's already copied. */
+	if (snapshot_now->copied)
+		txn->snapshot_now = snapshot_now;
+	else
+		txn->snapshot_now = ReorderBufferCopySnap(rb, snapshot_now,
+												  txn, command_id);
+}
+
+/*
+ * Helper function for ReorderBufferProcessTXN to handle the concurrent
+ * abort of the streaming transaction.  This resets the TXN such that it
+ * can be used to stream the remaining data of transaction being processed.
+ */
+static void
+ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
+					  Snapshot snapshot_now,
+					  CommandId command_id,
+					  XLogRecPtr last_lsn,
+					  ReorderBufferChange *specinsert)
+{
+	/* Discard the changes that we just streamed */
+	ReorderBufferTruncateTXN(rb, txn);
+
+	/* Free all resources allocated for toast reconstruction */
+	ReorderBufferToastReset(rb, txn);
+
+	/* Return the spec insert change if it is not NULL */
+	if (specinsert != NULL)
+	{
+		ReorderBufferReturnChange(rb, specinsert, true);
+		specinsert = NULL;
 	}
 
-	snapshot_now = txn->base_snapshot;
+	/* Stop the stream. */
+	rb->stream_stop(rb, txn, last_lsn);
+
+	/* Remember the command ID and snapshot for the streaming run */
+	ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
+}
+
+/*
+ * Helper function for ReorderBufferCommit and ReorderBufferStreamTXN.
+ *
+ * Send data of a transaction (and its subtransactions) to the
+ * output plugin. We iterate over the top and subtransactions (using a k-way
+ * merge) and replay the changes in lsn order.
+ *
+ * If streaming is true then data will be sent using stream API.
+ */
+static void
+ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
+						XLogRecPtr commit_lsn,
+						volatile Snapshot snapshot_now,
+						volatile CommandId command_id,
+						bool streaming)
+{
+	bool		using_subtxn;
+	MemoryContext ccxt = CurrentMemoryContext;
+	ReorderBufferIterTXNState *volatile iterstate = NULL;
+	volatile XLogRecPtr prev_lsn = InvalidXLogRecPtr;
+	ReorderBufferChange *volatile specinsert = NULL;
+	volatile bool stream_started = false;
+	ReorderBufferTXN *volatile curtxn = NULL;
 
 	/* build data to be able to lookup the CommandIds of catalog tuples */
 	ReorderBufferBuildTupleCidHash(rb, txn);
@@ -1558,14 +1944,15 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
 	PG_TRY();
 	{
 		ReorderBufferChange *change;
-		ReorderBufferChange *specinsert = NULL;
 
 		if (using_subtxn)
-			BeginInternalSubTransaction("replay");
+			BeginInternalSubTransaction(streaming ? "stream" : "replay");
 		else
 			StartTransactionCommand();
 
-		rb->begin(rb, txn);
+		/* We only need to send begin/commit for non-streamed transactions. */
+		if (!streaming)
+			rb->begin(rb, txn);
 
 		ReorderBufferIterTXNInit(rb, txn, &iterstate);
 		while ((change = ReorderBufferIterTXNNext(rb, iterstate)) != NULL)
@@ -1573,6 +1960,36 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
 			Relation	relation = NULL;
 			Oid			reloid;
 
+			/*
+			 * We can't call start stream callback before processing first
+			 * change.
+			 */
+			if (prev_lsn == InvalidXLogRecPtr)
+			{
+				if (streaming)
+				{
+					txn->origin_id = change->origin_id;
+					rb->stream_start(rb, txn, change->lsn);
+					stream_started = true;
+				}
+			}
+
+			/*
+			 * Enforce correct ordering of changes, merged from multiple
+			 * subtransactions. The changes may have the same LSN due to
+			 * MULTI_INSERT xlog records.
+			 */
+			Assert(prev_lsn == InvalidXLogRecPtr || prev_lsn <= change->lsn);
+
+			prev_lsn = change->lsn;
+
+			/* Set the current xid to detect concurrent aborts. */
+			if (streaming)
+			{
+				curtxn = change->txn;
+				SetupCheckXidLive(curtxn->xid);
+			}
+
 			switch (change->action)
 			{
 				case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM:
@@ -1649,7 +2066,8 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
 					if (!IsToastRelation(relation))
 					{
 						ReorderBufferToastReplace(rb, txn, relation, change);
-						rb->apply_change(rb, txn, relation, change);
+						ReorderBufferApplyChange(rb, txn, relation, change,
+												 streaming);
 
 						/*
 						 * Only clear reassembled toast chunks if we're sure
@@ -1685,11 +2103,11 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
 					 */
 					if (specinsert != NULL)
 					{
-						ReorderBufferReturnChange(rb, specinsert);
+						ReorderBufferReturnChange(rb, specinsert, true);
 						specinsert = NULL;
 					}
 
-					if (relation != NULL)
+					if (RelationIsValid(relation))
 					{
 						RelationClose(relation);
 						relation = NULL;
@@ -1714,7 +2132,7 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
 					/* clear out a pending (and thus failed) speculation */
 					if (specinsert != NULL)
 					{
-						ReorderBufferReturnChange(rb, specinsert);
+						ReorderBufferReturnChange(rb, specinsert, true);
 						specinsert = NULL;
 					}
 
@@ -1747,7 +2165,10 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
 							relations[nrelations++] = relation;
 						}
 
-						rb->apply_truncate(rb, txn, nrelations, relations, change);
+						/* Apply the truncate. */
+						ReorderBufferApplyTruncate(rb, txn, nrelations,
+												   relations, change,
+												   streaming);
 
 						for (i = 0; i < nrelations; i++)
 							RelationClose(relations[i]);
@@ -1756,10 +2177,7 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
 					}
 
 				case REORDER_BUFFER_CHANGE_MESSAGE:
-					rb->message(rb, txn, change->lsn, true,
-								change->data.msg.prefix,
-								change->data.msg.message_size,
-								change->data.msg.message);
+					ReorderBufferApplyMessage(rb, txn, change, streaming);
 					break;
 
 				case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT:
@@ -1790,7 +2208,6 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
 						snapshot_now = change->data.snapshot;
 					}
 
-
 					/* and continue with the new one */
 					SetupHistoricSnapshot(snapshot_now, txn->tuplecid_hash);
 					break;
@@ -1837,7 +2254,7 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
 		 */
 		if (specinsert)
 		{
-			ReorderBufferReturnChange(rb, specinsert);
+			ReorderBufferReturnChange(rb, specinsert, true);
 			specinsert = NULL;
 		}
 
@@ -1845,14 +2262,35 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
 		ReorderBufferIterTXNFinish(rb, iterstate);
 		iterstate = NULL;
 
-		/* call commit callback */
-		rb->commit(rb, txn, commit_lsn);
+		/*
+		 * Done with current changes, send the last message for this set of
+		 * changes depending upon streaming mode.
+		 */
+		if (streaming)
+		{
+			if (stream_started)
+			{
+				rb->stream_stop(rb, txn, prev_lsn);
+				stream_started = false;
+			}
+		}
+		else
+			rb->commit(rb, txn, commit_lsn);
 
 		/* this is just a sanity check against bad output plugin behaviour */
 		if (GetCurrentTransactionIdIfAny() != InvalidTransactionId)
 			elog(ERROR, "output plugin used XID %u",
 				 GetCurrentTransactionId());
 
+		/*
+		 * Remember the command ID and snapshot for the next set of changes in
+		 * streaming mode.
+		 */
+		if (streaming)
+			ReorderBufferSaveTXNSnapshot(rb, txn, snapshot_now, command_id);
+		else if (snapshot_now->copied)
+			ReorderBufferFreeSnap(rb, snapshot_now);
+
 		/* cleanup */
 		TeardownHistoricSnapshot(false);
 
@@ -1870,14 +2308,27 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
 		if (using_subtxn)
 			RollbackAndReleaseCurrentSubTransaction();
 
-		if (snapshot_now->copied)
-			ReorderBufferFreeSnap(rb, snapshot_now);
+		/*
+		 * If we are streaming the in-progress transaction then discard the
+		 * changes that we just streamed, and mark the transactions as
+		 * streamed (if they contained changes). Otherwise, remove all the
+		 * changes and deallocate the ReorderBufferTXN.
+		 */
+		if (streaming)
+		{
+			ReorderBufferTruncateTXN(rb, txn);
 
-		/* remove potential on-disk data, and deallocate */
-		ReorderBufferCleanupTXN(rb, txn);
+			/* Reset the CheckXidAlive */
+			CheckXidAlive = InvalidTransactionId;
+		}
+		else
+			ReorderBufferCleanupTXN(rb, txn);
 	}
 	PG_CATCH();
 	{
+		MemoryContext ecxt = MemoryContextSwitchTo(ccxt);
+		ErrorData  *errdata = CopyErrorData();
+
 		/* TODO: Encapsulate cleanup from the PG_TRY and PG_CATCH blocks */
 		if (iterstate)
 			ReorderBufferIterTXNFinish(rb, iterstate);
@@ -1896,15 +2347,106 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
 		if (using_subtxn)
 			RollbackAndReleaseCurrentSubTransaction();
 
-		if (snapshot_now->copied)
-			ReorderBufferFreeSnap(rb, snapshot_now);
+		/*
+		 * The error code ERRCODE_TRANSACTION_ROLLBACK indicates a concurrent
+		 * abort of the (sub)transaction we are streaming. We need to do the
+		 * cleanup and return gracefully on this error, see SetupCheckXidLive.
+		 */
+		if (errdata->sqlerrcode == ERRCODE_TRANSACTION_ROLLBACK)
+		{
+			/*
+			 * This error can only occur when we are sending the data in
+			 * streaming mode and the streaming is not finished yet.
+			 */
+			Assert(streaming);
+			Assert(stream_started);
+
+			/* Cleanup the temporary error state. */
+			FlushErrorState();
+			FreeErrorData(errdata);
+			errdata = NULL;
+			curtxn->concurrent_abort = true;
+
+			/* Reset the TXN so that it is allowed to stream remaining data. */
+			ReorderBufferResetTXN(rb, txn, snapshot_now,
+								  command_id, prev_lsn,
+								  specinsert);
+		}
+		else
+		{
+			ReorderBufferCleanupTXN(rb, txn);
+			MemoryContextSwitchTo(ecxt);
+			PG_RE_THROW();
+		}
+	}
+	PG_END_TRY();
+}
 
-		/* remove potential on-disk data, and deallocate */
-		ReorderBufferCleanupTXN(rb, txn);
+/*
+ * Perform the replay of a transaction and its non-aborted subtransactions.
+ *
+ * Subtransactions previously have to be processed by
+ * ReorderBufferCommitChild(), even if previously assigned to the toplevel
+ * transaction with ReorderBufferAssignChild.
+ *
+ * This interface is called once a toplevel commit is read for both streamed
+ * as well as non-streamed transactions.
+ */
+void
+ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid,
+					XLogRecPtr commit_lsn, XLogRecPtr end_lsn,
+					TimestampTz commit_time,
+					RepOriginId origin_id, XLogRecPtr origin_lsn)
+{
+	ReorderBufferTXN *txn;
+	Snapshot	snapshot_now;
+	CommandId	command_id = FirstCommandId;
+
+	txn = ReorderBufferTXNByXid(rb, xid, false, NULL, InvalidXLogRecPtr,
+								false);
 
-		PG_RE_THROW();
+	/* unknown transaction, nothing to replay */
+	if (txn == NULL)
+		return;
+
+	txn->final_lsn = commit_lsn;
+	txn->end_lsn = end_lsn;
+	txn->commit_time = commit_time;
+	txn->origin_id = origin_id;
+	txn->origin_lsn = origin_lsn;
+
+	/*
+	 * If the transaction was (partially) streamed, we need to commit it in a
+	 * 'streamed' way. That is, we first stream the remaining part of the
+	 * transaction, and then invoke stream_commit message.
+	 *
+	 * Called after everything (origin ID, LSN, ...) is stored in the
+	 * transaction to avoid passing that information directly.
+	 */
+	if (rbtxn_is_streamed(txn))
+	{
+		ReorderBufferStreamCommit(rb, txn);
+		return;
 	}
-	PG_END_TRY();
+
+	/*
+	 * If this transaction has no snapshot, it didn't make any changes to the
+	 * database, so there's nothing to decode.  Note that
+	 * ReorderBufferCommitChild will have transferred any snapshots from
+	 * subtransactions if there were any.
+	 */
+	if (txn->base_snapshot == NULL)
+	{
+		Assert(txn->ninvalidations == 0);
+		ReorderBufferCleanupTXN(rb, txn);
+		return;
+	}
+
+	snapshot_now = txn->base_snapshot;
+
+	/* Process and send the changes to output plugin. */
+	ReorderBufferProcessTXN(rb, txn, commit_lsn, snapshot_now,
+							command_id, false);
 }
 
 /*
@@ -1931,6 +2473,22 @@ ReorderBufferAbort(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
 	if (txn == NULL)
 		return;
 
+	/* For streamed transactions notify the remote node about the abort. */
+	if (rbtxn_is_streamed(txn))
+	{
+		rb->stream_abort(rb, txn, lsn);
+
+		/*
+		 * We might have decoded changes for this transaction that could load
+		 * the cache as per the current transaction's view (consider DDL's
+		 * happened in this transaction). We don't want the decoding of future
+		 * transactions to use those cache entries so execute invalidations.
+		 */
+		if (txn->ninvalidations > 0)
+			ReorderBufferImmediateInvalidation(rb, txn->ninvalidations,
+											   txn->invalidations);
+	}
+
 	/* cosmetic... */
 	txn->final_lsn = lsn;
 
@@ -2000,6 +2558,10 @@ ReorderBufferForget(ReorderBuffer *rb, TransactionId xid, XLogRecPtr lsn)
 	if (txn == NULL)
 		return;
 
+	/* For streamed transactions notify the remote node about the abort. */
+	if (rbtxn_is_streamed(txn))
+		rb->stream_abort(rb, txn, lsn);
+
 	/* cosmetic... */
 	txn->final_lsn = lsn;
 
@@ -2082,7 +2644,7 @@ ReorderBufferAddSnapshot(ReorderBuffer *rb, TransactionId xid,
 	change->data.snapshot = snap;
 	change->action = REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT;
 
-	ReorderBufferQueueChange(rb, xid, lsn, change);
+	ReorderBufferQueueChange(rb, xid, lsn, change, false);
 }
 
 /*
@@ -2131,12 +2693,21 @@ ReorderBufferAddNewCommandId(ReorderBuffer *rb, TransactionId xid,
 	change->data.command_id = cid;
 	change->action = REORDER_BUFFER_CHANGE_INTERNAL_COMMAND_ID;
 
-	ReorderBufferQueueChange(rb, xid, lsn, change);
+	ReorderBufferQueueChange(rb, xid, lsn, change, false);
 }
 
 /*
- * Update the memory accounting info. We track memory used by the whole
- * reorder buffer and the transaction containing the change.
+ * Update memory counters to account for the new or removed change.
+ *
+ * We update two counters - in the reorder buffer, and in the transaction
+ * containing the change. The reorder buffer counter allows us to quickly
+ * decide if we reached the memory limit, the transaction counter allows
+ * us to quickly pick the largest transaction for eviction.
+ *
+ * When streaming is enabled, we need to update the toplevel transaction
+ * counters instead - we don't really care about subtransactions as we
+ * can't stream them individually anyway, and we only pick toplevel
+ * transactions for eviction. So only toplevel transactions matter.
  */
 static void
 ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
@@ -2144,6 +2715,8 @@ ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
 								bool addition)
 {
 	Size		sz;
+	ReorderBufferTXN *txn;
+	ReorderBufferTXN *toptxn = NULL;
 
 	Assert(change->txn);
 
@@ -2155,19 +2728,41 @@ ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
 	if (change->action == REORDER_BUFFER_CHANGE_INTERNAL_TUPLECID)
 		return;
 
+	txn = change->txn;
+
+	/* If streaming supported, update the total size in top level as well. */
+	if (ReorderBufferCanStream(rb))
+	{
+		if (txn->toptxn != NULL)
+			toptxn = txn->toptxn;
+		else
+			toptxn = txn;
+	}
+
 	sz = ReorderBufferChangeSize(change);
 
 	if (addition)
 	{
-		change->txn->size += sz;
+		txn->size += sz;
 		rb->size += sz;
+
+		/* Update the total size in the top transaction. */
+		if (toptxn)
+			toptxn->total_size += sz;
 	}
 	else
 	{
-		Assert((rb->size >= sz) && (change->txn->size >= sz));
-		change->txn->size -= sz;
+		Assert((rb->size >= sz) && (txn->size >= sz));
+		txn->size -= sz;
 		rb->size -= sz;
+
+		/* Update the total size in the top transaction. */
+		if (toptxn)
+			toptxn->total_size -= sz;
 	}
+
+	Assert(txn->size <= rb->size);
+	Assert((txn->size >= 0) && (rb->size >= 0));
 }
 
 /*
@@ -2387,6 +2982,51 @@ ReorderBufferLargestTXN(ReorderBuffer *rb)
 	return largest;
 }
 
+/*
+ * Find the largest toplevel transaction to evict (by streaming).
+ *
+ * This can be seen as an optimized version of ReorderBufferLargestTXN, which
+ * should give us the same transaction (because we don't update memory account
+ * for subtransaction with streaming, so it's always 0). But we can simply
+ * iterate over the limited number of toplevel transactions.
+ *
+ * Note that, we skip transactions that contains incomplete changes. There
+ * is a scope of optimization here such that we can select the largest transaction
+ * which has complete changes.  But that will make the code and design quite complex
+ * and that might not be worth the benefit.  If we plan to stream the transactions
+ * that contains incomplete changes then we need to find a way to partially
+ * stream/truncate the transaction changes in-memory and build a mechanism to
+ * partially truncate the spilled files.  Additionally, whenever we partially
+ * stream the transaction we need to maintain the last streamed lsn and next time
+ * we need to restore from that segment and the offset in WAL.  As we stream the
+ * changes from the top transaction and restore them subtransaction wise, we need
+ * to even remember the subxact from where we streamed the last change.
+ */
+static ReorderBufferTXN *
+ReorderBufferLargestTopTXN(ReorderBuffer *rb)
+{
+	dlist_iter	iter;
+	Size		largest_size = 0;
+	ReorderBufferTXN *largest = NULL;
+
+	/* Find the largest top-level transaction. */
+	dlist_foreach(iter, &rb->toplevel_by_lsn)
+	{
+		ReorderBufferTXN *txn;
+
+		txn = dlist_container(ReorderBufferTXN, node, iter.cur);
+
+		if ((largest != NULL || txn->total_size > largest_size) &&
+			(txn->total_size > 0) && !(rbtxn_has_incomplete_tuple(txn)))
+		{
+			largest = txn;
+			largest_size = txn->total_size;
+		}
+	}
+
+	return largest;
+}
+
 /*
  * Check whether the logical_decoding_work_mem limit was reached, and if yes
  * pick the largest (sub)transaction at-a-time to evict and spill its changes to
@@ -2419,11 +3059,33 @@ ReorderBufferCheckMemoryLimit(ReorderBuffer *rb)
 	{
 		/*
 		 * Pick the largest transaction (or subtransaction) and evict it from
-		 * memory by serializing it to disk.
+		 * memory by streaming, if possible.  Otherwise, spill to disk.
 		 */
-		txn = ReorderBufferLargestTXN(rb);
+		if (ReorderBufferCanStartStreaming(rb) &&
+			(txn = ReorderBufferLargestTopTXN(rb)) != NULL)
+		{
+			/* we know there has to be one, because the size is not zero */
+			Assert(txn && !txn->toptxn);
+			Assert(txn->total_size > 0);
+			Assert(rb->size >= txn->total_size);
 
-		ReorderBufferSerializeTXN(rb, txn);
+			ReorderBufferStreamTXN(rb, txn);
+		}
+		else
+		{
+			/*
+			 * Pick the largest transaction (or subtransaction) and evict it
+			 * from memory by serializing it to disk.
+			 */
+			txn = ReorderBufferLargestTXN(rb);
+
+			/* we know there has to be one, because the size is not zero */
+			Assert(txn);
+			Assert(txn->size > 0);
+			Assert(rb->size >= txn->size);
+
+			ReorderBufferSerializeTXN(rb, txn);
+		}
 
 		/*
 		 * After eviction, the transaction should have no entries in memory,
@@ -2501,7 +3163,7 @@ ReorderBufferSerializeTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
 
 		ReorderBufferSerializeChange(rb, txn, fd, change);
 		dlist_delete(&change->node);
-		ReorderBufferReturnChange(rb, change);
+		ReorderBufferReturnChange(rb, change, true);
 
 		spilled++;
 	}
@@ -2713,6 +3375,136 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn,
 	Assert(ondisk->change.action == change->action);
 }
 
+/* Returns true, if the output plugin supports streaming, false, otherwise. */
+static inline bool
+ReorderBufferCanStream(ReorderBuffer *rb)
+{
+	LogicalDecodingContext *ctx = rb->private_data;
+
+	return ctx->streaming;
+}
+
+/* Returns true, if the streaming can be started now, false, otherwise. */
+static inline bool
+ReorderBufferCanStartStreaming(ReorderBuffer *rb)
+{
+	LogicalDecodingContext *ctx = rb->private_data;
+	SnapBuild  *builder = ctx->snapshot_builder;
+
+	/*
+	 * We can't start streaming immediately even if the streaming is enabled
+	 * because we previously decoded this transaction and now just are
+	 * restarting.
+	 */
+	if (ReorderBufferCanStream(rb) &&
+		!SnapBuildXactNeedsSkip(builder, ctx->reader->EndRecPtr))
+	{
+		/* We must have a consistent snapshot by this time */
+		Assert(SnapBuildCurrentState(builder) == SNAPBUILD_CONSISTENT);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Send data of a large transaction (and its subtransactions) to the
+ * output plugin, but using the stream API.
+ */
+static void
+ReorderBufferStreamTXN(ReorderBuffer *rb, ReorderBufferTXN *txn)
+{
+	Snapshot	snapshot_now;
+	CommandId	command_id;
+
+	/* We can never reach here for a subtransaction. */
+	Assert(txn->toptxn == NULL);
+
+	/*
+	 * We can't make any assumptions about base snapshot here, similar to what
+	 * ReorderBufferCommit() does. That relies on base_snapshot getting
+	 * transferred from subxact in ReorderBufferCommitChild(), but that was
+	 * not yet called as the transaction is in-progress.
+	 *
+	 * So just walk the subxacts and use the same logic here. But we only need
+	 * to do that once, when the transaction is streamed for the first time.
+	 * After that we need to reuse the snapshot from the previous run.
+	 *
+	 * Unlike DecodeCommit which adds xids of all the subtransactions in
+	 * snapshot's xip array via SnapBuildCommittedTxn, we can't do that here
+	 * but we do add them to subxip array instead via ReorderBufferCopySnap.
+	 * This allows the catalog changes made in subtransactions decoded till
+	 * now to be visible.
+	 */
+	if (txn->snapshot_now == NULL)
+	{
+		dlist_iter	subxact_i;
+
+		/* make sure this transaction is streamed for the first time */
+		Assert(!rbtxn_is_streamed(txn));
+
+		/* at the beginning we should have invalid command ID */
+		Assert(txn->command_id == InvalidCommandId);
+
+		dlist_foreach(subxact_i, &txn->subtxns)
+		{
+			ReorderBufferTXN *subtxn;
+
+			subtxn = dlist_container(ReorderBufferTXN, node, subxact_i.cur);
+			ReorderBufferTransferSnapToParent(txn, subtxn);
+		}
+
+		/*
+		 * If this transaction has no snapshot, it didn't make any changes to
+		 * the database till now, so there's nothing to decode.
+		 */
+		if (txn->base_snapshot == NULL)
+		{
+			Assert(txn->ninvalidations == 0);
+			return;
+		}
+
+		command_id = FirstCommandId;
+		snapshot_now = ReorderBufferCopySnap(rb, txn->base_snapshot,
+											 txn, command_id);
+	}
+	else
+	{
+		/* the transaction must have been already streamed */
+		Assert(rbtxn_is_streamed(txn));
+
+		/*
+		 * Nah, we already have snapshot from the previous streaming run. We
+		 * assume new subxacts can't move the LSN backwards, and so can't beat
+		 * the LSN condition in the previous branch (so no need to walk
+		 * through subxacts again). In fact, we must not do that as we may be
+		 * using snapshot half-way through the subxact.
+		 */
+		command_id = txn->command_id;
+
+		/*
+		 * We can't use txn->snapshot_now directly because after the last
+		 * streaming run, we might have got some new sub-transactions. So we
+		 * need to add them to the snapshot.
+		 */
+		snapshot_now = ReorderBufferCopySnap(rb, txn->snapshot_now,
+											 txn, command_id);
+
+		/* Free the previously copied snapshot. */
+		Assert(txn->snapshot_now->copied);
+		ReorderBufferFreeSnap(rb, txn->snapshot_now);
+		txn->snapshot_now = NULL;
+	}
+
+	/* Process and send the changes to output plugin. */
+	ReorderBufferProcessTXN(rb, txn, InvalidXLogRecPtr, snapshot_now,
+							command_id, true);
+
+	Assert(dlist_is_empty(&txn->changes));
+	Assert(txn->nentries == 0);
+	Assert(txn->nentries_mem == 0);
+}
+
 /*
  * Size of a change in memory.
  */
@@ -2813,7 +3605,7 @@ ReorderBufferRestoreChanges(ReorderBuffer *rb, ReorderBufferTXN *txn,
 		dlist_container(ReorderBufferChange, node, cleanup_iter.cur);
 
 		dlist_delete(&cleanup->node);
-		ReorderBufferReturnChange(rb, cleanup);
+		ReorderBufferReturnChange(rb, cleanup, true);
 	}
 	txn->nentries_mem = 0;
 	Assert(dlist_is_empty(&txn->changes));
@@ -3522,7 +4314,7 @@ ReorderBufferToastReset(ReorderBuffer *rb, ReorderBufferTXN *txn)
 			dlist_container(ReorderBufferChange, node, it.cur);
 
 			dlist_delete(&change->node);
-			ReorderBufferReturnChange(rb, change);
+			ReorderBufferReturnChange(rb, change, true);
 		}
 	}
 
@@ -3812,6 +4604,17 @@ ResolveCminCmaxDuringDecoding(HTAB *tuplecid_data,
 	BlockNumber blockno;
 	bool		updated_mapping = false;
 
+	/*
+	 * Return unresolved if tuplecid_data is not valid.  That's because when
+	 * streaming in-progress transactions we may run into tuples with the CID
+	 * before actually decoding them.  Think e.g. about INSERT followed by
+	 * TRUNCATE, where the TRUNCATE may not be decoded yet when applying the
+	 * INSERT.  So in such cases, we assume the CID is from the future
+	 * command.
+	 */
+	if (tuplecid_data == NULL)
+		return false;
+
 	/* be careful about padding */
 	memset(&key, 0, sizeof(key));
 
diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h
index 95d18cdb12e7..aa17f7df84d4 100644
--- a/src/include/access/heapam_xlog.h
+++ b/src/include/access/heapam_xlog.h
@@ -67,6 +67,7 @@
 #define XLH_INSERT_LAST_IN_MULTI				(1<<1)
 #define XLH_INSERT_IS_SPECULATIVE				(1<<2)
 #define XLH_INSERT_CONTAINS_NEW_TUPLE			(1<<3)
+#define XLH_INSERT_ON_TOAST_RELATION			(1<<4)
 
 /*
  * xl_heap_update flag values, 8 bits are available.
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index 7ba72c84e021..387eb34a61a3 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -19,6 +19,7 @@
 
 #include "access/relscan.h"
 #include "access/sdir.h"
+#include "access/xact.h"
 #include "utils/guc.h"
 #include "utils/rel.h"
 #include "utils/snapshot.h"
@@ -903,6 +904,15 @@ static inline bool
 table_scan_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
 {
 	slot->tts_tableOid = RelationGetRelid(sscan->rs_rd);
+
+	/*
+	 * We don't expect direct calls to table_scan_getnextslot with valid
+	 * CheckXidAlive for catalog or regular tables.  See detailed comments in
+	 * xact.c where these variables are declared.
+	 */
+	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+		elog(ERROR, "unexpected table_scan_getnextslot call during logical decoding");
+
 	return sscan->rs_rd->rd_tableam->scan_getnextslot(sscan, direction, slot);
 }
 
@@ -1017,6 +1027,13 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan,
 						TupleTableSlot *slot,
 						bool *call_again, bool *all_dead)
 {
+	/*
+	 * We don't expect direct calls to table_index_fetch_tuple with valid
+	 * CheckXidAlive for catalog or regular tables.  See detailed comments in
+	 * xact.c where these variables are declared.
+	 */
+	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+		elog(ERROR, "unexpected table_index_fetch_tuple call during logical decoding");
 
 	return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot,
 													slot, call_again,
@@ -1056,6 +1073,14 @@ table_tuple_fetch_row_version(Relation rel,
 							  Snapshot snapshot,
 							  TupleTableSlot *slot)
 {
+	/*
+	 * We don't expect direct calls to table_tuple_fetch_row_version with
+	 * valid CheckXidAlive for catalog or regular tables.  See detailed
+	 * comments in xact.c where these variables are declared.
+	 */
+	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+		elog(ERROR, "unexpected table_tuple_fetch_row_version call during logical decoding");
+
 	return rel->rd_tableam->tuple_fetch_row_version(rel, tid, snapshot, slot);
 }
 
@@ -1713,6 +1738,14 @@ static inline bool
 table_scan_bitmap_next_block(TableScanDesc scan,
 							 struct TBMIterateResult *tbmres)
 {
+	/*
+	 * We don't expect direct calls to table_scan_bitmap_next_block with valid
+	 * CheckXidAlive for catalog or regular tables.  See detailed comments in
+	 * xact.c where these variables are declared.
+	 */
+	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+		elog(ERROR, "unexpected table_scan_bitmap_next_block call during logical decoding");
+
 	return scan->rs_rd->rd_tableam->scan_bitmap_next_block(scan,
 														   tbmres);
 }
@@ -1730,6 +1763,14 @@ table_scan_bitmap_next_tuple(TableScanDesc scan,
 							 struct TBMIterateResult *tbmres,
 							 TupleTableSlot *slot)
 {
+	/*
+	 * We don't expect direct calls to table_scan_bitmap_next_tuple with valid
+	 * CheckXidAlive for catalog or regular tables.  See detailed comments in
+	 * xact.c where these variables are declared.
+	 */
+	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+		elog(ERROR, "unexpected table_scan_bitmap_next_tuple call during logical decoding");
+
 	return scan->rs_rd->rd_tableam->scan_bitmap_next_tuple(scan,
 														   tbmres,
 														   slot);
@@ -1748,6 +1789,13 @@ static inline bool
 table_scan_sample_next_block(TableScanDesc scan,
 							 struct SampleScanState *scanstate)
 {
+	/*
+	 * We don't expect direct calls to table_scan_sample_next_block with valid
+	 * CheckXidAlive for catalog or regular tables.  See detailed comments in
+	 * xact.c where these variables are declared.
+	 */
+	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+		elog(ERROR, "unexpected table_scan_sample_next_block call during logical decoding");
 	return scan->rs_rd->rd_tableam->scan_sample_next_block(scan, scanstate);
 }
 
@@ -1764,6 +1812,13 @@ table_scan_sample_next_tuple(TableScanDesc scan,
 							 struct SampleScanState *scanstate,
 							 TupleTableSlot *slot)
 {
+	/*
+	 * We don't expect direct calls to table_scan_sample_next_tuple with valid
+	 * CheckXidAlive for catalog or regular tables.  See detailed comments in
+	 * xact.c where these variables are declared.
+	 */
+	if (unlikely(TransactionIdIsValid(CheckXidAlive) && !bsysscan))
+		elog(ERROR, "unexpected table_scan_sample_next_tuple call during logical decoding");
 	return scan->rs_rd->rd_tableam->scan_sample_next_tuple(scan, scanstate,
 														   slot);
 }
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index 53480116a462..c18554bae2c2 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -81,6 +81,10 @@ typedef enum
 /* Synchronous commit level */
 extern int	synchronous_commit;
 
+/* used during logical streaming of a transaction */
+extern TransactionId CheckXidAlive;
+extern bool bsysscan;
+
 /*
  * Miscellaneous flag bits to record events which occur on the top level
  * transaction. These flags are only persisted in MyXactFlags and are intended
diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h
index deef31825d6e..b0fae9808bf6 100644
--- a/src/include/replication/logical.h
+++ b/src/include/replication/logical.h
@@ -121,5 +121,6 @@ extern void LogicalIncreaseRestartDecodingForSlot(XLogRecPtr current_lsn,
 extern void LogicalConfirmReceivedLocation(XLogRecPtr lsn);
 
 extern bool filter_by_origin_cb_wrapper(LogicalDecodingContext *ctx, RepOriginId origin_id);
+extern void ResetLogicalStreamingState(void);
 
 #endif
diff --git a/src/include/replication/reorderbuffer.h b/src/include/replication/reorderbuffer.h
index 42bc81764873..1ae17d5f11fd 100644
--- a/src/include/replication/reorderbuffer.h
+++ b/src/include/replication/reorderbuffer.h
@@ -162,6 +162,9 @@ typedef struct ReorderBufferChange
 #define RBTXN_HAS_CATALOG_CHANGES 0x0001
 #define RBTXN_IS_SUBXACT          0x0002
 #define RBTXN_IS_SERIALIZED       0x0004
+#define RBTXN_IS_STREAMED         0x0008
+#define RBTXN_HAS_TOAST_INSERT    0x0010
+#define RBTXN_HAS_SPEC_INSERT     0x0020
 
 /* Does the transaction have catalog changes? */
 #define rbtxn_has_catalog_changes(txn) \
@@ -181,6 +184,40 @@ typedef struct ReorderBufferChange
 	((txn)->txn_flags & RBTXN_IS_SERIALIZED) != 0 \
 )
 
+/* This transaction's changes has toast insert, without main table insert. */
+#define rbtxn_has_toast_insert(txn) \
+( \
+	((txn)->txn_flags & RBTXN_HAS_TOAST_INSERT) != 0 \
+)
+/*
+ * This transaction's changes has speculative insert, without speculative
+ * confirm.
+ */
+#define rbtxn_has_spec_insert(txn) \
+( \
+	((txn)->txn_flags & RBTXN_HAS_SPEC_INSERT) != 0 \
+)
+
+/* Check whether this transaction has an incomplete change. */
+#define rbtxn_has_incomplete_tuple(txn) \
+( \
+	rbtxn_has_toast_insert(txn) || rbtxn_has_spec_insert(txn) \
+)
+
+/*
+ * Has this transaction been streamed to downstream?
+ *
+ * (It's not possible to deduce this from nentries and nentries_mem for
+ * various reasons. For example, all changes may be in subtransactions in
+ * which case we'd have nentries==0 for the toplevel one, which would say
+ * nothing about the streaming. So we maintain this flag, but only for the
+ * toplevel transaction.)
+ */
+#define rbtxn_is_streamed(txn) \
+( \
+	((txn)->txn_flags & RBTXN_IS_STREAMED) != 0 \
+)
+
 typedef struct ReorderBufferTXN
 {
 	/* See above */
@@ -248,6 +285,13 @@ typedef struct ReorderBufferTXN
 	XLogRecPtr	base_snapshot_lsn;
 	dlist_node	base_snapshot_node; /* link in txns_by_base_snapshot_lsn */
 
+	/*
+	 * Snapshot/CID from the previous streaming run. Only valid for already
+	 * streamed transactions (NULL/InvalidCommandId otherwise).
+	 */
+	Snapshot	snapshot_now;
+	CommandId	command_id;
+
 	/*
 	 * How many ReorderBufferChange's do we have in this txn.
 	 *
@@ -313,6 +357,12 @@ typedef struct ReorderBufferTXN
 	 * Size of this transaction (changes currently in memory, in bytes).
 	 */
 	Size		size;
+
+	/* Size of top-transaction including sub-transactions. */
+	Size		total_size;
+
+	/* If we have detected concurrent abort then ignore future changes. */
+	bool		concurrent_abort;
 } ReorderBufferTXN;
 
 /* so we can define the callbacks used inside struct ReorderBuffer itself */
@@ -484,12 +534,14 @@ void		ReorderBufferFree(ReorderBuffer *);
 ReorderBufferTupleBuf *ReorderBufferGetTupleBuf(ReorderBuffer *, Size tuple_len);
 void		ReorderBufferReturnTupleBuf(ReorderBuffer *, ReorderBufferTupleBuf *tuple);
 ReorderBufferChange *ReorderBufferGetChange(ReorderBuffer *);
-void		ReorderBufferReturnChange(ReorderBuffer *, ReorderBufferChange *);
+void		ReorderBufferReturnChange(ReorderBuffer *, ReorderBufferChange *, bool);
 
 Oid		   *ReorderBufferGetRelids(ReorderBuffer *, int nrelids);
 void		ReorderBufferReturnRelids(ReorderBuffer *, Oid *relids);
 
-void		ReorderBufferQueueChange(ReorderBuffer *, TransactionId, XLogRecPtr lsn, ReorderBufferChange *);
+void		ReorderBufferQueueChange(ReorderBuffer *, TransactionId,
+									 XLogRecPtr lsn, ReorderBufferChange *,
+									 bool toast_insert);
 void		ReorderBufferQueueMessage(ReorderBuffer *, TransactionId, Snapshot snapshot, XLogRecPtr lsn,
 									  bool transactional, const char *prefix,
 									  Size message_size, const char *message);

From a13421c96c0e8ffa34310f92d03d0e6f3bfa27f8 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Sat, 8 Aug 2020 07:31:52 +0200
Subject: [PATCH 269/334] Add some const decorations

---
 src/backend/replication/logical/logical.c | 6 +++---
 src/include/replication/logical.h         | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index 42f284b33f6b..f5eb6bc3aff2 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -82,7 +82,7 @@ static void stream_message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *tx
 static void stream_truncate_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn,
 									   int nrelations, Relation relations[], ReorderBufferChange *change);
 
-static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin);
+static void LoadOutputPlugin(OutputPluginCallbacks *callbacks, const char *plugin);
 
 /*
  * Make sure the current settings & environment are capable of doing logical
@@ -277,7 +277,7 @@ StartupDecodingContext(List *output_plugin_options,
  * startup function.
  */
 LogicalDecodingContext *
-CreateInitDecodingContext(char *plugin,
+CreateInitDecodingContext(const char *plugin,
 						  List *output_plugin_options,
 						  bool need_full_snapshot,
 						  XLogRecPtr restart_lsn,
@@ -612,7 +612,7 @@ OutputPluginUpdateProgress(struct LogicalDecodingContext *ctx)
  * that it provides the required callbacks.
  */
 static void
-LoadOutputPlugin(OutputPluginCallbacks *callbacks, char *plugin)
+LoadOutputPlugin(OutputPluginCallbacks *callbacks, const char *plugin)
 {
 	LogicalOutputPluginInit plugin_init;
 
diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h
index b0fae9808bf6..45abc444b7a5 100644
--- a/src/include/replication/logical.h
+++ b/src/include/replication/logical.h
@@ -96,7 +96,7 @@ typedef struct LogicalDecodingContext
 
 extern void CheckLogicalDecodingRequirements(void);
 
-extern LogicalDecodingContext *CreateInitDecodingContext(char *plugin,
+extern LogicalDecodingContext *CreateInitDecodingContext(const char *plugin,
 														 List *output_plugin_options,
 														 bool need_full_snapshot,
 														 XLogRecPtr restart_lsn,

From 82a0ba7707e010a29f5fe1a0020d963c82b8f1cb Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Sat, 8 Aug 2020 12:13:18 +0530
Subject: [PATCH 270/334] Fix the logical streaming test.

Commit 7259736a6e added the capability to stream changes in ReorderBuffer
which has some tests to test the streaming mode. It is quite possible that
while this test is running a parallel transaction could be logged by
autovacuum. Such a transaction won't perform any insert/update/delete to
non-catalog tables so will be shown as an empty transaction. Fix it by
skipping the empty transactions during this test.

Per report by buildfarm.
---
 contrib/test_decoding/expected/stream.out | 4 ++--
 contrib/test_decoding/sql/stream.sql      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/contrib/test_decoding/expected/stream.out b/contrib/test_decoding/expected/stream.out
index 9a5d7e7c4399..d7e32f818546 100644
--- a/contrib/test_decoding/expected/stream.out
+++ b/contrib/test_decoding/expected/stream.out
@@ -26,7 +26,7 @@ TRUNCATE table stream_test;
 rollback to s1;
 INSERT INTO stream_test SELECT repeat('a', 10) || g.i FROM generate_series(1, 20) g(i);
 COMMIT;
-SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1');
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'skip-empty-xacts', '1', 'stream-changes', '1');
                            data                           
 ----------------------------------------------------------
  opening a streamed block for transaction
@@ -67,7 +67,7 @@ SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'inc
 (0 rows)
 
 INSERT INTO stream_test SELECT repeat('a', 6000) || g.i FROM generate_series(1, 10) g(i);
-SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1');
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'skip-empty-xacts', '1', 'stream-changes', '1');
                    data                   
 ------------------------------------------
  opening a streamed block for transaction
diff --git a/contrib/test_decoding/sql/stream.sql b/contrib/test_decoding/sql/stream.sql
index 8abc30de0afc..ce86c816d11f 100644
--- a/contrib/test_decoding/sql/stream.sql
+++ b/contrib/test_decoding/sql/stream.sql
@@ -16,7 +16,7 @@ rollback to s1;
 INSERT INTO stream_test SELECT repeat('a', 10) || g.i FROM generate_series(1, 20) g(i);
 COMMIT;
 
-SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1');
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'skip-empty-xacts', '1', 'stream-changes', '1');
 
 -- streaming test for toast changes
 ALTER TABLE stream_test ALTER COLUMN data set storage external;
@@ -24,7 +24,7 @@ ALTER TABLE stream_test ALTER COLUMN data set storage external;
 SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1');
 
 INSERT INTO stream_test SELECT repeat('a', 6000) || g.i FROM generate_series(1, 10) g(i);
-SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'stream-changes', '1');
+SELECT data FROM pg_logical_slot_get_changes('regression_slot', NULL,NULL, 'include-xids', '0', 'skip-empty-xacts', '1', 'stream-changes', '1');
 
 DROP TABLE stream_test;
 SELECT pg_drop_replication_slot('regression_slot');

From 470687b4a5bb3b9f2b5bf7c9235680b3c91bd050 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Sat, 8 Aug 2020 12:31:55 -0400
Subject: [PATCH 271/334] walsnd: Don't set waiting_for_ping_response
 spuriously
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ashutosh Bapat noticed that when logical walsender needs to wait for
WAL, and it realizes that it must send a keepalive message to
walreceiver to update the sent-LSN, which *does not* request a reply
from walreceiver, it wrongly sets the flag that it's going to wait for
that reply.  That means that any future would-be sender of feedback
messages ends up not sending a feedback message, because they all
believe that a reply is expected.

With built-in logical replication there's not much harm in this, because
WalReceiverMain will send a ping-back every wal_receiver_timeout/2
anyway; but with other logical replication systems (e.g. pglogical) it
can cause significant pain.

This problem was introduced in commit 41d5f8ad734, where the
request-reply flag was changed from true to false to WalSndKeepalive,
without at the same time removing the line that sets
waiting_for_ping_response.

Just removing that line would be a sufficient fix, but it seems better
to shift the responsibility of setting the flag to WalSndKeepalive
itself instead of requiring caller to do it; this is clearly less
error-prone.

Author: Álvaro Herrera <alvherre@alvh.no-ip.org>
Reported-by: Ashutosh Bapat <ashutosh.bapat@2ndquadrant.com>
Backpatch: 9.5 and up
Discussion: https://postgr.es/m/20200806225558.GA22401@alvherre.pgsql
---
 src/backend/replication/walsender.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 5e2210dd7bdc..d13220c14008 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -151,7 +151,7 @@ static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
  * How far have we sent WAL already? This is also advertised in
  * MyWalSnd->sentPtr.  (Actually, this is the next WAL location to send.)
  */
-static XLogRecPtr sentPtr = 0;
+static XLogRecPtr sentPtr = InvalidXLogRecPtr;
 
 /* Buffers for constructing outgoing messages and processing reply messages. */
 static StringInfoData output_message;
@@ -1451,10 +1451,7 @@ WalSndWaitForWal(XLogRecPtr loc)
 		if (MyWalSnd->flush < sentPtr &&
 			MyWalSnd->write < sentPtr &&
 			!waiting_for_ping_response)
-		{
 			WalSndKeepalive(false);
-			waiting_for_ping_response = true;
-		}
 
 		/* check whether we're done */
 		if (loc <= RecentFlushPtr)
@@ -2932,10 +2929,7 @@ WalSndDone(WalSndSendDataCallback send_data)
 		proc_exit(0);
 	}
 	if (!waiting_for_ping_response)
-	{
 		WalSndKeepalive(true);
-		waiting_for_ping_response = true;
-	}
 }
 
 /*
@@ -3432,10 +3426,13 @@ pg_stat_get_wal_senders(PG_FUNCTION_ARGS)
 }
 
 /*
-  * This function is used to send a keepalive message to standby.
-  * If requestReply is set, sets a flag in the message requesting the standby
-  * to send a message back to us, for heartbeat purposes.
-  */
+ * Send a keepalive message to standby.
+ *
+ * If requestReply is set, the message requests the other party to send
+ * a message back to us, for heartbeat purposes.  We also set a flag to
+ * let nearby code that we're waiting for that response, to avoid
+ * repeated requests.
+ */
 static void
 WalSndKeepalive(bool requestReply)
 {
@@ -3450,6 +3447,10 @@ WalSndKeepalive(bool requestReply)
 
 	/* ... and send it wrapped in CopyData */
 	pq_putmessage_noblock('d', output_message.data, output_message.len);
+
+	/* Set local flag */
+	if (requestReply)
+		waiting_for_ping_response = true;
 }
 
 /*
@@ -3480,7 +3481,6 @@ WalSndKeepaliveIfNecessary(void)
 	if (last_processing >= ping_time)
 	{
 		WalSndKeepalive(true);
-		waiting_for_ping_response = true;
 
 		/* Try to flush pending output to the client */
 		if (pq_flush_if_writable() != 0)

From 39132b784aeaaacf5ddfb5c35b6e29a6926f4345 Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sat, 8 Aug 2020 11:12:01 -0700
Subject: [PATCH 272/334] Teach amcheck to verify sibling links in all cases.

Teach contrib/amcheck's bt_index_check() function to check agreement
between siblings links.  The left sibling's right link should point to a
right sibling page whose left link points back to the same original left
sibling.  This extends a check that bt_index_parent_check() always
performed to bt_index_check().

This is the first time amcheck has been taught to perform buffer lock
coupling, which we have explicitly avoided up until now.  The sibling
link check tends to catch a lot of real world index corruption with
little overhead, so it seems worth accepting the complexity.  Note that
the new lock coupling logic would not work correctly on replica servers
without the changes made by commits 0a7d771f and 9a9db08a (there could
be false positives without those changes).

Author: Andrey Borodin, Peter Geoghegan
Discussion: https://postgr.es/m/0EB0CFA8-CBD8-4296-8049-A2C0F28FAE8C@yandex-team.ru
---
 contrib/amcheck/verify_nbtree.c | 173 +++++++++++++++++++++++++++-----
 1 file changed, 150 insertions(+), 23 deletions(-)

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index b87a3cb4717c..635ece73b354 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -145,6 +145,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel,
 								 bool rootdescend);
 static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state,
 											   BtreeLevel level);
+static void bt_recheck_sibling_links(BtreeCheckState *state,
+									 BlockNumber btpo_prev_from_target,
+									 BlockNumber leftcurrent);
 static void bt_target_page_check(BtreeCheckState *state);
 static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state);
 static void bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
@@ -787,17 +790,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
 			 */
 		}
 
-		/*
-		 * readonly mode can only ever land on live pages and half-dead pages,
-		 * so sibling pointers should always be in mutual agreement
-		 */
-		if (state->readonly && opaque->btpo_prev != leftcurrent)
-			ereport(ERROR,
-					(errcode(ERRCODE_INDEX_CORRUPTED),
-					 errmsg("left link/right link pair in index \"%s\" not in agreement",
-							RelationGetRelationName(state->rel)),
-					 errdetail_internal("Block=%u left block=%u left link from block=%u.",
-										current, leftcurrent, opaque->btpo_prev)));
+		/* Sibling links should be in mutual agreement */
+		if (opaque->btpo_prev != leftcurrent)
+			bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent);
 
 		/* Check level, which must be valid for non-ignorable page */
 		if (level.level != opaque->btpo.level)
@@ -877,6 +872,140 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
 	return nextleveldown;
 }
 
+/*
+ * Raise an error when target page's left link does not point back to the
+ * previous target page, called leftcurrent here.  The leftcurrent page's
+ * right link was followed to get to the current target page, and we expect
+ * mutual agreement among leftcurrent and the current target page.  Make sure
+ * that this condition has definitely been violated in the !readonly case,
+ * where concurrent page splits are something that we need to deal with.
+ *
+ * Cross-page inconsistencies involving pages that don't agree about being
+ * siblings are known to be a particularly good indicator of corruption
+ * involving partial writes/lost updates.  The bt_right_page_check_scankey
+ * check also provides a way of detecting cross-page inconsistencies for
+ * !readonly callers, but it can only detect sibling pages that have an
+ * out-of-order keyspace, which can't catch many of the problems that we
+ * expect to catch here.
+ *
+ * The classic example of the kind of inconsistency that we can only catch
+ * with this check (when in !readonly mode) involves three sibling pages that
+ * were affected by a faulty page split at some point in the past.  The
+ * effects of the split are reflected in the original page and its new right
+ * sibling page, with a lack of any accompanying changes for the _original_
+ * right sibling page.  The original right sibling page's left link fails to
+ * point to the new right sibling page (its left link still points to the
+ * original page), even though the first phase of a page split is supposed to
+ * work as a single atomic action.  This subtle inconsistency will probably
+ * only break backwards scans in practice.
+ *
+ * Note that this is the only place where amcheck will "couple" buffer locks
+ * (and only for !readonly callers).  In general we prefer to avoid more
+ * thorough cross-page checks in !readonly mode, but it seems worth the
+ * complexity here.  Also, the performance overhead of performing lock
+ * coupling here is negligible in practice.  Control only reaches here with a
+ * non-corrupt index when there is a concurrent page split at the instant
+ * caller crossed over to target page from leftcurrent page.
+ */
+static void
+bt_recheck_sibling_links(BtreeCheckState *state,
+						 BlockNumber btpo_prev_from_target,
+						 BlockNumber leftcurrent)
+{
+	if (!state->readonly)
+	{
+		Buffer		lbuf;
+		Buffer		newtargetbuf;
+		Page		page;
+		BTPageOpaque opaque;
+		BlockNumber	newtargetblock;
+
+		/* Couple locks in the usual order for nbtree:  Left to right */
+		lbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM, leftcurrent,
+								  RBM_NORMAL, state->checkstrategy);
+		LockBuffer(lbuf, BT_READ);
+		_bt_checkpage(state->rel, lbuf);
+		page = BufferGetPage(lbuf);
+		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+		if (P_ISDELETED(opaque))
+		{
+			/*
+			 * Cannot reason about concurrently deleted page -- the left link
+			 * in the page to the right is expected to point to some other
+			 * page to the left (not leftcurrent page).
+			 *
+			 * Note that we deliberately don't give up with a half-dead page.
+			 */
+			UnlockReleaseBuffer(lbuf);
+			return;
+		}
+
+		newtargetblock = opaque->btpo_next;
+		/* Avoid self-deadlock when newtargetblock == leftcurrent */
+		if (newtargetblock != leftcurrent)
+		{
+			newtargetbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM,
+											  newtargetblock, RBM_NORMAL,
+											  state->checkstrategy);
+			LockBuffer(newtargetbuf, BT_READ);
+			_bt_checkpage(state->rel, newtargetbuf);
+			page = BufferGetPage(newtargetbuf);
+			opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+			/* btpo_prev_from_target may have changed; update it */
+			btpo_prev_from_target = opaque->btpo_prev;
+		}
+		else
+		{
+			/*
+			 * leftcurrent right sibling points back to leftcurrent block.
+			 * Index is corrupt.  Easiest way to handle this is to pretend
+			 * that we actually read from a distinct page that has an invalid
+			 * block number in its btpo_prev.
+			 */
+			newtargetbuf = InvalidBuffer;
+			btpo_prev_from_target = InvalidBlockNumber;
+		}
+
+		/*
+		 * No need to check P_ISDELETED here, since new target block cannot be
+		 * marked deleted as long as we hold a lock on lbuf
+		 */
+		if (BufferIsValid(newtargetbuf))
+			UnlockReleaseBuffer(newtargetbuf);
+		UnlockReleaseBuffer(lbuf);
+
+		if (btpo_prev_from_target == leftcurrent)
+		{
+			/* Report split in left sibling, not target (or new target) */
+			ereport(DEBUG1,
+					(errcode(ERRCODE_INTERNAL_ERROR),
+					 errmsg("harmless concurrent page split detected in index \"%s\"",
+							RelationGetRelationName(state->rel)),
+					 errdetail_internal("Block=%u new right sibling=%u original right sibling=%u.",
+										leftcurrent, newtargetblock,
+										state->targetblock)));
+			return;
+		}
+
+		/*
+		 * Index is corrupt.  Make sure that we report correct target page.
+		 *
+		 * This could have changed in cases where there was a concurrent page
+		 * split, as well as index corruption (at least in theory).  Note that
+		 * btpo_prev_from_target was already updated above.
+		 */
+		state->targetblock = newtargetblock;
+	}
+
+	ereport(ERROR,
+			(errcode(ERRCODE_INDEX_CORRUPTED),
+			 errmsg("left link/right link pair in index \"%s\" not in agreement",
+					RelationGetRelationName(state->rel)),
+			 errdetail_internal("Block=%u left block=%u left link from block=%u.",
+								state->targetblock, leftcurrent,
+								btpo_prev_from_target)));
+}
+
 /*
  * Function performs the following checks on target page, or pages ancillary to
  * target page:
@@ -1965,18 +2094,14 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
 	 * downlink, which was concurrently physically removed in target/parent as
 	 * part of deletion's first phase.)
 	 *
-	 * Note that while the cross-page-same-level last item check uses a trick
-	 * that allows it to perform verification for !readonly callers, a similar
-	 * trick seems difficult here.  The trick that that other check uses is,
-	 * in essence, to lock down race conditions to those that occur due to
-	 * concurrent page deletion of the target; that's a race that can be
-	 * reliably detected before actually reporting corruption.
-	 *
-	 * On the other hand, we'd need to lock down race conditions involving
-	 * deletion of child's left page, for long enough to read the child page
-	 * into memory (in other words, a scheme with concurrently held buffer
-	 * locks on both child and left-of-child pages).  That's unacceptable for
-	 * amcheck functions on general principle, though.
+	 * While we use various techniques elsewhere to perform cross-page
+	 * verification for !readonly callers, a similar trick seems difficult
+	 * here.  The tricks used by bt_recheck_sibling_links and by
+	 * bt_right_page_check_scankey both involve verification of a same-level,
+	 * cross-sibling invariant.  Cross-level invariants are far more squishy,
+	 * though.  The nbtree REDO routines do not actually couple buffer locks
+	 * across levels during page splits, so making any cross-level check work
+	 * reliably in !readonly mode may be impossible.
 	 */
 	Assert(state->readonly);
 
@@ -2785,6 +2910,8 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key,
  * There is never an attempt to get a consistent view of multiple pages using
  * multiple concurrent buffer locks; in general, we only acquire a single pin
  * and buffer lock at a time, which is often all that the nbtree code requires.
+ * (Actually, bt_recheck_sibling_links couples buffer locks, which is the only
+ * exception to this general rule.)
  *
  * Operating on a copy of the page is useful because it prevents control
  * getting stuck in an uninterruptible state when an underlying operator class

From 20e7e1fe316467720d8d062e1a1429f798fc31bf Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 8 Aug 2020 17:26:29 -0400
Subject: [PATCH 273/334] Remove <@ from contrib/intarray's GiST operator
 classes.

Since commit efc77cf5f, an indexed query using <@ has required a
full-index scan, so that it actually performs worse than a plain seqscan
would do.  As I noted at the time, we'd be better off to not treat <@ as
being indexable by such indexes at all; and that's what this patch does.

It would have been difficult to remove these opclass members without
dropping the whole opclass before commit 9f9682783 fixed GiST opclass
member dependency rules, but now it's quite simple, so let's do it.

I left the existing support code in place for the time being, with
comments noting it's now unreachable.  At some point, perhaps we should
remove that code in favor of throwing an error telling people to upgrade
the extension version.

Discussion: https://postgr.es/m/2176979.1596389859@sss.pgh.pa.us
Discussion: https://postgr.es/m/458.1565114141@sss.pgh.pa.us
---
 contrib/intarray/Makefile               |  3 ++-
 contrib/intarray/_int_gist.c            |  6 ++++++
 contrib/intarray/_intbig_gist.c         |  6 ++++++
 contrib/intarray/intarray--1.3--1.4.sql | 21 +++++++++++++++++++++
 contrib/intarray/intarray.control       |  2 +-
 doc/src/sgml/intarray.sgml              |  5 +++--
 6 files changed, 39 insertions(+), 4 deletions(-)
 create mode 100644 contrib/intarray/intarray--1.3--1.4.sql

diff --git a/contrib/intarray/Makefile b/contrib/intarray/Makefile
index b68959ebd64d..01faa36b1073 100644
--- a/contrib/intarray/Makefile
+++ b/contrib/intarray/Makefile
@@ -12,7 +12,8 @@ OBJS = \
 	_intbig_gist.o
 
 EXTENSION = intarray
-DATA = intarray--1.2--1.3.sql intarray--1.2.sql intarray--1.1--1.2.sql \
+DATA = intarray--1.3--1.4.sql intarray--1.2--1.3.sql \
+	intarray--1.2.sql intarray--1.1--1.2.sql \
 	intarray--1.0--1.1.sql
 PGFILEDESC = "intarray - functions and operators for arrays of integers"
 
diff --git a/contrib/intarray/_int_gist.c b/contrib/intarray/_int_gist.c
index fb05b06af9eb..f1817a6cce3b 100644
--- a/contrib/intarray/_int_gist.c
+++ b/contrib/intarray/_int_gist.c
@@ -93,6 +93,12 @@ g_int_consistent(PG_FUNCTION_ARGS)
 			break;
 		case RTContainedByStrategyNumber:
 		case RTOldContainedByStrategyNumber:
+
+			/*
+			 * This code is unreachable as of intarray 1.4, because the <@
+			 * operator has been removed from the opclass.  We keep it for now
+			 * to support older versions of the SQL definitions.
+			 */
 			if (GIST_LEAF(entry))
 				retval = inner_int_contains(query,
 											(ArrayType *) DatumGetPointer(entry->key));
diff --git a/contrib/intarray/_intbig_gist.c b/contrib/intarray/_intbig_gist.c
index 67c44e99a9a7..18ecd8cda6b1 100644
--- a/contrib/intarray/_intbig_gist.c
+++ b/contrib/intarray/_intbig_gist.c
@@ -533,6 +533,12 @@ g_intbig_consistent(PG_FUNCTION_ARGS)
 			break;
 		case RTContainedByStrategyNumber:
 		case RTOldContainedByStrategyNumber:
+
+			/*
+			 * This code is unreachable as of intarray 1.4, because the <@
+			 * operator has been removed from the opclass.  We keep it for now
+			 * to support older versions of the SQL definitions.
+			 */
 			if (GIST_LEAF(entry))
 			{
 				int			i,
diff --git a/contrib/intarray/intarray--1.3--1.4.sql b/contrib/intarray/intarray--1.3--1.4.sql
new file mode 100644
index 000000000000..3fbebb541737
--- /dev/null
+++ b/contrib/intarray/intarray--1.3--1.4.sql
@@ -0,0 +1,21 @@
+/* contrib/intarray/intarray--1.3--1.4.sql */
+
+-- complain if script is sourced in psql, rather than via ALTER EXTENSION
+\echo Use "ALTER EXTENSION intarray UPDATE TO '1.4'" to load this file. \quit
+
+-- Remove <@ from the GiST opclasses, as it's not usefully indexable
+-- due to mishandling of empty arrays.  (It's OK in GIN.)
+
+ALTER OPERATOR FAMILY gist__int_ops USING gist
+DROP OPERATOR 8 (_int4, _int4);
+
+ALTER OPERATOR FAMILY gist__intbig_ops USING gist
+DROP OPERATOR 8 (_int4, _int4);
+
+-- Likewise for the old spelling ~.
+
+ALTER OPERATOR FAMILY gist__int_ops USING gist
+DROP OPERATOR 14 (_int4, _int4);
+
+ALTER OPERATOR FAMILY gist__intbig_ops USING gist
+DROP OPERATOR 14 (_int4, _int4);
diff --git a/contrib/intarray/intarray.control b/contrib/intarray/intarray.control
index db7746b6c7a0..bbc837c5732e 100644
--- a/contrib/intarray/intarray.control
+++ b/contrib/intarray/intarray.control
@@ -1,6 +1,6 @@
 # intarray extension
 comment = 'functions, operators, and index support for 1-D arrays of integers'
-default_version = '1.3'
+default_version = '1.4'
 module_pathname = '$libdir/_int'
 relocatable = true
 trusted = true
diff --git a/doc/src/sgml/intarray.sgml b/doc/src/sgml/intarray.sgml
index 9d2eb52eeb4f..c8db87e97df9 100644
--- a/doc/src/sgml/intarray.sgml
+++ b/doc/src/sgml/intarray.sgml
@@ -399,7 +399,7 @@
 
   <para>
    <filename>intarray</filename> provides index support for the
-   <literal>&amp;&amp;</literal>, <literal>@&gt;</literal>, <literal>&lt;@</literal>,
+   <literal>&amp;&amp;</literal>, <literal>@&gt;</literal>,
    and <literal>@@</literal> operators, as well as regular array equality.
   </para>
 
@@ -436,7 +436,8 @@
 
   <para>
    There is also a non-default GIN operator class
-   <literal>gin__int_ops</literal> supporting the same operators.
+   <literal>gin__int_ops</literal>, which supports these operators as well
+   as <literal>&lt;@</literal>.
   </para>
 
   <para>

From 1c164ef3d28dfab445a885a03e80cfd0d552f64a Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 9 Aug 2020 11:32:31 -0400
Subject: [PATCH 274/334] Remove useless Assert.

Testing that an unsigned variable is >= 0 is pretty pointless,
as noted by Coverity and numerous buildfarm members.

In passing, add comment about new uses of "volatile" --- Coverity
doesn't much like that either, but it seems probably necessary.
---
 src/backend/replication/logical/reorderbuffer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c
index 5b7afe6d9e9c..1975d629a6e2 100644
--- a/src/backend/replication/logical/reorderbuffer.c
+++ b/src/backend/replication/logical/reorderbuffer.c
@@ -1907,6 +1907,9 @@ ReorderBufferResetTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
  * merge) and replay the changes in lsn order.
  *
  * If streaming is true then data will be sent using stream API.
+ *
+ * Note: "volatile" markers on some parameters are to avoid trouble with
+ * PG_TRY inside the function.
  */
 static void
 ReorderBufferProcessTXN(ReorderBuffer *rb, ReorderBufferTXN *txn,
@@ -2762,7 +2765,6 @@ ReorderBufferChangeMemoryUpdate(ReorderBuffer *rb,
 	}
 
 	Assert(txn->size <= rb->size);
-	Assert((txn->size >= 0) && (rb->size >= 0));
 }
 
 /*

From 1b9cde51246c7773eac119b84cc18095118735de Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 9 Aug 2020 12:39:07 -0400
Subject: [PATCH 275/334] Check for fseeko() failure in pg_dump's
 _tarAddFile().

Coverity pointed out, not unreasonably, that we checked fseeko's
result at every other call site but these.  Failure to seek in the
temp file (note this is NOT pg_dump's output file) seems quite
unlikely, and even if it did happen the file length cross-check
further down would probably detect the problem.  Still, that's a
poor excuse for not checking the result of a system call.
---
 src/bin/pg_dump/pg_backup_tar.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup_tar.c b/src/bin/pg_dump/pg_backup_tar.c
index b4f594295927..c601ec07012a 100644
--- a/src/bin/pg_dump/pg_backup_tar.c
+++ b/src/bin/pg_dump/pg_backup_tar.c
@@ -1082,11 +1082,13 @@ _tarAddFile(ArchiveHandle *AH, TAR_MEMBER *th)
 	/*
 	 * Find file len & go back to start.
 	 */
-	fseeko(tmp, 0, SEEK_END);
+	if (fseeko(tmp, 0, SEEK_END) != 0)
+		fatal("error during file seek: %m");
 	th->fileLen = ftello(tmp);
 	if (th->fileLen < 0)
 		fatal("could not determine seek position in archive file: %m");
-	fseeko(tmp, 0, SEEK_SET);
+	if (fseeko(tmp, 0, SEEK_SET) != 0)
+		fatal("error during file seek: %m");
 
 	_tarWriteHeader(th);
 

From d129c07499dbf0d5960115173515e3ce384c662a Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Sun, 9 Aug 2020 12:01:15 -0700
Subject: [PATCH 276/334] Correct nbtree page split lock coupling comment.

There is no reason to distinguish between readers and writers here.
---
 src/backend/access/nbtree/nbtinsert.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index e3a44bc09e02..d36f7557c87c 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -1861,11 +1861,9 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	}
 
 	/*
-	 * We have to grab the right sibling (if any) and fix the prev pointer
-	 * there. We are guaranteed that this is deadlock-free since no other
-	 * writer will be holding a lock on that page and trying to move left, and
-	 * all readers release locks on a page before trying to fetch its
-	 * neighbors.
+	 * We have to grab the original right sibling (if any) and update its prev
+	 * link.  We are guaranteed that this is deadlock-free, since we couple
+	 * the locks in the standard order: left to right.
 	 */
 	if (!isrightmost)
 	{

From 7eeb1d9861b0a3f453f8b31c7648396cdd7f1e59 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 10 Aug 2020 10:44:42 -0400
Subject: [PATCH 277/334] Make contrib modules' installation scripts more
 secure.

Hostile objects located within the installation-time search_path could
capture references in an extension's installation or upgrade script.
If the extension is being installed with superuser privileges, this
opens the door to privilege escalation.  While such hazards have existed
all along, their urgency increases with the v13 "trusted extensions"
feature, because that lets a non-superuser control the installation path
for a superuser-privileged script.  Therefore, make a number of changes
to make such situations more secure:

* Tweak the construction of the installation-time search_path to ensure
that references to objects in pg_catalog can't be subverted; and
explicitly add pg_temp to the end of the path to prevent attacks using
temporary objects.

* Disable check_function_bodies within installation/upgrade scripts,
so that any security gaps in SQL-language or PL-language function bodies
cannot create a risk of unwanted installation-time code execution.

* Adjust lookup of type input/receive functions and join estimator
functions to complain if there are multiple candidate functions.  This
prevents capture of references to functions whose signature is not the
first one checked; and it's arguably more user-friendly anyway.

* Modify various contrib upgrade scripts to ensure that catalog
modification queries are executed with secure search paths.  (These
are in-place modifications with no extension version changes, since
it is the update process itself that is at issue, not the end result.)

Extensions that depend on other extensions cannot be made fully secure
by these methods alone; therefore, revert the "trusted" marking that
commit eb67623c9 applied to earthdistance and hstore_plperl, pending
some better solution to that set of issues.

Also add documentation around these issues, to help extension authors
write secure installation scripts.

Patch by me, following an observation by Andres Freund; thanks
to Noah Misch for review.

Security: CVE-2020-14350
---
 contrib/btree_gist/btree_gist--1.1--1.2.sql  |  56 +++--
 contrib/citext/citext--1.1--1.2.sql          |  26 ++-
 contrib/citext/citext--1.2--1.3.sql          |  18 +-
 contrib/cube/cube--1.1--1.2.sql              |  25 ++-
 contrib/cube/cube--1.3--1.4.sql              |  25 ++-
 contrib/earthdistance/earthdistance--1.1.sql |   2 +-
 contrib/earthdistance/earthdistance.control  |   1 -
 contrib/hstore/hstore--1.1--1.2.sql          |   9 +-
 contrib/hstore/hstore--1.3--1.4.sql          |  35 +++-
 contrib/hstore_plperl/hstore_plperl.control  |   1 -
 contrib/intagg/intagg--1.0--1.1.sql          |  14 +-
 contrib/intarray/intarray--1.1--1.2.sql      |  27 ++-
 contrib/ltree/ltree--1.0--1.1.sql            |  37 +++-
 contrib/pg_trgm/pg_trgm--1.2--1.3.sql        |  25 ++-
 contrib/seg/seg--1.0--1.1.sql                |  23 ++-
 contrib/seg/seg--1.2--1.3.sql                |  25 ++-
 doc/src/sgml/earthdistance.sgml              |  27 ++-
 doc/src/sgml/extend.sgml                     | 203 +++++++++++++++----
 doc/src/sgml/hstore.sgml                     |  12 +-
 doc/src/sgml/ltree.sgml                      |   9 +
 doc/src/sgml/ref/create_extension.sgml       |  37 +++-
 src/backend/commands/extension.c             |  21 +-
 src/backend/commands/operatorcmds.c          |  26 ++-
 src/backend/commands/typecmds.c              |  50 +++--
 24 files changed, 575 insertions(+), 159 deletions(-)

diff --git a/contrib/btree_gist/btree_gist--1.1--1.2.sql b/contrib/btree_gist/btree_gist--1.1--1.2.sql
index 8487f9bfc88a..d5a8c6cf90e9 100644
--- a/contrib/btree_gist/btree_gist--1.1--1.2.sql
+++ b/contrib/btree_gist/btree_gist--1.1--1.2.sql
@@ -8,56 +8,72 @@
 -- wherein the signatures have been updated already.  In that case to_regprocedure() will
 -- return NULL and no updates will happen.
 
+DO LANGUAGE plpgsql
+$$
+DECLARE
+  my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema());
+  old_path pg_catalog.text := pg_catalog.current_setting('search_path');
+BEGIN
+-- for safety, transiently set search_path to just pg_catalog+pg_temp
+PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true);
+
 UPDATE pg_catalog.pg_proc SET
   proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector,
   pronargs = pg_catalog.array_length(newtypes, 1)
 FROM (VALUES
-(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types
+(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types
 ('gbt_oid_distance(internal,oid,int2,oid)', '{internal,oid,int2,oid,internal}'),
 ('gbt_oid_union(bytea,internal)', '{internal,internal}'),
-('gbt_oid_same(internal,internal,internal)', '{gbtreekey8,gbtreekey8,internal}'),
+('gbt_oid_same(internal,internal,internal)', '{SCH.gbtreekey8,SCH.gbtreekey8,internal}'),
 ('gbt_int2_distance(internal,int2,int2,oid)', '{internal,int2,int2,oid,internal}'),
 ('gbt_int2_union(bytea,internal)', '{internal,internal}'),
-('gbt_int2_same(internal,internal,internal)', '{gbtreekey4,gbtreekey4,internal}'),
+('gbt_int2_same(internal,internal,internal)', '{SCH.gbtreekey4,SCH.gbtreekey4,internal}'),
 ('gbt_int4_distance(internal,int4,int2,oid)', '{internal,int4,int2,oid,internal}'),
 ('gbt_int4_union(bytea,internal)', '{internal,internal}'),
-('gbt_int4_same(internal,internal,internal)', '{gbtreekey8,gbtreekey8,internal}'),
+('gbt_int4_same(internal,internal,internal)', '{SCH.gbtreekey8,SCH.gbtreekey8,internal}'),
 ('gbt_int8_distance(internal,int8,int2,oid)', '{internal,int8,int2,oid,internal}'),
 ('gbt_int8_union(bytea,internal)', '{internal,internal}'),
-('gbt_int8_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}'),
+('gbt_int8_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}'),
 ('gbt_float4_distance(internal,float4,int2,oid)', '{internal,float4,int2,oid,internal}'),
 ('gbt_float4_union(bytea,internal)', '{internal,internal}'),
-('gbt_float4_same(internal,internal,internal)', '{gbtreekey8,gbtreekey8,internal}'),
+('gbt_float4_same(internal,internal,internal)', '{SCH.gbtreekey8,SCH.gbtreekey8,internal}'),
 ('gbt_float8_distance(internal,float8,int2,oid)', '{internal,float8,int2,oid,internal}'),
 ('gbt_float8_union(bytea,internal)', '{internal,internal}'),
-('gbt_float8_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}'),
+('gbt_float8_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}'),
 ('gbt_ts_distance(internal,timestamp,int2,oid)', '{internal,timestamp,int2,oid,internal}'),
 ('gbt_tstz_distance(internal,timestamptz,int2,oid)', '{internal,timestamptz,int2,oid,internal}'),
 ('gbt_ts_union(bytea,internal)', '{internal,internal}'),
-('gbt_ts_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}'),
+('gbt_ts_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}'),
 ('gbt_time_distance(internal,time,int2,oid)', '{internal,time,int2,oid,internal}'),
 ('gbt_time_union(bytea,internal)', '{internal,internal}'),
-('gbt_time_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}'),
+('gbt_time_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}'),
 ('gbt_date_distance(internal,date,int2,oid)', '{internal,date,int2,oid,internal}'),
 ('gbt_date_union(bytea,internal)', '{internal,internal}'),
-('gbt_date_same(internal,internal,internal)', '{gbtreekey8,gbtreekey8,internal}'),
+('gbt_date_same(internal,internal,internal)', '{SCH.gbtreekey8,SCH.gbtreekey8,internal}'),
 ('gbt_intv_distance(internal,interval,int2,oid)', '{internal,interval,int2,oid,internal}'),
 ('gbt_intv_union(bytea,internal)', '{internal,internal}'),
-('gbt_intv_same(internal,internal,internal)', '{gbtreekey32,gbtreekey32,internal}'),
+('gbt_intv_same(internal,internal,internal)', '{SCH.gbtreekey32,SCH.gbtreekey32,internal}'),
 ('gbt_cash_distance(internal,money,int2,oid)', '{internal,money,int2,oid,internal}'),
 ('gbt_cash_union(bytea,internal)', '{internal,internal}'),
-('gbt_cash_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}'),
+('gbt_cash_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}'),
 ('gbt_macad_union(bytea,internal)', '{internal,internal}'),
-('gbt_macad_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}'),
+('gbt_macad_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}'),
 ('gbt_text_union(bytea,internal)', '{internal,internal}'),
-('gbt_text_same(internal,internal,internal)', '{gbtreekey_var,gbtreekey_var,internal}'),
+('gbt_text_same(internal,internal,internal)', '{SCH.gbtreekey_var,SCH.gbtreekey_var,internal}'),
 ('gbt_bytea_union(bytea,internal)', '{internal,internal}'),
-('gbt_bytea_same(internal,internal,internal)', '{gbtreekey_var,gbtreekey_var,internal}'),
+('gbt_bytea_same(internal,internal,internal)', '{SCH.gbtreekey_var,SCH.gbtreekey_var,internal}'),
 ('gbt_numeric_union(bytea,internal)', '{internal,internal}'),
-('gbt_numeric_same(internal,internal,internal)', '{gbtreekey_var,gbtreekey_var,internal}'),
+('gbt_numeric_same(internal,internal,internal)', '{SCH.gbtreekey_var,SCH.gbtreekey_var,internal}'),
 ('gbt_bit_union(bytea,internal)', '{internal,internal}'),
-('gbt_bit_same(internal,internal,internal)', '{gbtreekey_var,gbtreekey_var,internal}'),
+('gbt_bit_same(internal,internal,internal)', '{SCH.gbtreekey_var,SCH.gbtreekey_var,internal}'),
 ('gbt_inet_union(bytea,internal)', '{internal,internal}'),
-('gbt_inet_same(internal,internal,internal)', '{gbtreekey16,gbtreekey16,internal}')
-) AS update_data (oldproc, newtypes)
-WHERE oid = pg_catalog.to_regprocedure(oldproc);
+('gbt_inet_same(internal,internal,internal)', '{SCH.gbtreekey16,SCH.gbtreekey16,internal}')
+) AS update_data (oldproc, newtypestext),
+LATERAL (
+  SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ
+) ls
+WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema));
+
+PERFORM pg_catalog.set_config('search_path', old_path, true);
+END
+$$;
diff --git a/contrib/citext/citext--1.1--1.2.sql b/contrib/citext/citext--1.1--1.2.sql
index 4f0e4bc7195b..a8bba860a1d4 100644
--- a/contrib/citext/citext--1.1--1.2.sql
+++ b/contrib/citext/citext--1.1--1.2.sql
@@ -41,14 +41,28 @@ ALTER FUNCTION replace(citext, citext, citext) PARALLEL SAFE;
 ALTER FUNCTION split_part(citext, citext, int) PARALLEL SAFE;
 ALTER FUNCTION translate(citext, citext, text) PARALLEL SAFE;
 
+-- We have to update aggregates the hard way for lack of ALTER support
+DO LANGUAGE plpgsql
+$$
+DECLARE
+  my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema());
+  old_path pg_catalog.text := pg_catalog.current_setting('search_path');
+BEGIN
+-- for safety, transiently set search_path to just pg_catalog+pg_temp
+PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true);
+
 UPDATE pg_proc SET proparallel = 's'
-WHERE oid = 'min(citext)'::pg_catalog.regprocedure;
+WHERE oid = (my_schema || '.min(' || my_schema || '.citext)')::pg_catalog.regprocedure;
 
 UPDATE pg_proc SET proparallel = 's'
-WHERE oid = 'max(citext)'::pg_catalog.regprocedure;
+WHERE oid = (my_schema || '.max(' || my_schema || '.citext)')::pg_catalog.regprocedure;
+
+UPDATE pg_aggregate SET aggcombinefn = (my_schema || '.citext_smaller')::regproc
+WHERE aggfnoid = (my_schema || '.max(' || my_schema || '.citext)')::pg_catalog.regprocedure;
 
-UPDATE pg_aggregate SET aggcombinefn = 'citext_smaller'
-WHERE aggfnoid = 'max(citext)'::pg_catalog.regprocedure;
+UPDATE pg_aggregate SET aggcombinefn = (my_schema || '.citext_larger')::regproc
+WHERE aggfnoid = (my_schema || '.max(' || my_schema || '.citext)')::pg_catalog.regprocedure;
 
-UPDATE pg_aggregate SET aggcombinefn = 'citext_larger'
-WHERE aggfnoid = 'max(citext)'::pg_catalog.regprocedure;
+PERFORM pg_catalog.set_config('search_path', old_path, true);
+END
+$$;
diff --git a/contrib/citext/citext--1.2--1.3.sql b/contrib/citext/citext--1.2--1.3.sql
index 4ab867915c73..24a71452c624 100644
--- a/contrib/citext/citext--1.2--1.3.sql
+++ b/contrib/citext/citext--1.2--1.3.sql
@@ -3,5 +3,19 @@
 -- complain if script is sourced in psql, rather than via ALTER EXTENSION
 \echo Use "ALTER EXTENSION citext UPDATE TO '1.3'" to load this file. \quit
 
-UPDATE pg_aggregate SET aggcombinefn = 'citext_smaller'
-WHERE aggfnoid = 'min(citext)'::pg_catalog.regprocedure;
+-- We have to update aggregates the hard way for lack of ALTER support
+DO LANGUAGE plpgsql
+$$
+DECLARE
+  my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema());
+  old_path pg_catalog.text := pg_catalog.current_setting('search_path');
+BEGIN
+-- for safety, transiently set search_path to just pg_catalog+pg_temp
+PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true);
+
+UPDATE pg_aggregate SET aggcombinefn = (my_schema || '.citext_smaller')::regproc
+WHERE aggfnoid = (my_schema || '.min(' || my_schema || '.citext)')::pg_catalog.regprocedure;
+
+PERFORM pg_catalog.set_config('search_path', old_path, true);
+END
+$$;
diff --git a/contrib/cube/cube--1.1--1.2.sql b/contrib/cube/cube--1.1--1.2.sql
index 64a531e8b433..76aba239e5bc 100644
--- a/contrib/cube/cube--1.1--1.2.sql
+++ b/contrib/cube/cube--1.1--1.2.sql
@@ -7,16 +7,31 @@
 -- We use to_regprocedure() so that query doesn't fail if run against 9.6beta1 definitions,
 -- wherein the signatures have been updated already.  In that case to_regprocedure() will
 -- return NULL and no updates will happen.
+DO LANGUAGE plpgsql
+$$
+DECLARE
+  my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema());
+  old_path pg_catalog.text := pg_catalog.current_setting('search_path');
+BEGIN
+-- for safety, transiently set search_path to just pg_catalog+pg_temp
+PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true);
 
 UPDATE pg_catalog.pg_proc SET
   proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector,
   pronargs = pg_catalog.array_length(newtypes, 1)
 FROM (VALUES
-(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types
-('g_cube_consistent(internal,cube,int4,oid,internal)', '{internal,cube,int2,oid,internal}'),
-('g_cube_distance(internal,cube,smallint,oid)', '{internal,cube,smallint,oid,internal}')
-) AS update_data (oldproc, newtypes)
-WHERE oid = pg_catalog.to_regprocedure(oldproc);
+(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types
+('g_cube_consistent(internal,SCH.cube,int4,oid,internal)', '{internal,SCH.cube,int2,oid,internal}'),
+('g_cube_distance(internal,SCH.cube,smallint,oid)', '{internal,SCH.cube,smallint,oid,internal}')
+) AS update_data (oldproc, newtypestext),
+LATERAL (
+  SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ
+) ls
+WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema));
+
+PERFORM pg_catalog.set_config('search_path', old_path, true);
+END
+$$;
 
 ALTER FUNCTION cube_in(cstring) PARALLEL SAFE;
 ALTER FUNCTION cube(float8[], float8[]) PARALLEL SAFE;
diff --git a/contrib/cube/cube--1.3--1.4.sql b/contrib/cube/cube--1.3--1.4.sql
index 869820c0c834..41629395df27 100644
--- a/contrib/cube/cube--1.3--1.4.sql
+++ b/contrib/cube/cube--1.3--1.4.sql
@@ -12,6 +12,15 @@
 -- bound into a particular opclass.  There's no SQL command for that,
 -- so fake it with a manual update on pg_depend.
 --
+DO LANGUAGE plpgsql
+$$
+DECLARE
+  my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema());
+  old_path pg_catalog.text := pg_catalog.current_setting('search_path');
+BEGIN
+-- for safety, transiently set search_path to just pg_catalog+pg_temp
+PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true);
+
 UPDATE pg_catalog.pg_depend
 SET deptype = 'a'
 WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass
@@ -20,14 +29,10 @@ WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass
      FROM pg_catalog.pg_depend
      WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass
        AND refclassid = 'pg_catalog.pg_proc'::pg_catalog.regclass
-       AND (refobjid = 'g_cube_compress(pg_catalog.internal)'::pg_catalog.regprocedure))
+       AND (refobjid = (my_schema || '.g_cube_compress(pg_catalog.internal)')::pg_catalog.regprocedure))
   AND refclassid = 'pg_catalog.pg_opclass'::pg_catalog.regclass
   AND deptype = 'i';
 
-ALTER OPERATOR FAMILY gist_cube_ops USING gist drop function 3 (cube);
-ALTER EXTENSION cube DROP function g_cube_compress(pg_catalog.internal);
-DROP FUNCTION g_cube_compress(pg_catalog.internal);
-
 UPDATE pg_catalog.pg_depend
 SET deptype = 'a'
 WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass
@@ -36,10 +41,18 @@ WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass
      FROM pg_catalog.pg_depend
      WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass
        AND refclassid = 'pg_catalog.pg_proc'::pg_catalog.regclass
-       AND (refobjid = 'g_cube_decompress(pg_catalog.internal)'::pg_catalog.regprocedure))
+       AND (refobjid = (my_schema || '.g_cube_decompress(pg_catalog.internal)')::pg_catalog.regprocedure))
   AND refclassid = 'pg_catalog.pg_opclass'::pg_catalog.regclass
   AND deptype = 'i';
 
+PERFORM pg_catalog.set_config('search_path', old_path, true);
+END
+$$;
+
+ALTER OPERATOR FAMILY gist_cube_ops USING gist drop function 3 (cube);
+ALTER EXTENSION cube DROP function g_cube_compress(pg_catalog.internal);
+DROP FUNCTION g_cube_compress(pg_catalog.internal);
+
 ALTER OPERATOR FAMILY gist_cube_ops USING gist drop function 4 (cube);
 ALTER EXTENSION cube DROP function g_cube_decompress(pg_catalog.internal);
 DROP FUNCTION g_cube_decompress(pg_catalog.internal);
diff --git a/contrib/earthdistance/earthdistance--1.1.sql b/contrib/earthdistance/earthdistance--1.1.sql
index 9136a54a7b34..9ef20ab848c5 100644
--- a/contrib/earthdistance/earthdistance--1.1.sql
+++ b/contrib/earthdistance/earthdistance--1.1.sql
@@ -31,7 +31,7 @@ CREATE DOMAIN earth AS cube
   CONSTRAINT not_point check(cube_is_point(value))
   CONSTRAINT not_3d check(cube_dim(value) <= 3)
   CONSTRAINT on_surface check(abs(cube_distance(value, '(0)'::cube) /
-  earth() - 1) < '10e-7'::float8);
+  earth() - '1'::float8) < '10e-7'::float8);
 
 CREATE FUNCTION sec_to_gc(float8)
 RETURNS float8
diff --git a/contrib/earthdistance/earthdistance.control b/contrib/earthdistance/earthdistance.control
index 3df666dfc1bb..5816d22cdd98 100644
--- a/contrib/earthdistance/earthdistance.control
+++ b/contrib/earthdistance/earthdistance.control
@@ -3,5 +3,4 @@ comment = 'calculate great-circle distances on the surface of the Earth'
 default_version = '1.1'
 module_pathname = '$libdir/earthdistance'
 relocatable = true
-trusted = true
 requires = 'cube'
diff --git a/contrib/hstore/hstore--1.1--1.2.sql b/contrib/hstore/hstore--1.1--1.2.sql
index a868ffe48e1a..cc69fc7f802e 100644
--- a/contrib/hstore/hstore--1.1--1.2.sql
+++ b/contrib/hstore/hstore--1.1--1.2.sql
@@ -9,10 +9,13 @@
 -- dependent on the extension.
 
 DO LANGUAGE plpgsql
-
 $$
-
+DECLARE
+  my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema());
+  old_path pg_catalog.text := pg_catalog.current_setting('search_path');
 BEGIN
+-- for safety, transiently set search_path to just pg_catalog+pg_temp
+PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true);
 
    PERFORM 1
    FROM pg_proc p
@@ -27,6 +30,7 @@ BEGIN
 
    IF NOT FOUND
    THEN
+        PERFORM pg_catalog.set_config('search_path', old_path, true);
 
         CREATE FUNCTION hstore_to_json(hstore)
         RETURNS json
@@ -43,6 +47,7 @@ BEGIN
 
    END IF;
 
+PERFORM pg_catalog.set_config('search_path', old_path, true);
 END;
 
 $$;
diff --git a/contrib/hstore/hstore--1.3--1.4.sql b/contrib/hstore/hstore--1.3--1.4.sql
index d68956bb9495..53f26f9fb847 100644
--- a/contrib/hstore/hstore--1.3--1.4.sql
+++ b/contrib/hstore/hstore--1.3--1.4.sql
@@ -7,23 +7,38 @@
 -- We use to_regprocedure() so that query doesn't fail if run against 9.6beta1 definitions,
 -- wherein the signatures have been updated already.  In that case to_regprocedure() will
 -- return NULL and no updates will happen.
+DO LANGUAGE plpgsql
+$$
+DECLARE
+  my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema());
+  old_path pg_catalog.text := pg_catalog.current_setting('search_path');
+BEGIN
+-- for safety, transiently set search_path to just pg_catalog+pg_temp
+PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true);
 
 UPDATE pg_catalog.pg_proc SET
   proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector,
   pronargs = pg_catalog.array_length(newtypes, 1)
 FROM (VALUES
-(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types
-('ghstore_same(internal,internal,internal)', '{ghstore,ghstore,internal}'),
-('ghstore_consistent(internal,internal,int4,oid,internal)', '{internal,hstore,int2,oid,internal}'),
-('gin_extract_hstore(internal,internal)', '{hstore,internal}'),
-('gin_extract_hstore_query(internal,internal,int2,internal,internal)', '{hstore,internal,int2,internal,internal}'),
-('gin_consistent_hstore(internal,int2,internal,int4,internal,internal)', '{internal,int2,hstore,int4,internal,internal}')
-) AS update_data (oldproc, newtypes)
-WHERE oid = pg_catalog.to_regprocedure(oldproc);
+(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types
+('ghstore_same(internal,internal,internal)', '{SCH.ghstore,SCH.ghstore,internal}'),
+('ghstore_consistent(internal,internal,int4,oid,internal)', '{internal,SCH.hstore,int2,oid,internal}'),
+('gin_extract_hstore(internal,internal)', '{SCH.hstore,internal}'),
+('gin_extract_hstore_query(internal,internal,int2,internal,internal)', '{SCH.hstore,internal,int2,internal,internal}'),
+('gin_consistent_hstore(internal,int2,internal,int4,internal,internal)', '{internal,int2,SCH.hstore,int4,internal,internal}')
+) AS update_data (oldproc, newtypestext),
+LATERAL (
+  SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ
+) ls
+WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema));
 
 UPDATE pg_catalog.pg_proc SET
-  prorettype = 'ghstore'::pg_catalog.regtype
-WHERE oid = pg_catalog.to_regprocedure('ghstore_union(internal,internal)');
+  prorettype = (my_schema || '.ghstore')::pg_catalog.regtype
+WHERE oid = pg_catalog.to_regprocedure((my_schema || '.ghstore_union(internal,internal)'));
+
+PERFORM pg_catalog.set_config('search_path', old_path, true);
+END
+$$;
 
 ALTER FUNCTION hstore_in(cstring) PARALLEL SAFE;
 ALTER FUNCTION hstore_out(hstore) PARALLEL SAFE;
diff --git a/contrib/hstore_plperl/hstore_plperl.control b/contrib/hstore_plperl/hstore_plperl.control
index 4b9fd13d04fc..16277f68c1cc 100644
--- a/contrib/hstore_plperl/hstore_plperl.control
+++ b/contrib/hstore_plperl/hstore_plperl.control
@@ -3,5 +3,4 @@ comment = 'transform between hstore and plperl'
 default_version = '1.0'
 module_pathname = '$libdir/hstore_plperl'
 relocatable = true
-trusted = true
 requires = 'hstore,plperl'
diff --git a/contrib/intagg/intagg--1.0--1.1.sql b/contrib/intagg/intagg--1.0--1.1.sql
index b2a2820b0cac..c0cc17a033bd 100644
--- a/contrib/intagg/intagg--1.0--1.1.sql
+++ b/contrib/intagg/intagg--1.0--1.1.sql
@@ -6,6 +6,18 @@
 ALTER FUNCTION int_agg_state(internal, int4) PARALLEL SAFE;
 ALTER FUNCTION int_agg_final_array(internal) PARALLEL SAFE;
 ALTER FUNCTION int_array_enum(int4[]) PARALLEL SAFE;
+DO LANGUAGE plpgsql
+$$
+DECLARE
+  my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema());
+  old_path pg_catalog.text := pg_catalog.current_setting('search_path');
+BEGIN
+-- for safety, transiently set search_path to just pg_catalog+pg_temp
+PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true);
 
 UPDATE pg_proc SET proparallel = 's'
-WHERE oid = 'int_array_aggregate(int4)'::pg_catalog.regprocedure;
+WHERE oid = (my_schema || '.int_array_aggregate(int4)')::pg_catalog.regprocedure;
+
+PERFORM pg_catalog.set_config('search_path', old_path, true);
+END
+$$;
diff --git a/contrib/intarray/intarray--1.1--1.2.sql b/contrib/intarray/intarray--1.1--1.2.sql
index 468f245ecec9..919340ef01ef 100644
--- a/contrib/intarray/intarray--1.1--1.2.sql
+++ b/contrib/intarray/intarray--1.1--1.2.sql
@@ -7,23 +7,38 @@
 -- We use to_regprocedure() so that query doesn't fail if run against 9.6beta1 definitions,
 -- wherein the signatures have been updated already.  In that case to_regprocedure() will
 -- return NULL and no updates will happen.
+DO LANGUAGE plpgsql
+$$
+DECLARE
+  my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema());
+  old_path pg_catalog.text := pg_catalog.current_setting('search_path');
+BEGIN
+-- for safety, transiently set search_path to just pg_catalog+pg_temp
+PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true);
 
 UPDATE pg_catalog.pg_proc SET
   proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector,
   pronargs = pg_catalog.array_length(newtypes, 1)
 FROM (VALUES
-(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types
+(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types
 ('g_int_consistent(internal,_int4,int4,oid,internal)', '{internal,_int4,int2,oid,internal}'),
 ('g_intbig_consistent(internal,internal,int4,oid,internal)', '{internal,_int4,int2,oid,internal}'),
-('g_intbig_same(internal,internal,internal)', '{intbig_gkey,intbig_gkey,internal}'),
+('g_intbig_same(internal,internal,internal)', '{SCH.intbig_gkey,SCH.intbig_gkey,internal}'),
 ('ginint4_queryextract(internal,internal,int2,internal,internal,internal,internal)', '{_int4,internal,int2,internal,internal,internal,internal}'),
 ('ginint4_consistent(internal,int2,internal,int4,internal,internal,internal,internal)', '{internal,int2,_int4,int4,internal,internal,internal,internal}')
-) AS update_data (oldproc, newtypes)
-WHERE oid = pg_catalog.to_regprocedure(oldproc);
+) AS update_data (oldproc, newtypestext),
+LATERAL (
+  SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ
+) ls
+WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema));
 
 UPDATE pg_catalog.pg_proc SET
-  prorettype = 'intbig_gkey'::pg_catalog.regtype
-WHERE oid = pg_catalog.to_regprocedure('g_intbig_union(internal,internal)');
+  prorettype = (my_schema || '.intbig_gkey')::pg_catalog.regtype
+WHERE oid = pg_catalog.to_regprocedure(my_schema || '.g_intbig_union(internal,internal)');
+
+PERFORM pg_catalog.set_config('search_path', old_path, true);
+END
+$$;
 
 ALTER FUNCTION bqarr_in(cstring) PARALLEL SAFE;
 ALTER FUNCTION bqarr_out(query_int) PARALLEL SAFE;
diff --git a/contrib/ltree/ltree--1.0--1.1.sql b/contrib/ltree/ltree--1.0--1.1.sql
index 155751aa3a87..2ce6f5adbc21 100644
--- a/contrib/ltree/ltree--1.0--1.1.sql
+++ b/contrib/ltree/ltree--1.0--1.1.sql
@@ -7,26 +7,41 @@
 -- We use to_regprocedure() so that query doesn't fail if run against 9.6beta1 definitions,
 -- wherein the signatures have been updated already.  In that case to_regprocedure() will
 -- return NULL and no updates will happen.
+DO LANGUAGE plpgsql
+$$
+DECLARE
+  my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema());
+  old_path pg_catalog.text := pg_catalog.current_setting('search_path');
+BEGIN
+-- for safety, transiently set search_path to just pg_catalog+pg_temp
+PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true);
 
 UPDATE pg_catalog.pg_proc SET
   proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector,
   pronargs = pg_catalog.array_length(newtypes, 1)
 FROM (VALUES
-(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types
-('ltree_consistent(internal,internal,int2,oid,internal)', '{internal,ltree,int2,oid,internal}'),
-('ltree_same(internal,internal,internal)', '{ltree_gist,ltree_gist,internal}'),
-('_ltree_consistent(internal,internal,int2,oid,internal)', '{internal,_ltree,int2,oid,internal}'),
-('_ltree_same(internal,internal,internal)', '{ltree_gist,ltree_gist,internal}')
-) AS update_data (oldproc, newtypes)
-WHERE oid = pg_catalog.to_regprocedure(oldproc);
+(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types
+('ltree_consistent(internal,internal,int2,oid,internal)', '{internal,SCH.ltree,int2,oid,internal}'),
+('ltree_same(internal,internal,internal)', '{SCH.ltree_gist,SCH.ltree_gist,internal}'),
+('_ltree_consistent(internal,internal,int2,oid,internal)', '{internal,SCH._ltree,int2,oid,internal}'),
+('_ltree_same(internal,internal,internal)', '{SCH.ltree_gist,SCH.ltree_gist,internal}')
+) AS update_data (oldproc, newtypestext),
+LATERAL (
+  SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ
+) ls
+WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema));
 
 UPDATE pg_catalog.pg_proc SET
-  prorettype = 'ltree_gist'::pg_catalog.regtype
-WHERE oid = pg_catalog.to_regprocedure('ltree_union(internal,internal)');
+  prorettype = (my_schema || '.ltree_gist')::pg_catalog.regtype
+WHERE oid = pg_catalog.to_regprocedure(my_schema || '.ltree_union(internal,internal)');
 
 UPDATE pg_catalog.pg_proc SET
-  prorettype = 'ltree_gist'::pg_catalog.regtype
-WHERE oid = pg_catalog.to_regprocedure('_ltree_union(internal,internal)');
+  prorettype = (my_schema || '.ltree_gist')::pg_catalog.regtype
+WHERE oid = pg_catalog.to_regprocedure(my_schema || '._ltree_union(internal,internal)');
+
+PERFORM pg_catalog.set_config('search_path', old_path, true);
+END
+$$;
 
 ALTER FUNCTION ltree_in(cstring) PARALLEL SAFE;
 ALTER FUNCTION ltree_out(ltree) PARALLEL SAFE;
diff --git a/contrib/pg_trgm/pg_trgm--1.2--1.3.sql b/contrib/pg_trgm/pg_trgm--1.2--1.3.sql
index b082dcd8d841..8dc772c40727 100644
--- a/contrib/pg_trgm/pg_trgm--1.2--1.3.sql
+++ b/contrib/pg_trgm/pg_trgm--1.2--1.3.sql
@@ -7,21 +7,36 @@
 -- We use to_regprocedure() so that query doesn't fail if run against 9.6beta1 definitions,
 -- wherein the signatures have been updated already.  In that case to_regprocedure() will
 -- return NULL and no updates will happen.
+DO LANGUAGE plpgsql
+$$
+DECLARE
+  my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema());
+  old_path pg_catalog.text := pg_catalog.current_setting('search_path');
+BEGIN
+-- for safety, transiently set search_path to just pg_catalog+pg_temp
+PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true);
 
 UPDATE pg_catalog.pg_proc SET
   proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector,
   pronargs = pg_catalog.array_length(newtypes, 1)
 FROM (VALUES
-(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types
+(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types
 ('gtrgm_consistent(internal,text,int4,oid,internal)', '{internal,text,int2,oid,internal}'),
 ('gtrgm_distance(internal,text,int4,oid)', '{internal,text,int2,oid,internal}'),
 ('gtrgm_union(bytea,internal)', '{internal,internal}')
-) AS update_data (oldproc, newtypes)
-WHERE oid = pg_catalog.to_regprocedure(oldproc);
+) AS update_data (oldproc, newtypestext),
+LATERAL (
+  SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ
+) ls
+WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema));
 
 UPDATE pg_catalog.pg_proc SET
-  prorettype = 'gtrgm'::pg_catalog.regtype
-WHERE oid = pg_catalog.to_regprocedure('gtrgm_union(internal,internal)');
+  prorettype = (my_schema || '.gtrgm')::pg_catalog.regtype
+WHERE oid = pg_catalog.to_regprocedure(my_schema || '.gtrgm_union(internal,internal)');
+
+PERFORM pg_catalog.set_config('search_path', old_path, true);
+END
+$$;
 
 ALTER FUNCTION set_limit(float4) PARALLEL UNSAFE;
 ALTER FUNCTION show_limit() PARALLEL SAFE;
diff --git a/contrib/seg/seg--1.0--1.1.sql b/contrib/seg/seg--1.0--1.1.sql
index 2dcd4d428003..ae6cb2fba889 100644
--- a/contrib/seg/seg--1.0--1.1.sql
+++ b/contrib/seg/seg--1.0--1.1.sql
@@ -7,15 +7,30 @@
 -- We use to_regprocedure() so that query doesn't fail if run against 9.6beta1 definitions,
 -- wherein the signatures have been updated already.  In that case to_regprocedure() will
 -- return NULL and no updates will happen.
+DO LANGUAGE plpgsql
+$$
+DECLARE
+  my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema());
+  old_path pg_catalog.text := pg_catalog.current_setting('search_path');
+BEGIN
+-- for safety, transiently set search_path to just pg_catalog+pg_temp
+PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true);
 
 UPDATE pg_catalog.pg_proc SET
   proargtypes = pg_catalog.array_to_string(newtypes::pg_catalog.oid[], ' ')::pg_catalog.oidvector,
   pronargs = pg_catalog.array_length(newtypes, 1)
 FROM (VALUES
-(NULL::pg_catalog.text, NULL::pg_catalog.regtype[]), -- establish column types
-('gseg_consistent(internal,seg,int4,oid,internal)', '{internal,seg,int2,oid,internal}')
-) AS update_data (oldproc, newtypes)
-WHERE oid = pg_catalog.to_regprocedure(oldproc);
+(NULL::pg_catalog.text, NULL::pg_catalog.text[]), -- establish column types
+('gseg_consistent(internal,SCH.seg,int4,oid,internal)', '{internal,SCH.seg,int2,oid,internal}')
+) AS update_data (oldproc, newtypestext),
+LATERAL (
+  SELECT array_agg(replace(typ, 'SCH', my_schema)::regtype) as newtypes FROM unnest(newtypestext) typ
+) ls
+WHERE oid = to_regprocedure(my_schema || '.' || replace(oldproc, 'SCH', my_schema));
+
+PERFORM pg_catalog.set_config('search_path', old_path, true);
+END
+$$;
 
 ALTER FUNCTION seg_in(cstring) PARALLEL SAFE;
 ALTER FUNCTION seg_out(seg) PARALLEL SAFE;
diff --git a/contrib/seg/seg--1.2--1.3.sql b/contrib/seg/seg--1.2--1.3.sql
index cd71a300f6df..578e98953ca3 100644
--- a/contrib/seg/seg--1.2--1.3.sql
+++ b/contrib/seg/seg--1.2--1.3.sql
@@ -12,6 +12,15 @@
 -- bound into a particular opclass.  There's no SQL command for that,
 -- so fake it with a manual update on pg_depend.
 --
+DO LANGUAGE plpgsql
+$$
+DECLARE
+  my_schema pg_catalog.text := pg_catalog.quote_ident(pg_catalog.current_schema());
+  old_path pg_catalog.text := pg_catalog.current_setting('search_path');
+BEGIN
+-- for safety, transiently set search_path to just pg_catalog+pg_temp
+PERFORM pg_catalog.set_config('search_path', 'pg_catalog, pg_temp', true);
+
 UPDATE pg_catalog.pg_depend
 SET deptype = 'a'
 WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass
@@ -20,14 +29,10 @@ WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass
      FROM pg_catalog.pg_depend
      WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass
        AND refclassid = 'pg_catalog.pg_proc'::pg_catalog.regclass
-       AND (refobjid = 'gseg_compress(pg_catalog.internal)'::pg_catalog.regprocedure))
+       AND (refobjid = (my_schema || '.gseg_compress(internal)')::pg_catalog.regprocedure))
   AND refclassid = 'pg_catalog.pg_opclass'::pg_catalog.regclass
   AND deptype = 'i';
 
-ALTER OPERATOR FAMILY gist_seg_ops USING gist drop function 3 (seg);
-ALTER EXTENSION seg DROP function gseg_compress(pg_catalog.internal);
-DROP function gseg_compress(pg_catalog.internal);
-
 UPDATE pg_catalog.pg_depend
 SET deptype = 'a'
 WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass
@@ -36,10 +41,18 @@ WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass
      FROM pg_catalog.pg_depend
      WHERE classid = 'pg_catalog.pg_amproc'::pg_catalog.regclass
        AND refclassid = 'pg_catalog.pg_proc'::pg_catalog.regclass
-       AND (refobjid = 'gseg_decompress(pg_catalog.internal)'::pg_catalog.regprocedure))
+       AND (refobjid = (my_schema || '.gseg_decompress(internal)')::pg_catalog.regprocedure))
   AND refclassid = 'pg_catalog.pg_opclass'::pg_catalog.regclass
   AND deptype = 'i';
 
+PERFORM pg_catalog.set_config('search_path', old_path, true);
+END
+$$;
+
+ALTER OPERATOR FAMILY gist_seg_ops USING gist drop function 3 (seg);
+ALTER EXTENSION seg DROP function gseg_compress(pg_catalog.internal);
+DROP function gseg_compress(pg_catalog.internal);
+
 ALTER OPERATOR FAMILY gist_seg_ops USING gist drop function 4 (seg);
 ALTER EXTENSION seg DROP function gseg_decompress(pg_catalog.internal);
 DROP function gseg_decompress(pg_catalog.internal);
diff --git a/doc/src/sgml/earthdistance.sgml b/doc/src/sgml/earthdistance.sgml
index 4ac52cb191cb..641e69c5e984 100644
--- a/doc/src/sgml/earthdistance.sgml
+++ b/doc/src/sgml/earthdistance.sgml
@@ -10,9 +10,8 @@
  <para>
   The <filename>earthdistance</filename> module provides two different approaches to
   calculating great circle distances on the surface of the Earth. The one
-  described first depends on the <filename>cube</filename> module (which
-  <emphasis>must</emphasis> be installed before <filename>earthdistance</filename> can be
-  installed). The second one is based on the built-in <type>point</type> data type,
+  described first depends on the <filename>cube</filename> module.
+  The second one is based on the built-in <type>point</type> data type,
   using longitude and latitude for the coordinates.
  </para>
 
@@ -24,11 +23,27 @@
  </para>
 
  <para>
-  This module is considered <quote>trusted</quote>, that is, it can be
-  installed by non-superusers who have <literal>CREATE</literal> privilege
-  on the current database.
+  The <filename>cube</filename> module must be installed
+  before <filename>earthdistance</filename> can be installed
+  (although you can use the <literal>CASCADE</literal> option
+  of <command>CREATE EXTENSION</command> to install both in one command).
  </para>
 
+ <caution>
+  <para>
+   It is strongly recommended that <filename>earthdistance</filename>
+   and <filename>cube</filename> be installed in the same schema, and that
+   that schema be one for which CREATE privilege has not been and will not
+   be granted to any untrusted users.
+   Otherwise there are installation-time security hazards
+   if <filename>earthdistance</filename>'s schema contains objects defined
+   by a hostile user.
+   Furthermore, when using <filename>earthdistance</filename>'s functions
+   after installation, the entire search path should contain only trusted
+   schemas.
+  </para>
+ </caution>
+
  <sect2>
   <title>Cube-Based Earth Distances</title>
 
diff --git a/doc/src/sgml/extend.sgml b/doc/src/sgml/extend.sgml
index 890ff97b7aef..641c9ce3c9ba 100644
--- a/doc/src/sgml/extend.sgml
+++ b/doc/src/sgml/extend.sgml
@@ -540,7 +540,7 @@ RETURNS anycompatible AS ...
 
    <para>
     The extension script may set privileges on objects that are part of the
-    extension via <command>GRANT</command> and <command>REVOKE</command>
+    extension, using <command>GRANT</command> and <command>REVOKE</command>
     statements.  The final set of privileges for each object (if any are set)
     will be stored in the
     <link linkend="catalog-pg-init-privs"><structname>pg_init_privs</structname></link>
@@ -597,32 +597,6 @@ RETURNS anycompatible AS ...
     dropping the whole extension.
    </para>
 
-   <sect2 id="extend-extensions-style">
-    <title>Defining Extension Objects</title>
-
-    <!-- XXX It's not enough to use qualified names, because one might write a
-         qualified name to an object that itself uses unqualified names.  Many
-         information_schema functions have that defect, for example.  However,
-         that's a defect in the referenced object, and relatively few queries
-         will be affected.  Also, we direct applications to secure search_path
-         when connecting to an untrusted database; if applications do that,
-         they are immune to known attacks even if some extension refers to a
-         defective object.  Therefore, guide extension authors as though core
-         PostgreSQL contained no such defect. -->
-    <para>
-     Widely-distributed extensions should assume little about the database
-     they occupy.  In particular, unless you issued <literal>SET search_path =
-     pg_temp</literal>, assume each unqualified name could resolve to an
-     object that a malicious user has defined.  Beware of constructs that
-     depend on <varname>search_path</varname> implicitly: <token>IN</token>
-     and <literal>CASE <replaceable>expression</replaceable> WHEN</literal>
-     always select an operator using the search path.  In their place, use
-     <literal>OPERATOR(<replaceable>schema</replaceable>.=) ANY</literal>
-     and <literal>CASE WHEN <replaceable>expression</replaceable></literal>.
-    </para>
-
-   </sect2>
-
    <sect2>
     <title>Extension Files</title>
 
@@ -740,7 +714,8 @@ RETURNS anycompatible AS ...
        <para>
         If this parameter is <literal>true</literal> (which is the default),
         only superusers can create the extension or update it to a new
-        version.  If it is set to <literal>false</literal>, just the privileges
+        version (but see also <varname>trusted</varname>, below).
+        If it is set to <literal>false</literal>, just the privileges
         required to execute the commands in the installation or update script
         are required.
         This should normally be set to <literal>true</literal> if any of the
@@ -768,6 +743,9 @@ RETURNS anycompatible AS ...
         Generally, this should not be set true for extensions that could
         allow access to otherwise-superuser-only abilities, such as
         file system access.
+        Also, marking an extension trusted requires significant extra effort
+        to write the extension's installation and update script(s) securely;
+        see <xref linkend="extend-extensions-security"/>.
        </para>
       </listitem>
      </varlistentry>
@@ -921,7 +899,7 @@ RETURNS anycompatible AS ...
      schema; that is, <command>CREATE EXTENSION</command> does the equivalent of
      this:
 <programlisting>
-SET LOCAL search_path TO @extschema@;
+SET LOCAL search_path TO @extschema@, pg_temp;
 </programlisting>
      This allows the objects created by the script file to go into the target
      schema.  The script file can change <varname>search_path</varname> if it wishes,
@@ -941,9 +919,15 @@ SET LOCAL search_path TO @extschema@;
 
     <para>
      If any prerequisite extensions are listed in <varname>requires</varname>
-     in the control file, their target schemas are appended to the initial
-     setting of <varname>search_path</varname>.  This allows their objects to be
-     visible to the new extension's script file.
+     in the control file, their target schemas are added to the initial
+     setting of <varname>search_path</varname>, following the new
+     extension's target schema.  This allows their objects to be visible to
+     the new extension's script file.
+    </para>
+
+    <para>
+     For security, <literal>pg_temp</literal> is automatically appended to
+     the end of <varname>search_path</varname> in all cases.
     </para>
 
     <para>
@@ -1170,6 +1154,154 @@ SELECT * FROM pg_extension_update_paths('<replaceable>extension_name</replaceabl
     </para>
    </sect2>
 
+   <sect2 id="extend-extensions-security">
+    <title>Security Considerations for Extensions</title>
+
+    <para>
+     Widely-distributed extensions should assume little about the database
+     they occupy.  Therefore, it's appropriate to write functions provided
+     by an extension in a secure style that cannot be compromised by
+     search-path-based attacks.
+    </para>
+
+    <para>
+     An extension that has the <varname>superuser</varname> property set to
+     true must also consider security hazards for the actions taken within
+     its installation and update scripts.  It is not terribly difficult for
+     a malicious user to create trojan-horse objects that will compromise
+     later execution of a carelessly-written extension script, allowing that
+     user to acquire superuser privileges.
+    </para>
+
+    <para>
+     If an extension is marked <varname>trusted</varname>, then its
+     installation schema can be selected by the installing user, who might
+     intentionally use an insecure schema in hopes of gaining superuser
+     privileges.  Therefore, a trusted extension is extremely exposed from a
+     security standpoint, and all its script commands must be carefully
+     examined to ensure that no compromise is possible.
+    </para>
+
+    <para>
+     Advice about writing functions securely is provided in
+     <xref linkend="extend-extensions-security-funcs"/> below, and advice
+     about writing installation scripts securely is provided in
+     <xref linkend="extend-extensions-security-scripts"/>.
+    </para>
+
+    <sect3 id="extend-extensions-security-funcs">
+     <title>Security Considerations for Extension Functions</title>
+
+     <para>
+      SQL-language and PL-language functions provided by extensions are at
+      risk of search-path-based attacks when they are executed, since
+      parsing of these functions occurs at execution time not creation time.
+     </para>
+
+     <para>
+      The <link linkend="sql-createfunction-security"><command>CREATE
+      FUNCTION</command></link> reference page contains advice about
+      writing <literal>SECURITY DEFINER</literal> functions safely.  It's
+      good practice to apply those techniques for any function provided by
+      an extension, since the function might be called by a high-privilege
+      user.
+     </para>
+
+     <!-- XXX It's not enough to use qualified names, because one might write a
+          qualified name to an object that itself uses unqualified names.  Many
+          information_schema functions have that defect, for example.  However,
+          that's a defect in the referenced object, and relatively few queries
+          will be affected.  Also, we direct applications to secure search_path
+          when connecting to an untrusted database; if applications do that,
+          they are immune to known attacks even if some extension refers to a
+          defective object.  Therefore, guide extension authors as though core
+          PostgreSQL contained no such defect. -->
+     <para>
+      If you cannot set the <varname>search_path</varname> to contain only
+      secure schemas, assume that each unqualified name could resolve to an
+      object that a malicious user has defined.  Beware of constructs that
+      depend on <varname>search_path</varname> implicitly; for
+      example, <token>IN</token>
+      and <literal>CASE <replaceable>expression</replaceable> WHEN</literal>
+      always select an operator using the search path.  In their place, use
+      <literal>OPERATOR(<replaceable>schema</replaceable>.=) ANY</literal>
+      and <literal>CASE WHEN <replaceable>expression</replaceable></literal>.
+     </para>
+
+     <para>
+      A general-purpose extension usually should not assume that it's been
+      installed into a secure schema, which means that even schema-qualified
+      references to its own objects are not entirely risk-free.  For
+      example, if the extension has defined a
+      function <literal>myschema.myfunc(bigint)</literal> then a call such
+      as <literal>myschema.myfunc(42)</literal> could be captured by a
+      hostile function <literal>myschema.myfunc(integer)</literal>.  Be
+      careful that the data types of function and operator parameters exactly
+      match the declared argument types, using explicit casts where necessary.
+     </para>
+    </sect3>
+
+    <sect3 id="extend-extensions-security-scripts">
+     <title>Security Considerations for Extension Scripts</title>
+
+     <para>
+      An extension installation or update script should be written to guard
+      against search-path-based attacks occurring when the script executes.
+      If an object reference in the script can be made to resolve to some
+      other object than the script author intended, then a compromise might
+      occur immediately, or later when the mis-defined extension object is
+      used.
+     </para>
+
+     <para>
+      DDL commands such as <command>CREATE FUNCTION</command>
+      and <command>CREATE OPERATOR CLASS</command> are generally secure,
+      but beware of any command having a general-purpose expression as a
+      component.  For example, <command>CREATE VIEW</command> needs to be
+      vetted, as does a <literal>DEFAULT</literal> expression
+      in <command>CREATE FUNCTION</command>.
+     </para>
+
+     <para>
+      Sometimes an extension script might need to execute general-purpose
+      SQL, for example to make catalog adjustments that aren't possible via
+      DDL.  Be careful to execute such commands with a
+      secure <varname>search_path</varname>; do <emphasis>not</emphasis>
+      trust the path provided by <command>CREATE/ALTER EXTENSION</command>
+      to be secure.  Best practice is to temporarily
+      set <varname>search_path</varname> to <literal>'pg_catalog,
+      pg_temp'</literal> and insert references to the extension's
+      installation schema explicitly where needed.  (This practice might
+      also be helpful for creating views.)  Examples can be found in
+      the <filename>contrib</filename> modules in
+      the <productname>PostgreSQL</productname> source code distribution.
+     </para>
+
+     <para>
+      Cross-extension references are extremely difficult to make fully
+      secure, partially because of uncertainty about which schema the other
+      extension is in.  The hazards are reduced if both extensions are
+      installed in the same schema, because then a hostile object cannot be
+      placed ahead of the referenced extension in the installation-time
+      <varname>search_path</varname>.  However, no mechanism currently exists
+      to require that.  For now, best practice is to not mark an extension
+      trusted if it depends on another one, unless that other one is always
+      installed in <literal>pg_catalog</literal>.
+     </para>
+
+     <para>
+      Do <emphasis>not</emphasis> use <command>CREATE OR REPLACE
+      FUNCTION</command>, except in an update script that must change the
+      definition of a function that is known to be an extension member
+      already.  (Likewise for other <literal>OR REPLACE</literal> options.)
+      Using <literal>OR REPLACE</literal> unnecessarily not only has a risk
+      of accidentally overwriting someone else's function, but it creates a
+      security hazard since the overwritten function would still be owned by
+      its original owner, who could modify it.
+     </para>
+    </sect3>
+   </sect2>
+
    <sect2 id="extend-extensions-example">
     <title>Extension Example</title>
 
@@ -1189,18 +1321,18 @@ SELECT * FROM pg_extension_update_paths('<replaceable>extension_name</replaceabl
 
 CREATE TYPE pair AS ( k text, v text );
 
-CREATE OR REPLACE FUNCTION pair(text, text)
+CREATE FUNCTION pair(text, text)
 RETURNS pair LANGUAGE SQL AS 'SELECT ROW($1, $2)::@extschema@.pair;';
 
 CREATE OPERATOR ~> (LEFTARG = text, RIGHTARG = text, FUNCTION = pair);
 
 -- "SET search_path" is easy to get right, but qualified names perform better.
-CREATE OR REPLACE FUNCTION lower(pair)
+CREATE FUNCTION lower(pair)
 RETURNS pair LANGUAGE SQL
 AS 'SELECT ROW(lower($1.k), lower($1.v))::@extschema@.pair;'
 SET search_path = pg_temp;
 
-CREATE OR REPLACE FUNCTION pair_concat(pair, pair)
+CREATE FUNCTION pair_concat(pair, pair)
 RETURNS pair LANGUAGE SQL
 AS 'SELECT ROW($1.k OPERATOR(pg_catalog.||) $2.k,
                $1.v OPERATOR(pg_catalog.||) $2.v)::@extschema@.pair;';
@@ -1215,6 +1347,7 @@ AS 'SELECT ROW($1.k OPERATOR(pg_catalog.||) $2.k,
 # pair extension
 comment = 'A key/value pair data type'
 default_version = '1.0'
+# cannot be relocatable because of use of @extschema@
 relocatable = false
 </programlisting>
     </para>
diff --git a/doc/src/sgml/hstore.sgml b/doc/src/sgml/hstore.sgml
index fd75e92790b3..8a1caa357613 100644
--- a/doc/src/sgml/hstore.sgml
+++ b/doc/src/sgml/hstore.sgml
@@ -918,10 +918,14 @@ ALTER TABLE tablename ALTER hstorecol TYPE hstore USING hstorecol || '';
    Python dictionaries.
   </para>
 
-  <para>
-   Of these additional extensions, <literal>hstore_plperl</literal> is
-   considered trusted; the rest are not.
-  </para>
+  <caution>
+   <para>
+    It is strongly recommended that the transform extensions be installed in
+    the same schema as <filename>hstore</filename>.  Otherwise there are
+    installation-time security hazards if a transform extension's schema
+    contains objects defined by a hostile user.
+   </para>
+  </caution>
  </sect2>
 
  <sect2>
diff --git a/doc/src/sgml/ltree.sgml b/doc/src/sgml/ltree.sgml
index dea453fc7599..36aa2b5fad86 100644
--- a/doc/src/sgml/ltree.sgml
+++ b/doc/src/sgml/ltree.sgml
@@ -835,6 +835,15 @@ ltreetest=&gt; SELECT ins_label(path,2,'Space') FROM test WHERE path &lt;@ 'Top.
    creating a function, <type>ltree</type> values are mapped to Python lists.
    (The reverse is currently not supported, however.)
   </para>
+
+  <caution>
+   <para>
+    It is strongly recommended that the transform extensions be installed in
+    the same schema as <filename>ltree</filename>.  Otherwise there are
+    installation-time security hazards if a transform extension's schema
+    contains objects defined by a hostile user.
+   </para>
+  </caution>
  </sect2>
 
  <sect2>
diff --git a/doc/src/sgml/ref/create_extension.sgml b/doc/src/sgml/ref/create_extension.sgml
index 756dd193f854..efd7fc646560 100644
--- a/doc/src/sgml/ref/create_extension.sgml
+++ b/doc/src/sgml/ref/create_extension.sgml
@@ -177,6 +177,33 @@ CREATE EXTENSION [ IF NOT EXISTS ] <replaceable class="parameter">extension_name
    system views.
   </para>
 
+  <caution>
+   <para>
+    Installing an extension as superuser requires trusting that the
+    extension's author wrote the extension installation script in a secure
+    fashion.  It is not terribly difficult for a malicious user to create
+    trojan-horse objects that will compromise later execution of a
+    carelessly-written extension script, allowing that user to acquire
+    superuser privileges.  However, trojan-horse objects are only hazardous
+    if they are in the <varname>search_path</varname> during script
+    execution, meaning that they are in the extension's installation target
+    schema or in the schema of some extension it depends on.  Therefore, a
+    good rule of thumb when dealing with extensions whose scripts have not
+    been carefully vetted is to install them only into schemas for which
+    CREATE privilege has not been and will not be granted to any untrusted
+    users.  Likewise for any extensions they depend on.
+   </para>
+
+   <para>
+    The extensions supplied with <productname>PostgreSQL</productname> are
+    believed to be secure against installation-time attacks of this sort,
+    except for a few that depend on other extensions.  As stated in the
+    documentation for those extensions, they should be installed into secure
+    schemas, or installed into the same schemas as the extensions they
+    depend on, or both.
+   </para>
+  </caution>
+
   <para>
    For information about writing new extensions, see
    <xref linkend="extend-extensions"/>.
@@ -188,10 +215,16 @@ CREATE EXTENSION [ IF NOT EXISTS ] <replaceable class="parameter">extension_name
 
   <para>
    Install the <link linkend="hstore">hstore</link> extension into the
-   current database:
+   current database, placing its objects in schema <literal>addons</literal>:
+<programlisting>
+CREATE EXTENSION hstore SCHEMA addons;
+</programlisting>
+   Another way to accomplish the same thing:
 <programlisting>
+SET search_path = addons;
 CREATE EXTENSION hstore;
-</programlisting></para>
+</programlisting>
+  </para>
  </refsect1>
 
  <refsect1>
diff --git a/src/backend/commands/extension.c b/src/backend/commands/extension.c
index c796fcd8da0a..b5630b4c8d98 100644
--- a/src/backend/commands/extension.c
+++ b/src/backend/commands/extension.c
@@ -908,9 +908,21 @@ execute_extension_script(Oid extensionOid, ExtensionControlFile *control,
 								 GUC_ACTION_SAVE, true, 0, false);
 
 	/*
-	 * Set up the search path to contain the target schema, then the schemas
-	 * of any prerequisite extensions, and nothing else.  In particular this
-	 * makes the target schema be the default creation target namespace.
+	 * Similarly disable check_function_bodies, to ensure that SQL functions
+	 * won't be parsed during creation.
+	 */
+	if (check_function_bodies)
+		(void) set_config_option("check_function_bodies", "off",
+								 PGC_USERSET, PGC_S_SESSION,
+								 GUC_ACTION_SAVE, true, 0, false);
+
+	/*
+	 * Set up the search path to have the target schema first, making it be
+	 * the default creation target namespace.  Then add the schemas of any
+	 * prerequisite extensions, unless they are in pg_catalog which would be
+	 * searched anyway.  (Listing pg_catalog explicitly in a non-first
+	 * position would be bad for security.)  Finally add pg_temp to ensure
+	 * that temp objects can't take precedence over others.
 	 *
 	 * Note: it might look tempting to use PushOverrideSearchPath for this,
 	 * but we cannot do that.  We have to actually set the search_path GUC in
@@ -924,9 +936,10 @@ execute_extension_script(Oid extensionOid, ExtensionControlFile *control,
 		Oid			reqschema = lfirst_oid(lc);
 		char	   *reqname = get_namespace_name(reqschema);
 
-		if (reqname)
+		if (reqname && strcmp(reqname, "pg_catalog") != 0)
 			appendStringInfo(&pathbuf, ", %s", quote_identifier(reqname));
 	}
+	appendStringInfoString(&pathbuf, ", pg_temp");
 
 	(void) set_config_option("search_path", pathbuf.data,
 							 PGC_USERSET, PGC_S_SESSION,
diff --git a/src/backend/commands/operatorcmds.c b/src/backend/commands/operatorcmds.c
index 0a53e9b93e21..bf23937849c9 100644
--- a/src/backend/commands/operatorcmds.c
+++ b/src/backend/commands/operatorcmds.c
@@ -297,6 +297,7 @@ ValidateJoinEstimator(List *joinName)
 {
 	Oid			typeId[5];
 	Oid			joinOid;
+	Oid			joinOid2;
 	AclResult	aclresult;
 
 	typeId[0] = INTERNALOID;	/* PlannerInfo */
@@ -307,15 +308,26 @@ ValidateJoinEstimator(List *joinName)
 
 	/*
 	 * As of Postgres 8.4, the preferred signature for join estimators has 5
-	 * arguments, but we still allow the old 4-argument form. Try the
-	 * preferred form first.
+	 * arguments, but we still allow the old 4-argument form.  Whine about
+	 * ambiguity if both forms exist.
 	 */
 	joinOid = LookupFuncName(joinName, 5, typeId, true);
-	if (!OidIsValid(joinOid))
-		joinOid = LookupFuncName(joinName, 4, typeId, true);
-	/* If not found, reference the 5-argument signature in error msg */
-	if (!OidIsValid(joinOid))
-		joinOid = LookupFuncName(joinName, 5, typeId, false);
+	joinOid2 = LookupFuncName(joinName, 4, typeId, true);
+	if (OidIsValid(joinOid))
+	{
+		if (OidIsValid(joinOid2))
+			ereport(ERROR,
+					(errcode(ERRCODE_AMBIGUOUS_FUNCTION),
+					 errmsg("join estimator function %s has multiple matches",
+							NameListToString(joinName))));
+	}
+	else
+	{
+		joinOid = joinOid2;
+		/* If not found, reference the 5-argument signature in error msg */
+		if (!OidIsValid(joinOid))
+			joinOid = LookupFuncName(joinName, 5, typeId, false);
+	}
 
 	/* estimators must return float8 */
 	if (get_func_rettype(joinOid) != FLOAT8OID)
diff --git a/src/backend/commands/typecmds.c b/src/backend/commands/typecmds.c
index 2e107ace39be..483bb65ddc89 100644
--- a/src/backend/commands/typecmds.c
+++ b/src/backend/commands/typecmds.c
@@ -1627,21 +1627,31 @@ findTypeInputFunction(List *procname, Oid typeOid)
 {
 	Oid			argList[3];
 	Oid			procOid;
+	Oid			procOid2;
 
 	/*
 	 * Input functions can take a single argument of type CSTRING, or three
-	 * arguments (string, typioparam OID, typmod).  They must return the
-	 * target type.
+	 * arguments (string, typioparam OID, typmod).  Whine about ambiguity if
+	 * both forms exist.
 	 */
 	argList[0] = CSTRINGOID;
+	argList[1] = OIDOID;
+	argList[2] = INT4OID;
 
 	procOid = LookupFuncName(procname, 1, argList, true);
-	if (!OidIsValid(procOid))
+	procOid2 = LookupFuncName(procname, 3, argList, true);
+	if (OidIsValid(procOid))
 	{
-		argList[1] = OIDOID;
-		argList[2] = INT4OID;
-
-		procOid = LookupFuncName(procname, 3, argList, true);
+		if (OidIsValid(procOid2))
+			ereport(ERROR,
+					(errcode(ERRCODE_AMBIGUOUS_FUNCTION),
+					 errmsg("type input function %s has multiple matches",
+							NameListToString(procname))));
+	}
+	else
+	{
+		procOid = procOid2;
+		/* If not found, reference the 1-argument signature in error msg */
 		if (!OidIsValid(procOid))
 			ereport(ERROR,
 					(errcode(ERRCODE_UNDEFINED_FUNCTION),
@@ -1649,6 +1659,7 @@ findTypeInputFunction(List *procname, Oid typeOid)
 							func_signature_string(procname, 1, NIL, argList))));
 	}
 
+	/* Input functions must return the target type. */
 	if (get_func_rettype(procOid) != typeOid)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),
@@ -1714,21 +1725,31 @@ findTypeReceiveFunction(List *procname, Oid typeOid)
 {
 	Oid			argList[3];
 	Oid			procOid;
+	Oid			procOid2;
 
 	/*
 	 * Receive functions can take a single argument of type INTERNAL, or three
-	 * arguments (internal, typioparam OID, typmod).  They must return the
-	 * target type.
+	 * arguments (internal, typioparam OID, typmod).  Whine about ambiguity if
+	 * both forms exist.
 	 */
 	argList[0] = INTERNALOID;
+	argList[1] = OIDOID;
+	argList[2] = INT4OID;
 
 	procOid = LookupFuncName(procname, 1, argList, true);
-	if (!OidIsValid(procOid))
+	procOid2 = LookupFuncName(procname, 3, argList, true);
+	if (OidIsValid(procOid))
 	{
-		argList[1] = OIDOID;
-		argList[2] = INT4OID;
-
-		procOid = LookupFuncName(procname, 3, argList, true);
+		if (OidIsValid(procOid2))
+			ereport(ERROR,
+					(errcode(ERRCODE_AMBIGUOUS_FUNCTION),
+					 errmsg("type receive function %s has multiple matches",
+							NameListToString(procname))));
+	}
+	else
+	{
+		procOid = procOid2;
+		/* If not found, reference the 1-argument signature in error msg */
 		if (!OidIsValid(procOid))
 			ereport(ERROR,
 					(errcode(ERRCODE_UNDEFINED_FUNCTION),
@@ -1736,6 +1757,7 @@ findTypeReceiveFunction(List *procname, Oid typeOid)
 							func_signature_string(procname, 1, NIL, argList))));
 	}
 
+	/* Receive functions must return the target type. */
 	if (get_func_rettype(procOid) != typeOid)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_OBJECT_DEFINITION),

From e078fb5d4eeb23d0d09932e0b183a8e7bdfb17b4 Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Mon, 10 Aug 2020 09:22:54 -0700
Subject: [PATCH 278/334] Move connect.h from fe_utils to src/include/common.

Any libpq client can use the header.  Clients include backend components
postgres_fdw, dblink, and logical replication apply worker.  Back-patch
to v10, because another fix needs this.  In released branches, just copy
the header and keep the original.
---
 contrib/oid2name/oid2name.c                | 2 +-
 contrib/vacuumlo/vacuumlo.c                | 2 +-
 src/bin/pg_basebackup/streamutil.c         | 2 +-
 src/bin/pg_dump/pg_backup_db.c             | 2 +-
 src/bin/pg_dump/pg_dump.c                  | 2 +-
 src/bin/pg_dump/pg_dumpall.c               | 2 +-
 src/bin/pg_rewind/libpq_fetch.c            | 2 +-
 src/bin/pg_upgrade/server.c                | 2 +-
 src/bin/scripts/common.c                   | 2 +-
 src/bin/scripts/reindexdb.c                | 2 +-
 src/bin/scripts/vacuumdb.c                 | 2 +-
 src/fe_utils/cancel.c                      | 2 +-
 src/include/{fe_utils => common}/connect.h | 2 +-
 src/tools/findoidjoins/findoidjoins.c      | 2 +-
 14 files changed, 14 insertions(+), 14 deletions(-)
 rename src/include/{fe_utils => common}/connect.h (96%)

diff --git a/contrib/oid2name/oid2name.c b/contrib/oid2name/oid2name.c
index c7d0f9025a43..91b7958c48ef 100644
--- a/contrib/oid2name/oid2name.c
+++ b/contrib/oid2name/oid2name.c
@@ -10,8 +10,8 @@
 #include "postgres_fe.h"
 
 #include "catalog/pg_class_d.h"
+#include "common/connect.h"
 #include "common/logging.h"
-#include "fe_utils/connect.h"
 #include "getopt_long.h"
 #include "libpq-fe.h"
 #include "pg_getopt.h"
diff --git a/contrib/vacuumlo/vacuumlo.c b/contrib/vacuumlo/vacuumlo.c
index 92bdf71356b1..e4019fafaa9e 100644
--- a/contrib/vacuumlo/vacuumlo.c
+++ b/contrib/vacuumlo/vacuumlo.c
@@ -22,8 +22,8 @@
 #endif
 
 #include "catalog/pg_class_d.h"
+#include "common/connect.h"
 #include "common/logging.h"
-#include "fe_utils/connect.h"
 #include "getopt_long.h"
 #include "libpq-fe.h"
 #include "pg_getopt.h"
diff --git a/src/bin/pg_basebackup/streamutil.c b/src/bin/pg_basebackup/streamutil.c
index 410116492ea1..c08003e7f2c7 100644
--- a/src/bin/pg_basebackup/streamutil.c
+++ b/src/bin/pg_basebackup/streamutil.c
@@ -18,11 +18,11 @@
 #include <unistd.h>
 
 #include "access/xlog_internal.h"
+#include "common/connect.h"
 #include "common/fe_memutils.h"
 #include "common/file_perm.h"
 #include "common/logging.h"
 #include "datatype/timestamp.h"
-#include "fe_utils/connect.h"
 #include "port/pg_bswap.h"
 #include "pqexpbuffer.h"
 #include "receivelog.h"
diff --git a/src/bin/pg_dump/pg_backup_db.c b/src/bin/pg_dump/pg_backup_db.c
index 6dba7e19e433..94af11b80a39 100644
--- a/src/bin/pg_dump/pg_backup_db.c
+++ b/src/bin/pg_dump/pg_backup_db.c
@@ -17,8 +17,8 @@
 #include <termios.h>
 #endif
 
+#include "common/connect.h"
 #include "dumputils.h"
-#include "fe_utils/connect.h"
 #include "fe_utils/string_utils.h"
 #include "parallel.h"
 #include "pg_backup_archiver.h"
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 94459b3539ad..9c8436dde6cc 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -52,8 +52,8 @@
 #include "catalog/pg_proc_d.h"
 #include "catalog/pg_trigger_d.h"
 #include "catalog/pg_type_d.h"
+#include "common/connect.h"
 #include "dumputils.h"
-#include "fe_utils/connect.h"
 #include "fe_utils/string_utils.h"
 #include "getopt_long.h"
 #include "libpq/libpq-fs.h"
diff --git a/src/bin/pg_dump/pg_dumpall.c b/src/bin/pg_dump/pg_dumpall.c
index 8d5484910231..2c82b39af0d2 100644
--- a/src/bin/pg_dump/pg_dumpall.c
+++ b/src/bin/pg_dump/pg_dumpall.c
@@ -18,10 +18,10 @@
 #include <time.h>
 #include <unistd.h>
 
+#include "common/connect.h"
 #include "common/file_utils.h"
 #include "common/logging.h"
 #include "dumputils.h"
-#include "fe_utils/connect.h"
 #include "fe_utils/string_utils.h"
 #include "getopt_long.h"
 #include "pg_backup.h"
diff --git a/src/bin/pg_rewind/libpq_fetch.c b/src/bin/pg_rewind/libpq_fetch.c
index c44648f82318..bf4dfc23b963 100644
--- a/src/bin/pg_rewind/libpq_fetch.c
+++ b/src/bin/pg_rewind/libpq_fetch.c
@@ -15,8 +15,8 @@
 #include <unistd.h>
 
 #include "catalog/pg_type_d.h"
+#include "common/connect.h"
 #include "datapagemap.h"
-#include "fe_utils/connect.h"
 #include "fetch.h"
 #include "file_ops.h"
 #include "filemap.h"
diff --git a/src/bin/pg_upgrade/server.c b/src/bin/pg_upgrade/server.c
index 79ec3f04c0ec..7db3c1d51f2e 100644
--- a/src/bin/pg_upgrade/server.c
+++ b/src/bin/pg_upgrade/server.c
@@ -9,7 +9,7 @@
 
 #include "postgres_fe.h"
 
-#include "fe_utils/connect.h"
+#include "common/connect.h"
 #include "fe_utils/string_utils.h"
 #include "pg_upgrade.h"
 
diff --git a/src/bin/scripts/common.c b/src/bin/scripts/common.c
index ee65cc39481a..420d0d11a5a1 100644
--- a/src/bin/scripts/common.c
+++ b/src/bin/scripts/common.c
@@ -18,9 +18,9 @@
 #include <unistd.h>
 
 #include "common.h"
+#include "common/connect.h"
 #include "common/logging.h"
 #include "fe_utils/cancel.h"
-#include "fe_utils/connect.h"
 #include "fe_utils/string_utils.h"
 
 #define ERRCODE_UNDEFINED_TABLE  "42P01"
diff --git a/src/bin/scripts/reindexdb.c b/src/bin/scripts/reindexdb.c
index b7b19ccc1ca9..40dcbc928332 100644
--- a/src/bin/scripts/reindexdb.c
+++ b/src/bin/scripts/reindexdb.c
@@ -13,9 +13,9 @@
 
 #include "catalog/pg_class_d.h"
 #include "common.h"
+#include "common/connect.h"
 #include "common/logging.h"
 #include "fe_utils/cancel.h"
-#include "fe_utils/connect.h"
 #include "fe_utils/simple_list.h"
 #include "fe_utils/string_utils.h"
 #include "scripts_parallel.h"
diff --git a/src/bin/scripts/vacuumdb.c b/src/bin/scripts/vacuumdb.c
index 6a3c941158fb..125ed2ff5a46 100644
--- a/src/bin/scripts/vacuumdb.c
+++ b/src/bin/scripts/vacuumdb.c
@@ -15,9 +15,9 @@
 #include "catalog/pg_class_d.h"
 
 #include "common.h"
+#include "common/connect.h"
 #include "common/logging.h"
 #include "fe_utils/cancel.h"
-#include "fe_utils/connect.h"
 #include "fe_utils/simple_list.h"
 #include "fe_utils/string_utils.h"
 #include "scripts_parallel.h"
diff --git a/src/fe_utils/cancel.c b/src/fe_utils/cancel.c
index 51fb67d384ad..70042017481a 100644
--- a/src/fe_utils/cancel.c
+++ b/src/fe_utils/cancel.c
@@ -18,8 +18,8 @@
 
 #include <unistd.h>
 
+#include "common/connect.h"
 #include "fe_utils/cancel.h"
-#include "fe_utils/connect.h"
 #include "fe_utils/string_utils.h"
 
 
diff --git a/src/include/fe_utils/connect.h b/src/include/common/connect.h
similarity index 96%
rename from src/include/fe_utils/connect.h
rename to src/include/common/connect.h
index 8030af9a9f8b..2cc5d7dd251b 100644
--- a/src/include/fe_utils/connect.h
+++ b/src/include/common/connect.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * src/include/fe_utils/connect.h
+ * src/include/common/connect.h
  *
  *-------------------------------------------------------------------------
  */
diff --git a/src/tools/findoidjoins/findoidjoins.c b/src/tools/findoidjoins/findoidjoins.c
index 5239332ea7ee..3d9ca2623576 100644
--- a/src/tools/findoidjoins/findoidjoins.c
+++ b/src/tools/findoidjoins/findoidjoins.c
@@ -10,7 +10,7 @@
 #include "access/transam.h"
 #include "catalog/pg_class_d.h"
 
-#include "fe_utils/connect.h"
+#include "common/connect.h"
 #include "libpq-fe.h"
 #include "pqexpbuffer.h"
 

From 11da97024abbe76b8c81e3f2375b2a62e9717c67 Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Mon, 10 Aug 2020 09:22:54 -0700
Subject: [PATCH 279/334] Empty search_path in logical replication apply worker
 and walsender.

This is like CVE-2018-1058 commit
582edc369cdbd348d68441fc50fa26a84afd0c1a.  Today, a malicious user of a
publisher or subscriber database can invoke arbitrary SQL functions
under an identity running replication, often a superuser.  This fix may
cause "does not exist" or "no schema has been selected to create in"
errors in a replication process.  After upgrading, consider watching
server logs for these errors.  Objects accruing schema qualification in
the wake of the earlier commit are unlikely to need further correction.
Back-patch to v10, which introduced logical replication.

Security: CVE-2020-14349
---
 .../libpqwalreceiver/libpqwalreceiver.c         | 17 +++++++++++++++++
 src/backend/replication/logical/worker.c        |  6 ++++++
 src/test/subscription/t/001_rep_changes.pl      |  4 ++++
 3 files changed, 27 insertions(+)

diff --git a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
index e9057230e40c..8afa5a29b484 100644
--- a/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
+++ b/src/backend/replication/libpqwalreceiver/libpqwalreceiver.c
@@ -21,6 +21,7 @@
 
 #include "access/xlog.h"
 #include "catalog/pg_type.h"
+#include "common/connect.h"
 #include "funcapi.h"
 #include "libpq-fe.h"
 #include "mb/pg_wchar.h"
@@ -213,6 +214,22 @@ libpqrcv_connect(const char *conninfo, bool logical, const char *appname,
 		return NULL;
 	}
 
+	if (logical)
+	{
+		PGresult   *res;
+
+		res = libpqrcv_PQexec(conn->streamConn,
+							  ALWAYS_SECURE_SEARCH_PATH_SQL);
+		if (PQresultStatus(res) != PGRES_TUPLES_OK)
+		{
+			PQclear(res);
+			ereport(ERROR,
+					(errmsg("could not clear search path: %s",
+							pchomp(PQerrorMessage(conn->streamConn)))));
+		}
+		PQclear(res);
+	}
+
 	conn->logical = logical;
 
 	return conn;
diff --git a/src/backend/replication/logical/worker.c b/src/backend/replication/logical/worker.c
index 2fcf2e61bc3e..b576e342cb7d 100644
--- a/src/backend/replication/logical/worker.c
+++ b/src/backend/replication/logical/worker.c
@@ -2019,6 +2019,12 @@ ApplyWorkerMain(Datum main_arg)
 											  MyLogicalRepWorker->userid,
 											  0);
 
+	/*
+	 * Set always-secure search path, so malicious users can't redirect user
+	 * code (e.g. pg_index.indexprs).
+	 */
+	SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
+
 	/* Load the subscription into persistent memory context. */
 	ApplyContext = AllocSetContextCreate(TopMemoryContext,
 										 "ApplyContext",
diff --git a/src/test/subscription/t/001_rep_changes.pl b/src/test/subscription/t/001_rep_changes.pl
index 3f8318fc7cc2..0680f44a1aa5 100644
--- a/src/test/subscription/t/001_rep_changes.pl
+++ b/src/test/subscription/t/001_rep_changes.pl
@@ -16,6 +16,10 @@
 $node_subscriber->start;
 
 # Create some preexisting content on publisher
+$node_publisher->safe_psql(
+	'postgres',
+	"CREATE FUNCTION public.pg_get_replica_identity_index(int)
+	 RETURNS regclass LANGUAGE sql AS 'SELECT 1/0'");    # shall not call
 $node_publisher->safe_psql('postgres',
 	"CREATE TABLE tab_notrep AS SELECT generate_series(1,10) AS a");
 $node_publisher->safe_psql('postgres',

From cec57b1a0fbcd3833086ba686897c5883e0a2afc Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Mon, 10 Aug 2020 09:22:54 -0700
Subject: [PATCH 280/334] Document clashes between logical replication and
 untrusted users.

Back-patch to v10, which introduced logical replication.

Security: CVE-2020-14349
---
 doc/src/sgml/logical-replication.sgml | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/logical-replication.sgml b/doc/src/sgml/logical-replication.sgml
index 7c8629d74efd..3f69b7192682 100644
--- a/doc/src/sgml/logical-replication.sgml
+++ b/doc/src/sgml/logical-replication.sgml
@@ -513,11 +513,27 @@
  <sect1 id="logical-replication-security">
   <title>Security</title>
 
+  <para>
+   A user able to modify the schema of subscriber-side tables can execute
+   arbitrary code as a superuser.  Limit ownership
+   and <literal>TRIGGER</literal> privilege on such tables to roles that
+   superusers trust.  Moreover, if untrusted users can create tables, use only
+   publications that list tables explicitly.  That is to say, create a
+   subscription <literal>FOR ALL TABLES</literal> only when superusers trust
+   every user permitted to create a non-temp table on the publisher or the
+   subscriber.
+  </para>
+
   <para>
    The role used for the replication connection must have
-   the <literal>REPLICATION</literal> attribute (or be a superuser).  Access for the role must be
-   configured in <filename>pg_hba.conf</filename> and it must have the
-   <literal>LOGIN</literal> attribute.
+   the <literal>REPLICATION</literal> attribute (or be a superuser).  If the
+   role lacks <literal>SUPERUSER</literal> and <literal>BYPASSRLS</literal>,
+   publisher row security policies can execute.  If the role does not trust
+   all table owners, include <literal>options=-crow_security=off</literal> in
+   the connection string; if a table owner then adds a row security policy,
+   that setting will cause replication to halt rather than execute the policy.
+   Access for the role must be configured in <filename>pg_hba.conf</filename>
+   and it must have the <literal>LOGIN</literal> attribute.
   </para>
 
   <para>

From 1784f278a63866cc144fcd0a2127cadba6a2b7f8 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Mon, 10 Aug 2020 18:51:31 +0200
Subject: [PATCH 281/334] Replace remaining StrNCpy() by strlcpy()

They are equivalent, except that StrNCpy() zero-fills the entire
destination buffer instead of providing just one trailing zero.  For
all but a tiny number of callers, that's just overhead rather than
being desirable.

Remove StrNCpy() as it is now unused.

In some cases, namestrcpy() is the more appropriate function to use.
While we're here, simplify the API of namestrcpy(): Remove the return
value, don't check for NULL input.  Nothing was using that anyway.
Also, remove a few unused name-related functions.

Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://www.postgresql.org/message-id/flat/44f5e198-36f6-6cdb-7fa9-60e34784daae%402ndquadrant.com
---
 contrib/pgcrypto/crypt-des.c                  |  2 +-
 src/backend/access/transam/slru.c             |  2 +-
 src/backend/access/transam/xlogarchive.c      |  2 +-
 src/backend/catalog/pg_constraint.c           |  2 +-
 src/backend/commands/indexcmds.c              |  2 +-
 src/backend/commands/statscmds.c              |  2 +-
 src/backend/commands/tablecmds.c              |  2 +-
 src/backend/postmaster/pgstat.c               |  2 +-
 src/backend/replication/logical/logical.c     | 11 ++++-
 src/backend/replication/slot.c                |  2 +-
 src/backend/utils/adt/formatting.c            |  8 ++--
 src/backend/utils/adt/name.c                  | 48 ++-----------------
 src/backend/utils/adt/pg_locale.c             |  9 ----
 src/backend/utils/adt/ruleutils.c             |  2 +-
 src/common/exec.c                             |  4 +-
 src/include/c.h                               | 29 -----------
 src/include/utils/builtins.h                  |  3 +-
 src/interfaces/ecpg/pgtypeslib/dt_common.c    |  4 +-
 src/interfaces/ecpg/test/pg_regress_ecpg.c    |  2 +-
 .../ssl_passphrase_func.c                     |  2 +-
 20 files changed, 34 insertions(+), 106 deletions(-)

diff --git a/contrib/pgcrypto/crypt-des.c b/contrib/pgcrypto/crypt-des.c
index 6efaa609c9d1..98c30ea122e3 100644
--- a/contrib/pgcrypto/crypt-des.c
+++ b/contrib/pgcrypto/crypt-des.c
@@ -720,7 +720,7 @@ px_crypt_des(const char *key, const char *setting)
 			if (des_setkey((char *) keybuf))
 				return NULL;
 		}
-		StrNCpy(output, setting, 10);
+		strlcpy(output, setting, 10);
 
 		/*
 		 * Double check that we weren't given a short setting. If we were, the
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 9e145f1c36ac..d1dbb43e096c 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -252,7 +252,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 	 */
 	ctl->shared = shared;
 	ctl->do_fsync = true;		/* default behavior */
-	StrNCpy(ctl->Dir, subdir, sizeof(ctl->Dir));
+	strlcpy(ctl->Dir, subdir, sizeof(ctl->Dir));
 }
 
 /*
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
index cdd586fcfbae..8f8734dc1d4e 100644
--- a/src/backend/access/transam/xlogarchive.c
+++ b/src/backend/access/transam/xlogarchive.c
@@ -323,7 +323,7 @@ ExecuteRecoveryCommand(const char *command, const char *commandName, bool failOn
 				case 'r':
 					/* %r: filename of last restartpoint */
 					sp++;
-					StrNCpy(dp, lastRestartPointFname, endp - dp);
+					strlcpy(dp, lastRestartPointFname, endp - dp);
 					dp += strlen(dp);
 					break;
 				case '%':
diff --git a/src/backend/catalog/pg_constraint.c b/src/backend/catalog/pg_constraint.c
index fdc63e7dea16..6a6b2cb8c0c8 100644
--- a/src/backend/catalog/pg_constraint.c
+++ b/src/backend/catalog/pg_constraint.c
@@ -484,7 +484,7 @@ ChooseConstraintName(const char *name1, const char *name2,
 	conDesc = table_open(ConstraintRelationId, AccessShareLock);
 
 	/* try the unmodified label first */
-	StrNCpy(modlabel, label, sizeof(modlabel));
+	strlcpy(modlabel, label, sizeof(modlabel));
 
 	for (;;)
 	{
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 2baca12c5f47..7819266a6306 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -2246,7 +2246,7 @@ ChooseRelationName(const char *name1, const char *name2,
 	char		modlabel[NAMEDATALEN];
 
 	/* try the unmodified label first */
-	StrNCpy(modlabel, label, sizeof(modlabel));
+	strlcpy(modlabel, label, sizeof(modlabel));
 
 	for (;;)
 	{
diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c
index 974828545ca9..3057d89d50c0 100644
--- a/src/backend/commands/statscmds.c
+++ b/src/backend/commands/statscmds.c
@@ -681,7 +681,7 @@ ChooseExtendedStatisticName(const char *name1, const char *name2,
 	char		modlabel[NAMEDATALEN];
 
 	/* try the unmodified label first */
-	StrNCpy(modlabel, label, sizeof(modlabel));
+	strlcpy(modlabel, label, sizeof(modlabel));
 
 	for (;;)
 	{
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index ac53f79ada2a..cd989c95e517 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -606,7 +606,7 @@ DefineRelation(CreateStmt *stmt, char relkind, Oid ownerId,
 	 * Truncate relname to appropriate length (probably a waste of time, as
 	 * parser should have done this already).
 	 */
-	StrNCpy(relname, stmt->relation->relname, NAMEDATALEN);
+	strlcpy(relname, stmt->relation->relname, NAMEDATALEN);
 
 	/*
 	 * Check consistency of arguments
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 15f92b66c6ba..73ce944fb1ce 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -4367,7 +4367,7 @@ pgstat_send_archiver(const char *xlog, bool failed)
 	 */
 	pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_ARCHIVER);
 	msg.m_failed = failed;
-	StrNCpy(msg.m_xlog, xlog, sizeof(msg.m_xlog));
+	strlcpy(msg.m_xlog, xlog, sizeof(msg.m_xlog));
 	msg.m_timestamp = GetCurrentTimestamp();
 	pgstat_send(&msg, sizeof(msg));
 }
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index f5eb6bc3aff2..57c5b513ccf8 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -39,6 +39,7 @@
 #include "replication/snapbuild.h"
 #include "storage/proc.h"
 #include "storage/procarray.h"
+#include "utils/builtins.h"
 #include "utils/memutils.h"
 
 /* data for errcontext callback */
@@ -288,6 +289,7 @@ CreateInitDecodingContext(const char *plugin,
 {
 	TransactionId xmin_horizon = InvalidTransactionId;
 	ReplicationSlot *slot;
+	NameData	plugin_name;
 	LogicalDecodingContext *ctx;
 	MemoryContext old_context;
 
@@ -319,9 +321,14 @@ CreateInitDecodingContext(const char *plugin,
 				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
 				 errmsg("cannot create logical replication slot in transaction that has performed writes")));
 
-	/* register output plugin name with slot */
+	/*
+	 * Register output plugin name with slot.  We need the mutex to avoid
+	 * concurrent reading of a partially copied string.  But we don't want any
+	 * complicated code while holding a spinlock, so do namestrcpy() outside.
+	 */
+	namestrcpy(&plugin_name, plugin);
 	SpinLockAcquire(&slot->mutex);
-	StrNCpy(NameStr(slot->data.plugin), plugin, NAMEDATALEN);
+	slot->data.plugin = plugin_name;
 	SpinLockRelease(&slot->mutex);
 
 	if (XLogRecPtrIsInvalid(restart_lsn))
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 57bbb6288c68..3dc01b6df22a 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -275,7 +275,7 @@ ReplicationSlotCreate(const char *name, bool db_specific,
 
 	/* first initialize persistent data */
 	memset(&slot->data, 0, sizeof(ReplicationSlotPersistentData));
-	StrNCpy(NameStr(slot->data.name), name, NAMEDATALEN);
+	namestrcpy(&slot->data.name, name);
 	slot->data.database = db_specific ? MyDatabaseId : InvalidOid;
 	slot->data.persistency = persistency;
 
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 662643813660..9de63686ecb5 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -3890,7 +3890,7 @@ DCH_cache_getnew(const char *str, bool std)
 		elog(DEBUG_elog_output, "OLD: '%s' AGE: %d", old->str, old->age);
 #endif
 		old->valid = false;
-		StrNCpy(old->str, str, DCH_CACHE_SIZE + 1);
+		strlcpy(old->str, str, DCH_CACHE_SIZE + 1);
 		old->age = (++DCHCounter);
 		/* caller is expected to fill format, then set valid */
 		return old;
@@ -3904,7 +3904,7 @@ DCH_cache_getnew(const char *str, bool std)
 		DCHCache[n_DCHCache] = ent = (DCHCacheEntry *)
 			MemoryContextAllocZero(TopMemoryContext, sizeof(DCHCacheEntry));
 		ent->valid = false;
-		StrNCpy(ent->str, str, DCH_CACHE_SIZE + 1);
+		strlcpy(ent->str, str, DCH_CACHE_SIZE + 1);
 		ent->std = std;
 		ent->age = (++DCHCounter);
 		/* caller is expected to fill format, then set valid */
@@ -4799,7 +4799,7 @@ NUM_cache_getnew(const char *str)
 		elog(DEBUG_elog_output, "OLD: \"%s\" AGE: %d", old->str, old->age);
 #endif
 		old->valid = false;
-		StrNCpy(old->str, str, NUM_CACHE_SIZE + 1);
+		strlcpy(old->str, str, NUM_CACHE_SIZE + 1);
 		old->age = (++NUMCounter);
 		/* caller is expected to fill format and Num, then set valid */
 		return old;
@@ -4813,7 +4813,7 @@ NUM_cache_getnew(const char *str)
 		NUMCache[n_NUMCache] = ent = (NUMCacheEntry *)
 			MemoryContextAllocZero(TopMemoryContext, sizeof(NUMCacheEntry));
 		ent->valid = false;
-		StrNCpy(ent->str, str, NUM_CACHE_SIZE + 1);
+		strlcpy(ent->str, str, NUM_CACHE_SIZE + 1);
 		ent->age = (++NUMCounter);
 		/* caller is expected to fill format and Num, then set valid */
 		++n_NUMCache;
diff --git a/src/backend/utils/adt/name.c b/src/backend/utils/adt/name.c
index 64877f67e010..a3ce3f3d1e18 100644
--- a/src/backend/utils/adt/name.c
+++ b/src/backend/utils/adt/name.c
@@ -229,53 +229,13 @@ btnamesortsupport(PG_FUNCTION_ARGS)
  *	 MISCELLANEOUS PUBLIC ROUTINES											 *
  *****************************************************************************/
 
-int
-namecpy(Name n1, const NameData *n2)
-{
-	if (!n1 || !n2)
-		return -1;
-	StrNCpy(NameStr(*n1), NameStr(*n2), NAMEDATALEN);
-	return 0;
-}
-
-#ifdef NOT_USED
-int
-namecat(Name n1, Name n2)
-{
-	return namestrcat(n1, NameStr(*n2));	/* n2 can't be any longer than n1 */
-}
-#endif
-
-int
+void
 namestrcpy(Name name, const char *str)
 {
-	if (!name || !str)
-		return -1;
-	StrNCpy(NameStr(*name), str, NAMEDATALEN);
-	return 0;
-}
-
-#ifdef NOT_USED
-int
-namestrcat(Name name, const char *str)
-{
-	int			i;
-	char	   *p,
-			   *q;
-
-	if (!name || !str)
-		return -1;
-	for (i = 0, p = NameStr(*name); i < NAMEDATALEN && *p; ++i, ++p)
-		;
-	for (q = str; i < NAMEDATALEN; ++i, ++p, ++q)
-	{
-		*p = *q;
-		if (!*q)
-			break;
-	}
-	return 0;
+	/* NB: We need to zero-pad the destination. */
+	strncpy(NameStr(*name), str, NAMEDATALEN);
+	NameStr(*name)[NAMEDATALEN-1] = '\0';
 }
-#endif
 
 /*
  * Compare a NAME to a C string
diff --git a/src/backend/utils/adt/pg_locale.c b/src/backend/utils/adt/pg_locale.c
index 11d05c73accc..07299dbc0911 100644
--- a/src/backend/utils/adt/pg_locale.c
+++ b/src/backend/utils/adt/pg_locale.c
@@ -75,16 +75,7 @@
 #endif
 
 #ifdef WIN32
-/*
- * This Windows file defines StrNCpy. We don't need it here, so we undefine
- * it to keep the compiler quiet, and undefine it again after the file is
- * included, so we don't accidentally use theirs.
- */
-#undef StrNCpy
 #include <shlwapi.h>
-#ifdef StrNCpy
-#undef StrNCpy
-#endif
 #endif
 
 #define		MAX_L10N_DATA		80
diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c
index 2cbcb4b85e3b..60dd80c23c87 100644
--- a/src/backend/utils/adt/ruleutils.c
+++ b/src/backend/utils/adt/ruleutils.c
@@ -2489,7 +2489,7 @@ pg_get_userbyid(PG_FUNCTION_ARGS)
 	if (HeapTupleIsValid(roletup))
 	{
 		role_rec = (Form_pg_authid) GETSTRUCT(roletup);
-		StrNCpy(NameStr(*result), NameStr(role_rec->rolname), NAMEDATALEN);
+		*result = role_rec->rolname;
 		ReleaseSysCache(roletup);
 	}
 	else
diff --git a/src/common/exec.c b/src/common/exec.c
index f39b0a294bf5..78bb486f999a 100644
--- a/src/common/exec.c
+++ b/src/common/exec.c
@@ -144,7 +144,7 @@ find_my_exec(const char *argv0, char *retpath)
 	if (first_dir_separator(argv0) != NULL)
 	{
 		if (is_absolute_path(argv0))
-			StrNCpy(retpath, argv0, MAXPGPATH);
+			strlcpy(retpath, argv0, MAXPGPATH);
 		else
 			join_path_components(retpath, cwd, argv0);
 		canonicalize_path(retpath);
@@ -184,7 +184,7 @@ find_my_exec(const char *argv0, char *retpath)
 			if (!endp)
 				endp = startp + strlen(startp); /* point to end */
 
-			StrNCpy(test_path, startp, Min(endp - startp + 1, MAXPGPATH));
+			strlcpy(test_path, startp, Min(endp - startp + 1, MAXPGPATH));
 
 			if (is_absolute_path(test_path))
 				join_path_components(retpath, test_path, argv0);
diff --git a/src/include/c.h b/src/include/c.h
index f242e32edbe7..2c61ca8aa894 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -932,35 +932,6 @@ extern void ExceptionalCondition(const char *conditionName,
  */
 #define Abs(x)			((x) >= 0 ? (x) : -(x))
 
-/*
- * StrNCpy
- *	Like standard library function strncpy(), except that result string
- *	is guaranteed to be null-terminated --- that is, at most N-1 bytes
- *	of the source string will be kept.
- *	Also, the macro returns no result (too hard to do that without
- *	evaluating the arguments multiple times, which seems worse).
- *
- *	BTW: when you need to copy a non-null-terminated string (like a text
- *	datum) and add a null, do not do it with StrNCpy(..., len+1).  That
- *	might seem to work, but it fetches one byte more than there is in the
- *	text object.  One fine day you'll have a SIGSEGV because there isn't
- *	another byte before the end of memory.  Don't laugh, we've had real
- *	live bug reports from real live users over exactly this mistake.
- *	Do it honestly with "memcpy(dst,src,len); dst[len] = '\0';", instead.
- */
-#define StrNCpy(dst,src,len) \
-	do \
-	{ \
-		char * _dst = (dst); \
-		Size _len = (len); \
-\
-		if (_len > 0) \
-		{ \
-			strncpy(_dst, (src), _len); \
-			_dst[_len-1] = '\0'; \
-		} \
-	} while (0)
-
 
 /* Get a bit mask of the bits set in non-long aligned addresses */
 #define LONG_ALIGN_MASK (sizeof(long) - 1)
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 3ca5e938f8f8..4db5ad3f12e5 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -39,8 +39,7 @@ extern uint64 hex_decode(const char *src, size_t len, char *dst);
 extern int2vector *buildint2vector(const int16 *int2s, int n);
 
 /* name.c */
-extern int	namecpy(Name n1, const NameData *n2);
-extern int	namestrcpy(Name name, const char *str);
+extern void	namestrcpy(Name name, const char *str);
 extern int	namestrcmp(Name name, const char *str);
 
 /* numutils.c */
diff --git a/src/interfaces/ecpg/pgtypeslib/dt_common.c b/src/interfaces/ecpg/pgtypeslib/dt_common.c
index 14cdf2d428b5..e8a8a0f0ed3e 100644
--- a/src/interfaces/ecpg/pgtypeslib/dt_common.c
+++ b/src/interfaces/ecpg/pgtypeslib/dt_common.c
@@ -1015,7 +1015,7 @@ abstime2tm(AbsoluteTime _time, int *tzp, struct tm *tm, char **tzn)
 			 * Copy no more than MAXTZLEN bytes of timezone to tzn, in case it
 			 * contains an error message, which doesn't fit in the buffer
 			 */
-			StrNCpy(*tzn, tm->tm_zone, MAXTZLEN + 1);
+			strlcpy(*tzn, tm->tm_zone, MAXTZLEN + 1);
 			if (strlen(tm->tm_zone) > MAXTZLEN)
 				tm->tm_isdst = -1;
 		}
@@ -1033,7 +1033,7 @@ abstime2tm(AbsoluteTime _time, int *tzp, struct tm *tm, char **tzn)
 			 * Copy no more than MAXTZLEN bytes of timezone to tzn, in case it
 			 * contains an error message, which doesn't fit in the buffer
 			 */
-			StrNCpy(*tzn, TZNAME_GLOBAL[tm->tm_isdst], MAXTZLEN + 1);
+			strlcpy(*tzn, TZNAME_GLOBAL[tm->tm_isdst], MAXTZLEN + 1);
 			if (strlen(TZNAME_GLOBAL[tm->tm_isdst]) > MAXTZLEN)
 				tm->tm_isdst = -1;
 		}
diff --git a/src/interfaces/ecpg/test/pg_regress_ecpg.c b/src/interfaces/ecpg/test/pg_regress_ecpg.c
index 956a599fcbbc..46b9e78fe59d 100644
--- a/src/interfaces/ecpg/test/pg_regress_ecpg.c
+++ b/src/interfaces/ecpg/test/pg_regress_ecpg.c
@@ -63,7 +63,7 @@ ecpg_filter(const char *sourcefile, const char *outfile)
 			if (plen > 1)
 			{
 				n = (char *) malloc(plen);
-				StrNCpy(n, p + 1, plen);
+				strlcpy(n, p + 1, plen);
 				replace_string(linebuf, n, "");
 			}
 		}
diff --git a/src/test/modules/ssl_passphrase_callback/ssl_passphrase_func.c b/src/test/modules/ssl_passphrase_callback/ssl_passphrase_func.c
index 563ff144cc10..6b0a3db104c2 100644
--- a/src/test/modules/ssl_passphrase_callback/ssl_passphrase_func.c
+++ b/src/test/modules/ssl_passphrase_callback/ssl_passphrase_func.c
@@ -74,7 +74,7 @@ rot13_passphrase(char *buf, int size, int rwflag, void *userdata)
 {
 
 	Assert(ssl_passphrase != NULL);
-	StrNCpy(buf, ssl_passphrase, size);
+	strlcpy(buf, ssl_passphrase, size);
 	for (char *p = buf; *p; p++)
 	{
 		char		c = *p;

From 1f75b454134cce6a67a9bcdb01b5c018221dd359 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 11 Aug 2020 14:37:38 +0900
Subject: [PATCH 282/334] Improve tab completion of REINDEX in psql

This allows the tab completion of REINDEX to handle an optional
parenthesized list of options.  This case is more complicated than
VACUUM or ANALYZE because of CONCURRENTLY and the different object types
to consider with the reindex.

Author: Justin Pryzby
Reviewed-by: Alexey Kondratov, Michael Paquier
Discussion: https://postgr.es/m/20200403182712.GR14618@telsasoft.com
---
 src/bin/psql/tab-complete.c | 38 ++++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index c4af40bfa9fa..f41785f11c12 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -3430,28 +3430,48 @@ psql_completion(const char *text, int start, int end)
 		COMPLETE_WITH("DATA");
 
 /* REINDEX */
-	else if (Matches("REINDEX"))
+	else if (Matches("REINDEX") ||
+			 Matches("REINDEX", "(*)"))
 		COMPLETE_WITH("TABLE", "INDEX", "SYSTEM", "SCHEMA", "DATABASE");
-	else if (Matches("REINDEX", "TABLE"))
+	else if (Matches("REINDEX", "TABLE") ||
+			 Matches("REINDEX", "(*)", "TABLE"))
 		COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_indexables,
 								   " UNION SELECT 'CONCURRENTLY'");
-	else if (Matches("REINDEX", "INDEX"))
+	else if (Matches("REINDEX", "INDEX") ||
+			 Matches("REINDEX", "(*)", "INDEX"))
 		COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_indexes,
 								   " UNION SELECT 'CONCURRENTLY'");
-	else if (Matches("REINDEX", "SCHEMA"))
+	else if (Matches("REINDEX", "SCHEMA") ||
+			 Matches("REINDEX", "(*)", "SCHEMA"))
 		COMPLETE_WITH_QUERY(Query_for_list_of_schemas
 							" UNION SELECT 'CONCURRENTLY'");
-	else if (Matches("REINDEX", "SYSTEM|DATABASE"))
+	else if (Matches("REINDEX", "SYSTEM|DATABASE") ||
+			 Matches("REINDEX", "(*)", "SYSTEM|DATABASE"))
 		COMPLETE_WITH_QUERY(Query_for_list_of_databases
 							" UNION SELECT 'CONCURRENTLY'");
-	else if (Matches("REINDEX", "TABLE", "CONCURRENTLY"))
+	else if (Matches("REINDEX", "TABLE", "CONCURRENTLY") ||
+			 Matches("REINDEX", "(*)", "TABLE", "CONCURRENTLY"))
 		COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_indexables, NULL);
-	else if (Matches("REINDEX", "INDEX", "CONCURRENTLY"))
+	else if (Matches("REINDEX", "INDEX", "CONCURRENTLY") ||
+			 Matches("REINDEX", "(*)", "INDEX", "CONCURRENTLY"))
 		COMPLETE_WITH_SCHEMA_QUERY(Query_for_list_of_indexes, NULL);
-	else if (Matches("REINDEX", "SCHEMA", "CONCURRENTLY"))
+	else if (Matches("REINDEX", "SCHEMA", "CONCURRENTLY") ||
+			 Matches("REINDEX", "(*)", "SCHEMA", "CONCURRENTLY"))
 		COMPLETE_WITH_QUERY(Query_for_list_of_schemas);
-	else if (Matches("REINDEX", "SYSTEM|DATABASE", "CONCURRENTLY"))
+	else if (Matches("REINDEX", "SYSTEM|DATABASE", "CONCURRENTLY") ||
+			 Matches("REINDEX", "(*)", "SYSTEM|DATABASE", "CONCURRENTLY"))
 		COMPLETE_WITH_QUERY(Query_for_list_of_databases);
+	else if (HeadMatches("REINDEX", "(*") &&
+			 !HeadMatches("REINDEX", "(*)"))
+	{
+		/*
+		 * This fires if we're in an unfinished parenthesized option list.
+		 * get_previous_words treats a completed parenthesized option list as
+		 * one word, so the above test is correct.
+		 */
+		if (ends_with(prev_wd, '(') || ends_with(prev_wd, ','))
+			COMPLETE_WITH("VERBOSE");
+	}
 
 /* SECURITY LABEL */
 	else if (Matches("SECURITY"))

From fea10a64340e529805609126740a540c8f9daab4 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 11 Aug 2020 11:25:23 -0700
Subject: [PATCH 283/334] Rename VariableCacheData.nextFullXid to nextXid.

Including Full in variable names duplicates the type information and
leads to overly long names. As FullTransactionId cannot accidentally
be casted to TransactionId that does not seem necessary.

Author: Andres Freund
Discussion: https://postgr.es/m/20200724011143.jccsyvsvymuiqfxu@alap3.anarazel.de
---
 src/backend/access/gist/gistxlog.c      |  6 ++--
 src/backend/access/rmgrdesc/xlogdesc.c  |  4 +--
 src/backend/access/transam/clog.c       |  8 ++---
 src/backend/access/transam/commit_ts.c  |  4 +--
 src/backend/access/transam/multixact.c  |  2 +-
 src/backend/access/transam/subtrans.c   | 10 +++---
 src/backend/access/transam/twophase.c   | 22 ++++++------
 src/backend/access/transam/varsup.c     | 26 +++++++-------
 src/backend/access/transam/xact.c       |  4 +--
 src/backend/access/transam/xlog.c       | 48 ++++++++++++-------------
 src/backend/access/transam/xlogreader.c |  4 +--
 src/backend/storage/ipc/procarray.c     | 14 ++++----
 src/backend/storage/ipc/standby.c       |  2 +-
 src/backend/storage/lmgr/predicate.c    |  2 +-
 src/backend/utils/misc/pg_controldata.c |  4 +--
 src/bin/pg_controldata/pg_controldata.c |  4 +--
 src/bin/pg_resetwal/pg_resetwal.c       | 18 +++++-----
 src/include/access/transam.h            |  4 +--
 src/include/catalog/pg_control.h        |  2 +-
 src/include/storage/standby.h           |  2 +-
 src/include/storage/standbydefs.h       |  2 +-
 21 files changed, 96 insertions(+), 96 deletions(-)

diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 3f0effd5e429..7b5d1e98b70b 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -396,7 +396,7 @@ gistRedoPageReuse(XLogReaderState *record)
 	if (InHotStandby)
 	{
 		FullTransactionId latestRemovedFullXid = xlrec->latestRemovedFullXid;
-		FullTransactionId nextFullXid = ReadNextFullTransactionId();
+		FullTransactionId nextXid = ReadNextFullTransactionId();
 		uint64		diff;
 
 		/*
@@ -405,8 +405,8 @@ gistRedoPageReuse(XLogReaderState *record)
 		 * logged value is very old, so that XID wrap-around already happened
 		 * on it, there can't be any snapshots that still see it.
 		 */
-		nextFullXid = ReadNextFullTransactionId();
-		diff = U64FromFullTransactionId(nextFullXid) -
+		nextXid = ReadNextFullTransactionId();
+		diff = U64FromFullTransactionId(nextXid) -
 			U64FromFullTransactionId(latestRemovedFullXid);
 		if (diff < MaxTransactionId / 2)
 		{
diff --git a/src/backend/access/rmgrdesc/xlogdesc.c b/src/backend/access/rmgrdesc/xlogdesc.c
index 1cd97852e8f3..3200f777f5a3 100644
--- a/src/backend/access/rmgrdesc/xlogdesc.c
+++ b/src/backend/access/rmgrdesc/xlogdesc.c
@@ -53,8 +53,8 @@ xlog_desc(StringInfo buf, XLogReaderState *record)
 						 checkpoint->ThisTimeLineID,
 						 checkpoint->PrevTimeLineID,
 						 checkpoint->fullPageWrites ? "true" : "false",
-						 EpochFromFullTransactionId(checkpoint->nextFullXid),
-						 XidFromFullTransactionId(checkpoint->nextFullXid),
+						 EpochFromFullTransactionId(checkpoint->nextXid),
+						 XidFromFullTransactionId(checkpoint->nextXid),
 						 checkpoint->nextOid,
 						 checkpoint->nextMulti,
 						 checkpoint->nextMultiOffset,
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index f3da40ae017f..dd2f4d5bc7e7 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -742,12 +742,12 @@ ZeroCLOGPage(int pageno, bool writeXlog)
 
 /*
  * This must be called ONCE during postmaster or standalone-backend startup,
- * after StartupXLOG has initialized ShmemVariableCache->nextFullXid.
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid.
  */
 void
 StartupCLOG(void)
 {
-	TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
 	int			pageno = TransactionIdToPage(xid);
 
 	LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
@@ -766,7 +766,7 @@ StartupCLOG(void)
 void
 TrimCLOG(void)
 {
-	TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	TransactionId xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
 	int			pageno = TransactionIdToPage(xid);
 
 	LWLockAcquire(XactSLRULock, LW_EXCLUSIVE);
@@ -785,7 +785,7 @@ TrimCLOG(void)
 	 * but makes no WAL entry).  Let's just be safe. (We need not worry about
 	 * pages beyond the current one, since those will be zeroed when first
 	 * used.  For the same reason, there is no need to do anything when
-	 * nextFullXid is exactly at a page boundary; and it's likely that the
+	 * nextXid is exactly at a page boundary; and it's likely that the
 	 * "current" page doesn't exist yet in that case.)
 	 */
 	if (TransactionIdToPgIndex(xid) != 0)
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 903280ae92d0..5244b06a2b65 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -614,7 +614,7 @@ ZeroCommitTsPage(int pageno, bool writeXlog)
 
 /*
  * This must be called ONCE during postmaster or standalone-backend startup,
- * after StartupXLOG has initialized ShmemVariableCache->nextFullXid.
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid.
  */
 void
 StartupCommitTs(void)
@@ -704,7 +704,7 @@ ActivateCommitTs(void)
 	}
 	LWLockRelease(CommitTsLock);
 
-	xid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
 	pageno = TransactionIdToCTsPage(xid);
 
 	/*
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 475f5ed86110..b8bedca04a4d 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -3265,7 +3265,7 @@ multixact_redo(XLogReaderState *record)
 								  xlrec->moff + xlrec->nmembers);
 
 		/*
-		 * Make sure nextFullXid is beyond any XID mentioned in the record.
+		 * Make sure nextXid is beyond any XID mentioned in the record.
 		 * This should be unnecessary, since any XID found here ought to have
 		 * other evidence in the XLOG, but let's be safe.
 		 */
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index f33ae407a609..a087a5554210 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -241,15 +241,15 @@ ZeroSUBTRANSPage(int pageno)
 
 /*
  * This must be called ONCE during postmaster or standalone-backend startup,
- * after StartupXLOG has initialized ShmemVariableCache->nextFullXid.
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid.
  *
- * oldestActiveXID is the oldest XID of any prepared transaction, or nextFullXid
+ * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid
  * if there are none.
  */
 void
 StartupSUBTRANS(TransactionId oldestActiveXID)
 {
-	FullTransactionId nextFullXid;
+	FullTransactionId nextXid;
 	int			startPage;
 	int			endPage;
 
@@ -262,8 +262,8 @@ StartupSUBTRANS(TransactionId oldestActiveXID)
 	LWLockAcquire(SubtransSLRULock, LW_EXCLUSIVE);
 
 	startPage = TransactionIdToPage(oldestActiveXID);
-	nextFullXid = ShmemVariableCache->nextFullXid;
-	endPage = TransactionIdToPage(XidFromFullTransactionId(nextFullXid));
+	nextXid = ShmemVariableCache->nextXid;
+	endPage = TransactionIdToPage(XidFromFullTransactionId(nextXid));
 
 	while (startPage != endPage)
 	{
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 9b2e59bf0ec1..31f135f5cedc 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -1813,16 +1813,16 @@ restoreTwoPhaseData(void)
  *
  * Scan the shared memory entries of TwoPhaseState and determine the range
  * of valid XIDs present.  This is run during database startup, after we
- * have completed reading WAL.  ShmemVariableCache->nextFullXid has been set to
+ * have completed reading WAL.  ShmemVariableCache->nextXid has been set to
  * one more than the highest XID for which evidence exists in WAL.
  *
- * We throw away any prepared xacts with main XID beyond nextFullXid --- if any
+ * We throw away any prepared xacts with main XID beyond nextXid --- if any
  * are present, it suggests that the DBA has done a PITR recovery to an
  * earlier point in time without cleaning out pg_twophase.  We dare not
  * try to recover such prepared xacts since they likely depend on database
  * state that doesn't exist now.
  *
- * However, we will advance nextFullXid beyond any subxact XIDs belonging to
+ * However, we will advance nextXid beyond any subxact XIDs belonging to
  * valid prepared xacts.  We need to do this since subxact commit doesn't
  * write a WAL entry, and so there might be no evidence in WAL of those
  * subxact XIDs.
@@ -1832,7 +1832,7 @@ restoreTwoPhaseData(void)
  * backup should be rolled in.
  *
  * Our other responsibility is to determine and return the oldest valid XID
- * among the prepared xacts (if none, return ShmemVariableCache->nextFullXid).
+ * among the prepared xacts (if none, return ShmemVariableCache->nextXid).
  * This is needed to synchronize pg_subtrans startup properly.
  *
  * If xids_p and nxids_p are not NULL, pointer to a palloc'd array of all
@@ -1842,8 +1842,8 @@ restoreTwoPhaseData(void)
 TransactionId
 PrescanPreparedTransactions(TransactionId **xids_p, int *nxids_p)
 {
-	FullTransactionId nextFullXid = ShmemVariableCache->nextFullXid;
-	TransactionId origNextXid = XidFromFullTransactionId(nextFullXid);
+	FullTransactionId nextXid = ShmemVariableCache->nextXid;
+	TransactionId origNextXid = XidFromFullTransactionId(nextXid);
 	TransactionId result = origNextXid;
 	TransactionId *xids = NULL;
 	int			nxids = 0;
@@ -2059,7 +2059,7 @@ RecoverPreparedTransactions(void)
  *
  * If setParent is true, set up subtransaction parent linkages.
  *
- * If setNextXid is true, set ShmemVariableCache->nextFullXid to the newest
+ * If setNextXid is true, set ShmemVariableCache->nextXid to the newest
  * value scanned.
  */
 static char *
@@ -2068,8 +2068,8 @@ ProcessTwoPhaseBuffer(TransactionId xid,
 					  bool fromdisk,
 					  bool setParent, bool setNextXid)
 {
-	FullTransactionId nextFullXid = ShmemVariableCache->nextFullXid;
-	TransactionId origNextXid = XidFromFullTransactionId(nextFullXid);
+	FullTransactionId nextXid = ShmemVariableCache->nextXid;
+	TransactionId origNextXid = XidFromFullTransactionId(nextXid);
 	TransactionId *subxids;
 	char	   *buf;
 	TwoPhaseFileHeader *hdr;
@@ -2149,7 +2149,7 @@ ProcessTwoPhaseBuffer(TransactionId xid,
 
 	/*
 	 * Examine subtransaction XIDs ... they should all follow main XID, and
-	 * they may force us to advance nextFullXid.
+	 * they may force us to advance nextXid.
 	 */
 	subxids = (TransactionId *) (buf +
 								 MAXALIGN(sizeof(TwoPhaseFileHeader)) +
@@ -2160,7 +2160,7 @@ ProcessTwoPhaseBuffer(TransactionId xid,
 
 		Assert(TransactionIdFollows(subxid, xid));
 
-		/* update nextFullXid if needed */
+		/* update nextXid if needed */
 		if (setNextXid)
 			AdvanceNextFullTransactionIdPastXid(subxid);
 
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 0142bc70f6a6..3ebd75118f06 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -75,7 +75,7 @@ GetNewTransactionId(bool isSubXact)
 
 	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
 
-	full_xid = ShmemVariableCache->nextFullXid;
+	full_xid = ShmemVariableCache->nextXid;
 	xid = XidFromFullTransactionId(full_xid);
 
 	/*----------
@@ -159,7 +159,7 @@ GetNewTransactionId(bool isSubXact)
 
 		/* Re-acquire lock and start over */
 		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
-		full_xid = ShmemVariableCache->nextFullXid;
+		full_xid = ShmemVariableCache->nextXid;
 		xid = XidFromFullTransactionId(full_xid);
 	}
 
@@ -177,12 +177,12 @@ GetNewTransactionId(bool isSubXact)
 	ExtendSUBTRANS(xid);
 
 	/*
-	 * Now advance the nextFullXid counter.  This must not happen until after
+	 * Now advance the nextXid counter.  This must not happen until after
 	 * we have successfully completed ExtendCLOG() --- if that routine fails,
 	 * we want the next incoming transaction to try it again.  We cannot
 	 * assign more XIDs until there is CLOG space for them.
 	 */
-	FullTransactionIdAdvance(&ShmemVariableCache->nextFullXid);
+	FullTransactionIdAdvance(&ShmemVariableCache->nextXid);
 
 	/*
 	 * We must store the new XID into the shared ProcArray before releasing
@@ -240,7 +240,7 @@ GetNewTransactionId(bool isSubXact)
 }
 
 /*
- * Read nextFullXid but don't allocate it.
+ * Read nextXid but don't allocate it.
  */
 FullTransactionId
 ReadNextFullTransactionId(void)
@@ -248,14 +248,14 @@ ReadNextFullTransactionId(void)
 	FullTransactionId fullXid;
 
 	LWLockAcquire(XidGenLock, LW_SHARED);
-	fullXid = ShmemVariableCache->nextFullXid;
+	fullXid = ShmemVariableCache->nextXid;
 	LWLockRelease(XidGenLock);
 
 	return fullXid;
 }
 
 /*
- * Advance nextFullXid to the value after a given xid.  The epoch is inferred.
+ * Advance nextXid to the value after a given xid.  The epoch is inferred.
  * This must only be called during recovery or from two-phase start-up code.
  */
 void
@@ -266,14 +266,14 @@ AdvanceNextFullTransactionIdPastXid(TransactionId xid)
 	uint32		epoch;
 
 	/*
-	 * It is safe to read nextFullXid without a lock, because this is only
+	 * It is safe to read nextXid without a lock, because this is only
 	 * called from the startup process or single-process mode, meaning that no
 	 * other process can modify it.
 	 */
 	Assert(AmStartupProcess() || !IsUnderPostmaster);
 
 	/* Fast return if this isn't an xid high enough to move the needle. */
-	next_xid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
 	if (!TransactionIdFollowsOrEquals(xid, next_xid))
 		return;
 
@@ -286,7 +286,7 @@ AdvanceNextFullTransactionIdPastXid(TransactionId xid)
 	 * point in the WAL stream.
 	 */
 	TransactionIdAdvance(xid);
-	epoch = EpochFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid);
 	if (unlikely(xid < next_xid))
 		++epoch;
 	newNextFullXid = FullTransactionIdFromEpochAndXid(epoch, xid);
@@ -296,7 +296,7 @@ AdvanceNextFullTransactionIdPastXid(TransactionId xid)
 	 * concurrent readers.
 	 */
 	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
-	ShmemVariableCache->nextFullXid = newNextFullXid;
+	ShmemVariableCache->nextXid = newNextFullXid;
 	LWLockRelease(XidGenLock);
 }
 
@@ -404,7 +404,7 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid)
 	ShmemVariableCache->xidStopLimit = xidStopLimit;
 	ShmemVariableCache->xidWrapLimit = xidWrapLimit;
 	ShmemVariableCache->oldestXidDB = oldest_datoid;
-	curXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	curXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
 	LWLockRelease(XidGenLock);
 
 	/* Log the info */
@@ -480,7 +480,7 @@ ForceTransactionIdLimitUpdate(void)
 
 	/* Locking is probably not really necessary, but let's be careful */
 	LWLockAcquire(XidGenLock, LW_SHARED);
-	nextXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
 	xidVacLimit = ShmemVariableCache->xidVacLimit;
 	oldestXid = ShmemVariableCache->oldestXid;
 	oldestXidDB = ShmemVariableCache->oldestXidDB;
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 727d61603593..7ccb7d68ed9a 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -5791,7 +5791,7 @@ xact_redo_commit(xl_xact_parsed_commit *parsed,
 
 	max_xid = TransactionIdLatest(xid, parsed->nsubxacts, parsed->subxacts);
 
-	/* Make sure nextFullXid is beyond any XID mentioned in the record. */
+	/* Make sure nextXid is beyond any XID mentioned in the record. */
 	AdvanceNextFullTransactionIdPastXid(max_xid);
 
 	Assert(((parsed->xinfo & XACT_XINFO_HAS_ORIGIN) == 0) ==
@@ -5931,7 +5931,7 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid)
 
 	Assert(TransactionIdIsValid(xid));
 
-	/* Make sure nextFullXid is beyond any XID mentioned in the record. */
+	/* Make sure nextXid is beyond any XID mentioned in the record. */
 	max_xid = TransactionIdLatest(xid,
 								  parsed->nsubxacts,
 								  parsed->subxacts);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 756b838e6a54..53945c0e305d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -601,7 +601,7 @@ typedef struct XLogCtlData
 	/* Protected by info_lck: */
 	XLogwrtRqst LogwrtRqst;
 	XLogRecPtr	RedoRecPtr;		/* a recent copy of Insert->RedoRecPtr */
-	FullTransactionId ckptFullXid;	/* nextFullXid of latest checkpoint */
+	FullTransactionId ckptFullXid;	/* nextXid of latest checkpoint */
 	XLogRecPtr	asyncXactLSN;	/* LSN of newest async commit/abort */
 	XLogRecPtr	replicationSlotMinLSN;	/* oldest LSN needed by any slot */
 
@@ -5239,7 +5239,7 @@ BootStrapXLOG(void)
 	checkPoint.ThisTimeLineID = ThisTimeLineID;
 	checkPoint.PrevTimeLineID = ThisTimeLineID;
 	checkPoint.fullPageWrites = fullPageWrites;
-	checkPoint.nextFullXid =
+	checkPoint.nextXid =
 		FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
 	checkPoint.nextOid = FirstBootstrapObjectId;
 	checkPoint.nextMulti = FirstMultiXactId;
@@ -5253,7 +5253,7 @@ BootStrapXLOG(void)
 	checkPoint.time = (pg_time_t) time(NULL);
 	checkPoint.oldestActiveXid = InvalidTransactionId;
 
-	ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
+	ShmemVariableCache->nextXid = checkPoint.nextXid;
 	ShmemVariableCache->nextOid = checkPoint.nextOid;
 	ShmemVariableCache->oidCount = 0;
 	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
@@ -6741,7 +6741,7 @@ StartupXLOG(void)
 							 wasShutdown ? "true" : "false")));
 	ereport(DEBUG1,
 			(errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
-							 U64FromFullTransactionId(checkPoint.nextFullXid),
+							 U64FromFullTransactionId(checkPoint.nextXid),
 							 checkPoint.nextOid)));
 	ereport(DEBUG1,
 			(errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
@@ -6756,12 +6756,12 @@ StartupXLOG(void)
 			(errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
 							 checkPoint.oldestCommitTsXid,
 							 checkPoint.newestCommitTsXid)));
-	if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextFullXid)))
+	if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
 		ereport(PANIC,
 				(errmsg("invalid next transaction ID")));
 
 	/* initialize shared memory variables from the checkpoint record */
-	ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
+	ShmemVariableCache->nextXid = checkPoint.nextXid;
 	ShmemVariableCache->nextOid = checkPoint.nextOid;
 	ShmemVariableCache->oidCount = 0;
 	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
@@ -6770,7 +6770,7 @@ StartupXLOG(void)
 	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
 	SetCommitTsLimit(checkPoint.oldestCommitTsXid,
 					 checkPoint.newestCommitTsXid);
-	XLogCtl->ckptFullXid = checkPoint.nextFullXid;
+	XLogCtl->ckptFullXid = checkPoint.nextXid;
 
 	/*
 	 * Initialize replication slots, before there's a chance to remove
@@ -7051,7 +7051,7 @@ StartupXLOG(void)
 			Assert(TransactionIdIsValid(oldestActiveXID));
 
 			/* Tell procarray about the range of xids it has to deal with */
-			ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextFullXid));
+			ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid));
 
 			/*
 			 * Startup commit log and subtrans only.  MultiXact and commit
@@ -7081,9 +7081,9 @@ StartupXLOG(void)
 				running.xcnt = nxids;
 				running.subxcnt = 0;
 				running.subxid_overflow = false;
-				running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid);
+				running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
 				running.oldestRunningXid = oldestActiveXID;
-				latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid);
+				latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
 				TransactionIdRetreat(latestCompletedXid);
 				Assert(TransactionIdIsNormal(latestCompletedXid));
 				running.latestCompletedXid = latestCompletedXid;
@@ -7254,7 +7254,7 @@ StartupXLOG(void)
 				error_context_stack = &errcallback;
 
 				/*
-				 * ShmemVariableCache->nextFullXid must be beyond record's
+				 * ShmemVariableCache->nextXid must be beyond record's
 				 * xid.
 				 */
 				AdvanceNextFullTransactionIdPastXid(record->xl_xid);
@@ -7865,7 +7865,7 @@ StartupXLOG(void)
 
 	/* also initialize latestCompletedXid, to nextXid - 1 */
 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-	ShmemVariableCache->latestCompletedXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	ShmemVariableCache->latestCompletedXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
 	TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
 	LWLockRelease(ProcArrayLock);
 
@@ -8897,7 +8897,7 @@ CreateCheckPoint(int flags)
 	 * there.
 	 */
 	LWLockAcquire(XidGenLock, LW_SHARED);
-	checkPoint.nextFullXid = ShmemVariableCache->nextFullXid;
+	checkPoint.nextXid = ShmemVariableCache->nextXid;
 	checkPoint.oldestXid = ShmemVariableCache->oldestXid;
 	checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
 	LWLockRelease(XidGenLock);
@@ -9050,7 +9050,7 @@ CreateCheckPoint(int flags)
 
 	/* Update shared-memory copy of checkpoint XID/epoch */
 	SpinLockAcquire(&XLogCtl->info_lck);
-	XLogCtl->ckptFullXid = checkPoint.nextFullXid;
+	XLogCtl->ckptFullXid = checkPoint.nextXid;
 	SpinLockRelease(&XLogCtl->info_lck);
 
 	/*
@@ -9926,7 +9926,7 @@ xlog_redo(XLogReaderState *record)
 		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
 		/* In a SHUTDOWN checkpoint, believe the counters exactly */
 		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
-		ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
+		ShmemVariableCache->nextXid = checkPoint.nextXid;
 		LWLockRelease(XidGenLock);
 		LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
 		ShmemVariableCache->nextOid = checkPoint.nextOid;
@@ -9980,9 +9980,9 @@ xlog_redo(XLogReaderState *record)
 			running.xcnt = nxids;
 			running.subxcnt = 0;
 			running.subxid_overflow = false;
-			running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid);
+			running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
 			running.oldestRunningXid = oldestActiveXID;
-			latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid);
+			latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
 			TransactionIdRetreat(latestCompletedXid);
 			Assert(TransactionIdIsNormal(latestCompletedXid));
 			running.latestCompletedXid = latestCompletedXid;
@@ -9995,12 +9995,12 @@ xlog_redo(XLogReaderState *record)
 
 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-		ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid;
+		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
 		LWLockRelease(ControlFileLock);
 
 		/* Update shared-memory copy of checkpoint XID/epoch */
 		SpinLockAcquire(&XLogCtl->info_lck);
-		XLogCtl->ckptFullXid = checkPoint.nextFullXid;
+		XLogCtl->ckptFullXid = checkPoint.nextXid;
 		SpinLockRelease(&XLogCtl->info_lck);
 
 		/*
@@ -10021,9 +10021,9 @@ xlog_redo(XLogReaderState *record)
 		memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
 		/* In an ONLINE checkpoint, treat the XID counter as a minimum */
 		LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
-		if (FullTransactionIdPrecedes(ShmemVariableCache->nextFullXid,
-									  checkPoint.nextFullXid))
-			ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
+		if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid,
+									  checkPoint.nextXid))
+			ShmemVariableCache->nextXid = checkPoint.nextXid;
 		LWLockRelease(XidGenLock);
 
 		/*
@@ -10054,12 +10054,12 @@ xlog_redo(XLogReaderState *record)
 								  checkPoint.oldestXidDB);
 		/* ControlFile->checkPointCopy always tracks the latest ckpt XID */
 		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-		ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid;
+		ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
 		LWLockRelease(ControlFileLock);
 
 		/* Update shared-memory copy of checkpoint XID/epoch */
 		SpinLockAcquire(&XLogCtl->info_lck);
-		XLogCtl->ckptFullXid = checkPoint.nextFullXid;
+		XLogCtl->ckptFullXid = checkPoint.nextXid;
 		SpinLockRelease(&XLogCtl->info_lck);
 
 		/* TLI should not change in an on-line checkpoint */
diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c
index a757baccfc55..67996018da27 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -1613,8 +1613,8 @@ XLogRecGetFullXid(XLogReaderState *record)
 	Assert(AmStartupProcess() || !IsUnderPostmaster);
 
 	xid = XLogRecGetXid(record);
-	next_xid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
-	epoch = EpochFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	epoch = EpochFromFullTransactionId(ShmemVariableCache->nextXid);
 
 	/*
 	 * If xid is numerically greater than next_xid, it has to be from the last
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index b44853356446..be0240e0ddcd 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -878,10 +878,10 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
 
 	LWLockRelease(ProcArrayLock);
 
-	/* ShmemVariableCache->nextFullXid must be beyond any observed xid. */
+	/* ShmemVariableCache->nextXid must be beyond any observed xid. */
 	AdvanceNextFullTransactionIdPastXid(latestObservedXid);
 
-	Assert(FullTransactionIdIsValid(ShmemVariableCache->nextFullXid));
+	Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid));
 
 	KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
 	if (standbyState == STANDBY_SNAPSHOT_READY)
@@ -1986,7 +1986,7 @@ GetRunningTransactionData(void)
 
 	latestCompletedXid = ShmemVariableCache->latestCompletedXid;
 
-	oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
 
 	/*
 	 * Spin over procArray collecting all xids
@@ -2078,7 +2078,7 @@ GetRunningTransactionData(void)
 	CurrentRunningXacts->xcnt = count - subcount;
 	CurrentRunningXacts->subxcnt = subcount;
 	CurrentRunningXacts->subxid_overflow = suboverflowed;
-	CurrentRunningXacts->nextXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	CurrentRunningXacts->nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
 	CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
 	CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
 
@@ -2123,7 +2123,7 @@ GetOldestActiveTransactionId(void)
 	 * have already completed), when we spin over it.
 	 */
 	LWLockAcquire(XidGenLock, LW_SHARED);
-	oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
 	LWLockRelease(XidGenLock);
 
 	/*
@@ -2191,7 +2191,7 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
 	 * a safe, albeit pessimal, value.
 	 */
 	LWLockAcquire(XidGenLock, LW_SHARED);
-	oldestSafeXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	oldestSafeXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
 
 	/*
 	 * If there's already a slot pegging the xmin horizon, we can start with
@@ -3361,7 +3361,7 @@ RecordKnownAssignedTransactionIds(TransactionId xid)
 		 */
 		latestObservedXid = xid;
 
-		/* ShmemVariableCache->nextFullXid must be beyond any observed xid */
+		/* ShmemVariableCache->nextXid must be beyond any observed xid */
 		AdvanceNextFullTransactionIdPastXid(latestObservedXid);
 		next_expected_xid = latestObservedXid;
 		TransactionIdAdvance(next_expected_xid);
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index f5229839cfc3..52b2809dac03 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -889,7 +889,7 @@ standby_redo(XLogReaderState *record)
  * up from a checkpoint and are immediately at our starting point, we
  * unconditionally move to STANDBY_INITIALIZED. After this point we
  * must do 4 things:
- *	* move shared nextFullXid forwards as we see new xids
+ *	* move shared nextXid forwards as we see new xids
  *	* extend the clog and subtrans with each new xid
  *	* keep track of uncommitted known assigned xids
  *	* keep track of uncommitted AccessExclusiveLocks
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index d24919f76b67..a2f8e7524b49 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -3390,7 +3390,7 @@ ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
 	 * transaction to complete before freeing some RAM; correctness of visible
 	 * behavior is not affected.
 	 */
-	MySerializableXact->finishedBefore = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
+	MySerializableXact->finishedBefore = XidFromFullTransactionId(ShmemVariableCache->nextXid);
 
 	/*
 	 * If it's not a commit it's either a rollback or a read-only transaction
diff --git a/src/backend/utils/misc/pg_controldata.c b/src/backend/utils/misc/pg_controldata.c
index 419b58330fea..609231275893 100644
--- a/src/backend/utils/misc/pg_controldata.c
+++ b/src/backend/utils/misc/pg_controldata.c
@@ -165,8 +165,8 @@ pg_control_checkpoint(PG_FUNCTION_ARGS)
 	nulls[5] = false;
 
 	values[6] = CStringGetTextDatum(psprintf("%u:%u",
-											 EpochFromFullTransactionId(ControlFile->checkPointCopy.nextFullXid),
-											 XidFromFullTransactionId(ControlFile->checkPointCopy.nextFullXid)));
+											 EpochFromFullTransactionId(ControlFile->checkPointCopy.nextXid),
+											 XidFromFullTransactionId(ControlFile->checkPointCopy.nextXid)));
 	nulls[6] = false;
 
 	values[7] = ObjectIdGetDatum(ControlFile->checkPointCopy.nextOid);
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index e73639df744b..3e00ac0f701a 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -250,8 +250,8 @@ main(int argc, char *argv[])
 	printf(_("Latest checkpoint's full_page_writes: %s\n"),
 		   ControlFile->checkPointCopy.fullPageWrites ? _("on") : _("off"));
 	printf(_("Latest checkpoint's NextXID:          %u:%u\n"),
-		   EpochFromFullTransactionId(ControlFile->checkPointCopy.nextFullXid),
-		   XidFromFullTransactionId(ControlFile->checkPointCopy.nextFullXid));
+		   EpochFromFullTransactionId(ControlFile->checkPointCopy.nextXid),
+		   XidFromFullTransactionId(ControlFile->checkPointCopy.nextXid));
 	printf(_("Latest checkpoint's NextOID:          %u\n"),
 		   ControlFile->checkPointCopy.nextOid);
 	printf(_("Latest checkpoint's NextMultiXactId:  %u\n"),
diff --git a/src/bin/pg_resetwal/pg_resetwal.c b/src/bin/pg_resetwal/pg_resetwal.c
index 233441837f8a..cb6ef1918206 100644
--- a/src/bin/pg_resetwal/pg_resetwal.c
+++ b/src/bin/pg_resetwal/pg_resetwal.c
@@ -424,14 +424,14 @@ main(int argc, char *argv[])
 	 * if any, includes these values.)
 	 */
 	if (set_xid_epoch != -1)
-		ControlFile.checkPointCopy.nextFullXid =
+		ControlFile.checkPointCopy.nextXid =
 			FullTransactionIdFromEpochAndXid(set_xid_epoch,
-											 XidFromFullTransactionId(ControlFile.checkPointCopy.nextFullXid));
+											 XidFromFullTransactionId(ControlFile.checkPointCopy.nextXid));
 
 	if (set_xid != 0)
 	{
-		ControlFile.checkPointCopy.nextFullXid =
-			FullTransactionIdFromEpochAndXid(EpochFromFullTransactionId(ControlFile.checkPointCopy.nextFullXid),
+		ControlFile.checkPointCopy.nextXid =
+			FullTransactionIdFromEpochAndXid(EpochFromFullTransactionId(ControlFile.checkPointCopy.nextXid),
 											 set_xid);
 
 		/*
@@ -684,7 +684,7 @@ GuessControlValues(void)
 	ControlFile.checkPointCopy.ThisTimeLineID = 1;
 	ControlFile.checkPointCopy.PrevTimeLineID = 1;
 	ControlFile.checkPointCopy.fullPageWrites = false;
-	ControlFile.checkPointCopy.nextFullXid =
+	ControlFile.checkPointCopy.nextXid =
 		FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
 	ControlFile.checkPointCopy.nextOid = FirstBootstrapObjectId;
 	ControlFile.checkPointCopy.nextMulti = FirstMultiXactId;
@@ -756,8 +756,8 @@ PrintControlValues(bool guessed)
 	printf(_("Latest checkpoint's full_page_writes: %s\n"),
 		   ControlFile.checkPointCopy.fullPageWrites ? _("on") : _("off"));
 	printf(_("Latest checkpoint's NextXID:          %u:%u\n"),
-		   EpochFromFullTransactionId(ControlFile.checkPointCopy.nextFullXid),
-		   XidFromFullTransactionId(ControlFile.checkPointCopy.nextFullXid));
+		   EpochFromFullTransactionId(ControlFile.checkPointCopy.nextXid),
+		   XidFromFullTransactionId(ControlFile.checkPointCopy.nextXid));
 	printf(_("Latest checkpoint's NextOID:          %u\n"),
 		   ControlFile.checkPointCopy.nextOid);
 	printf(_("Latest checkpoint's NextMultiXactId:  %u\n"),
@@ -847,7 +847,7 @@ PrintNewControlValues(void)
 	if (set_xid != 0)
 	{
 		printf(_("NextXID:                              %u\n"),
-			   XidFromFullTransactionId(ControlFile.checkPointCopy.nextFullXid));
+			   XidFromFullTransactionId(ControlFile.checkPointCopy.nextXid));
 		printf(_("OldestXID:                            %u\n"),
 			   ControlFile.checkPointCopy.oldestXid);
 		printf(_("OldestXID's DB:                       %u\n"),
@@ -857,7 +857,7 @@ PrintNewControlValues(void)
 	if (set_xid_epoch != -1)
 	{
 		printf(_("NextXID epoch:                        %u\n"),
-			   EpochFromFullTransactionId(ControlFile.checkPointCopy.nextFullXid));
+			   EpochFromFullTransactionId(ControlFile.checkPointCopy.nextXid));
 	}
 
 	if (set_oldest_commit_ts_xid != 0)
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index a91a0c7487d8..85508300e9a2 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -175,12 +175,12 @@ typedef struct VariableCacheData
 	/*
 	 * These fields are protected by XidGenLock.
 	 */
-	FullTransactionId nextFullXid;	/* next full XID to assign */
+	FullTransactionId nextXid;	/* next XID to assign */
 
 	TransactionId oldestXid;	/* cluster-wide minimum datfrozenxid */
 	TransactionId xidVacLimit;	/* start forcing autovacuums here */
 	TransactionId xidWarnLimit; /* start complaining here */
-	TransactionId xidStopLimit; /* refuse to advance nextFullXid beyond here */
+	TransactionId xidStopLimit; /* refuse to advance nextXid beyond here */
 	TransactionId xidWrapLimit; /* where the world ends */
 	Oid			oldestXidDB;	/* database with minimum datfrozenxid */
 
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index de5670e53826..06bed90c5e9e 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -40,7 +40,7 @@ typedef struct CheckPoint
 	TimeLineID	PrevTimeLineID; /* previous TLI, if this record begins a new
 								 * timeline (equals ThisTimeLineID otherwise) */
 	bool		fullPageWrites; /* current full_page_writes */
-	FullTransactionId nextFullXid;	/* next free full transaction ID */
+	FullTransactionId nextXid;	/* next free transaction ID */
 	Oid			nextOid;		/* next free OID */
 	MultiXactId nextMulti;		/* next free MultiXactId */
 	MultiXactOffset nextMultiOffset;	/* next free MultiXact offset */
diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h
index cfbe426e5ae3..faaf1d3817b6 100644
--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@@ -72,7 +72,7 @@ typedef struct RunningTransactionsData
 	int			xcnt;			/* # of xact ids in xids[] */
 	int			subxcnt;		/* # of subxact ids in xids[] */
 	bool		subxid_overflow;	/* snapshot overflowed, subxids missing */
-	TransactionId nextXid;		/* xid from ShmemVariableCache->nextFullXid */
+	TransactionId nextXid;		/* xid from ShmemVariableCache->nextXid */
 	TransactionId oldestRunningXid; /* *not* oldestXmin */
 	TransactionId latestCompletedXid;	/* so we can set xmax */
 
diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h
index 4876d2eeea13..4dda1c403a4b 100644
--- a/src/include/storage/standbydefs.h
+++ b/src/include/storage/standbydefs.h
@@ -49,7 +49,7 @@ typedef struct xl_running_xacts
 	int			xcnt;			/* # of xact ids in xids[] */
 	int			subxcnt;		/* # of subxact ids in xids[] */
 	bool		subxid_overflow;	/* snapshot overflowed, subxids missing */
-	TransactionId nextXid;		/* xid from ShmemVariableCache->nextFullXid */
+	TransactionId nextXid;		/* xid from ShmemVariableCache->nextXid */
 	TransactionId oldestRunningXid; /* *not* oldestXmin */
 	TransactionId latestCompletedXid;	/* so we can set xmax */
 

From 3bd7f9969a240827bc2effa399170b7565238fd2 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 11 Aug 2020 17:41:18 -0700
Subject: [PATCH 284/334] Track latest completed xid as a FullTransactionId.

The reason for doing so is that a subsequent commit will need that to
avoid wraparound issues. As the subsequent change is large this was
split out for easier review.

The reason this is not a perfect straight-forward change is that we do
not want track 64bit xids in the procarray or the WAL. Therefore we
need to advance lastestCompletedXid in relation to 32 bit xids. The
code for that is now centralized in MaintainLatestCompletedXid*.

Author: Andres Freund
Reviewed-By: Thomas Munro, Robert Haas, David Rowley
Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
---
 src/backend/access/transam/varsup.c |  50 +++++++++++
 src/backend/access/transam/xlog.c   |   4 +-
 src/backend/storage/ipc/procarray.c | 129 ++++++++++++++++++++++------
 src/include/access/transam.h        |  37 +++++++-
 4 files changed, 191 insertions(+), 29 deletions(-)

diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 3ebd75118f06..2ef0f4991caf 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -569,3 +569,53 @@ GetNewObjectId(void)
 
 	return result;
 }
+
+
+#ifdef USE_ASSERT_CHECKING
+
+/*
+ * Assert that xid is between [oldestXid, nextXid], which is the range we
+ * expect XIDs coming from tables etc to be in.
+ *
+ * As ShmemVariableCache->oldestXid could change just after this call without
+ * further precautions, and as a wrapped-around xid could again fall within
+ * the valid range, this assertion can only detect if something is definitely
+ * wrong, but not establish correctness.
+ *
+ * This intentionally does not expose a return value, to avoid code being
+ * introduced that depends on the return value.
+ */
+void
+AssertTransactionIdInAllowableRange(TransactionId xid)
+{
+	TransactionId oldest_xid;
+	TransactionId next_xid;
+
+	Assert(TransactionIdIsValid(xid));
+
+	/* we may see bootstrap / frozen */
+	if (!TransactionIdIsNormal(xid))
+		return;
+
+	/*
+	 * We can't acquire XidGenLock, as this may be called with XidGenLock
+	 * already held (or with other locks that don't allow XidGenLock to be
+	 * nested). That's ok for our purposes though, since we already rely on
+	 * 32bit reads to be atomic. While nextXid is 64 bit, we only look at
+	 * the lower 32bit, so a skewed read doesn't hurt.
+	 *
+	 * There's no increased danger of falling outside [oldest, next] by
+	 * accessing them without a lock. xid needs to have been created with
+	 * GetNewTransactionId() in the originating session, and the locks there
+	 * pair with the memory barrier below.  We do however accept xid to be <=
+	 * to next_xid, instead of just <, as xid could be from the procarray,
+	 * before we see the updated nextXid value.
+	 */
+	pg_memory_barrier();
+	oldest_xid = ShmemVariableCache->oldestXid;
+	next_xid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+	Assert(TransactionIdFollowsOrEquals(xid, oldest_xid) ||
+		   TransactionIdPrecedesOrEquals(xid, next_xid));
+}
+#endif
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 53945c0e305d..8f72faee82cc 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7865,8 +7865,8 @@ StartupXLOG(void)
 
 	/* also initialize latestCompletedXid, to nextXid - 1 */
 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-	ShmemVariableCache->latestCompletedXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
-	TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
+	ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
+	FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
 	LWLockRelease(ProcArrayLock);
 
 	/*
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index be0240e0ddcd..522518695eec 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -175,6 +175,11 @@ static void KnownAssignedXidsReset(void);
 static inline void ProcArrayEndTransactionInternal(PGPROC *proc,
 												   PGXACT *pgxact, TransactionId latestXid);
 static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid);
+static void MaintainLatestCompletedXid(TransactionId latestXid);
+static void MaintainLatestCompletedXidRecovery(TransactionId latestXid);
+
+static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel,
+												  TransactionId xid);
 
 /*
  * Report shared-memory space needed by CreateSharedProcArray.
@@ -349,9 +354,7 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 		Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
 
 		/* Advance global latestCompletedXid while holding the lock */
-		if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
-								  latestXid))
-			ShmemVariableCache->latestCompletedXid = latestXid;
+		MaintainLatestCompletedXid(latestXid);
 	}
 	else
 	{
@@ -464,9 +467,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
 	pgxact->overflowed = false;
 
 	/* Also advance global latestCompletedXid while holding the lock */
-	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
-							  latestXid))
-		ShmemVariableCache->latestCompletedXid = latestXid;
+	MaintainLatestCompletedXid(latestXid);
 }
 
 /*
@@ -621,6 +622,59 @@ ProcArrayClearTransaction(PGPROC *proc)
 	pgxact->overflowed = false;
 }
 
+/*
+ * Update ShmemVariableCache->latestCompletedXid to point to latestXid if
+ * currently older.
+ */
+static void
+MaintainLatestCompletedXid(TransactionId latestXid)
+{
+	FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid;
+
+	Assert(FullTransactionIdIsValid(cur_latest));
+	Assert(!RecoveryInProgress());
+	Assert(LWLockHeldByMe(ProcArrayLock));
+
+	if (TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid))
+	{
+		ShmemVariableCache->latestCompletedXid =
+			FullXidRelativeTo(cur_latest, latestXid);
+	}
+
+	Assert(IsBootstrapProcessingMode() ||
+		   FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
+}
+
+/*
+ * Same as MaintainLatestCompletedXid, except for use during WAL replay.
+ */
+static void
+MaintainLatestCompletedXidRecovery(TransactionId latestXid)
+{
+	FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid;
+	FullTransactionId rel;
+
+	Assert(AmStartupProcess() || !IsUnderPostmaster);
+	Assert(LWLockHeldByMe(ProcArrayLock));
+
+	/*
+	 * Need a FullTransactionId to compare latestXid with. Can't rely on
+	 * latestCompletedXid to be initialized in recovery. But in recovery it's
+	 * safe to access nextXid without a lock for the startup process.
+	 */
+	rel = ShmemVariableCache->nextXid;
+	Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid));
+
+	if (!FullTransactionIdIsValid(cur_latest) ||
+		TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid))
+	{
+		ShmemVariableCache->latestCompletedXid =
+			FullXidRelativeTo(rel, latestXid);
+	}
+
+	Assert(FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
+}
+
 /*
  * ProcArrayInitRecovery -- initialize recovery xid mgmt environment
  *
@@ -869,12 +923,9 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running)
 	 * If a transaction wrote a commit record in the gap between taking and
 	 * logging the snapshot then latestCompletedXid may already be higher than
 	 * the value from the snapshot, so check before we use the incoming value.
+	 * It also might not yet be set at all.
 	 */
-	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
-							  running->latestCompletedXid))
-		ShmemVariableCache->latestCompletedXid = running->latestCompletedXid;
-
-	Assert(TransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
+	MaintainLatestCompletedXidRecovery(running->latestCompletedXid);
 
 	LWLockRelease(ProcArrayLock);
 
@@ -989,6 +1040,7 @@ TransactionIdIsInProgress(TransactionId xid)
 	int			nxids = 0;
 	ProcArrayStruct *arrayP = procArray;
 	TransactionId topxid;
+	TransactionId latestCompletedXid;
 	int			i,
 				j;
 
@@ -1051,7 +1103,9 @@ TransactionIdIsInProgress(TransactionId xid)
 	 * Now that we have the lock, we can check latestCompletedXid; if the
 	 * target Xid is after that, it's surely still running.
 	 */
-	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xid))
+	latestCompletedXid =
+		XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid);
+	if (TransactionIdPrecedes(latestCompletedXid, xid))
 	{
 		LWLockRelease(ProcArrayLock);
 		xc_by_latest_xid_inc();
@@ -1330,9 +1384,9 @@ GetOldestXmin(Relation rel, int flags)
 	 * and so protects us against overestimating the result due to future
 	 * additions.
 	 */
-	result = ShmemVariableCache->latestCompletedXid;
-	Assert(TransactionIdIsNormal(result));
+	result = XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid);
 	TransactionIdAdvance(result);
+	Assert(TransactionIdIsNormal(result));
 
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
@@ -1511,6 +1565,7 @@ GetSnapshotData(Snapshot snapshot)
 	int			count = 0;
 	int			subcount = 0;
 	bool		suboverflowed = false;
+	FullTransactionId latest_completed;
 	TransactionId replication_slot_xmin = InvalidTransactionId;
 	TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
 
@@ -1554,10 +1609,11 @@ GetSnapshotData(Snapshot snapshot)
 	 */
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
+	latest_completed = ShmemVariableCache->latestCompletedXid;
 	/* xmax is always latestCompletedXid + 1 */
-	xmax = ShmemVariableCache->latestCompletedXid;
-	Assert(TransactionIdIsNormal(xmax));
+	xmax = XidFromFullTransactionId(latest_completed);
 	TransactionIdAdvance(xmax);
+	Assert(TransactionIdIsNormal(xmax));
 
 	/* initialize xmin calculation with xmax */
 	globalxmin = xmin = xmax;
@@ -1984,9 +2040,10 @@ GetRunningTransactionData(void)
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 	LWLockAcquire(XidGenLock, LW_SHARED);
 
-	latestCompletedXid = ShmemVariableCache->latestCompletedXid;
-
-	oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+	latestCompletedXid =
+		XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid);
+	oldestRunningXid =
+		XidFromFullTransactionId(ShmemVariableCache->nextXid);
 
 	/*
 	 * Spin over procArray collecting all xids
@@ -3207,9 +3264,7 @@ XidCacheRemoveRunningXids(TransactionId xid,
 		elog(WARNING, "did not find subXID %u in MyProc", xid);
 
 	/* Also advance global latestCompletedXid while holding the lock */
-	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
-							  latestXid))
-		ShmemVariableCache->latestCompletedXid = latestXid;
+	MaintainLatestCompletedXid(latestXid);
 
 	LWLockRelease(ProcArrayLock);
 }
@@ -3236,6 +3291,32 @@ DisplayXidCache(void)
 }
 #endif							/* XIDCACHE_DEBUG */
 
+/*
+ * Convert a 32 bit transaction id into 64 bit transaction id, by assuming it
+ * is within MaxTransactionId / 2 of XidFromFullTransactionId(rel).
+ *
+ * Be very careful about when to use this function. It can only safely be used
+ * when there is a guarantee that xid is within MaxTransactionId / 2 xids of
+ * rel. That e.g. can be guaranteed if the the caller assures a snapshot is
+ * held by the backend and xid is from a table (where vacuum/freezing ensures
+ * the xid has to be within that range), or if xid is from the procarray and
+ * prevents xid wraparound that way.
+ */
+static inline FullTransactionId
+FullXidRelativeTo(FullTransactionId rel, TransactionId xid)
+{
+	TransactionId rel_xid = XidFromFullTransactionId(rel);
+
+	Assert(TransactionIdIsValid(xid));
+	Assert(TransactionIdIsValid(rel_xid));
+
+	/* not guaranteed to find issues, but likely to catch mistakes */
+	AssertTransactionIdInAllowableRange(xid);
+
+	return FullTransactionIdFromU64(U64FromFullTransactionId(rel)
+									+ (int32) (xid - rel_xid));
+}
+
 
 /* ----------------------------------------------
  *		KnownAssignedTransactionIds sub-module
@@ -3388,9 +3469,7 @@ ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
 	KnownAssignedXidsRemoveTree(xid, nsubxids, subxids);
 
 	/* As in ProcArrayEndTransaction, advance latestCompletedXid */
-	if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid,
-							  max_xid))
-		ShmemVariableCache->latestCompletedXid = max_xid;
+	MaintainLatestCompletedXidRecovery(max_xid);
 
 	LWLockRelease(ProcArrayLock);
 }
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index 85508300e9a2..8db326ad1b50 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -54,6 +54,8 @@
 #define FullTransactionIdFollowsOrEquals(a, b) ((a).value >= (b).value)
 #define FullTransactionIdIsValid(x)		TransactionIdIsValid(XidFromFullTransactionId(x))
 #define InvalidFullTransactionId		FullTransactionIdFromEpochAndXid(0, InvalidTransactionId)
+#define FirstNormalFullTransactionId	FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId)
+#define FullTransactionIdIsNormal(x)	FullTransactionIdFollowsOrEquals(x, FirstNormalFullTransactionId)
 
 /*
  * A 64 bit value that contains an epoch and a TransactionId.  This is
@@ -102,6 +104,31 @@ FullTransactionIdAdvance(FullTransactionId *dest)
 		dest->value++;
 }
 
+/*
+ * Retreat a FullTransactionId variable, stepping over xids that would appear
+ * to be special only when viewed as 32bit XIDs.
+ */
+static inline void
+FullTransactionIdRetreat(FullTransactionId *dest)
+{
+	dest->value--;
+
+	/*
+	 * In contrast to 32bit XIDs don't step over the "actual" special xids.
+	 * For 64bit xids these can't be reached as part of a wraparound as they
+	 * can in the 32bit case.
+	 */
+	if (FullTransactionIdPrecedes(*dest, FirstNormalFullTransactionId))
+		return;
+
+	/*
+	 * But we do need to step over XIDs that'd appear special only for 32bit
+	 * XIDs.
+	 */
+	while (XidFromFullTransactionId(*dest) < FirstNormalTransactionId)
+		dest->value--;
+}
+
 /* back up a transaction ID variable, handling wraparound correctly */
 #define TransactionIdRetreat(dest)	\
 	do { \
@@ -193,8 +220,8 @@ typedef struct VariableCacheData
 	/*
 	 * These fields are protected by ProcArrayLock.
 	 */
-	TransactionId latestCompletedXid;	/* newest XID that has committed or
-										 * aborted */
+	FullTransactionId latestCompletedXid;	/* newest full XID that has
+											 * committed or aborted */
 
 	/*
 	 * These fields are protected by XactTruncationLock
@@ -244,6 +271,12 @@ extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid);
 extern bool ForceTransactionIdLimitUpdate(void);
 extern Oid	GetNewObjectId(void);
 
+#ifdef USE_ASSERT_CHECKING
+extern void AssertTransactionIdInAllowableRange(TransactionId xid);
+#else
+#define AssertTransactionIdInAllowableRange(xid) ((void)true)
+#endif
+
 /*
  * Some frontend programs include this header.  For compilers that emit static
  * inline functions even when they're unused, that leads to unsatisfied

From 3546cf8a7a9dc57e6aa98f5fc1ac5476ad6b99ff Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 12 Aug 2020 11:54:16 -0400
Subject: [PATCH 285/334] Improve comments for postmaster.c's BackendList.

This had gotten a little disjointed over time, and some of the grammar
was sloppy.  Rewrite for more clarity.

In passing, re-pgindent some recently added comments.

No code changes.
---
 src/backend/postmaster/postmaster.c | 37 ++++++++++++++++-------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 5b5fc97c72da..38e2c16ac206 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -156,28 +156,32 @@
  * authorization phase).  This is used mainly to keep track of how many
  * children we have and send them appropriate signals when necessary.
  *
- * "Special" children such as the startup, bgwriter and autovacuum launcher
- * tasks are not in this list.  Autovacuum worker and walsender are in it.
+ * As shown in the above set of backend types, this list includes not only
+ * "normal" client sessions, but also autovacuum workers, walsenders, and
+ * background workers.  (Note that at the time of launch, walsenders are
+ * labeled BACKEND_TYPE_NORMAL; we relabel them to BACKEND_TYPE_WALSND
+ * upon noticing they've changed their PMChildFlags entry.  Hence that check
+ * must be done before any operation that needs to distinguish walsenders
+ * from normal backends.)
+ *
  * Also, "dead_end" children are in it: these are children launched just for
  * the purpose of sending a friendly rejection message to a would-be client.
  * We must track them because they are attached to shared memory, but we know
  * they will never become live backends.  dead_end children are not assigned a
- * PMChildSlot.
+ * PMChildSlot.  dead_end children have bkend_type NORMAL.
  *
- * Background workers are in this list, too.
+ * "Special" children such as the startup, bgwriter and autovacuum launcher
+ * tasks are not in this list.  They are tracked via StartupPID and other
+ * pid_t variables below.  (Thus, there can't be more than one of any given
+ * "special" child process type.  We use BackendList entries for any child
+ * process there can be more than one of.)
  */
 typedef struct bkend
 {
 	pid_t		pid;			/* process id of backend */
 	int32		cancel_key;		/* cancel key for cancels for this backend */
 	int			child_slot;		/* PMChildSlot for this backend, if any */
-
-	/*
-	 * Flavor of backend or auxiliary process.  Note that BACKEND_TYPE_WALSND
-	 * backends initially announce themselves as BACKEND_TYPE_NORMAL, so if
-	 * bkend_type is normal, you should check for a recent transition.
-	 */
-	int			bkend_type;
+	int			bkend_type;		/* child process flavor, see above */
 	bool		dead_end;		/* is it going to send an error and quit? */
 	bool		bgworker_notify;	/* gets bgworker start/stop notifications */
 	dlist_node	elem;			/* list link in BackendList */
@@ -1059,10 +1063,9 @@ PostmasterMain(int argc, char *argv[])
 	 * only during a few moments during a standby promotion. However there is
 	 * a race condition: if pg_ctl promote is executed and creates the files
 	 * during a promotion, the files can stay around even after the server is
-	 * brought up to be the primary. Then, if a new standby starts by using the
-	 * backup taken from the new primary, the files can exist at the server
-	 * startup and should be removed in order to avoid an unexpected
-	 * promotion.
+	 * brought up to be the primary.  Then, if a new standby starts by using
+	 * the backup taken from the new primary, the files can exist at server
+	 * startup and must be removed in order to avoid an unexpected promotion.
 	 *
 	 * Note that promotion signal files need to be removed before the startup
 	 * process is invoked. Because, after that, they can be used by
@@ -5336,8 +5339,8 @@ sigusr1_handler(SIGNAL_ARGS)
 		/*
 		 * Tell startup process to finish recovery.
 		 *
-		 * Leave the promote signal file in place and let the Startup
-		 * process do the unlink.
+		 * Leave the promote signal file in place and let the Startup process
+		 * do the unlink.
 		 */
 		signal_child(StartupPID, SIGUSR2);
 	}

From 1f42d35a1d6144a23602b2c0bc7f97f3046cf890 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Wed, 12 Aug 2020 15:33:36 -0400
Subject: [PATCH 286/334] BRIN: Handle concurrent desummarization properly

If a page range is desummarized at just the right time concurrently with
an index walk, BRIN would raise an error indicating index corruption.
This is scary and unhelpful; silently returning that the page range is
not summarized is sufficient reaction.

This bug was introduced by commit 975ad4e602ff as additional protection
against a bug whose actual fix was elsewhere.  Backpatch equally.

Reported-By: Anastasia Lubennikova <a.lubennikova@postgrespro.ru>
Diagnosed-By: Alexander Lakhin <exclusion@gmail.com>
Discussion: https://postgr.es/m/2588667e-d07d-7e10-74e2-7e1e46194491@postgrespro.ru
Backpatch: 9.5 - master
---
 src/backend/access/brin/brin_revmap.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c
index e8b8308f82ec..35746714a7c4 100644
--- a/src/backend/access/brin/brin_revmap.c
+++ b/src/backend/access/brin/brin_revmap.c
@@ -282,10 +282,17 @@ brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
 		/* If we land on a revmap page, start over */
 		if (BRIN_IS_REGULAR_PAGE(page))
 		{
+			/*
+			 * If the offset number is greater than what's in the page, it's
+			 * possible that the range was desummarized concurrently. Just
+			 * return NULL to handle that case.
+			 */
 			if (*off > PageGetMaxOffsetNumber(page))
-				ereport(ERROR,
-						(errcode(ERRCODE_INDEX_CORRUPTED),
-						 errmsg_internal("corrupted BRIN index: inconsistent range map")));
+			{
+				LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
+				return NULL;
+			}
+
 			lp = PageGetItemId(page, *off);
 			if (ItemIdIsUsed(lp))
 			{

From dc7420c2c9274a283779ec19718d2d16323640c0 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Wed, 12 Aug 2020 16:03:49 -0700
Subject: [PATCH 287/334] snapshot scalability: Don't compute global horizons
 while building snapshots.

To make GetSnapshotData() more scalable, it cannot not look at at each proc's
xmin: While snapshot contents do not need to change whenever a read-only
transaction commits or a snapshot is released, a proc's xmin is modified in
those cases. The frequency of xmin modifications leads to, particularly on
higher core count systems, many cache misses inside GetSnapshotData(), despite
the data underlying a snapshot not changing. That is the most
significant source of GetSnapshotData() scaling poorly on larger systems.

Without accessing xmins, GetSnapshotData() cannot calculate accurate horizons /
thresholds as it has so far. But we don't really have to: The horizons don't
actually change that much between GetSnapshotData() calls. Nor are the horizons
actually used every time a snapshot is built.

The trick this commit introduces is to delay computation of accurate horizons
until there use and using horizon boundaries to determine whether accurate
horizons need to be computed.

The use of RecentGlobal[Data]Xmin to decide whether a row version could be
removed has been replaces with new GlobalVisTest* functions.  These use two
thresholds to determine whether a row can be pruned:
1) definitely_needed, indicating that rows deleted by XIDs >= definitely_needed
   are definitely still visible.
2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can
   definitely be removed
GetSnapshotData() updates definitely_needed to be the xmin of the computed
snapshot.

When testing whether a row can be removed (with GlobalVisTestIsRemovableXid())
and the tested XID falls in between the two (i.e. XID >= maybe_needed && XID <
definitely_needed) the boundaries can be recomputed to be more accurate. As it
is not cheap to compute accurate boundaries, we limit the number of times that
happens in short succession.  As the boundaries used by
GlobalVisTestIsRemovableXid() are never reset (with maybe_needed updated by
GetSnapshotData()), it is likely that further test can benefit from an earlier
computation of accurate horizons.

To avoid regressing performance when old_snapshot_threshold is set (as that
requires an accurate horizon to be computed), heap_page_prune_opt() doesn't
unconditionally call TransactionIdLimitedForOldSnapshots() anymore. Both the
computation of the limited horizon, and the triggering of errors (with
SetOldSnapshotThresholdTimestamp()) is now only done when necessary to remove
tuples.

This commit just removes the accesses to PGXACT->xmin from
GetSnapshotData(), but other members of PGXACT residing in the same
cache line are accessed. Therefore this in itself does not result in a
significant improvement. Subsequent commits will take advantage of the
fact that GetSnapshotData() now does not need to access xmins anymore.

Note: This contains a workaround in heap_page_prune_opt() to keep the
snapshot_too_old tests working. While that workaround is ugly, the tests
currently are not meaningful, and it seems best to address them separately.

Author: Andres Freund <andres@anarazel.de>
Reviewed-By: Robert Haas <robertmhaas@gmail.com>
Reviewed-By: Thomas Munro <thomas.munro@gmail.com>
Reviewed-By: David Rowley <dgrowleyml@gmail.com>
Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
---
 contrib/amcheck/verify_nbtree.c             |   8 +-
 contrib/pg_visibility/pg_visibility.c       |  18 +-
 contrib/pgstattuple/pgstatapprox.c          |   2 +-
 src/backend/access/gin/ginvacuum.c          |  26 +
 src/backend/access/gist/gistutil.c          |   8 +-
 src/backend/access/gist/gistxlog.c          |  10 +-
 src/backend/access/heap/heapam.c            |  15 +-
 src/backend/access/heap/heapam_handler.c    |  24 +-
 src/backend/access/heap/heapam_visibility.c |  99 ++-
 src/backend/access/heap/pruneheap.c         | 205 ++++-
 src/backend/access/heap/vacuumlazy.c        |  24 +-
 src/backend/access/index/indexam.c          |   3 +-
 src/backend/access/nbtree/README            |  10 +-
 src/backend/access/nbtree/nbtpage.c         |   4 +-
 src/backend/access/nbtree/nbtree.c          |  28 +-
 src/backend/access/nbtree/nbtxlog.c         |  10 +-
 src/backend/access/spgist/spgvacuum.c       |   6 +-
 src/backend/access/transam/README           |  82 +-
 src/backend/access/transam/xlog.c           |   4 +-
 src/backend/commands/analyze.c              |   2 +-
 src/backend/commands/vacuum.c               |  41 +-
 src/backend/postmaster/autovacuum.c         |   4 +
 src/backend/replication/logical/launcher.c  |   4 +
 src/backend/replication/walreceiver.c       |  17 +-
 src/backend/replication/walsender.c         |  15 +-
 src/backend/storage/ipc/procarray.c         | 901 ++++++++++++++++----
 src/backend/utils/adt/selfuncs.c            |  20 +-
 src/backend/utils/init/postinit.c           |   4 +
 src/backend/utils/time/snapmgr.c            | 250 +++---
 src/include/access/ginblock.h               |   4 +-
 src/include/access/heapam.h                 |  10 +-
 src/include/access/transam.h                |  79 +-
 src/include/storage/bufpage.h               |   6 -
 src/include/storage/proc.h                  |   8 -
 src/include/storage/procarray.h             |  32 +-
 src/include/utils/snapmgr.h                 |  37 +-
 src/include/utils/snapshot.h                |   6 +
 src/tools/pgindent/typedefs.list            |   2 +
 38 files changed, 1462 insertions(+), 566 deletions(-)

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 635ece73b354..5f3de3c0b7f6 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -434,10 +434,10 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
 			 RelationGetRelationName(rel));
 
 	/*
-	 * RecentGlobalXmin assertion matches index_getnext_tid().  See note on
-	 * RecentGlobalXmin/B-Tree page deletion.
+	 * This assertion matches the one in index_getnext_tid().  See page
+	 * recycling/"visible to everyone" notes in nbtree README.
 	 */
-	Assert(TransactionIdIsValid(RecentGlobalXmin));
+	Assert(TransactionIdIsValid(RecentXmin));
 
 	/*
 	 * Initialize state for entire verification operation
@@ -1581,7 +1581,7 @@ bt_right_page_check_scankey(BtreeCheckState *state)
 	 * does not occur until no possible index scan could land on the page.
 	 * Index scans can follow links with nothing more than their snapshot as
 	 * an interlock and be sure of at least that much.  (See page
-	 * recycling/RecentGlobalXmin notes in nbtree README.)
+	 * recycling/"visible to everyone" notes in nbtree README.)
 	 *
 	 * Furthermore, it's okay if we follow a rightlink and find a half-dead or
 	 * dead (ignorable) page one or more times.  There will either be a
diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c
index e731161734ae..54e47b810fd2 100644
--- a/contrib/pg_visibility/pg_visibility.c
+++ b/contrib/pg_visibility/pg_visibility.c
@@ -563,17 +563,14 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 	BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD);
 	TransactionId OldestXmin = InvalidTransactionId;
 
-	if (all_visible)
-	{
-		/* Don't pass rel; that will fail in recovery. */
-		OldestXmin = GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM);
-	}
-
 	rel = relation_open(relid, AccessShareLock);
 
 	/* Only some relkinds have a visibility map */
 	check_relation_relkind(rel);
 
+	if (all_visible)
+		OldestXmin = GetOldestNonRemovableTransactionId(rel);
+
 	nblocks = RelationGetNumberOfBlocks(rel);
 
 	/*
@@ -679,11 +676,12 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 				 * From a concurrency point of view, it sort of sucks to
 				 * retake ProcArrayLock here while we're holding the buffer
 				 * exclusively locked, but it should be safe against
-				 * deadlocks, because surely GetOldestXmin() should never take
-				 * a buffer lock. And this shouldn't happen often, so it's
-				 * worth being careful so as to avoid false positives.
+				 * deadlocks, because surely
+				 * GetOldestNonRemovableTransactionId() should never take a
+				 * buffer lock. And this shouldn't happen often, so it's worth
+				 * being careful so as to avoid false positives.
 				 */
-				RecomputedOldestXmin = GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM);
+				RecomputedOldestXmin = GetOldestNonRemovableTransactionId(rel);
 
 				if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
 					record_corrupt_item(items, &tuple.t_self);
diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c
index dbc0fa11f615..3a99333d4435 100644
--- a/contrib/pgstattuple/pgstatapprox.c
+++ b/contrib/pgstattuple/pgstatapprox.c
@@ -71,7 +71,7 @@ statapprox_heap(Relation rel, output_type *stat)
 	BufferAccessStrategy bstrategy;
 	TransactionId OldestXmin;
 
-	OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM);
+	OldestXmin = GetOldestNonRemovableTransactionId(rel);
 	bstrategy = GetAccessStrategy(BAS_BULKREAD);
 
 	nblocks = RelationGetNumberOfBlocks(rel);
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index 8ae4fd95a7bf..9cd6638df621 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -793,3 +793,29 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 	return stats;
 }
+
+/*
+ * Return whether Page can safely be recycled.
+ */
+bool
+GinPageIsRecyclable(Page page)
+{
+	TransactionId delete_xid;
+
+	if (PageIsNew(page))
+		return true;
+
+	if (!GinPageIsDeleted(page))
+		return false;
+
+	delete_xid = GinPageGetDeleteXid(page);
+
+	if (!TransactionIdIsValid(delete_xid))
+		return true;
+
+	/*
+	 * If no backend still could view delete_xid as in running, all scans
+	 * concurrent with ginDeletePage() must have finished.
+	 */
+	return GlobalVisCheckRemovableXid(NULL, delete_xid);
+}
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 765329bbcd43..bfda7fbe3d58 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -891,15 +891,13 @@ gistPageRecyclable(Page page)
 		 * As long as that can happen, we must keep the deleted page around as
 		 * a tombstone.
 		 *
-		 * Compare the deletion XID with RecentGlobalXmin. If deleteXid <
-		 * RecentGlobalXmin, then no scan that's still in progress could have
+		 * For that check if the deletion XID could still be visible to
+		 * anyone. If not, then no scan that's still in progress could have
 		 * seen its downlink, and we can recycle it.
 		 */
 		FullTransactionId deletexid_full = GistPageGetDeleteXid(page);
-		FullTransactionId recentxmin_full = GetFullRecentGlobalXmin();
 
-		if (FullTransactionIdPrecedes(deletexid_full, recentxmin_full))
-			return true;
+		return GlobalVisIsRemovableFullXid(NULL, deletexid_full);
 	}
 	return false;
 }
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 7b5d1e98b70b..a63b05388c5d 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -387,11 +387,11 @@ gistRedoPageReuse(XLogReaderState *record)
 	 * PAGE_REUSE records exist to provide a conflict point when we reuse
 	 * pages in the index via the FSM.  That's all they do though.
 	 *
-	 * latestRemovedXid was the page's deleteXid.  The deleteXid <
-	 * RecentGlobalXmin test in gistPageRecyclable() conceptually mirrors the
-	 * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs().
-	 * Consequently, one XID value achieves the same exclusion effect on
-	 * primary and standby.
+	 * latestRemovedXid was the page's deleteXid.  The
+	 * GlobalVisIsRemovableFullXid(deleteXid) test in gistPageRecyclable()
+	 * conceptually mirrors the pgxact->xmin > limitXmin test in
+	 * GetConflictingVirtualXIDs().  Consequently, one XID value achieves the
+	 * same exclusion effect on primary and standby.
 	 */
 	if (InHotStandby)
 	{
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 00169006fb1f..f75e1cf0e7b0 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1517,6 +1517,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 	bool		at_chain_start;
 	bool		valid;
 	bool		skip;
+	GlobalVisState *vistest = NULL;
 
 	/* If this is not the first call, previous call returned a (live!) tuple */
 	if (all_dead)
@@ -1527,7 +1528,8 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 	at_chain_start = first_call;
 	skip = !first_call;
 
-	Assert(TransactionIdIsValid(RecentGlobalXmin));
+	/* XXX: we should assert that a snapshot is pushed or registered */
+	Assert(TransactionIdIsValid(RecentXmin));
 	Assert(BufferGetBlockNumber(buffer) == blkno);
 
 	/* Scan through possible multiple members of HOT-chain */
@@ -1616,9 +1618,14 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer,
 		 * Note: if you change the criterion here for what is "dead", fix the
 		 * planner's get_actual_variable_range() function to match.
 		 */
-		if (all_dead && *all_dead &&
-			!HeapTupleIsSurelyDead(heapTuple, RecentGlobalXmin))
-			*all_dead = false;
+		if (all_dead && *all_dead)
+		{
+			if (!vistest)
+				vistest = GlobalVisTestFor(relation);
+
+			if (!HeapTupleIsSurelyDead(heapTuple, vistest))
+				*all_dead = false;
+		}
 
 		/*
 		 * Check to see if HOT chain continues past this tuple; if so fetch
diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index 267a6ee25a75..e3e41fb75163 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -1203,7 +1203,7 @@ heapam_index_build_range_scan(Relation heapRelation,
 
 	/* okay to ignore lazy VACUUMs here */
 	if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent)
-		OldestXmin = GetOldestXmin(heapRelation, PROCARRAY_FLAGS_VACUUM);
+		OldestXmin = GetOldestNonRemovableTransactionId(heapRelation);
 
 	if (!scan)
 	{
@@ -1244,6 +1244,17 @@ heapam_index_build_range_scan(Relation heapRelation,
 
 	hscan = (HeapScanDesc) scan;
 
+	/*
+	 * Must have called GetOldestNonRemovableTransactionId() if using
+	 * SnapshotAny.  Shouldn't have for an MVCC snapshot. (It's especially
+	 * worth checking this for parallel builds, since ambuild routines that
+	 * support parallel builds must work these details out for themselves.)
+	 */
+	Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot));
+	Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) :
+		   !TransactionIdIsValid(OldestXmin));
+	Assert(snapshot == SnapshotAny || !anyvisible);
+
 	/* Publish number of blocks to scan */
 	if (progress)
 	{
@@ -1263,17 +1274,6 @@ heapam_index_build_range_scan(Relation heapRelation,
 									 nblocks);
 	}
 
-	/*
-	 * Must call GetOldestXmin() with SnapshotAny.  Should never call
-	 * GetOldestXmin() with MVCC snapshot. (It's especially worth checking
-	 * this for parallel builds, since ambuild routines that support parallel
-	 * builds must work these details out for themselves.)
-	 */
-	Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot));
-	Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) :
-		   !TransactionIdIsValid(OldestXmin));
-	Assert(snapshot == SnapshotAny || !anyvisible);
-
 	/* set our scan endpoints */
 	if (!allow_sync)
 		heap_setscanlimits(scan, start_blockno, numblocks);
diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c
index c77128087cf7..528e75bafd45 100644
--- a/src/backend/access/heap/heapam_visibility.c
+++ b/src/backend/access/heap/heapam_visibility.c
@@ -1154,19 +1154,56 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
  *	we mainly want to know is if a tuple is potentially visible to *any*
  *	running transaction.  If so, it can't be removed yet by VACUUM.
  *
- * OldestXmin is a cutoff XID (obtained from GetOldestXmin()).  Tuples
- * deleted by XIDs >= OldestXmin are deemed "recently dead"; they might
- * still be visible to some open transaction, so we can't remove them,
- * even if we see that the deleting transaction has committed.
+ * OldestXmin is a cutoff XID (obtained from
+ * GetOldestNonRemovableTransactionId()).  Tuples deleted by XIDs >=
+ * OldestXmin are deemed "recently dead"; they might still be visible to some
+ * open transaction, so we can't remove them, even if we see that the deleting
+ * transaction has committed.
  */
 HTSV_Result
 HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin,
 						 Buffer buffer)
+{
+	TransactionId dead_after = InvalidTransactionId;
+	HTSV_Result res;
+
+	res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after);
+
+	if (res == HEAPTUPLE_RECENTLY_DEAD)
+	{
+		Assert(TransactionIdIsValid(dead_after));
+
+		if (TransactionIdPrecedes(dead_after, OldestXmin))
+			res = HEAPTUPLE_DEAD;
+	}
+	else
+		Assert(!TransactionIdIsValid(dead_after));
+
+	return res;
+}
+
+/*
+ * Work horse for HeapTupleSatisfiesVacuum and similar routines.
+ *
+ * In contrast to HeapTupleSatisfiesVacuum this routine, when encountering a
+ * tuple that could still be visible to some backend, stores the xid that
+ * needs to be compared with the horizon in *dead_after, and returns
+ * HEAPTUPLE_RECENTLY_DEAD. The caller then can perform the comparison with
+ * the horizon.  This is e.g. useful when comparing with different horizons.
+ *
+ * Note: HEAPTUPLE_DEAD can still be returned here, e.g. if the inserting
+ * transaction aborted.
+ */
+HTSV_Result
+HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *dead_after)
 {
 	HeapTupleHeader tuple = htup->t_data;
 
 	Assert(ItemPointerIsValid(&htup->t_self));
 	Assert(htup->t_tableOid != InvalidOid);
+	Assert(dead_after != NULL);
+
+	*dead_after = InvalidTransactionId;
 
 	/*
 	 * Has inserting transaction committed?
@@ -1323,17 +1360,15 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin,
 		else if (TransactionIdDidCommit(xmax))
 		{
 			/*
-			 * The multixact might still be running due to lockers.  If the
-			 * updater is below the xid horizon, we have to return DEAD
-			 * regardless -- otherwise we could end up with a tuple where the
-			 * updater has to be removed due to the horizon, but is not pruned
-			 * away.  It's not a problem to prune that tuple, because any
-			 * remaining lockers will also be present in newer tuple versions.
+			 * The multixact might still be running due to lockers.  Need to
+			 * allow for pruning if below the xid horizon regardless --
+			 * otherwise we could end up with a tuple where the updater has to
+			 * be removed due to the horizon, but is not pruned away.  It's
+			 * not a problem to prune that tuple, because any remaining
+			 * lockers will also be present in newer tuple versions.
 			 */
-			if (!TransactionIdPrecedes(xmax, OldestXmin))
-				return HEAPTUPLE_RECENTLY_DEAD;
-
-			return HEAPTUPLE_DEAD;
+			*dead_after = xmax;
+			return HEAPTUPLE_RECENTLY_DEAD;
 		}
 		else if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false))
 		{
@@ -1372,14 +1407,11 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin,
 	}
 
 	/*
-	 * Deleter committed, but perhaps it was recent enough that some open
-	 * transactions could still see the tuple.
+	 * Deleter committed, allow caller to check if it was recent enough that
+	 * some open transactions could still see the tuple.
 	 */
-	if (!TransactionIdPrecedes(HeapTupleHeaderGetRawXmax(tuple), OldestXmin))
-		return HEAPTUPLE_RECENTLY_DEAD;
-
-	/* Otherwise, it's dead and removable */
-	return HEAPTUPLE_DEAD;
+	*dead_after = HeapTupleHeaderGetRawXmax(tuple);
+	return HEAPTUPLE_RECENTLY_DEAD;
 }
 
 
@@ -1393,14 +1425,28 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin,
  *
  *	This is an interface to HeapTupleSatisfiesVacuum that's callable via
  *	HeapTupleSatisfiesSnapshot, so it can be used through a Snapshot.
- *	snapshot->xmin must have been set up with the xmin horizon to use.
+ *	snapshot->vistest must have been set up with the horizon to use.
  */
 static bool
 HeapTupleSatisfiesNonVacuumable(HeapTuple htup, Snapshot snapshot,
 								Buffer buffer)
 {
-	return HeapTupleSatisfiesVacuum(htup, snapshot->xmin, buffer)
-		!= HEAPTUPLE_DEAD;
+	TransactionId dead_after = InvalidTransactionId;
+	HTSV_Result res;
+
+	res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after);
+
+	if (res == HEAPTUPLE_RECENTLY_DEAD)
+	{
+		Assert(TransactionIdIsValid(dead_after));
+
+		if (GlobalVisTestIsRemovableXid(snapshot->vistest, dead_after))
+			res = HEAPTUPLE_DEAD;
+	}
+	else
+		Assert(!TransactionIdIsValid(dead_after));
+
+	return res != HEAPTUPLE_DEAD;
 }
 
 
@@ -1418,7 +1464,7 @@ HeapTupleSatisfiesNonVacuumable(HeapTuple htup, Snapshot snapshot,
  *	if the tuple is removable.
  */
 bool
-HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin)
+HeapTupleIsSurelyDead(HeapTuple htup, GlobalVisState *vistest)
 {
 	HeapTupleHeader tuple = htup->t_data;
 
@@ -1459,7 +1505,8 @@ HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin)
 		return false;
 
 	/* Deleter committed, so tuple is dead if the XID is old enough. */
-	return TransactionIdPrecedes(HeapTupleHeaderGetRawXmax(tuple), OldestXmin);
+	return GlobalVisTestIsRemovableXid(vistest,
+									   HeapTupleHeaderGetRawXmax(tuple));
 }
 
 /*
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 256df4de1050..00a3cb106aac 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -23,12 +23,30 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
+#include "utils/snapmgr.h"
 #include "utils/rel.h"
 #include "utils/snapmgr.h"
 
 /* Working data for heap_page_prune and subroutines */
 typedef struct
 {
+	Relation	rel;
+
+	/* tuple visibility test, initialized for the relation */
+	GlobalVisState *vistest;
+
+	/*
+	 * Thresholds set by TransactionIdLimitedForOldSnapshots() if they have
+	 * been computed (done on demand, and only if
+	 * OldSnapshotThresholdActive()). The first time a tuple is about to be
+	 * removed based on the limited horizon, old_snap_used is set to true, and
+	 * SetOldSnapshotThresholdTimestamp() is called. See
+	 * heap_prune_satisfies_vacuum().
+	 */
+	TimestampTz old_snap_ts;
+	TransactionId old_snap_xmin;
+	bool		old_snap_used;
+
 	TransactionId new_prune_xid;	/* new prune hint value for page */
 	TransactionId latestRemovedXid; /* latest xid to be removed by this prune */
 	int			nredirected;	/* numbers of entries in arrays below */
@@ -43,9 +61,8 @@ typedef struct
 } PruneState;
 
 /* Local functions */
-static int	heap_prune_chain(Relation relation, Buffer buffer,
+static int	heap_prune_chain(Buffer buffer,
 							 OffsetNumber rootoffnum,
-							 TransactionId OldestXmin,
 							 PruneState *prstate);
 static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid);
 static void heap_prune_record_redirect(PruneState *prstate,
@@ -65,16 +82,16 @@ static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum);
  * if there's not any use in pruning.
  *
  * Caller must have pin on the buffer, and must *not* have a lock on it.
- *
- * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
- * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
  */
 void
 heap_page_prune_opt(Relation relation, Buffer buffer)
 {
 	Page		page = BufferGetPage(buffer);
+	TransactionId prune_xid;
+	GlobalVisState *vistest;
+	TransactionId limited_xmin = InvalidTransactionId;
+	TimestampTz limited_ts = 0;
 	Size		minfree;
-	TransactionId OldestXmin;
 
 	/*
 	 * We can't write WAL in recovery mode, so there's no point trying to
@@ -85,37 +102,55 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
 		return;
 
 	/*
-	 * Use the appropriate xmin horizon for this relation. If it's a proper
-	 * catalog relation or a user defined, additional, catalog relation, we
-	 * need to use the horizon that includes slots, otherwise the data-only
-	 * horizon can be used. Note that the toast relation of user defined
-	 * relations are *not* considered catalog relations.
+	 * XXX: Magic to keep old_snapshot_threshold tests appear "working". They
+	 * currently are broken, and discussion of what to do about them is
+	 * ongoing. See
+	 * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de
+	 */
+	if (old_snapshot_threshold == 0)
+		SnapshotTooOldMagicForTest();
+
+	/*
+	 * First check whether there's any chance there's something to prune,
+	 * determining the appropriate horizon is a waste if there's no prune_xid
+	 * (i.e. no updates/deletes left potentially dead tuples around).
+	 */
+	prune_xid = ((PageHeader) page)->pd_prune_xid;
+	if (!TransactionIdIsValid(prune_xid))
+		return;
+
+	/*
+	 * Check whether prune_xid indicates that there may be dead rows that can
+	 * be cleaned up.
 	 *
-	 * It is OK to apply the old snapshot limit before acquiring the cleanup
+	 * It is OK to check the old snapshot limit before acquiring the cleanup
 	 * lock because the worst that can happen is that we are not quite as
 	 * aggressive about the cleanup (by however many transaction IDs are
 	 * consumed between this point and acquiring the lock).  This allows us to
 	 * save significant overhead in the case where the page is found not to be
 	 * prunable.
+	 *
+	 * Even if old_snapshot_threshold is set, we first check whether the page
+	 * can be pruned without. Both because
+	 * TransactionIdLimitedForOldSnapshots() is not cheap, and because not
+	 * unnecessarily relying on old_snapshot_threshold avoids causing
+	 * conflicts.
 	 */
-	if (IsCatalogRelation(relation) ||
-		RelationIsAccessibleInLogicalDecoding(relation))
-		OldestXmin = RecentGlobalXmin;
-	else
-		OldestXmin =
-			TransactionIdLimitedForOldSnapshots(RecentGlobalDataXmin,
-												relation);
+	vistest = GlobalVisTestFor(relation);
 
-	Assert(TransactionIdIsValid(OldestXmin));
+	if (!GlobalVisTestIsRemovableXid(vistest, prune_xid))
+	{
+		if (!OldSnapshotThresholdActive())
+			return;
 
-	/*
-	 * Let's see if we really need pruning.
-	 *
-	 * Forget it if page is not hinted to contain something prunable that's
-	 * older than OldestXmin.
-	 */
-	if (!PageIsPrunable(page, OldestXmin))
-		return;
+		if (!TransactionIdLimitedForOldSnapshots(GlobalVisTestNonRemovableHorizon(vistest),
+												 relation,
+												 &limited_xmin, &limited_ts))
+			return;
+
+		if (!TransactionIdPrecedes(prune_xid, limited_xmin))
+			return;
+	}
 
 	/*
 	 * We prune when a previous UPDATE failed to find enough space on the page
@@ -151,7 +186,9 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
 															 * needed */
 
 			/* OK to prune */
-			(void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore);
+			(void) heap_page_prune(relation, buffer, vistest,
+								   limited_xmin, limited_ts,
+								   true, &ignore);
 		}
 
 		/* And release buffer lock */
@@ -165,8 +202,11 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
  *
  * Caller must have pin and buffer cleanup lock on the page.
  *
- * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD
- * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum).
+ * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD
+ * (see heap_prune_satisfies_vacuum and
+ * HeapTupleSatisfiesVacuum). old_snap_xmin / old_snap_ts need to
+ * either have been set by TransactionIdLimitedForOldSnapshots, or
+ * InvalidTransactionId/0 respectively.
  *
  * If report_stats is true then we send the number of reclaimed heap-only
  * tuples to pgstats.  (This must be false during vacuum, since vacuum will
@@ -177,7 +217,10 @@ heap_page_prune_opt(Relation relation, Buffer buffer)
  * latestRemovedXid.
  */
 int
-heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
+heap_page_prune(Relation relation, Buffer buffer,
+				GlobalVisState *vistest,
+				TransactionId old_snap_xmin,
+				TimestampTz old_snap_ts,
 				bool report_stats, TransactionId *latestRemovedXid)
 {
 	int			ndeleted = 0;
@@ -198,6 +241,11 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
 	 * initialize the rest of our working state.
 	 */
 	prstate.new_prune_xid = InvalidTransactionId;
+	prstate.rel = relation;
+	prstate.vistest = vistest;
+	prstate.old_snap_xmin = old_snap_xmin;
+	prstate.old_snap_ts = old_snap_ts;
+	prstate.old_snap_used = false;
 	prstate.latestRemovedXid = *latestRemovedXid;
 	prstate.nredirected = prstate.ndead = prstate.nunused = 0;
 	memset(prstate.marked, 0, sizeof(prstate.marked));
@@ -220,9 +268,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
 			continue;
 
 		/* Process this item or chain of items */
-		ndeleted += heap_prune_chain(relation, buffer, offnum,
-									 OldestXmin,
-									 &prstate);
+		ndeleted += heap_prune_chain(buffer, offnum, &prstate);
 	}
 
 	/* Any error while applying the changes is critical */
@@ -323,6 +369,85 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
 }
 
 
+/*
+ * Perform visiblity checks for heap pruning.
+ *
+ * This is more complicated than just using GlobalVisTestIsRemovableXid()
+ * because of old_snapshot_threshold. We only want to increase the threshold
+ * that triggers errors for old snapshots when we actually decide to remove a
+ * row based on the limited horizon.
+ *
+ * Due to its cost we also only want to call
+ * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have
+ * done so in heap_hot_prune_opt() if pd_prune_xid was old enough. But we
+ * still want to be able to remove rows that are too new to be removed
+ * according to prstate->vistest, but that can be removed based on
+ * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on
+ * demand in here, if appropriate.
+ */
+static HTSV_Result
+heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer)
+{
+	HTSV_Result res;
+	TransactionId dead_after;
+
+	res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after);
+
+	if (res != HEAPTUPLE_RECENTLY_DEAD)
+		return res;
+
+	/*
+	 * If we are already relying on the limited xmin, there is no need to
+	 * delay doing so anymore.
+	 */
+	if (prstate->old_snap_used)
+	{
+		Assert(TransactionIdIsValid(prstate->old_snap_xmin));
+
+		if (TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
+			res = HEAPTUPLE_DEAD;
+		return res;
+	}
+
+	/*
+	 * First check if GlobalVisTestIsRemovableXid() is sufficient to find the
+	 * row dead. If not, and old_snapshot_threshold is enabled, try to use the
+	 * lowered horizon.
+	 */
+	if (GlobalVisTestIsRemovableXid(prstate->vistest, dead_after))
+		res = HEAPTUPLE_DEAD;
+	else if (OldSnapshotThresholdActive())
+	{
+		/* haven't determined limited horizon yet, requests */
+		if (!TransactionIdIsValid(prstate->old_snap_xmin))
+		{
+			TransactionId horizon =
+			GlobalVisTestNonRemovableHorizon(prstate->vistest);
+
+			TransactionIdLimitedForOldSnapshots(horizon, prstate->rel,
+												&prstate->old_snap_xmin,
+												&prstate->old_snap_ts);
+		}
+
+		if (TransactionIdIsValid(prstate->old_snap_xmin) &&
+			TransactionIdPrecedes(dead_after, prstate->old_snap_xmin))
+		{
+			/*
+			 * About to remove row based on snapshot_too_old. Need to raise
+			 * the threshold so problematic accesses would error.
+			 */
+			Assert(!prstate->old_snap_used);
+			SetOldSnapshotThresholdTimestamp(prstate->old_snap_ts,
+											 prstate->old_snap_xmin);
+			prstate->old_snap_used = true;
+			res = HEAPTUPLE_DEAD;
+		}
+	}
+
+	return res;
+}
+
+
 /*
  * Prune specified line pointer or a HOT chain originating at line pointer.
  *
@@ -349,9 +474,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
  * Returns the number of tuples (to be) deleted from the page.
  */
 static int
-heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
-				 TransactionId OldestXmin,
-				 PruneState *prstate)
+heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate)
 {
 	int			ndeleted = 0;
 	Page		dp = (Page) BufferGetPage(buffer);
@@ -366,7 +489,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 				i;
 	HeapTupleData tup;
 
-	tup.t_tableOid = RelationGetRelid(relation);
+	tup.t_tableOid = RelationGetRelid(prstate->rel);
 
 	rootlp = PageGetItemId(dp, rootoffnum);
 
@@ -401,7 +524,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 			 * either here or while following a chain below.  Whichever path
 			 * gets there first will mark the tuple unused.
 			 */
-			if (HeapTupleSatisfiesVacuum(&tup, OldestXmin, buffer)
+			if (heap_prune_satisfies_vacuum(prstate, &tup, buffer)
 				== HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup))
 			{
 				heap_prune_record_unused(prstate, rootoffnum);
@@ -485,7 +608,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 		 */
 		tupdead = recent_dead = false;
 
-		switch (HeapTupleSatisfiesVacuum(&tup, OldestXmin, buffer))
+		switch (heap_prune_satisfies_vacuum(prstate, &tup, buffer))
 		{
 			case HEAPTUPLE_DEAD:
 				tupdead = true;
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 1bbc4598f75e..44e2224dd557 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -788,6 +788,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
 		PROGRESS_VACUUM_MAX_DEAD_TUPLES
 	};
 	int64		initprog_val[3];
+	GlobalVisState *vistest;
 
 	pg_rusage_init(&ru0);
 
@@ -816,6 +817,8 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
 	vacrelstats->nonempty_pages = 0;
 	vacrelstats->latestRemovedXid = InvalidTransactionId;
 
+	vistest = GlobalVisTestFor(onerel);
+
 	/*
 	 * Initialize state for a parallel vacuum.  As of now, only one worker can
 	 * be used for an index, so we invoke parallelism only if there are at
@@ -1239,7 +1242,8 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
 		 *
 		 * We count tuples removed by the pruning step as removed by VACUUM.
 		 */
-		tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false,
+		tups_vacuumed += heap_page_prune(onerel, buf, vistest, false,
+										 InvalidTransactionId, 0,
 										 &vacrelstats->latestRemovedXid);
 
 		/*
@@ -1596,14 +1600,16 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
 		}
 
 		/*
-		 * It's possible for the value returned by GetOldestXmin() to move
-		 * backwards, so it's not wrong for us to see tuples that appear to
-		 * not be visible to everyone yet, while PD_ALL_VISIBLE is already
-		 * set. The real safe xmin value never moves backwards, but
-		 * GetOldestXmin() is conservative and sometimes returns a value
-		 * that's unnecessarily small, so if we see that contradiction it just
-		 * means that the tuples that we think are not visible to everyone yet
-		 * actually are, and the PD_ALL_VISIBLE flag is correct.
+		 * It's possible for the value returned by
+		 * GetOldestNonRemovableTransactionId() to move backwards, so it's not
+		 * wrong for us to see tuples that appear to not be visible to
+		 * everyone yet, while PD_ALL_VISIBLE is already set. The real safe
+		 * xmin value never moves backwards, but
+		 * GetOldestNonRemovableTransactionId() is conservative and sometimes
+		 * returns a value that's unnecessarily small, so if we see that
+		 * contradiction it just means that the tuples that we think are not
+		 * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag
+		 * is correct.
 		 *
 		 * There should never be dead tuples on a page with PD_ALL_VISIBLE
 		 * set, however.
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 6b9750c244a7..3fb8688f8f4c 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -519,7 +519,8 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 	SCAN_CHECKS;
 	CHECK_SCAN_PROCEDURE(amgettuple);
 
-	Assert(TransactionIdIsValid(RecentGlobalXmin));
+	/* XXX: we should assert that a snapshot is pushed or registered */
+	Assert(TransactionIdIsValid(RecentXmin));
 
 	/*
 	 * The AM's amgettuple proc finds the next index entry matching the scan
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index abce31a5a96b..781a8f1932d3 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -342,9 +342,9 @@ snapshots and registered snapshots as of the deletion are gone; which is
 overly strong, but is simple to implement within Postgres.  When marked
 dead, a deleted page is labeled with the next-transaction counter value.
 VACUUM can reclaim the page for re-use when this transaction number is
-older than RecentGlobalXmin.  As collateral damage, this implementation
-also waits for running XIDs with no snapshots and for snapshots taken
-until the next transaction to allocate an XID commits.
+guaranteed to be "visible to everyone".  As collateral damage, this
+implementation also waits for running XIDs with no snapshots and for
+snapshots taken until the next transaction to allocate an XID commits.
 
 Reclaiming a page doesn't actually change its state on disk --- we simply
 record it in the shared-memory free space map, from which it will be
@@ -411,8 +411,8 @@ page and also the correct place to hold the current value. We can avoid
 the cost of walking down the tree in such common cases.
 
 The optimization works on the assumption that there can only be one
-non-ignorable leaf rightmost page, and so even a RecentGlobalXmin style
-interlock isn't required.  We cannot fail to detect that our hint was
+non-ignorable leaf rightmost page, and so not even a visible-to-everyone
+style interlock required.  We cannot fail to detect that our hint was
 invalidated, because there can only be one such page in the B-Tree at
 any time. It's possible that the page will be deleted and recycled
 without a backend's cached page also being detected as invalidated, but
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index d5db9aaa3a13..74be3807bb7d 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -1097,7 +1097,7 @@ _bt_page_recyclable(Page page)
 	 */
 	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	if (P_ISDELETED(opaque) &&
-		TransactionIdPrecedes(opaque->btpo.xact, RecentGlobalXmin))
+		GlobalVisCheckRemovableXid(NULL, opaque->btpo.xact))
 		return true;
 	return false;
 }
@@ -2318,7 +2318,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	 * updated links to the target, ReadNewTransactionId() suffices as an
 	 * upper bound.  Any scan having retained a now-stale link is advertising
 	 * in its PGXACT an xmin less than or equal to the value we read here.  It
-	 * will continue to do so, holding back RecentGlobalXmin, for the duration
+	 * will continue to do so, holding back the xmin horizon, for the duration
 	 * of that scan.
 	 */
 	page = BufferGetPage(buf);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 49a8a9708e38..8fa6ac7296b9 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -808,6 +808,12 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
 	metapg = BufferGetPage(metabuf);
 	metad = BTPageGetMeta(metapg);
 
+	/*
+	 * XXX: If IndexVacuumInfo contained the heap relation, we could be more
+	 * aggressive about vacuuming non catalog relations by passing the table
+	 * to GlobalVisCheckRemovableXid().
+	 */
+
 	if (metad->btm_version < BTREE_NOVAC_VERSION)
 	{
 		/*
@@ -817,13 +823,12 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
 		result = true;
 	}
 	else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) &&
-			 TransactionIdPrecedes(metad->btm_oldest_btpo_xact,
-								   RecentGlobalXmin))
+			 GlobalVisCheckRemovableXid(NULL, metad->btm_oldest_btpo_xact))
 	{
 		/*
 		 * If any oldest btpo.xact from a previously deleted page in the index
-		 * is older than RecentGlobalXmin, then at least one deleted page can
-		 * be recycled -- don't skip cleanup.
+		 * is visible to everyone, then at least one deleted page can be
+		 * recycled -- don't skip cleanup.
 		 */
 		result = true;
 	}
@@ -1276,14 +1281,13 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
 				 * own conflict now.)
 				 *
 				 * Backends with snapshots acquired after a VACUUM starts but
-				 * before it finishes could have a RecentGlobalXmin with a
-				 * later xid than the VACUUM's OldestXmin cutoff.  These
-				 * backends might happen to opportunistically mark some index
-				 * tuples LP_DEAD before we reach them, even though they may
-				 * be after our cutoff.  We don't try to kill these "extra"
-				 * index tuples in _bt_delitems_vacuum().  This keep things
-				 * simple, and allows us to always avoid generating our own
-				 * conflicts.
+				 * before it finishes could have visibility cutoff with a
+				 * later xid than VACUUM's OldestXmin cutoff.  These backends
+				 * might happen to opportunistically mark some index tuples
+				 * LP_DEAD before we reach them, even though they may be after
+				 * our cutoff.  We don't try to kill these "extra" index
+				 * tuples in _bt_delitems_vacuum().  This keep things simple,
+				 * and allows us to always avoid generating our own conflicts.
 				 */
 				Assert(!BTreeTupleIsPivot(itup));
 				if (!BTreeTupleIsPosting(itup))
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index dbec58d5249c..bda9be234896 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -948,11 +948,11 @@ btree_xlog_reuse_page(XLogReaderState *record)
 	 * Btree reuse_page records exist to provide a conflict point when we
 	 * reuse pages in the index via the FSM.  That's all they do though.
 	 *
-	 * latestRemovedXid was the page's btpo.xact.  The btpo.xact <
-	 * RecentGlobalXmin test in _bt_page_recyclable() conceptually mirrors the
-	 * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs().
-	 * Consequently, one XID value achieves the same exclusion effect on
-	 * primary and standby.
+	 * latestRemovedXid was the page's btpo.xact.  The
+	 * GlobalVisCheckRemovableXid test in _bt_page_recyclable() conceptually
+	 * mirrors the pgxact->xmin > limitXmin test in
+	 * GetConflictingVirtualXIDs().  Consequently, one XID value achieves the
+	 * same exclusion effect on primary and standby.
 	 */
 	if (InHotStandby)
 	{
diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c
index bd98707f3c05..e1c58933f979 100644
--- a/src/backend/access/spgist/spgvacuum.c
+++ b/src/backend/access/spgist/spgvacuum.c
@@ -501,10 +501,14 @@ vacuumRedirectAndPlaceholder(Relation index, Buffer buffer)
 	OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage];
 	OffsetNumber itemnos[MaxIndexTuplesPerPage];
 	spgxlogVacuumRedirect xlrec;
+	GlobalVisState *vistest;
 
 	xlrec.nToPlaceholder = 0;
 	xlrec.newestRedirectXid = InvalidTransactionId;
 
+	/* XXX: providing heap relation would allow more pruning */
+	vistest = GlobalVisTestFor(NULL);
+
 	START_CRIT_SECTION();
 
 	/*
@@ -521,7 +525,7 @@ vacuumRedirectAndPlaceholder(Relation index, Buffer buffer)
 		dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i));
 
 		if (dt->tupstate == SPGIST_REDIRECT &&
-			TransactionIdPrecedes(dt->xid, RecentGlobalXmin))
+			GlobalVisTestIsRemovableXid(vistest, dt->xid))
 		{
 			dt->tupstate = SPGIST_PLACEHOLDER;
 			Assert(opaque->nRedirection > 0);
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index eb9aac5fd396..6f44ae9ce6a5 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -281,7 +281,7 @@ present or the overflow flag is set.)  If a backend released XidGenLock
 before storing its XID into MyPgXact, then it would be possible for another
 backend to allocate and commit a later XID, causing latestCompletedXid to
 pass the first backend's XID, before that value became visible in the
-ProcArray.  That would break GetOldestXmin, as discussed below.
+ProcArray.  That would break ComputeXidHorizons, as discussed below.
 
 We allow GetNewTransactionId to store the XID into MyPgXact->xid (or the
 subxid array) without taking ProcArrayLock.  This was once necessary to
@@ -293,42 +293,50 @@ once, rather than assume they can read it multiple times and get the same
 answer each time.  (Use volatile-qualified pointers when doing this, to
 ensure that the C compiler does exactly what you tell it to.)
 
-Another important activity that uses the shared ProcArray is GetOldestXmin,
-which must determine a lower bound for the oldest xmin of any active MVCC
-snapshot, system-wide.  Each individual backend advertises the smallest
-xmin of its own snapshots in MyPgXact->xmin, or zero if it currently has no
-live snapshots (eg, if it's between transactions or hasn't yet set a
-snapshot for a new transaction).  GetOldestXmin takes the MIN() of the
-valid xmin fields.  It does this with only shared lock on ProcArrayLock,
-which means there is a potential race condition against other backends
-doing GetSnapshotData concurrently: we must be certain that a concurrent
-backend that is about to set its xmin does not compute an xmin less than
-what GetOldestXmin returns.  We ensure that by including all the active
-XIDs into the MIN() calculation, along with the valid xmins.  The rule that
-transactions can't exit without taking exclusive ProcArrayLock ensures that
-concurrent holders of shared ProcArrayLock will compute the same minimum of
-currently-active XIDs: no xact, in particular not the oldest, can exit
-while we hold shared ProcArrayLock.  So GetOldestXmin's view of the minimum
-active XID will be the same as that of any concurrent GetSnapshotData, and
-so it can't produce an overestimate.  If there is no active transaction at
-all, GetOldestXmin returns latestCompletedXid + 1, which is a lower bound
-for the xmin that might be computed by concurrent or later GetSnapshotData
-calls.  (We know that no XID less than this could be about to appear in
-the ProcArray, because of the XidGenLock interlock discussed above.)
-
-GetSnapshotData also performs an oldest-xmin calculation (which had better
-match GetOldestXmin's) and stores that into RecentGlobalXmin, which is used
-for some tuple age cutoff checks where a fresh call of GetOldestXmin seems
-too expensive.  Note that while it is certain that two concurrent
-executions of GetSnapshotData will compute the same xmin for their own
-snapshots, as argued above, it is not certain that they will arrive at the
-same estimate of RecentGlobalXmin.  This is because we allow XID-less
-transactions to clear their MyPgXact->xmin asynchronously (without taking
-ProcArrayLock), so one execution might see what had been the oldest xmin,
-and another not.  This is OK since RecentGlobalXmin need only be a valid
-lower bound.  As noted above, we are already assuming that fetch/store
-of the xid fields is atomic, so assuming it for xmin as well is no extra
-risk.
+Another important activity that uses the shared ProcArray is
+ComputeXidHorizons, which must determine a lower bound for the oldest xmin
+of any active MVCC snapshot, system-wide.  Each individual backend
+advertises the smallest xmin of its own snapshots in MyPgXact->xmin, or zero
+if it currently has no live snapshots (eg, if it's between transactions or
+hasn't yet set a snapshot for a new transaction).  ComputeXidHorizons takes
+the MIN() of the valid xmin fields.  It does this with only shared lock on
+ProcArrayLock, which means there is a potential race condition against other
+backends doing GetSnapshotData concurrently: we must be certain that a
+concurrent backend that is about to set its xmin does not compute an xmin
+less than what ComputeXidHorizons determines.  We ensure that by including
+all the active XIDs into the MIN() calculation, along with the valid xmins.
+The rule that transactions can't exit without taking exclusive ProcArrayLock
+ensures that concurrent holders of shared ProcArrayLock will compute the
+same minimum of currently-active XIDs: no xact, in particular not the
+oldest, can exit while we hold shared ProcArrayLock.  So
+ComputeXidHorizons's view of the minimum active XID will be the same as that
+of any concurrent GetSnapshotData, and so it can't produce an overestimate.
+If there is no active transaction at all, ComputeXidHorizons uses
+latestCompletedXid + 1, which is a lower bound for the xmin that might
+be computed by concurrent or later GetSnapshotData calls.  (We know that no
+XID less than this could be about to appear in the ProcArray, because of the
+XidGenLock interlock discussed above.)
+
+As GetSnapshotData is performance critical, it does not perform an accurate
+oldest-xmin calculation (it used to, until v13). The contents of a snapshot
+only depend on the xids of other backends, not their xmin. As backend's xmin
+changes much more often than its xid, having GetSnapshotData look at xmins
+can lead to a lot of unnecessary cacheline ping-pong.  Instead
+GetSnapshotData updates approximate thresholds (one that guarantees that all
+deleted rows older than it can be removed, another determining that deleted
+rows newer than it can not be removed). GlobalVisTest* uses those threshold
+to make invisibility decision, falling back to ComputeXidHorizons if
+necessary.
+
+Note that while it is certain that two concurrent executions of
+GetSnapshotData will compute the same xmin for their own snapshots, there is
+no such guarantee for the horizons computed by ComputeXidHorizons.  This is
+because we allow XID-less transactions to clear their MyPgXact->xmin
+asynchronously (without taking ProcArrayLock), so one execution might see
+what had been the oldest xmin, and another not.  This is OK since the
+thresholds need only be a valid lower bound.  As noted above, we are already
+assuming that fetch/store of the xid fields is atomic, so assuming it for
+xmin as well is no extra risk.
 
 
 pg_xact and pg_subtrans
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8f72faee82cc..09c01ed4ae48 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9096,7 +9096,7 @@ CreateCheckPoint(int flags)
 	 * StartupSUBTRANS hasn't been called yet.
 	 */
 	if (!RecoveryInProgress())
-		TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
+		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
 
 	/* Real work is done, but log and update stats before releasing lock. */
 	LogCheckpointEnd(false);
@@ -9456,7 +9456,7 @@ CreateRestartPoint(int flags)
 	 * this because StartupSUBTRANS hasn't been called yet.
 	 */
 	if (EnableHotStandby)
-		TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
+		TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
 
 	/* Real work is done, but log and update before releasing lock. */
 	LogCheckpointEnd(true);
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index e0fa73ba7909..8af12b5c6b2b 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -1045,7 +1045,7 @@ acquire_sample_rows(Relation onerel, int elevel,
 	totalblocks = RelationGetNumberOfBlocks(onerel);
 
 	/* Need a cutoff xmin for HeapTupleSatisfiesVacuum */
-	OldestXmin = GetOldestXmin(onerel, PROCARRAY_FLAGS_VACUUM);
+	OldestXmin = GetOldestNonRemovableTransactionId(onerel);
 
 	/* Prepare for sampling block numbers */
 	nblocks = BlockSampler_Init(&bs, totalblocks, targrows, random());
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 576c7e63e99a..22228f5684f0 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -955,8 +955,25 @@ vacuum_set_xid_limits(Relation rel,
 	 * working on a particular table at any time, and that each vacuum is
 	 * always an independent transaction.
 	 */
-	*oldestXmin =
-		TransactionIdLimitedForOldSnapshots(GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM), rel);
+	*oldestXmin = GetOldestNonRemovableTransactionId(rel);
+
+	if (OldSnapshotThresholdActive())
+	{
+		TransactionId limit_xmin;
+		TimestampTz limit_ts;
+
+		if (TransactionIdLimitedForOldSnapshots(*oldestXmin, rel, &limit_xmin, &limit_ts))
+		{
+			/*
+			 * TODO: We should only set the threshold if we are pruning on the
+			 * basis of the increased limits. Not as crucial here as it is for
+			 * opportunistic pruning (which often happens at a much higher
+			 * frequency), but would still be a significant improvement.
+			 */
+			SetOldSnapshotThresholdTimestamp(limit_ts, limit_xmin);
+			*oldestXmin = limit_xmin;
+		}
+	}
 
 	Assert(TransactionIdIsNormal(*oldestXmin));
 
@@ -1345,12 +1362,13 @@ vac_update_datfrozenxid(void)
 	bool		dirty = false;
 
 	/*
-	 * Initialize the "min" calculation with GetOldestXmin, which is a
-	 * reasonable approximation to the minimum relfrozenxid for not-yet-
-	 * committed pg_class entries for new tables; see AddNewRelationTuple().
-	 * So we cannot produce a wrong minimum by starting with this.
+	 * Initialize the "min" calculation with
+	 * GetOldestNonRemovableTransactionId(), which is a reasonable
+	 * approximation to the minimum relfrozenxid for not-yet-committed
+	 * pg_class entries for new tables; see AddNewRelationTuple().  So we
+	 * cannot produce a wrong minimum by starting with this.
 	 */
-	newFrozenXid = GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM);
+	newFrozenXid = GetOldestNonRemovableTransactionId(NULL);
 
 	/*
 	 * Similarly, initialize the MultiXact "min" with the value that would be
@@ -1681,8 +1699,9 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params)
 	StartTransactionCommand();
 
 	/*
-	 * Functions in indexes may want a snapshot set.  Also, setting a snapshot
-	 * ensures that RecentGlobalXmin is kept truly recent.
+	 * Need to acquire a snapshot to prevent pg_subtrans from being truncated,
+	 * cutoff xids in local memory wrapping around, and to have updated xmin
+	 * horizons.
 	 */
 	PushActiveSnapshot(GetTransactionSnapshot());
 
@@ -1705,8 +1724,8 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params)
 		 *
 		 * Note: these flags remain set until CommitTransaction or
 		 * AbortTransaction.  We don't want to clear them until we reset
-		 * MyPgXact->xid/xmin, else OldestXmin might appear to go backwards,
-		 * which is probably Not Good.
+		 * MyPgXact->xid/xmin, otherwise GetOldestNonRemovableTransactionId()
+		 * might appear to go backwards, which is probably Not Good.
 		 */
 		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 		MyPgXact->vacuumFlags |= PROC_IN_VACUUM;
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 9c7d4b0c60e4..ac97e28be19c 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -1877,6 +1877,10 @@ get_database_list(void)
 	 * the secondary effect that it sets RecentGlobalXmin.  (This is critical
 	 * for anything that reads heap pages, because HOT may decide to prune
 	 * them even if the process doesn't attempt to modify any tuples.)
+	 *
+	 * FIXME: This comment is inaccurate / the code buggy. A snapshot that is
+	 * not pushed/active does not reliably prevent HOT pruning (->xmin could
+	 * e.g. be cleared when cache invalidations are processed).
 	 */
 	StartTransactionCommand();
 	(void) GetTransactionSnapshot();
diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c
index ff985b9b24ca..bdaf0312d63d 100644
--- a/src/backend/replication/logical/launcher.c
+++ b/src/backend/replication/logical/launcher.c
@@ -122,6 +122,10 @@ get_subscription_list(void)
 	 * the secondary effect that it sets RecentGlobalXmin.  (This is critical
 	 * for anything that reads heap pages, because HOT may decide to prune
 	 * them even if the process doesn't attempt to modify any tuples.)
+	 *
+	 * FIXME: This comment is inaccurate / the code buggy. A snapshot that is
+	 * not pushed/active does not reliably prevent HOT pruning (->xmin could
+	 * e.g. be cleared when cache invalidations are processed).
 	 */
 	StartTransactionCommand();
 	(void) GetTransactionSnapshot();
diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c
index d5a9b568a682..7c11e1ab44cb 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -1181,22 +1181,7 @@ XLogWalRcvSendHSFeedback(bool immed)
 	 */
 	if (hot_standby_feedback)
 	{
-		TransactionId slot_xmin;
-
-		/*
-		 * Usually GetOldestXmin() would include both global replication slot
-		 * xmin and catalog_xmin in its calculations, but we want to derive
-		 * separate values for each of those. So we ask for an xmin that
-		 * excludes the catalog_xmin.
-		 */
-		xmin = GetOldestXmin(NULL,
-							 PROCARRAY_FLAGS_DEFAULT | PROCARRAY_SLOTS_XMIN);
-
-		ProcArrayGetReplicationSlotXmin(&slot_xmin, &catalog_xmin);
-
-		if (TransactionIdIsValid(slot_xmin) &&
-			TransactionIdPrecedes(slot_xmin, xmin))
-			xmin = slot_xmin;
+		GetReplicationHorizons(&xmin, &catalog_xmin);
 	}
 	else
 	{
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index d13220c14008..460ca3f947f4 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -2113,9 +2113,10 @@ ProcessStandbyHSFeedbackMessage(void)
 
 	/*
 	 * Set the WalSender's xmin equal to the standby's requested xmin, so that
-	 * the xmin will be taken into account by GetOldestXmin.  This will hold
-	 * back the removal of dead rows and thereby prevent the generation of
-	 * cleanup conflicts on the standby server.
+	 * the xmin will be taken into account by GetSnapshotData() /
+	 * ComputeXidHorizons().  This will hold back the removal of dead rows and
+	 * thereby prevent the generation of cleanup conflicts on the standby
+	 * server.
 	 *
 	 * There is a small window for a race condition here: although we just
 	 * checked that feedbackXmin precedes nextXid, the nextXid could have
@@ -2128,10 +2129,10 @@ ProcessStandbyHSFeedbackMessage(void)
 	 * own xmin would prevent nextXid from advancing so far.
 	 *
 	 * We don't bother taking the ProcArrayLock here.  Setting the xmin field
-	 * is assumed atomic, and there's no real need to prevent a concurrent
-	 * GetOldestXmin.  (If we're moving our xmin forward, this is obviously
-	 * safe, and if we're moving it backwards, well, the data is at risk
-	 * already since a VACUUM could have just finished calling GetOldestXmin.)
+	 * is assumed atomic, and there's no real need to prevent concurrent
+	 * horizon determinations.  (If we're moving our xmin forward, this is
+	 * obviously safe, and if we're moving it backwards, well, the data is at
+	 * risk already since a VACUUM could already have determined the horizon.)
 	 *
 	 * If we're using a replication slot we reserve the xmin via that,
 	 * otherwise via the walsender's PGXACT entry. We can only track the
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 522518695eec..e582d5af4291 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -99,6 +99,142 @@ typedef struct ProcArrayStruct
 	int			pgprocnos[FLEXIBLE_ARRAY_MEMBER];
 } ProcArrayStruct;
 
+/*
+ * State for the GlobalVisTest* family of functions. Those functions can
+ * e.g. be used to decide if a deleted row can be removed without violating
+ * MVCC semantics: If the deleted row's xmax is not considered to be running
+ * by anyone, the row can be removed.
+ *
+ * To avoid slowing down GetSnapshotData(), we don't calculate a precise
+ * cutoff XID while building a snapshot (looking at the frequently changing
+ * xmins scales badly). Instead we compute two boundaries while building the
+ * snapshot:
+ *
+ * 1) definitely_needed, indicating that rows deleted by XIDs >=
+ *    definitely_needed are definitely still visible.
+ *
+ * 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can
+ *    definitely be removed
+ *
+ * When testing an XID that falls in between the two (i.e. XID >= maybe_needed
+ * && XID < definitely_needed), the boundaries can be recomputed (using
+ * ComputeXidHorizons()) to get a more accurate answer. This is cheaper than
+ * maintaining an accurate value all the time.
+ *
+ * As it is not cheap to compute accurate boundaries, we limit the number of
+ * times that happens in short succession. See GlobalVisTestShouldUpdate().
+ *
+ *
+ * There are three backend lifetime instances of this struct, optimized for
+ * different types of relations. As e.g. a normal user defined table in one
+ * database is inaccessible to backends connected to another database, a test
+ * specific to a relation can be more aggressive than a test for a shared
+ * relation.  Currently we track three different states:
+ *
+ * 1) GlobalVisSharedRels, which only considers an XID's
+ *    effects visible-to-everyone if neither snapshots in any database, nor a
+ *    replication slot's xmin, nor a replication slot's catalog_xmin might
+ *    still consider XID as running.
+ *
+ * 2) GlobalVisCatalogRels, which only considers an XID's
+ *    effects visible-to-everyone if neither snapshots in the current
+ *    database, nor a replication slot's xmin, nor a replication slot's
+ *    catalog_xmin might still consider XID as running.
+ *
+ *    I.e. the difference to GlobalVisSharedRels is that
+ *    snapshot in other databases are ignored.
+ *
+ * 3) GlobalVisCatalogRels, which only considers an XID's
+ *    effects visible-to-everyone if neither snapshots in the current
+ *    database, nor a replication slot's xmin consider XID as running.
+ *
+ *    I.e. the difference to GlobalVisCatalogRels is that
+ *    replication slot's catalog_xmin is not taken into account.
+ *
+ * GlobalVisTestFor(relation) returns the appropriate state
+ * for the relation.
+ *
+ * The boundaries are FullTransactionIds instead of TransactionIds to avoid
+ * wraparound dangers. There e.g. would otherwise exist no procarray state to
+ * prevent maybe_needed to become old enough after the GetSnapshotData()
+ * call.
+ *
+ * The typedef is in the header.
+ */
+struct GlobalVisState
+{
+	/* XIDs >= are considered running by some backend */
+	FullTransactionId definitely_needed;
+
+	/* XIDs < are not considered to be running by any backend */
+	FullTransactionId maybe_needed;
+};
+
+/*
+ * Result of ComputeXidHorizons().
+ */
+typedef struct ComputeXidHorizonsResult
+{
+	/*
+	 * The value of ShmemVariableCache->latestCompletedXid when
+	 * ComputeXidHorizons() held ProcArrayLock.
+	 */
+	FullTransactionId latest_completed;
+
+	/*
+	 * The same for procArray->replication_slot_xmin and.
+	 * procArray->replication_slot_catalog_xmin.
+	 */
+	TransactionId slot_xmin;
+	TransactionId slot_catalog_xmin;
+
+	/*
+	 * Oldest xid that any backend might still consider running. This needs to
+	 * include processes running VACUUM, in contrast to the normal visibility
+	 * cutoffs, as vacuum needs to be able to perform pg_subtrans lookups when
+	 * determining visibility, but doesn't care about rows above its xmin to
+	 * be removed.
+	 *
+	 * This likely should only be needed to determine whether pg_subtrans can
+	 * be truncated. It currently includes the effects of replications slots,
+	 * for historical reasons. But that could likely be changed.
+	 */
+	TransactionId oldest_considered_running;
+
+	/*
+	 * Oldest xid for which deleted tuples need to be retained in shared
+	 * tables.
+	 *
+	 * This includes the effects of replications lots. If that's not desired,
+	 * look at shared_oldest_nonremovable_raw;
+	 */
+	TransactionId shared_oldest_nonremovable;
+
+	/*
+	 * Oldest xid that may be necessary to retain in shared tables. This is
+	 * the same as shared_oldest_nonremovable, except that is not affected by
+	 * replication slot's catalog_xmin.
+	 *
+	 * This is mainly useful to be able to send the catalog_xmin to upstream
+	 * streaming replication servers via hot_standby_feedback, so they can
+	 * apply the limit only when accessing catalog tables.
+	 */
+	TransactionId shared_oldest_nonremovable_raw;
+
+	/*
+	 * Oldest xid for which deleted tuples need to be retained in non-shared
+	 * catalog tables.
+	 */
+	TransactionId catalog_oldest_nonremovable;
+
+	/*
+	 * Oldest xid for which deleted tuples need to be retained in normal user
+	 * defined tables.
+	 */
+	TransactionId data_oldest_nonremovable;
+} ComputeXidHorizonsResult;
+
+
 static ProcArrayStruct *procArray;
 
 static PGPROC *allProcs;
@@ -118,6 +254,22 @@ static TransactionId latestObservedXid = InvalidTransactionId;
  */
 static TransactionId standbySnapshotPendingXmin;
 
+/*
+ * State for visibility checks on different types of relations. See struct
+ * GlobalVisState for details. As shared, catalog, and user defined
+ * relations can have different horizons, one such state exists for each.
+ */
+static GlobalVisState GlobalVisSharedRels;
+static GlobalVisState GlobalVisCatalogRels;
+static GlobalVisState GlobalVisDataRels;
+
+/*
+ * This backend's RecentXmin at the last time the accurate xmin horizon was
+ * recomputed, or InvalidTransactionId if it has not. Used to limit how many
+ * times accurate horizons are recomputed. See GlobalVisTestShouldUpdate().
+ */
+static TransactionId ComputeXidHorizonsResultLastXmin;
+
 #ifdef XIDCACHE_DEBUG
 
 /* counters for XidCache measurement */
@@ -180,6 +332,7 @@ static void MaintainLatestCompletedXidRecovery(TransactionId latestXid);
 
 static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel,
 												  TransactionId xid);
+static void GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons);
 
 /*
  * Report shared-memory space needed by CreateSharedProcArray.
@@ -1302,159 +1455,191 @@ TransactionIdIsActive(TransactionId xid)
 
 
 /*
- * GetOldestXmin -- returns oldest transaction that was running
- *					when any current transaction was started.
+ * Determine XID horizons.
  *
- * If rel is NULL or a shared relation, all backends are considered, otherwise
- * only backends running in this database are considered.
+ * This is used by wrapper functions like GetOldestNonRemovableTransactionId()
+ * (for VACUUM), GetReplicationHorizons() (for hot_standby_feedback), etc as
+ * well as "internally" by GlobalVisUpdate() (see comment above struct
+ * GlobalVisState).
  *
- * The flags are used to ignore the backends in calculation when any of the
- * corresponding flags is set. Typically, if you want to ignore ones with
- * PROC_IN_VACUUM flag, you can use PROCARRAY_FLAGS_VACUUM.
+ * See the definition of ComputedXidHorizonsResult for the various computed
+ * horizons.
  *
- * PROCARRAY_SLOTS_XMIN causes GetOldestXmin to ignore the xmin and
- * catalog_xmin of any replication slots that exist in the system when
- * calculating the oldest xmin.
+ * For VACUUM separate horizons (used to to decide which deleted tuples must
+ * be preserved), for shared and non-shared tables are computed.  For shared
+ * relations backends in all databases must be considered, but for non-shared
+ * relations that's not required, since only backends in my own database could
+ * ever see the tuples in them. Also, we can ignore concurrently running lazy
+ * VACUUMs because (a) they must be working on other tables, and (b) they
+ * don't need to do snapshot-based lookups.
  *
- * This is used by VACUUM to decide which deleted tuples must be preserved in
- * the passed in table. For shared relations backends in all databases must be
- * considered, but for non-shared relations that's not required, since only
- * backends in my own database could ever see the tuples in them. Also, we can
- * ignore concurrently running lazy VACUUMs because (a) they must be working
- * on other tables, and (b) they don't need to do snapshot-based lookups.
- *
- * This is also used to determine where to truncate pg_subtrans.  For that
- * backends in all databases have to be considered, so rel = NULL has to be
- * passed in.
+ * This also computes a horizon used to truncate pg_subtrans. For that
+ * backends in all databases have to be considered, and concurrently running
+ * lazy VACUUMs cannot be ignored, as they still may perform pg_subtrans
+ * accesses.
  *
  * Note: we include all currently running xids in the set of considered xids.
  * This ensures that if a just-started xact has not yet set its snapshot,
  * when it does set the snapshot it cannot set xmin less than what we compute.
  * See notes in src/backend/access/transam/README.
  *
- * Note: despite the above, it's possible for the calculated value to move
- * backwards on repeated calls. The calculated value is conservative, so that
- * anything older is definitely not considered as running by anyone anymore,
- * but the exact value calculated depends on a number of things. For example,
- * if rel = NULL and there are no transactions running in the current
- * database, GetOldestXmin() returns latestCompletedXid. If a transaction
+ * Note: despite the above, it's possible for the calculated values to move
+ * backwards on repeated calls. The calculated values are conservative, so
+ * that anything older is definitely not considered as running by anyone
+ * anymore, but the exact values calculated depend on a number of things. For
+ * example, if there are no transactions running in the current database, the
+ * horizon for normal tables will be latestCompletedXid. If a transaction
  * begins after that, its xmin will include in-progress transactions in other
  * databases that started earlier, so another call will return a lower value.
  * Nonetheless it is safe to vacuum a table in the current database with the
  * first result.  There are also replication-related effects: a walsender
  * process can set its xmin based on transactions that are no longer running
  * on the primary but are still being replayed on the standby, thus possibly
- * making the GetOldestXmin reading go backwards.  In this case there is a
- * possibility that we lose data that the standby would like to have, but
- * unless the standby uses a replication slot to make its xmin persistent
- * there is little we can do about that --- data is only protected if the
- * walsender runs continuously while queries are executed on the standby.
- * (The Hot Standby code deals with such cases by failing standby queries
- * that needed to access already-removed data, so there's no integrity bug.)
- * The return value is also adjusted with vacuum_defer_cleanup_age, so
- * increasing that setting on the fly is another easy way to make
- * GetOldestXmin() move backwards, with no consequences for data integrity.
+ * making the values go backwards.  In this case there is a possibility that
+ * we lose data that the standby would like to have, but unless the standby
+ * uses a replication slot to make its xmin persistent there is little we can
+ * do about that --- data is only protected if the walsender runs continuously
+ * while queries are executed on the standby.  (The Hot Standby code deals
+ * with such cases by failing standby queries that needed to access
+ * already-removed data, so there's no integrity bug.)  The computed values
+ * are also adjusted with vacuum_defer_cleanup_age, so increasing that setting
+ * on the fly is another easy way to make horizons move backwards, with no
+ * consequences for data integrity.
+ *
+ * Note: the approximate horizons (see definition of GlobalVisState) are
+ * updated by the computations done here. That's currently required for
+ * correctness and a small optimization. Without doing so it's possible that
+ * heap vacuum's call to heap_page_prune() uses a more conservative horizon
+ * than later when deciding which tuples can be removed - which the code
+ * doesn't expect (breaking HOT).
  */
-TransactionId
-GetOldestXmin(Relation rel, int flags)
+static void
+ComputeXidHorizons(ComputeXidHorizonsResult *h)
 {
 	ProcArrayStruct *arrayP = procArray;
-	TransactionId result;
-	int			index;
-	bool		allDbs;
-
-	TransactionId replication_slot_xmin = InvalidTransactionId;
-	TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
-
-	/*
-	 * If we're not computing a relation specific limit, or if a shared
-	 * relation has been passed in, backends in all databases have to be
-	 * considered.
-	 */
-	allDbs = rel == NULL || rel->rd_rel->relisshared;
+	TransactionId kaxmin;
+	bool		in_recovery = RecoveryInProgress();
 
-	/* Cannot look for individual databases during recovery */
-	Assert(allDbs || !RecoveryInProgress());
+	/* inferred after ProcArrayLock is released */
+	h->catalog_oldest_nonremovable = InvalidTransactionId;
 
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
+	h->latest_completed = ShmemVariableCache->latestCompletedXid;
+
 	/*
 	 * We initialize the MIN() calculation with latestCompletedXid + 1. This
 	 * is a lower bound for the XIDs that might appear in the ProcArray later,
 	 * and so protects us against overestimating the result due to future
 	 * additions.
 	 */
-	result = XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid);
-	TransactionIdAdvance(result);
-	Assert(TransactionIdIsNormal(result));
+	{
+		TransactionId initial;
 
-	for (index = 0; index < arrayP->numProcs; index++)
+		initial = XidFromFullTransactionId(h->latest_completed);
+		Assert(TransactionIdIsValid(initial));
+		TransactionIdAdvance(initial);
+
+		h->oldest_considered_running = initial;
+		h->shared_oldest_nonremovable = initial;
+		h->data_oldest_nonremovable = initial;
+	}
+
+	/*
+	 * Fetch slot horizons while ProcArrayLock is held - the
+	 * LWLockAcquire/LWLockRelease are a barrier, ensuring this happens inside
+	 * the lock.
+	 */
+	h->slot_xmin = procArray->replication_slot_xmin;
+	h->slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+
+	for (int index = 0; index < arrayP->numProcs; index++)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
 		PGPROC	   *proc = &allProcs[pgprocno];
 		PGXACT	   *pgxact = &allPgXact[pgprocno];
+		TransactionId xid;
+		TransactionId xmin;
+
+		/* Fetch xid just once - see GetNewTransactionId */
+		xid = UINT32_ACCESS_ONCE(pgxact->xid);
+		xmin = UINT32_ACCESS_ONCE(pgxact->xmin);
+
+		/*
+		 * Consider both the transaction's Xmin, and its Xid.
+		 *
+		 * We must check both because a transaction might have an Xmin but not
+		 * (yet) an Xid; conversely, if it has an Xid, that could determine
+		 * some not-yet-set Xmin.
+		 */
+		xmin = TransactionIdOlder(xmin, xid);
 
-		if (pgxact->vacuumFlags & (flags & PROCARRAY_PROC_FLAGS_MASK))
+		/* if neither is set, this proc doesn't influence the horizon */
+		if (!TransactionIdIsValid(xmin))
 			continue;
 
-		if (allDbs ||
+		/*
+		 * Don't ignore any procs when determining which transactions might be
+		 * considered running.  While slots should ensure logical decoding
+		 * backends are protected even without this check, it can't hurt to
+		 * include them here as well..
+		 */
+		h->oldest_considered_running =
+			TransactionIdOlder(h->oldest_considered_running, xmin);
+
+		/*
+		 * Skip over backends either vacuuming (which is ok with rows being
+		 * removed, as long as pg_subtrans is not truncated) or doing logical
+		 * decoding (which manages xmin separately, check below).
+		 */
+		if (pgxact->vacuumFlags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING))
+			continue;
+
+		/* shared tables need to take backends in all database into account */
+		h->shared_oldest_nonremovable =
+			TransactionIdOlder(h->shared_oldest_nonremovable, xmin);
+
+		/*
+		 * Normally queries in other databases are ignored for anything but
+		 * the shared horizon. But in recovery we cannot compute an accurate
+		 * per-database horizon as all xids are managed via the
+		 * KnownAssignedXids machinery.
+		 */
+		if (in_recovery ||
 			proc->databaseId == MyDatabaseId ||
 			proc->databaseId == 0)	/* always include WalSender */
 		{
-			/* Fetch xid just once - see GetNewTransactionId */
-			TransactionId xid = UINT32_ACCESS_ONCE(pgxact->xid);
-
-			/* First consider the transaction's own Xid, if any */
-			if (TransactionIdIsNormal(xid) &&
-				TransactionIdPrecedes(xid, result))
-				result = xid;
-
-			/*
-			 * Also consider the transaction's Xmin, if set.
-			 *
-			 * We must check both Xid and Xmin because a transaction might
-			 * have an Xmin but not (yet) an Xid; conversely, if it has an
-			 * Xid, that could determine some not-yet-set Xmin.
-			 */
-			xid = UINT32_ACCESS_ONCE(pgxact->xmin);
-			if (TransactionIdIsNormal(xid) &&
-				TransactionIdPrecedes(xid, result))
-				result = xid;
+			h->data_oldest_nonremovable =
+				TransactionIdOlder(h->data_oldest_nonremovable, xmin);
 		}
 	}
 
 	/*
-	 * Fetch into local variable while ProcArrayLock is held - the
-	 * LWLockRelease below is a barrier, ensuring this happens inside the
-	 * lock.
+	 * If in recovery fetch oldest xid in KnownAssignedXids, will be applied
+	 * after lock is released.
 	 */
-	replication_slot_xmin = procArray->replication_slot_xmin;
-	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+	if (in_recovery)
+		kaxmin = KnownAssignedXidsGetOldestXmin();
 
-	if (RecoveryInProgress())
-	{
-		/*
-		 * Check to see whether KnownAssignedXids contains an xid value older
-		 * than the main procarray.
-		 */
-		TransactionId kaxmin = KnownAssignedXidsGetOldestXmin();
-
-		LWLockRelease(ProcArrayLock);
+	/*
+	 * No other information from shared state is needed, release the lock
+	 * immediately. The rest of the computations can be done without a lock.
+	 */
+	LWLockRelease(ProcArrayLock);
 
-		if (TransactionIdIsNormal(kaxmin) &&
-			TransactionIdPrecedes(kaxmin, result))
-			result = kaxmin;
+	if (in_recovery)
+	{
+		h->oldest_considered_running =
+			TransactionIdOlder(h->oldest_considered_running, kaxmin);
+		h->shared_oldest_nonremovable =
+			TransactionIdOlder(h->shared_oldest_nonremovable, kaxmin);
+		h->data_oldest_nonremovable =
+			TransactionIdOlder(h->data_oldest_nonremovable, kaxmin);
 	}
 	else
 	{
 		/*
-		 * No other information needed, so release the lock immediately.
-		 */
-		LWLockRelease(ProcArrayLock);
-
-		/*
-		 * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age,
-		 * being careful not to generate a "permanent" XID.
+		 * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age.
 		 *
 		 * vacuum_defer_cleanup_age provides some additional "slop" for the
 		 * benefit of hot standby queries on standby servers.  This is quick
@@ -1466,34 +1651,146 @@ GetOldestXmin(Relation rel, int flags)
 		 * in varsup.c.  Also note that we intentionally don't apply
 		 * vacuum_defer_cleanup_age on standby servers.
 		 */
-		result -= vacuum_defer_cleanup_age;
-		if (!TransactionIdIsNormal(result))
-			result = FirstNormalTransactionId;
+		h->oldest_considered_running =
+			TransactionIdRetreatedBy(h->oldest_considered_running,
+									 vacuum_defer_cleanup_age);
+		h->shared_oldest_nonremovable =
+			TransactionIdRetreatedBy(h->shared_oldest_nonremovable,
+									 vacuum_defer_cleanup_age);
+		h->data_oldest_nonremovable =
+			TransactionIdRetreatedBy(h->data_oldest_nonremovable,
+									 vacuum_defer_cleanup_age);
 	}
 
 	/*
 	 * Check whether there are replication slots requiring an older xmin.
 	 */
-	if (!(flags & PROCARRAY_SLOTS_XMIN) &&
-		TransactionIdIsValid(replication_slot_xmin) &&
-		NormalTransactionIdPrecedes(replication_slot_xmin, result))
-		result = replication_slot_xmin;
+	h->shared_oldest_nonremovable =
+		TransactionIdOlder(h->shared_oldest_nonremovable, h->slot_xmin);
+	h->data_oldest_nonremovable =
+		TransactionIdOlder(h->data_oldest_nonremovable, h->slot_xmin);
 
 	/*
-	 * After locks have been released and vacuum_defer_cleanup_age has been
-	 * applied, check whether we need to back up further to make logical
-	 * decoding possible. We need to do so if we're computing the global limit
-	 * (rel = NULL) or if the passed relation is a catalog relation of some
-	 * kind.
+	 * The only difference between catalog / data horizons is that the slot's
+	 * catalog xmin is applied to the catalog one (so catalogs can be accessed
+	 * for logical decoding). Initialize with data horizon, and then back up
+	 * further if necessary. Have to back up the shared horizon as well, since
+	 * that also can contain catalogs.
 	 */
-	if (!(flags & PROCARRAY_SLOTS_XMIN) &&
-		(rel == NULL ||
-		 RelationIsAccessibleInLogicalDecoding(rel)) &&
-		TransactionIdIsValid(replication_slot_catalog_xmin) &&
-		NormalTransactionIdPrecedes(replication_slot_catalog_xmin, result))
-		result = replication_slot_catalog_xmin;
+	h->shared_oldest_nonremovable_raw = h->shared_oldest_nonremovable;
+	h->shared_oldest_nonremovable =
+		TransactionIdOlder(h->shared_oldest_nonremovable,
+						   h->slot_catalog_xmin);
+	h->catalog_oldest_nonremovable = h->data_oldest_nonremovable;
+	h->catalog_oldest_nonremovable =
+		TransactionIdOlder(h->catalog_oldest_nonremovable,
+						   h->slot_catalog_xmin);
 
-	return result;
+	/*
+	 * It's possible that slots / vacuum_defer_cleanup_age backed up the
+	 * horizons further than oldest_considered_running. Fix.
+	 */
+	h->oldest_considered_running =
+		TransactionIdOlder(h->oldest_considered_running,
+						   h->shared_oldest_nonremovable);
+	h->oldest_considered_running =
+		TransactionIdOlder(h->oldest_considered_running,
+						   h->catalog_oldest_nonremovable);
+	h->oldest_considered_running =
+		TransactionIdOlder(h->oldest_considered_running,
+						   h->data_oldest_nonremovable);
+
+	/*
+	 * shared horizons have to be at least as old as the oldest visible in
+	 * current db
+	 */
+	Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable,
+										 h->data_oldest_nonremovable));
+	Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable,
+										 h->catalog_oldest_nonremovable));
+
+	/*
+	 * Horizons need to ensure that pg_subtrans access is still possible for
+	 * the relevant backends.
+	 */
+	Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->shared_oldest_nonremovable));
+	Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->catalog_oldest_nonremovable));
+	Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->data_oldest_nonremovable));
+	Assert(!TransactionIdIsValid(h->slot_xmin) ||
+		   TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->slot_xmin));
+	Assert(!TransactionIdIsValid(h->slot_catalog_xmin) ||
+		   TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+										 h->slot_catalog_xmin));
+
+	/* update approximate horizons with the computed horizons */
+	GlobalVisUpdateApply(h);
+}
+
+/*
+ * Return the oldest XID for which deleted tuples must be preserved in the
+ * passed table.
+ *
+ * If rel is not NULL the horizon may be considerably more recent than
+ * otherwise (i.e. fewer tuples will be removable). In the NULL case a horizon
+ * that is correct (but not optimal) for all relations will be returned.
+ *
+ * This is used by VACUUM to decide which deleted tuples must be preserved in
+ * the passed in table.
+ */
+TransactionId
+GetOldestNonRemovableTransactionId(Relation rel)
+{
+	ComputeXidHorizonsResult horizons;
+
+	ComputeXidHorizons(&horizons);
+
+	/* select horizon appropriate for relation */
+	if (rel == NULL || rel->rd_rel->relisshared)
+		return horizons.shared_oldest_nonremovable;
+	else if (RelationIsAccessibleInLogicalDecoding(rel))
+		return horizons.catalog_oldest_nonremovable;
+	else
+		return horizons.data_oldest_nonremovable;
+}
+
+/*
+ * Return the oldest transaction id any currently running backend might still
+ * consider running. This should not be used for visibility / pruning
+ * determinations (see GetOldestNonRemovableTransactionId()), but for
+ * decisions like up to where pg_subtrans can be truncated.
+ */
+TransactionId
+GetOldestTransactionIdConsideredRunning(void)
+{
+	ComputeXidHorizonsResult horizons;
+
+	ComputeXidHorizons(&horizons);
+
+	return horizons.oldest_considered_running;
+}
+
+/*
+ * Return the visibility horizons for a hot standby feedback message.
+ */
+void
+GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin)
+{
+	ComputeXidHorizonsResult horizons;
+
+	ComputeXidHorizons(&horizons);
+
+	/*
+	 * Don't want to use shared_oldest_nonremovable here, as that contains the
+	 * effect of replication slot's catalog_xmin. We want to send a separate
+	 * feedback for the catalog horizon, so the primary can remove data table
+	 * contents more aggressively.
+	 */
+	*xmin = horizons.shared_oldest_nonremovable_raw;
+	*catalog_xmin = horizons.slot_catalog_xmin;
 }
 
 /*
@@ -1544,12 +1841,9 @@ GetMaxSnapshotSubxidCount(void)
  *			current transaction (this is the same as MyPgXact->xmin).
  *		RecentXmin: the xmin computed for the most recent snapshot.  XIDs
  *			older than this are known not running any more.
- *		RecentGlobalXmin: the global xmin (oldest TransactionXmin across all
- *			running transactions, except those running LAZY VACUUM).  This is
- *			the same computation done by
- *			GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM).
- *		RecentGlobalDataXmin: the global xmin for non-catalog tables
- *			>= RecentGlobalXmin
+ *
+ * And try to advance the bounds of GlobalVisSharedRels, GlobalVisCatalogRels,
+ * GlobalVisDataRels for the benefit of theGlobalVisTest* family of functions.
  *
  * Note: this function should probably not be called with an argument that's
  * not statically allocated (see xip allocation below).
@@ -1560,12 +1854,12 @@ GetSnapshotData(Snapshot snapshot)
 	ProcArrayStruct *arrayP = procArray;
 	TransactionId xmin;
 	TransactionId xmax;
-	TransactionId globalxmin;
 	int			index;
 	int			count = 0;
 	int			subcount = 0;
 	bool		suboverflowed = false;
 	FullTransactionId latest_completed;
+	TransactionId oldestxid;
 	TransactionId replication_slot_xmin = InvalidTransactionId;
 	TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
 
@@ -1610,13 +1904,15 @@ GetSnapshotData(Snapshot snapshot)
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
 	latest_completed = ShmemVariableCache->latestCompletedXid;
+	oldestxid = ShmemVariableCache->oldestXid;
+
 	/* xmax is always latestCompletedXid + 1 */
 	xmax = XidFromFullTransactionId(latest_completed);
 	TransactionIdAdvance(xmax);
 	Assert(TransactionIdIsNormal(xmax));
 
 	/* initialize xmin calculation with xmax */
-	globalxmin = xmin = xmax;
+	xmin = xmax;
 
 	snapshot->takenDuringRecovery = RecoveryInProgress();
 
@@ -1645,12 +1941,6 @@ GetSnapshotData(Snapshot snapshot)
 				(PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM))
 				continue;
 
-			/* Update globalxmin to be the smallest valid xmin */
-			xid = UINT32_ACCESS_ONCE(pgxact->xmin);
-			if (TransactionIdIsNormal(xid) &&
-				NormalTransactionIdPrecedes(xid, globalxmin))
-				globalxmin = xid;
-
 			/* Fetch xid just once - see GetNewTransactionId */
 			xid = UINT32_ACCESS_ONCE(pgxact->xid);
 
@@ -1766,34 +2056,78 @@ GetSnapshotData(Snapshot snapshot)
 
 	LWLockRelease(ProcArrayLock);
 
-	/*
-	 * Update globalxmin to include actual process xids.  This is a slightly
-	 * different way of computing it than GetOldestXmin uses, but should give
-	 * the same result.
-	 */
-	if (TransactionIdPrecedes(xmin, globalxmin))
-		globalxmin = xmin;
+	/* maintain state for GlobalVis* */
+	{
+		TransactionId def_vis_xid;
+		TransactionId def_vis_xid_data;
+		FullTransactionId def_vis_fxid;
+		FullTransactionId def_vis_fxid_data;
+		FullTransactionId oldestfxid;
 
-	/* Update global variables too */
-	RecentGlobalXmin = globalxmin - vacuum_defer_cleanup_age;
-	if (!TransactionIdIsNormal(RecentGlobalXmin))
-		RecentGlobalXmin = FirstNormalTransactionId;
+		/*
+		 * Converting oldestXid is only safe when xid horizon cannot advance,
+		 * i.e. holding locks. While we don't hold the lock anymore, all the
+		 * necessary data has been gathered with lock held.
+		 */
+		oldestfxid = FullXidRelativeTo(latest_completed, oldestxid);
 
-	/* Check whether there's a replication slot requiring an older xmin. */
-	if (TransactionIdIsValid(replication_slot_xmin) &&
-		NormalTransactionIdPrecedes(replication_slot_xmin, RecentGlobalXmin))
-		RecentGlobalXmin = replication_slot_xmin;
+		/* apply vacuum_defer_cleanup_age */
+		def_vis_xid_data =
+			TransactionIdRetreatedBy(xmin, vacuum_defer_cleanup_age);
 
-	/* Non-catalog tables can be vacuumed if older than this xid */
-	RecentGlobalDataXmin = RecentGlobalXmin;
+		/* Check whether there's a replication slot requiring an older xmin. */
+		def_vis_xid_data =
+			TransactionIdOlder(def_vis_xid_data, replication_slot_xmin);
 
-	/*
-	 * Check whether there's a replication slot requiring an older catalog
-	 * xmin.
-	 */
-	if (TransactionIdIsNormal(replication_slot_catalog_xmin) &&
-		NormalTransactionIdPrecedes(replication_slot_catalog_xmin, RecentGlobalXmin))
-		RecentGlobalXmin = replication_slot_catalog_xmin;
+		/*
+		 * Rows in non-shared, non-catalog tables possibly could be vacuumed
+		 * if older than this xid.
+		 */
+		def_vis_xid = def_vis_xid_data;
+
+		/*
+		 * Check whether there's a replication slot requiring an older catalog
+		 * xmin.
+		 */
+		def_vis_xid =
+			TransactionIdOlder(replication_slot_catalog_xmin, def_vis_xid);
+
+		def_vis_fxid = FullXidRelativeTo(latest_completed, def_vis_xid);
+		def_vis_fxid_data = FullXidRelativeTo(latest_completed, def_vis_xid_data);
+
+		/*
+		 * Check if we can increase upper bound. As a previous
+		 * GlobalVisUpdate() might have computed more aggressive values, don't
+		 * overwrite them if so.
+		 */
+		GlobalVisSharedRels.definitely_needed =
+			FullTransactionIdNewer(def_vis_fxid,
+								   GlobalVisSharedRels.definitely_needed);
+		GlobalVisCatalogRels.definitely_needed =
+			FullTransactionIdNewer(def_vis_fxid,
+								   GlobalVisCatalogRels.definitely_needed);
+		GlobalVisDataRels.definitely_needed =
+			FullTransactionIdNewer(def_vis_fxid_data,
+								   GlobalVisDataRels.definitely_needed);
+
+		/*
+		 * Check if we know that we can initialize or increase the lower
+		 * bound. Currently the only cheap way to do so is to use
+		 * ShmemVariableCache->oldestXid as input.
+		 *
+		 * We should definitely be able to do better. We could e.g. put a
+		 * global lower bound value into ShmemVariableCache.
+		 */
+		GlobalVisSharedRels.maybe_needed =
+			FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed,
+								   oldestfxid);
+		GlobalVisCatalogRels.maybe_needed =
+			FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed,
+								   oldestfxid);
+		GlobalVisDataRels.maybe_needed =
+			FullTransactionIdNewer(GlobalVisDataRels.maybe_needed,
+								   oldestfxid);
+	}
 
 	RecentXmin = xmin;
 
@@ -3291,6 +3625,255 @@ DisplayXidCache(void)
 }
 #endif							/* XIDCACHE_DEBUG */
 
+/*
+ * If rel != NULL, return test state appropriate for relation, otherwise
+ * return state usable for all relations.  The latter may consider XIDs as
+ * not-yet-visible-to-everyone that a state for a specific relation would
+ * already consider visible-to-everyone.
+ *
+ * This needs to be called while a snapshot is active or registered, otherwise
+ * there are wraparound and other dangers.
+ *
+ * See comment for GlobalVisState for details.
+ */
+GlobalVisState *
+GlobalVisTestFor(Relation rel)
+{
+	bool		need_shared;
+	bool		need_catalog;
+	GlobalVisState *state;
+
+	/* XXX: we should assert that a snapshot is pushed or registered */
+	Assert(RecentXmin);
+
+	if (!rel)
+		need_shared = need_catalog = true;
+	else
+	{
+		/*
+		 * Other kinds currently don't contain xids, nor always the necessary
+		 * logical decoding markers.
+		 */
+		Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
+			   rel->rd_rel->relkind == RELKIND_MATVIEW ||
+			   rel->rd_rel->relkind == RELKIND_TOASTVALUE);
+
+		need_shared = rel->rd_rel->relisshared || RecoveryInProgress();
+		need_catalog = IsCatalogRelation(rel) || RelationIsAccessibleInLogicalDecoding(rel);
+	}
+
+	if (need_shared)
+		state = &GlobalVisSharedRels;
+	else if (need_catalog)
+		state = &GlobalVisCatalogRels;
+	else
+		state = &GlobalVisDataRels;
+
+	Assert(FullTransactionIdIsValid(state->definitely_needed) &&
+		   FullTransactionIdIsValid(state->maybe_needed));
+
+	return state;
+}
+
+/*
+ * Return true if it's worth updating the accurate maybe_needed boundary.
+ *
+ * As it is somewhat expensive to determine xmin horizons, we don't want to
+ * repeatedly do so when there is a low likelihood of it being beneficial.
+ *
+ * The current heuristic is that we update only if RecentXmin has changed
+ * since the last update. If the oldest currently running transaction has not
+ * finished, it is unlikely that recomputing the horizon would be useful.
+ */
+static bool
+GlobalVisTestShouldUpdate(GlobalVisState *state)
+{
+	/* hasn't been updated yet */
+	if (!TransactionIdIsValid(ComputeXidHorizonsResultLastXmin))
+		return true;
+
+	/*
+	 * If the maybe_needed/definitely_needed boundaries are the same, it's
+	 * unlikely to be beneficial to refresh boundaries.
+	 */
+	if (FullTransactionIdFollowsOrEquals(state->maybe_needed,
+										 state->definitely_needed))
+		return false;
+
+	/* does the last snapshot built have a different xmin? */
+	return RecentXmin != ComputeXidHorizonsResultLastXmin;
+}
+
+static void
+GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons)
+{
+	GlobalVisSharedRels.maybe_needed =
+		FullXidRelativeTo(horizons->latest_completed,
+						  horizons->shared_oldest_nonremovable);
+	GlobalVisCatalogRels.maybe_needed =
+		FullXidRelativeTo(horizons->latest_completed,
+						  horizons->catalog_oldest_nonremovable);
+	GlobalVisDataRels.maybe_needed =
+		FullXidRelativeTo(horizons->latest_completed,
+						  horizons->data_oldest_nonremovable);
+
+	/*
+	 * In longer running transactions it's possible that transactions we
+	 * previously needed to treat as running aren't around anymore. So update
+	 * definitely_needed to not be earlier than maybe_needed.
+	 */
+	GlobalVisSharedRels.definitely_needed =
+		FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed,
+							   GlobalVisSharedRels.definitely_needed);
+	GlobalVisCatalogRels.definitely_needed =
+		FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed,
+							   GlobalVisCatalogRels.definitely_needed);
+	GlobalVisDataRels.definitely_needed =
+		FullTransactionIdNewer(GlobalVisDataRels.maybe_needed,
+							   GlobalVisDataRels.definitely_needed);
+
+	ComputeXidHorizonsResultLastXmin = RecentXmin;
+}
+
+/*
+ * Update boundaries in GlobalVis{Shared,Catalog, Data}Rels
+ * using ComputeXidHorizons().
+ */
+static void
+GlobalVisUpdate(void)
+{
+	ComputeXidHorizonsResult horizons;
+
+	/* updates the horizons as a side-effect */
+	ComputeXidHorizons(&horizons);
+}
+
+/*
+ * Return true if no snapshot still considers fxid to be running.
+ *
+ * The state passed needs to have been initialized for the relation fxid is
+ * from (NULL is also OK), otherwise the result may not be correct.
+ *
+ * See comment for GlobalVisState for details.
+ */
+bool
+GlobalVisTestIsRemovableFullXid(GlobalVisState *state,
+								FullTransactionId fxid)
+{
+	/*
+	 * If fxid is older than maybe_needed bound, it definitely is visible to
+	 * everyone.
+	 */
+	if (FullTransactionIdPrecedes(fxid, state->maybe_needed))
+		return true;
+
+	/*
+	 * If fxid is >= definitely_needed bound, it is very likely to still be
+	 * considered running.
+	 */
+	if (FullTransactionIdFollowsOrEquals(fxid, state->definitely_needed))
+		return false;
+
+	/*
+	 * fxid is between maybe_needed and definitely_needed, i.e. there might or
+	 * might not exist a snapshot considering fxid running. If it makes sense,
+	 * update boundaries and recheck.
+	 */
+	if (GlobalVisTestShouldUpdate(state))
+	{
+		GlobalVisUpdate();
+
+		Assert(FullTransactionIdPrecedes(fxid, state->definitely_needed));
+
+		return FullTransactionIdPrecedes(fxid, state->maybe_needed);
+	}
+	else
+		return false;
+}
+
+/*
+ * Wrapper around GlobalVisTestIsRemovableFullXid() for 32bit xids.
+ *
+ * It is crucial that this only gets called for xids from a source that
+ * protects against xid wraparounds (e.g. from a table and thus protected by
+ * relfrozenxid).
+ */
+bool
+GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid)
+{
+	FullTransactionId fxid;
+
+	/*
+	 * Convert 32 bit argument to FullTransactionId. We can do so safely
+	 * because we know the xid has to, at the very least, be between
+	 * [oldestXid, nextFullXid), i.e. within 2 billion of xid. To avoid taking
+	 * a lock to determine either, we can just compare with
+	 * state->definitely_needed, which was based on those value at the time
+	 * the current snapshot was built.
+	 */
+	fxid = FullXidRelativeTo(state->definitely_needed, xid);
+
+	return GlobalVisTestIsRemovableFullXid(state, fxid);
+}
+
+/*
+ * Return FullTransactionId below which all transactions are not considered
+ * running anymore.
+ *
+ * Note: This is less efficient than testing with
+ * GlobalVisTestIsRemovableFullXid as it likely requires building an accurate
+ * cutoff, even in the case all the XIDs compared with the cutoff are outside
+ * [maybe_needed, definitely_needed).
+ */
+FullTransactionId
+GlobalVisTestNonRemovableFullHorizon(GlobalVisState *state)
+{
+	/* acquire accurate horizon if not already done */
+	if (GlobalVisTestShouldUpdate(state))
+		GlobalVisUpdate();
+
+	return state->maybe_needed;
+}
+
+/* Convenience wrapper around GlobalVisTestNonRemovableFullHorizon */
+TransactionId
+GlobalVisTestNonRemovableHorizon(GlobalVisState *state)
+{
+	FullTransactionId cutoff;
+
+	cutoff = GlobalVisTestNonRemovableFullHorizon(state);
+
+	return XidFromFullTransactionId(cutoff);
+}
+
+/*
+ * Convenience wrapper around GlobalVisTestFor() and
+ * GlobalVisTestIsRemovableFullXid(), see their comments.
+ */
+bool
+GlobalVisIsRemovableFullXid(Relation rel, FullTransactionId fxid)
+{
+	GlobalVisState *state;
+
+	state = GlobalVisTestFor(rel);
+
+	return GlobalVisTestIsRemovableFullXid(state, fxid);
+}
+
+/*
+ * Convenience wrapper around GlobalVisTestFor() and
+ * GlobalVisTestIsRemovableXid(), see their comments.
+ */
+bool
+GlobalVisCheckRemovableXid(Relation rel, TransactionId xid)
+{
+	GlobalVisState *state;
+
+	state = GlobalVisTestFor(rel);
+
+	return GlobalVisTestIsRemovableXid(state, xid);
+}
+
 /*
  * Convert a 32 bit transaction id into 64 bit transaction id, by assuming it
  * is within MaxTransactionId / 2 of XidFromFullTransactionId(rel).
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 53d974125fd5..00c7afc66fc2 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -5786,14 +5786,15 @@ get_actual_variable_endpoint(Relation heapRel,
 	 * recent); that case motivates not using SnapshotAny here.
 	 *
 	 * A crucial point here is that SnapshotNonVacuumable, with
-	 * RecentGlobalXmin as horizon, yields the inverse of the condition that
-	 * the indexscan will use to decide that index entries are killable (see
-	 * heap_hot_search_buffer()).  Therefore, if the snapshot rejects a tuple
-	 * (or more precisely, all tuples of a HOT chain) and we have to continue
-	 * scanning past it, we know that the indexscan will mark that index entry
-	 * killed.  That means that the next get_actual_variable_endpoint() call
-	 * will not have to re-consider that index entry.  In this way we avoid
-	 * repetitive work when this function is used a lot during planning.
+	 * GlobalVisTestFor(heapRel) as horizon, yields the inverse of the
+	 * condition that the indexscan will use to decide that index entries are
+	 * killable (see heap_hot_search_buffer()).  Therefore, if the snapshot
+	 * rejects a tuple (or more precisely, all tuples of a HOT chain) and we
+	 * have to continue scanning past it, we know that the indexscan will mark
+	 * that index entry killed.  That means that the next
+	 * get_actual_variable_endpoint() call will not have to re-consider that
+	 * index entry.  In this way we avoid repetitive work when this function
+	 * is used a lot during planning.
 	 *
 	 * But using SnapshotNonVacuumable creates a hazard of its own.  In a
 	 * recently-created index, some index entries may point at "broken" HOT
@@ -5805,7 +5806,8 @@ get_actual_variable_endpoint(Relation heapRel,
 	 * or could even be NULL.  We avoid this hazard because we take the data
 	 * from the index entry not the heap.
 	 */
-	InitNonVacuumableSnapshot(SnapshotNonVacuumable, RecentGlobalXmin);
+	InitNonVacuumableSnapshot(SnapshotNonVacuumable,
+							  GlobalVisTestFor(heapRel));
 
 	index_scan = index_beginscan(heapRel, indexRel,
 								 &SnapshotNonVacuumable,
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index f4247ea70d55..893be2f3ddbf 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -722,6 +722,10 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
 	 * is critical for anything that reads heap pages, because HOT may decide
 	 * to prune them even if the process doesn't attempt to modify any
 	 * tuples.)
+	 *
+	 * FIXME: This comment is inaccurate / the code buggy. A snapshot that is
+	 * not pushed/active does not reliably prevent HOT pruning (->xmin could
+	 * e.g. be cleared when cache invalidations are processed).
 	 */
 	if (!bootstrap)
 	{
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 6b6c8571e237..604d823f6861 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -157,16 +157,9 @@ static Snapshot HistoricSnapshot = NULL;
  * These are updated by GetSnapshotData.  We initialize them this way
  * for the convenience of TransactionIdIsInProgress: even in bootstrap
  * mode, we don't want it to say that BootstrapTransactionId is in progress.
- *
- * RecentGlobalXmin and RecentGlobalDataXmin are initialized to
- * InvalidTransactionId, to ensure that no one tries to use a stale
- * value. Readers should ensure that it has been set to something else
- * before using it.
  */
 TransactionId TransactionXmin = FirstNormalTransactionId;
 TransactionId RecentXmin = FirstNormalTransactionId;
-TransactionId RecentGlobalXmin = InvalidTransactionId;
-TransactionId RecentGlobalDataXmin = InvalidTransactionId;
 
 /* (table, ctid) => (cmin, cmax) mapping during timetravel */
 static HTAB *tuplecid_data = NULL;
@@ -581,9 +574,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid,
 	 * Even though we are not going to use the snapshot it computes, we must
 	 * call GetSnapshotData, for two reasons: (1) to be sure that
 	 * CurrentSnapshotData's XID arrays have been allocated, and (2) to update
-	 * RecentXmin and RecentGlobalXmin.  (We could alternatively include those
-	 * two variables in exported snapshot files, but it seems better to have
-	 * snapshot importers compute reasonably up-to-date values for them.)
+	 * the state for GlobalVis*.
 	 */
 	CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData);
 
@@ -956,36 +947,6 @@ xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg)
 		return 0;
 }
 
-/*
- * Get current RecentGlobalXmin value, as a FullTransactionId.
- */
-FullTransactionId
-GetFullRecentGlobalXmin(void)
-{
-	FullTransactionId nextxid_full;
-	uint32		nextxid_epoch;
-	TransactionId nextxid_xid;
-	uint32		epoch;
-
-	Assert(TransactionIdIsNormal(RecentGlobalXmin));
-
-	/*
-	 * Compute the epoch from the next XID's epoch. This relies on the fact
-	 * that RecentGlobalXmin must be within the 2 billion XID horizon from the
-	 * next XID.
-	 */
-	nextxid_full = ReadNextFullTransactionId();
-	nextxid_epoch = EpochFromFullTransactionId(nextxid_full);
-	nextxid_xid = XidFromFullTransactionId(nextxid_full);
-
-	if (RecentGlobalXmin > nextxid_xid)
-		epoch = nextxid_epoch - 1;
-	else
-		epoch = nextxid_epoch;
-
-	return FullTransactionIdFromEpochAndXid(epoch, RecentGlobalXmin);
-}
-
 /*
  * SnapshotResetXmin
  *
@@ -1753,106 +1714,157 @@ GetOldSnapshotThresholdTimestamp(void)
 	return threshold_timestamp;
 }
 
-static void
+void
 SetOldSnapshotThresholdTimestamp(TimestampTz ts, TransactionId xlimit)
 {
 	SpinLockAcquire(&oldSnapshotControl->mutex_threshold);
+	Assert(oldSnapshotControl->threshold_timestamp <= ts);
+	Assert(TransactionIdPrecedesOrEquals(oldSnapshotControl->threshold_xid, xlimit));
 	oldSnapshotControl->threshold_timestamp = ts;
 	oldSnapshotControl->threshold_xid = xlimit;
 	SpinLockRelease(&oldSnapshotControl->mutex_threshold);
 }
 
+/*
+ * XXX: Magic to keep old_snapshot_threshold tests appear "working". They
+ * currently are broken, and discussion of what to do about them is
+ * ongoing. See
+ * https://www.postgresql.org/message-id/20200403001235.e6jfdll3gh2ygbuc%40alap3.anarazel.de
+ */
+void
+SnapshotTooOldMagicForTest(void)
+{
+	TimestampTz ts = GetSnapshotCurrentTimestamp();
+
+	Assert(old_snapshot_threshold == 0);
+
+	ts -= 5 * USECS_PER_SEC;
+
+	SpinLockAcquire(&oldSnapshotControl->mutex_threshold);
+	oldSnapshotControl->threshold_timestamp = ts;
+	SpinLockRelease(&oldSnapshotControl->mutex_threshold);
+}
+
+/*
+ * If there is a valid mapping for the timestamp, set *xlimitp to
+ * that. Returns whether there is such a mapping.
+ */
+static bool
+GetOldSnapshotFromTimeMapping(TimestampTz ts, TransactionId *xlimitp)
+{
+	bool		in_mapping = false;
+
+	Assert(ts == AlignTimestampToMinuteBoundary(ts));
+
+	LWLockAcquire(OldSnapshotTimeMapLock, LW_SHARED);
+
+	if (oldSnapshotControl->count_used > 0
+		&& ts >= oldSnapshotControl->head_timestamp)
+	{
+		int			offset;
+
+		offset = ((ts - oldSnapshotControl->head_timestamp)
+				  / USECS_PER_MINUTE);
+		if (offset > oldSnapshotControl->count_used - 1)
+			offset = oldSnapshotControl->count_used - 1;
+		offset = (oldSnapshotControl->head_offset + offset)
+			% OLD_SNAPSHOT_TIME_MAP_ENTRIES;
+
+		*xlimitp = oldSnapshotControl->xid_by_minute[offset];
+
+		in_mapping = true;
+	}
+
+	LWLockRelease(OldSnapshotTimeMapLock);
+
+	return in_mapping;
+}
+
 /*
  * TransactionIdLimitedForOldSnapshots
  *
- * Apply old snapshot limit, if any.  This is intended to be called for page
- * pruning and table vacuuming, to allow old_snapshot_threshold to override
- * the normal global xmin value.  Actual testing for snapshot too old will be
- * based on whether a snapshot timestamp is prior to the threshold timestamp
- * set in this function.
+ * Apply old snapshot limit.  This is intended to be called for page pruning
+ * and table vacuuming, to allow old_snapshot_threshold to override the normal
+ * global xmin value.  Actual testing for snapshot too old will be based on
+ * whether a snapshot timestamp is prior to the threshold timestamp set in
+ * this function.
+ *
+ * If the limited horizon allows a cleanup action that otherwise would not be
+ * possible, SetOldSnapshotThresholdTimestamp(*limit_ts, *limit_xid) needs to
+ * be called before that cleanup action.
  */
-TransactionId
+bool
 TransactionIdLimitedForOldSnapshots(TransactionId recentXmin,
-									Relation relation)
+									Relation relation,
+									TransactionId *limit_xid,
+									TimestampTz *limit_ts)
 {
-	if (TransactionIdIsNormal(recentXmin)
-		&& old_snapshot_threshold >= 0
-		&& RelationAllowsEarlyPruning(relation))
-	{
-		TimestampTz ts = GetSnapshotCurrentTimestamp();
-		TransactionId xlimit = recentXmin;
-		TransactionId latest_xmin;
-		TimestampTz update_ts;
-		bool		same_ts_as_threshold = false;
+	TimestampTz ts;
+	TransactionId xlimit = recentXmin;
+	TransactionId latest_xmin;
+	TimestampTz next_map_update_ts;
+	TransactionId threshold_timestamp;
+	TransactionId threshold_xid;
 
-		SpinLockAcquire(&oldSnapshotControl->mutex_latest_xmin);
-		latest_xmin = oldSnapshotControl->latest_xmin;
-		update_ts = oldSnapshotControl->next_map_update;
-		SpinLockRelease(&oldSnapshotControl->mutex_latest_xmin);
+	Assert(TransactionIdIsNormal(recentXmin));
+	Assert(OldSnapshotThresholdActive());
+	Assert(limit_ts != NULL && limit_xid != NULL);
 
-		/*
-		 * Zero threshold always overrides to latest xmin, if valid.  Without
-		 * some heuristic it will find its own snapshot too old on, for
-		 * example, a simple UPDATE -- which would make it useless for most
-		 * testing, but there is no principled way to ensure that it doesn't
-		 * fail in this way.  Use a five-second delay to try to get useful
-		 * testing behavior, but this may need adjustment.
-		 */
-		if (old_snapshot_threshold == 0)
-		{
-			if (TransactionIdPrecedes(latest_xmin, MyPgXact->xmin)
-				&& TransactionIdFollows(latest_xmin, xlimit))
-				xlimit = latest_xmin;
+	if (!RelationAllowsEarlyPruning(relation))
+		return false;
 
-			ts -= 5 * USECS_PER_SEC;
-			SetOldSnapshotThresholdTimestamp(ts, xlimit);
+	ts = GetSnapshotCurrentTimestamp();
 
-			return xlimit;
-		}
+	SpinLockAcquire(&oldSnapshotControl->mutex_latest_xmin);
+	latest_xmin = oldSnapshotControl->latest_xmin;
+	next_map_update_ts = oldSnapshotControl->next_map_update;
+	SpinLockRelease(&oldSnapshotControl->mutex_latest_xmin);
 
+	/*
+	 * Zero threshold always overrides to latest xmin, if valid.  Without some
+	 * heuristic it will find its own snapshot too old on, for example, a
+	 * simple UPDATE -- which would make it useless for most testing, but
+	 * there is no principled way to ensure that it doesn't fail in this way.
+	 * Use a five-second delay to try to get useful testing behavior, but this
+	 * may need adjustment.
+	 */
+	if (old_snapshot_threshold == 0)
+	{
+		if (TransactionIdPrecedes(latest_xmin, MyPgXact->xmin)
+			&& TransactionIdFollows(latest_xmin, xlimit))
+			xlimit = latest_xmin;
+
+		ts -= 5 * USECS_PER_SEC;
+	}
+	else
+	{
 		ts = AlignTimestampToMinuteBoundary(ts)
 			- (old_snapshot_threshold * USECS_PER_MINUTE);
 
 		/* Check for fast exit without LW locking. */
 		SpinLockAcquire(&oldSnapshotControl->mutex_threshold);
-		if (ts == oldSnapshotControl->threshold_timestamp)
-		{
-			xlimit = oldSnapshotControl->threshold_xid;
-			same_ts_as_threshold = true;
-		}
+		threshold_timestamp = oldSnapshotControl->threshold_timestamp;
+		threshold_xid = oldSnapshotControl->threshold_xid;
 		SpinLockRelease(&oldSnapshotControl->mutex_threshold);
 
-		if (!same_ts_as_threshold)
+		if (ts == threshold_timestamp)
+		{
+			/*
+			 * Current timestamp is in same bucket as the the last limit that
+			 * was applied. Reuse.
+			 */
+			xlimit = threshold_xid;
+		}
+		else if (ts == next_map_update_ts)
+		{
+			/*
+			 * FIXME: This branch is super iffy - but that should probably
+			 * fixed separately.
+			 */
+			xlimit = latest_xmin;
+		}
+		else if (GetOldSnapshotFromTimeMapping(ts, &xlimit))
 		{
-			if (ts == update_ts)
-			{
-				xlimit = latest_xmin;
-				if (NormalTransactionIdFollows(xlimit, recentXmin))
-					SetOldSnapshotThresholdTimestamp(ts, xlimit);
-			}
-			else
-			{
-				LWLockAcquire(OldSnapshotTimeMapLock, LW_SHARED);
-
-				if (oldSnapshotControl->count_used > 0
-					&& ts >= oldSnapshotControl->head_timestamp)
-				{
-					int			offset;
-
-					offset = ((ts - oldSnapshotControl->head_timestamp)
-							  / USECS_PER_MINUTE);
-					if (offset > oldSnapshotControl->count_used - 1)
-						offset = oldSnapshotControl->count_used - 1;
-					offset = (oldSnapshotControl->head_offset + offset)
-						% OLD_SNAPSHOT_TIME_MAP_ENTRIES;
-					xlimit = oldSnapshotControl->xid_by_minute[offset];
-
-					if (NormalTransactionIdFollows(xlimit, recentXmin))
-						SetOldSnapshotThresholdTimestamp(ts, xlimit);
-				}
-
-				LWLockRelease(OldSnapshotTimeMapLock);
-			}
 		}
 
 		/*
@@ -1867,12 +1879,18 @@ TransactionIdLimitedForOldSnapshots(TransactionId recentXmin,
 		if (TransactionIdIsNormal(latest_xmin)
 			&& TransactionIdPrecedes(latest_xmin, xlimit))
 			xlimit = latest_xmin;
+	}
+
+	if (TransactionIdIsValid(xlimit) &&
+		TransactionIdFollowsOrEquals(xlimit, recentXmin))
+	{
+		*limit_ts = ts;
+		*limit_xid = xlimit;
 
-		if (NormalTransactionIdFollows(xlimit, recentXmin))
-			return xlimit;
+		return true;
 	}
 
-	return recentXmin;
+	return false;
 }
 
 /*
diff --git a/src/include/access/ginblock.h b/src/include/access/ginblock.h
index 3f64fd572e32..fe66a95226b9 100644
--- a/src/include/access/ginblock.h
+++ b/src/include/access/ginblock.h
@@ -12,6 +12,7 @@
 
 #include "access/transam.h"
 #include "storage/block.h"
+#include "storage/bufpage.h"
 #include "storage/itemptr.h"
 #include "storage/off.h"
 
@@ -134,8 +135,7 @@ typedef struct GinMetaPageData
  */
 #define GinPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid )
 #define GinPageSetDeleteXid(page, xid) ( ((PageHeader) (page))->pd_prune_xid = xid)
-#define GinPageIsRecyclable(page) ( PageIsNew(page) || (GinPageIsDeleted(page) \
-	&& TransactionIdPrecedes(GinPageGetDeleteXid(page), RecentGlobalXmin)))
+extern bool GinPageIsRecyclable(Page page);
 
 /*
  * We use our own ItemPointerGet(BlockNumber|OffsetNumber)
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index b31de389106d..ba77013f64f2 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -172,9 +172,12 @@ extern TransactionId heap_compute_xid_horizon_for_tuples(Relation rel,
 														 int nitems);
 
 /* in heap/pruneheap.c */
+struct GlobalVisState;
 extern void heap_page_prune_opt(Relation relation, Buffer buffer);
 extern int	heap_page_prune(Relation relation, Buffer buffer,
-							TransactionId OldestXmin,
+							struct GlobalVisState *vistest,
+							TransactionId limited_oldest_xmin,
+							TimestampTz limited_oldest_ts,
 							bool report_stats, TransactionId *latestRemovedXid);
 extern void heap_page_prune_execute(Buffer buffer,
 									OffsetNumber *redirected, int nredirected,
@@ -195,11 +198,14 @@ extern TM_Result HeapTupleSatisfiesUpdate(HeapTuple stup, CommandId curcid,
 										  Buffer buffer);
 extern HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple stup, TransactionId OldestXmin,
 											Buffer buffer);
+extern HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple stup, Buffer buffer,
+												   TransactionId *dead_after);
 extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer,
 								 uint16 infomask, TransactionId xid);
 extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple);
 extern bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot);
-extern bool HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin);
+extern bool HeapTupleIsSurelyDead(HeapTuple htup,
+								  struct GlobalVisState *vistest);
 
 /*
  * To avoid leaking too much knowledge about reorderbuffer implementation
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index 8db326ad1b50..b32044153b09 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -95,15 +95,6 @@ FullTransactionIdFromU64(uint64 value)
 			(dest) = FirstNormalTransactionId; \
 	} while(0)
 
-/* advance a FullTransactionId variable, stepping over special XIDs */
-static inline void
-FullTransactionIdAdvance(FullTransactionId *dest)
-{
-	dest->value++;
-	while (XidFromFullTransactionId(*dest) < FirstNormalTransactionId)
-		dest->value++;
-}
-
 /*
  * Retreat a FullTransactionId variable, stepping over xids that would appear
  * to be special only when viewed as 32bit XIDs.
@@ -129,6 +120,23 @@ FullTransactionIdRetreat(FullTransactionId *dest)
 		dest->value--;
 }
 
+/*
+ * Advance a FullTransactionId variable, stepping over xids that would appear
+ * to be special only when viewed as 32bit XIDs.
+ */
+static inline void
+FullTransactionIdAdvance(FullTransactionId *dest)
+{
+	dest->value++;
+
+	/* see FullTransactionIdAdvance() */
+	if (FullTransactionIdPrecedes(*dest, FirstNormalFullTransactionId))
+		return;
+
+	while (XidFromFullTransactionId(*dest) < FirstNormalTransactionId)
+		dest->value++;
+}
+
 /* back up a transaction ID variable, handling wraparound correctly */
 #define TransactionIdRetreat(dest)	\
 	do { \
@@ -293,6 +301,59 @@ ReadNewTransactionId(void)
 	return XidFromFullTransactionId(ReadNextFullTransactionId());
 }
 
+/* return transaction ID backed up by amount, handling wraparound correctly */
+static inline TransactionId
+TransactionIdRetreatedBy(TransactionId xid, uint32 amount)
+{
+	xid -= amount;
+
+	while (xid < FirstNormalTransactionId)
+		xid--;
+
+	return xid;
+}
+
+/* return the older of the two IDs */
+static inline TransactionId
+TransactionIdOlder(TransactionId a, TransactionId b)
+{
+	if (!TransactionIdIsValid(a))
+		return b;
+
+	if (!TransactionIdIsValid(b))
+		return a;
+
+	if (TransactionIdPrecedes(a, b))
+		return a;
+	return b;
+}
+
+/* return the older of the two IDs, assuming they're both normal */
+static inline TransactionId
+NormalTransactionIdOlder(TransactionId a, TransactionId b)
+{
+	Assert(TransactionIdIsNormal(a));
+	Assert(TransactionIdIsNormal(b));
+	if (NormalTransactionIdPrecedes(a, b))
+		return a;
+	return b;
+}
+
+/* return the newer of the two IDs */
+static inline FullTransactionId
+FullTransactionIdNewer(FullTransactionId a, FullTransactionId b)
+{
+	if (!FullTransactionIdIsValid(a))
+		return b;
+
+	if (!FullTransactionIdIsValid(b))
+		return a;
+
+	if (FullTransactionIdFollows(a, b))
+		return a;
+	return b;
+}
+
 #endif							/* FRONTEND */
 
 #endif							/* TRANSAM_H */
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 3f88683a059d..51b8f994ac0a 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -389,12 +389,6 @@ PageValidateSpecialPointer(Page page)
 #define PageClearAllVisible(page) \
 	(((PageHeader) (page))->pd_flags &= ~PD_ALL_VISIBLE)
 
-#define PageIsPrunable(page, oldestxmin) \
-( \
-	AssertMacro(TransactionIdIsNormal(oldestxmin)), \
-	TransactionIdIsValid(((PageHeader) (page))->pd_prune_xid) && \
-	TransactionIdPrecedes(((PageHeader) (page))->pd_prune_xid, oldestxmin) \
-)
 #define PageSetPrunable(page, xid) \
 do { \
 	Assert(TransactionIdIsNormal(xid)); \
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 5ceb2494bae7..52ff43cabaaf 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -42,20 +42,12 @@ struct XidCache
 
 /*
  * Flags for PGXACT->vacuumFlags
- *
- * Note: If you modify these flags, you need to modify PROCARRAY_XXX flags
- * in src/include/storage/procarray.h.
- *
- * PROC_RESERVED may later be assigned for use in vacuumFlags, but its value is
- * used for PROCARRAY_SLOTS_XMIN in procarray.h, so GetOldestXmin won't be able
- * to match and ignore processes with this flag set.
  */
 #define		PROC_IS_AUTOVACUUM	0x01	/* is it an autovac worker? */
 #define		PROC_IN_VACUUM		0x02	/* currently running lazy vacuum */
 #define		PROC_VACUUM_FOR_WRAPAROUND	0x08	/* set by autovac only */
 #define		PROC_IN_LOGICAL_DECODING	0x10	/* currently doing logical
 												 * decoding outside xact */
-#define		PROC_RESERVED				0x20	/* reserved for procarray */
 
 /* flags reset at EOXact */
 #define		PROC_VACUUM_STATE_MASK \
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index 01040d76e122..ea8a876ca45c 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -20,34 +20,6 @@
 #include "utils/snapshot.h"
 
 
-/*
- * These are to implement PROCARRAY_FLAGS_XXX
- *
- * Note: These flags are cloned from PROC_XXX flags in src/include/storage/proc.h
- * to avoid forcing to include proc.h when including procarray.h. So if you modify
- * PROC_XXX flags, you need to modify these flags.
- */
-#define		PROCARRAY_VACUUM_FLAG			0x02	/* currently running lazy
-													 * vacuum */
-#define		PROCARRAY_LOGICAL_DECODING_FLAG 0x10	/* currently doing logical
-													 * decoding outside xact */
-
-#define		PROCARRAY_SLOTS_XMIN			0x20	/* replication slot xmin,
-													 * catalog_xmin */
-/*
- * Only flags in PROCARRAY_PROC_FLAGS_MASK are considered when matching
- * PGXACT->vacuumFlags. Other flags are used for different purposes and
- * have no corresponding PROC flag equivalent.
- */
-#define		PROCARRAY_PROC_FLAGS_MASK	(PROCARRAY_VACUUM_FLAG | \
-										 PROCARRAY_LOGICAL_DECODING_FLAG)
-
-/* Use the following flags as an input "flags" to GetOldestXmin function */
-/* Consider all backends except for logical decoding ones which manage xmin separately */
-#define		PROCARRAY_FLAGS_DEFAULT			PROCARRAY_LOGICAL_DECODING_FLAG
-/* Ignore vacuum backends */
-#define		PROCARRAY_FLAGS_VACUUM			PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG
-
 extern Size ProcArrayShmemSize(void);
 extern void CreateSharedProcArray(void);
 extern void ProcArrayAdd(PGPROC *proc);
@@ -81,9 +53,11 @@ extern RunningTransactions GetRunningTransactionData(void);
 
 extern bool TransactionIdIsInProgress(TransactionId xid);
 extern bool TransactionIdIsActive(TransactionId xid);
-extern TransactionId GetOldestXmin(Relation rel, int flags);
+extern TransactionId GetOldestNonRemovableTransactionId(Relation rel);
+extern TransactionId GetOldestTransactionIdConsideredRunning(void);
 extern TransactionId GetOldestActiveTransactionId(void);
 extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
+extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin);
 
 extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
 extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index ffb4ba3adfb0..b6b403e29313 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -52,13 +52,12 @@ extern Size SnapMgrShmemSize(void);
 extern void SnapMgrInit(void);
 extern TimestampTz GetSnapshotCurrentTimestamp(void);
 extern TimestampTz GetOldSnapshotThresholdTimestamp(void);
+extern void SnapshotTooOldMagicForTest(void);
 
 extern bool FirstSnapshotSet;
 
 extern PGDLLIMPORT TransactionId TransactionXmin;
 extern PGDLLIMPORT TransactionId RecentXmin;
-extern PGDLLIMPORT TransactionId RecentGlobalXmin;
-extern PGDLLIMPORT TransactionId RecentGlobalDataXmin;
 
 /* Variables representing various special snapshot semantics */
 extern PGDLLIMPORT SnapshotData SnapshotSelfData;
@@ -78,11 +77,12 @@ extern PGDLLIMPORT SnapshotData CatalogSnapshotData;
 
 /*
  * Similarly, some initialization is required for a NonVacuumable snapshot.
- * The caller must supply the xmin horizon to use (e.g., RecentGlobalXmin).
+ * The caller must supply the visibility cutoff state to use (c.f.
+ * GlobalVisTestFor()).
  */
-#define InitNonVacuumableSnapshot(snapshotdata, xmin_horizon)  \
+#define InitNonVacuumableSnapshot(snapshotdata, vistestp)  \
 	((snapshotdata).snapshot_type = SNAPSHOT_NON_VACUUMABLE, \
-	 (snapshotdata).xmin = (xmin_horizon))
+	 (snapshotdata).vistest = (vistestp))
 
 /*
  * Similarly, some initialization is required for SnapshotToast.  We need
@@ -98,6 +98,11 @@ extern PGDLLIMPORT SnapshotData CatalogSnapshotData;
 	((snapshot)->snapshot_type == SNAPSHOT_MVCC || \
 	 (snapshot)->snapshot_type == SNAPSHOT_HISTORIC_MVCC)
 
+static inline bool
+OldSnapshotThresholdActive(void)
+{
+	return old_snapshot_threshold >= 0;
+}
 
 extern Snapshot GetTransactionSnapshot(void);
 extern Snapshot GetLatestSnapshot(void);
@@ -121,8 +126,6 @@ extern void UnregisterSnapshot(Snapshot snapshot);
 extern Snapshot RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner);
 extern void UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner);
 
-extern FullTransactionId GetFullRecentGlobalXmin(void);
-
 extern void AtSubCommit_Snapshot(int level);
 extern void AtSubAbort_Snapshot(int level);
 extern void AtEOXact_Snapshot(bool isCommit, bool resetXmin);
@@ -131,13 +134,29 @@ extern void ImportSnapshot(const char *idstr);
 extern bool XactHasExportedSnapshots(void);
 extern void DeleteAllExportedSnapshotFiles(void);
 extern bool ThereAreNoPriorRegisteredSnapshots(void);
-extern TransactionId TransactionIdLimitedForOldSnapshots(TransactionId recentXmin,
-														 Relation relation);
+extern bool TransactionIdLimitedForOldSnapshots(TransactionId recentXmin,
+												Relation relation,
+												TransactionId *limit_xid,
+												TimestampTz *limit_ts);
+extern void SetOldSnapshotThresholdTimestamp(TimestampTz ts, TransactionId xlimit);
 extern void MaintainOldSnapshotTimeMapping(TimestampTz whenTaken,
 										   TransactionId xmin);
 
 extern char *ExportSnapshot(Snapshot snapshot);
 
+/*
+ * These live in procarray.c because they're intimately linked to the
+ * procarray contents, but thematically they better fit into snapmgr.h.
+ */
+typedef struct GlobalVisState GlobalVisState;
+extern GlobalVisState *GlobalVisTestFor(Relation rel);
+extern bool GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid);
+extern bool GlobalVisTestIsRemovableFullXid(GlobalVisState *state, FullTransactionId fxid);
+extern FullTransactionId GlobalVisTestNonRemovableFullHorizon(GlobalVisState *state);
+extern TransactionId GlobalVisTestNonRemovableHorizon(GlobalVisState *state);
+extern bool GlobalVisCheckRemovableXid(Relation rel, TransactionId xid);
+extern bool GlobalVisIsRemovableFullXid(Relation rel, FullTransactionId fxid);
+
 /*
  * Utility functions for implementing visibility routines in table AMs.
  */
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 4796edb63aa2..35b1f05bea65 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -192,6 +192,12 @@ typedef struct SnapshotData
 	 */
 	uint32		speculativeToken;
 
+	/*
+	 * For SNAPSHOT_NON_VACUUMABLE (and hopefully more in the future) this is
+	 * used to determine whether row could be vacuumed.
+	 */
+	struct GlobalVisState *vistest;
+
 	/*
 	 * Book-keeping information, used by the snapshot manager
 	 */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 7eaaad1e140a..b4948ac675f7 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -395,6 +395,7 @@ CompositeTypeStmt
 CompoundAffixFlag
 CompressionAlgorithm
 CompressorState
+ComputeXidHorizonsResult
 ConditionVariable
 ConditionalStack
 ConfigData
@@ -930,6 +931,7 @@ GistSplitVector
 GistTsVectorOptions
 GistVacState
 GlobalTransaction
+GlobalVisState
 GrantRoleStmt
 GrantStmt
 GrantTargetType

From b8443eae72b5c36e6b443a2f09b9c605c61a589d Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Wed, 12 Aug 2020 17:04:51 -0700
Subject: [PATCH 288/334] Fix out-of-date version reference, grammar.

Time appears to be passing fast.

Reported-By: Peter Geoghegan <pg@bowt.ie>
---
 src/backend/access/nbtree/README  | 2 +-
 src/backend/access/transam/README | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 781a8f1932d3..9692e4cdf644 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -412,7 +412,7 @@ the cost of walking down the tree in such common cases.
 
 The optimization works on the assumption that there can only be one
 non-ignorable leaf rightmost page, and so not even a visible-to-everyone
-style interlock required.  We cannot fail to detect that our hint was
+style interlock is required.  We cannot fail to detect that our hint was
 invalidated, because there can only be one such page in the B-Tree at
 any time. It's possible that the page will be deleted and recycled
 without a backend's cached page also being detected as invalidated, but
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index 6f44ae9ce6a5..98acb429b67e 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -318,7 +318,7 @@ XID less than this could be about to appear in the ProcArray, because of the
 XidGenLock interlock discussed above.)
 
 As GetSnapshotData is performance critical, it does not perform an accurate
-oldest-xmin calculation (it used to, until v13). The contents of a snapshot
+oldest-xmin calculation (it used to, until v14). The contents of a snapshot
 only depend on the xids of other backends, not their xmin. As backend's xmin
 changes much more often than its xid, having GetSnapshotData look at xmins
 can lead to a lot of unnecessary cacheline ping-pong.  Instead

From a811ea5bde2fbf450095994b5726dcbf64d68668 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Thu, 13 Aug 2020 17:33:49 -0400
Subject: [PATCH 289/334] Handle new HOT chains in index-build table scans
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a table is scanned by heapam_index_build_range_scan (née
IndexBuildHeapScan) and the table lock being held allows concurrent data
changes, it is possible for new HOT chains to sprout in a page that were
unknown when the scan of a page happened.  This leads to an error such
as
  ERROR:  failed to find parent tuple for heap-only tuple at (X,Y) in table "tbl"
because the root tuple was not present when we first obtained the list
of the page's root tuples.  This can be fixed by re-obtaining the list
of root tuples, if we see that a heap-only tuple appears to point to a
non-existing root.

This was reported by Anastasia as occurring for BRIN summarization
(which exists since 9.5), but I think it could theoretically also happen
with CREATE INDEX CONCURRENTLY (much older) or REINDEX CONCURRENTLY
(very recent).  It seems a happy coincidence that BRIN forces us to
backpatch this all the way to 9.5.

Reported-by: Anastasia Lubennikova <a.lubennikova@postgrespro.ru>
Diagnosed-by: Anastasia Lubennikova <a.lubennikova@postgrespro.ru>
Co-authored-by: Anastasia Lubennikova <a.lubennikova@postgrespro.ru>
Co-authored-by: Álvaro Herrera <alvherre@alvh.no-ip.org>
Discussion: https://postgr.es/m/602d8487-f0b2-5486-0088-0f372b2549fa@postgrespro.ru
Backpatch: 9.5 - master
---
 src/backend/access/heap/heapam_handler.c | 20 ++++++++++++++++++++
 src/backend/access/heap/pruneheap.c      |  5 +++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c
index e3e41fb75163..dcaea7135fb2 100644
--- a/src/backend/access/heap/heapam_handler.c
+++ b/src/backend/access/heap/heapam_handler.c
@@ -1324,6 +1324,12 @@ heapam_index_build_range_scan(Relation heapRelation,
 		 * buffer continuously while visiting the page, so no pruning
 		 * operation can occur either.
 		 *
+		 * In cases with only ShareUpdateExclusiveLock on the table, it's
+		 * possible for some HOT tuples to appear that we didn't know about
+		 * when we first read the page.  To handle that case, we re-obtain the
+		 * list of root offsets when a HOT tuple points to a root item that we
+		 * don't know about.
+		 *
 		 * Also, although our opinions about tuple liveness could change while
 		 * we scan the page (due to concurrent transaction commits/aborts),
 		 * the chain root locations won't, so this info doesn't need to be
@@ -1625,6 +1631,20 @@ heapam_index_build_range_scan(Relation heapRelation,
 
 			offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
 
+			/*
+			 * If a HOT tuple points to a root that we don't know
+			 * about, obtain root items afresh.  If that still fails,
+			 * report it as corruption.
+			 */
+			if (root_offsets[offnum - 1] == InvalidOffsetNumber)
+			{
+				Page	page = BufferGetPage(hscan->rs_cbuf);
+
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
+				heap_get_root_tuples(page, root_offsets);
+				LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
+			}
+
 			if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
 				ereport(ERROR,
 						(errcode(ERRCODE_DATA_CORRUPTED),
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index 00a3cb106aac..3ad4222cb8af 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -855,7 +855,7 @@ heap_page_prune_execute(Buffer buffer,
  * root_offsets[k - 1] = j.
  *
  * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries.
- * We zero out all unused entries.
+ * Unused entries are filled with InvalidOffsetNumber (zero).
  *
  * The function must be called with at least share lock on the buffer, to
  * prevent concurrent prune operations.
@@ -870,7 +870,8 @@ heap_get_root_tuples(Page page, OffsetNumber *root_offsets)
 	OffsetNumber offnum,
 				maxoff;
 
-	MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber));
+	MemSet(root_offsets, InvalidOffsetNumber,
+		   MaxHeapTuplesPerPage * sizeof(OffsetNumber));
 
 	maxoff = PageGetMaxOffsetNumber(page);
 	for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))

From 1f51c17c68d05c28d5b9294d8013cb9e7e653160 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 13 Aug 2020 16:25:21 -0700
Subject: [PATCH 290/334] snapshot scalability: Move PGXACT->xmin back to
 PGPROC.

Now that xmin isn't needed for GetSnapshotData() anymore, it leads to
unnecessary cacheline ping-pong to have it in PGXACT, as it is updated
considerably more frequently than the other PGXACT members.

After the changes in dc7420c2c92, this is a very straight-forward change.

For highly concurrent, snapshot acquisition heavy, workloads this change alone
can significantly increase scalability. E.g. plain pgbench on a smaller 2
socket machine gains 1.07x for read-only pgbench, 1.22x for read-only pgbench
when submitting queries in batches of 100, and 2.85x for batches of 100
'SELECT';.  The latter numbers are obviously not to be expected in the
real-world, but micro-benchmark the snapshot computation
scalability (previously spending ~80% of the time in GetSnapshotData()).

Author: Andres Freund <andres@anarazel.de>
Reviewed-By: Robert Haas <robertmhaas@gmail.com>
Reviewed-By: Thomas Munro <thomas.munro@gmail.com>
Reviewed-By: David Rowley <dgrowleyml@gmail.com>
Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
---
 src/backend/access/gist/gistxlog.c          |  2 +-
 src/backend/access/nbtree/nbtpage.c         |  2 +-
 src/backend/access/transam/README           |  4 +--
 src/backend/access/transam/twophase.c       |  2 +-
 src/backend/commands/indexcmds.c            |  2 +-
 src/backend/replication/logical/snapbuild.c |  6 ++--
 src/backend/replication/walsender.c         | 10 +++---
 src/backend/storage/ipc/procarray.c         | 36 +++++++++------------
 src/backend/storage/ipc/sinvaladt.c         |  2 +-
 src/backend/storage/lmgr/proc.c             |  4 +--
 src/backend/utils/time/snapmgr.c            | 28 ++++++++--------
 src/include/storage/proc.h                  | 10 +++---
 12 files changed, 52 insertions(+), 56 deletions(-)

diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index a63b05388c5d..dcd28f678b3d 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -389,7 +389,7 @@ gistRedoPageReuse(XLogReaderState *record)
 	 *
 	 * latestRemovedXid was the page's deleteXid.  The
 	 * GlobalVisIsRemovableFullXid(deleteXid) test in gistPageRecyclable()
-	 * conceptually mirrors the pgxact->xmin > limitXmin test in
+	 * conceptually mirrors the PGPROC->xmin > limitXmin test in
 	 * GetConflictingVirtualXIDs().  Consequently, one XID value achieves the
 	 * same exclusion effect on primary and standby.
 	 */
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 74be3807bb7d..7f392480ac0f 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -2317,7 +2317,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	 * we're in VACUUM and would not otherwise have an XID.  Having already
 	 * updated links to the target, ReadNewTransactionId() suffices as an
 	 * upper bound.  Any scan having retained a now-stale link is advertising
-	 * in its PGXACT an xmin less than or equal to the value we read here.  It
+	 * in its PGPROC an xmin less than or equal to the value we read here.  It
 	 * will continue to do so, holding back the xmin horizon, for the duration
 	 * of that scan.
 	 */
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index 98acb429b67e..eab8edd20ec2 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -296,7 +296,7 @@ ensure that the C compiler does exactly what you tell it to.)
 Another important activity that uses the shared ProcArray is
 ComputeXidHorizons, which must determine a lower bound for the oldest xmin
 of any active MVCC snapshot, system-wide.  Each individual backend
-advertises the smallest xmin of its own snapshots in MyPgXact->xmin, or zero
+advertises the smallest xmin of its own snapshots in MyProc->xmin, or zero
 if it currently has no live snapshots (eg, if it's between transactions or
 hasn't yet set a snapshot for a new transaction).  ComputeXidHorizons takes
 the MIN() of the valid xmin fields.  It does this with only shared lock on
@@ -331,7 +331,7 @@ necessary.
 Note that while it is certain that two concurrent executions of
 GetSnapshotData will compute the same xmin for their own snapshots, there is
 no such guarantee for the horizons computed by ComputeXidHorizons.  This is
-because we allow XID-less transactions to clear their MyPgXact->xmin
+because we allow XID-less transactions to clear their MyProc->xmin
 asynchronously (without taking ProcArrayLock), so one execution might see
 what had been the oldest xmin, and another not.  This is OK since the
 thresholds need only be a valid lower bound.  As noted above, we are already
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 31f135f5cedc..eb5f4680a3d9 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -464,7 +464,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
 	/* We set up the gxact's VXID as InvalidBackendId/XID */
 	proc->lxid = (LocalTransactionId) xid;
 	pgxact->xid = xid;
-	pgxact->xmin = InvalidTransactionId;
+	Assert(proc->xmin == InvalidTransactionId);
 	proc->delayChkpt = false;
 	pgxact->vacuumFlags = 0;
 	proc->pid = 0;
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index 7819266a6306..254dbcdce52b 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -1535,7 +1535,7 @@ DefineIndex(Oid relationId,
 	StartTransactionCommand();
 
 	/* We should now definitely not be advertising any xmin. */
-	Assert(MyPgXact->xmin == InvalidTransactionId);
+	Assert(MyProc->xmin == InvalidTransactionId);
 
 	/*
 	 * The index is now valid in the sense that it contains all currently
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 3089f0d5ddcd..e9701ea72215 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -553,8 +553,8 @@ SnapBuildInitialSnapshot(SnapBuild *builder)
 		elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore");
 
 	/* so we don't overwrite the existing value */
-	if (TransactionIdIsValid(MyPgXact->xmin))
-		elog(ERROR, "cannot build an initial slot snapshot when MyPgXact->xmin already is valid");
+	if (TransactionIdIsValid(MyProc->xmin))
+		elog(ERROR, "cannot build an initial slot snapshot when MyProc->xmin already is valid");
 
 	snap = SnapBuildBuildSnapshot(builder);
 
@@ -575,7 +575,7 @@ SnapBuildInitialSnapshot(SnapBuild *builder)
 	}
 #endif
 
-	MyPgXact->xmin = snap->xmin;
+	MyProc->xmin = snap->xmin;
 
 	/* allocate in transaction context */
 	newxip = (TransactionId *)
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index 460ca3f947f4..3f756b470af1 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -1964,7 +1964,7 @@ PhysicalReplicationSlotNewXmin(TransactionId feedbackXmin, TransactionId feedbac
 	ReplicationSlot *slot = MyReplicationSlot;
 
 	SpinLockAcquire(&slot->mutex);
-	MyPgXact->xmin = InvalidTransactionId;
+	MyProc->xmin = InvalidTransactionId;
 
 	/*
 	 * For physical replication we don't need the interlock provided by xmin
@@ -2093,7 +2093,7 @@ ProcessStandbyHSFeedbackMessage(void)
 	if (!TransactionIdIsNormal(feedbackXmin)
 		&& !TransactionIdIsNormal(feedbackCatalogXmin))
 	{
-		MyPgXact->xmin = InvalidTransactionId;
+		MyProc->xmin = InvalidTransactionId;
 		if (MyReplicationSlot != NULL)
 			PhysicalReplicationSlotNewXmin(feedbackXmin, feedbackCatalogXmin);
 		return;
@@ -2135,7 +2135,7 @@ ProcessStandbyHSFeedbackMessage(void)
 	 * risk already since a VACUUM could already have determined the horizon.)
 	 *
 	 * If we're using a replication slot we reserve the xmin via that,
-	 * otherwise via the walsender's PGXACT entry. We can only track the
+	 * otherwise via the walsender's PGPROC entry. We can only track the
 	 * catalog xmin separately when using a slot, so we store the least of the
 	 * two provided when not using a slot.
 	 *
@@ -2148,9 +2148,9 @@ ProcessStandbyHSFeedbackMessage(void)
 	{
 		if (TransactionIdIsNormal(feedbackCatalogXmin)
 			&& TransactionIdPrecedes(feedbackCatalogXmin, feedbackXmin))
-			MyPgXact->xmin = feedbackCatalogXmin;
+			MyProc->xmin = feedbackCatalogXmin;
 		else
-			MyPgXact->xmin = feedbackXmin;
+			MyProc->xmin = feedbackXmin;
 	}
 }
 
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index e582d5af4291..185f581c8b6f 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -587,9 +587,9 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 		Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
 
 		proc->lxid = InvalidLocalTransactionId;
-		pgxact->xmin = InvalidTransactionId;
 		/* must be cleared with xid/xmin: */
 		pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
+		proc->xmin = InvalidTransactionId;
 		proc->delayChkpt = false;	/* be sure this is cleared in abort */
 		proc->recoveryConflictPending = false;
 
@@ -609,9 +609,9 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
 {
 	pgxact->xid = InvalidTransactionId;
 	proc->lxid = InvalidLocalTransactionId;
-	pgxact->xmin = InvalidTransactionId;
 	/* must be cleared with xid/xmin: */
 	pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
+	proc->xmin = InvalidTransactionId;
 	proc->delayChkpt = false;	/* be sure this is cleared in abort */
 	proc->recoveryConflictPending = false;
 
@@ -763,7 +763,7 @@ ProcArrayClearTransaction(PGPROC *proc)
 	 */
 	pgxact->xid = InvalidTransactionId;
 	proc->lxid = InvalidLocalTransactionId;
-	pgxact->xmin = InvalidTransactionId;
+	proc->xmin = InvalidTransactionId;
 	proc->recoveryConflictPending = false;
 
 	/* redundant, but just in case */
@@ -1563,7 +1563,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 
 		/* Fetch xid just once - see GetNewTransactionId */
 		xid = UINT32_ACCESS_ONCE(pgxact->xid);
-		xmin = UINT32_ACCESS_ONCE(pgxact->xmin);
+		xmin = UINT32_ACCESS_ONCE(proc->xmin);
 
 		/*
 		 * Consider both the transaction's Xmin, and its Xid.
@@ -1838,7 +1838,7 @@ GetMaxSnapshotSubxidCount(void)
  *
  * We also update the following backend-global variables:
  *		TransactionXmin: the oldest xmin of any snapshot in use in the
- *			current transaction (this is the same as MyPgXact->xmin).
+ *			current transaction (this is the same as MyProc->xmin).
  *		RecentXmin: the xmin computed for the most recent snapshot.  XIDs
  *			older than this are known not running any more.
  *
@@ -1899,7 +1899,7 @@ GetSnapshotData(Snapshot snapshot)
 
 	/*
 	 * It is sufficient to get shared lock on ProcArrayLock, even if we are
-	 * going to set MyPgXact->xmin.
+	 * going to set MyProc->xmin.
 	 */
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
@@ -2051,8 +2051,8 @@ GetSnapshotData(Snapshot snapshot)
 	replication_slot_xmin = procArray->replication_slot_xmin;
 	replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
 
-	if (!TransactionIdIsValid(MyPgXact->xmin))
-		MyPgXact->xmin = TransactionXmin = xmin;
+	if (!TransactionIdIsValid(MyProc->xmin))
+		MyProc->xmin = TransactionXmin = xmin;
 
 	LWLockRelease(ProcArrayLock);
 
@@ -2172,7 +2172,7 @@ GetSnapshotData(Snapshot snapshot)
 }
 
 /*
- * ProcArrayInstallImportedXmin -- install imported xmin into MyPgXact->xmin
+ * ProcArrayInstallImportedXmin -- install imported xmin into MyProc->xmin
  *
  * This is called when installing a snapshot imported from another
  * transaction.  To ensure that OldestXmin doesn't go backwards, we must
@@ -2225,7 +2225,7 @@ ProcArrayInstallImportedXmin(TransactionId xmin,
 		/*
 		 * Likewise, let's just make real sure its xmin does cover us.
 		 */
-		xid = UINT32_ACCESS_ONCE(pgxact->xmin);
+		xid = UINT32_ACCESS_ONCE(proc->xmin);
 		if (!TransactionIdIsNormal(xid) ||
 			!TransactionIdPrecedesOrEquals(xid, xmin))
 			continue;
@@ -2236,7 +2236,7 @@ ProcArrayInstallImportedXmin(TransactionId xmin,
 		 * GetSnapshotData first, we'll be overwriting a valid xmin here, so
 		 * we don't check that.)
 		 */
-		MyPgXact->xmin = TransactionXmin = xmin;
+		MyProc->xmin = TransactionXmin = xmin;
 
 		result = true;
 		break;
@@ -2248,7 +2248,7 @@ ProcArrayInstallImportedXmin(TransactionId xmin,
 }
 
 /*
- * ProcArrayInstallRestoredXmin -- install restored xmin into MyPgXact->xmin
+ * ProcArrayInstallRestoredXmin -- install restored xmin into MyProc->xmin
  *
  * This is like ProcArrayInstallImportedXmin, but we have a pointer to the
  * PGPROC of the transaction from which we imported the snapshot, rather than
@@ -2261,7 +2261,6 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
 {
 	bool		result = false;
 	TransactionId xid;
-	PGXACT	   *pgxact;
 
 	Assert(TransactionIdIsNormal(xmin));
 	Assert(proc != NULL);
@@ -2269,20 +2268,18 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
 	/* Get lock so source xact can't end while we're doing this */
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
-	pgxact = &allPgXact[proc->pgprocno];
-
 	/*
 	 * Be certain that the referenced PGPROC has an advertised xmin which is
 	 * no later than the one we're installing, so that the system-wide xmin
 	 * can't go backwards.  Also, make sure it's running in the same database,
 	 * so that the per-database xmin cannot go backwards.
 	 */
-	xid = UINT32_ACCESS_ONCE(pgxact->xmin);
+	xid = UINT32_ACCESS_ONCE(proc->xmin);
 	if (proc->databaseId == MyDatabaseId &&
 		TransactionIdIsNormal(xid) &&
 		TransactionIdPrecedesOrEquals(xid, xmin))
 	{
-		MyPgXact->xmin = TransactionXmin = xmin;
+		MyProc->xmin = TransactionXmin = xmin;
 		result = true;
 	}
 
@@ -2908,7 +2905,7 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
 		if (allDbs || proc->databaseId == MyDatabaseId)
 		{
 			/* Fetch xmin just once - might change on us */
-			TransactionId pxmin = UINT32_ACCESS_ONCE(pgxact->xmin);
+			TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin);
 
 			if (excludeXmin0 && !TransactionIdIsValid(pxmin))
 				continue;
@@ -2994,7 +2991,6 @@ GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
 		PGPROC	   *proc = &allProcs[pgprocno];
-		PGXACT	   *pgxact = &allPgXact[pgprocno];
 
 		/* Exclude prepared transactions */
 		if (proc->pid == 0)
@@ -3004,7 +3000,7 @@ GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
 			proc->databaseId == dbOid)
 		{
 			/* Fetch xmin just once - can't change on us, but good coding */
-			TransactionId pxmin = UINT32_ACCESS_ONCE(pgxact->xmin);
+			TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin);
 
 			/*
 			 * We ignore an invalid pxmin because this means that backend has
diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c
index e5c115b92f2b..ad048bc85fab 100644
--- a/src/backend/storage/ipc/sinvaladt.c
+++ b/src/backend/storage/ipc/sinvaladt.c
@@ -420,7 +420,7 @@ BackendIdGetTransactionIds(int backendID, TransactionId *xid, TransactionId *xmi
 			PGXACT	   *xact = &ProcGlobal->allPgXact[proc->pgprocno];
 
 			*xid = xact->xid;
-			*xmin = xact->xmin;
+			*xmin = proc->xmin;
 		}
 	}
 
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index e57fcd253880..de346cd87fcd 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -388,7 +388,7 @@ InitProcess(void)
 	MyProc->fpVXIDLock = false;
 	MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
 	MyPgXact->xid = InvalidTransactionId;
-	MyPgXact->xmin = InvalidTransactionId;
+	MyProc->xmin = InvalidTransactionId;
 	MyProc->pid = MyProcPid;
 	/* backendId, databaseId and roleId will be filled in later */
 	MyProc->backendId = InvalidBackendId;
@@ -572,7 +572,7 @@ InitAuxiliaryProcess(void)
 	MyProc->fpVXIDLock = false;
 	MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
 	MyPgXact->xid = InvalidTransactionId;
-	MyPgXact->xmin = InvalidTransactionId;
+	MyProc->xmin = InvalidTransactionId;
 	MyProc->backendId = InvalidBackendId;
 	MyProc->databaseId = InvalidOid;
 	MyProc->roleId = InvalidOid;
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 604d823f6861..752af0c10dfc 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -27,11 +27,11 @@
  * their lifetime is managed separately (as they live longer than one xact.c
  * transaction).
  *
- * These arrangements let us reset MyPgXact->xmin when there are no snapshots
+ * These arrangements let us reset MyProc->xmin when there are no snapshots
  * referenced by this transaction, and advance it when the one with oldest
  * Xmin is no longer referenced.  For simplicity however, only registered
  * snapshots not active snapshots participate in tracking which one is oldest;
- * we don't try to change MyPgXact->xmin except when the active-snapshot
+ * we don't try to change MyProc->xmin except when the active-snapshot
  * stack is empty.
  *
  *
@@ -187,7 +187,7 @@ static ActiveSnapshotElt *OldestActiveSnapshot = NULL;
 
 /*
  * Currently registered Snapshots.  Ordered in a heap by xmin, so that we can
- * quickly find the one with lowest xmin, to advance our MyPgXact->xmin.
+ * quickly find the one with lowest xmin, to advance our MyProc->xmin.
  */
 static int	xmin_cmp(const pairingheap_node *a, const pairingheap_node *b,
 					 void *arg);
@@ -475,7 +475,7 @@ GetNonHistoricCatalogSnapshot(Oid relid)
 
 		/*
 		 * Make sure the catalog snapshot will be accounted for in decisions
-		 * about advancing PGXACT->xmin.  We could apply RegisterSnapshot, but
+		 * about advancing PGPROC->xmin.  We could apply RegisterSnapshot, but
 		 * that would result in making a physical copy, which is overkill; and
 		 * it would also create a dependency on some resource owner, which we
 		 * do not want for reasons explained at the head of this file. Instead
@@ -596,7 +596,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid,
 	/* NB: curcid should NOT be copied, it's a local matter */
 
 	/*
-	 * Now we have to fix what GetSnapshotData did with MyPgXact->xmin and
+	 * Now we have to fix what GetSnapshotData did with MyProc->xmin and
 	 * TransactionXmin.  There is a race condition: to make sure we are not
 	 * causing the global xmin to go backwards, we have to test that the
 	 * source transaction is still running, and that has to be done
@@ -950,13 +950,13 @@ xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg)
 /*
  * SnapshotResetXmin
  *
- * If there are no more snapshots, we can reset our PGXACT->xmin to InvalidXid.
+ * If there are no more snapshots, we can reset our PGPROC->xmin to InvalidXid.
  * Note we can do this without locking because we assume that storing an Xid
  * is atomic.
  *
  * Even if there are some remaining snapshots, we may be able to advance our
- * PGXACT->xmin to some degree.  This typically happens when a portal is
- * dropped.  For efficiency, we only consider recomputing PGXACT->xmin when
+ * PGPROC->xmin to some degree.  This typically happens when a portal is
+ * dropped.  For efficiency, we only consider recomputing PGPROC->xmin when
  * the active snapshot stack is empty; this allows us not to need to track
  * which active snapshot is oldest.
  *
@@ -977,15 +977,15 @@ SnapshotResetXmin(void)
 
 	if (pairingheap_is_empty(&RegisteredSnapshots))
 	{
-		MyPgXact->xmin = InvalidTransactionId;
+		MyProc->xmin = InvalidTransactionId;
 		return;
 	}
 
 	minSnapshot = pairingheap_container(SnapshotData, ph_node,
 										pairingheap_first(&RegisteredSnapshots));
 
-	if (TransactionIdPrecedes(MyPgXact->xmin, minSnapshot->xmin))
-		MyPgXact->xmin = minSnapshot->xmin;
+	if (TransactionIdPrecedes(MyProc->xmin, minSnapshot->xmin))
+		MyProc->xmin = minSnapshot->xmin;
 }
 
 /*
@@ -1132,13 +1132,13 @@ AtEOXact_Snapshot(bool isCommit, bool resetXmin)
 
 	/*
 	 * During normal commit processing, we call ProcArrayEndTransaction() to
-	 * reset the MyPgXact->xmin. That call happens prior to the call to
+	 * reset the MyProc->xmin. That call happens prior to the call to
 	 * AtEOXact_Snapshot(), so we need not touch xmin here at all.
 	 */
 	if (resetXmin)
 		SnapshotResetXmin();
 
-	Assert(resetXmin || MyPgXact->xmin == 0);
+	Assert(resetXmin || MyProc->xmin == 0);
 }
 
 
@@ -1830,7 +1830,7 @@ TransactionIdLimitedForOldSnapshots(TransactionId recentXmin,
 	 */
 	if (old_snapshot_threshold == 0)
 	{
-		if (TransactionIdPrecedes(latest_xmin, MyPgXact->xmin)
+		if (TransactionIdPrecedes(latest_xmin, MyProc->xmin)
 			&& TransactionIdFollows(latest_xmin, xlimit))
 			xlimit = latest_xmin;
 
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 52ff43cabaaf..5e4b028a5f98 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -101,6 +101,11 @@ struct PGPROC
 
 	Latch		procLatch;		/* generic latch for process */
 
+	TransactionId xmin;			/* minimal running XID as it was when we were
+								 * starting our xact, excluding LAZY VACUUM:
+								 * vacuum must not remove tuples deleted by
+								 * xid >= xmin ! */
+
 	LocalTransactionId lxid;	/* local id of top-level transaction currently
 								 * being executed by this proc, if running;
 								 * else InvalidLocalTransactionId */
@@ -223,11 +228,6 @@ typedef struct PGXACT
 								 * executed by this proc, if running and XID
 								 * is assigned; else InvalidTransactionId */
 
-	TransactionId xmin;			/* minimal running XID as it was when we were
-								 * starting our xact, excluding LAZY VACUUM:
-								 * vacuum must not remove tuples deleted by
-								 * xid >= xmin ! */
-
 	uint8		vacuumFlags;	/* vacuum-related flags, see above */
 	bool		overflowed;
 

From a9306f10b95992ec7229cae3de507a9fa2f6aa3c Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 13 Aug 2020 20:00:38 -0400
Subject: [PATCH 291/334] Doc: improve examples for json_populate_record() and
 related functions.

Make these examples self-contained by providing declarations of the
user-defined row types they rely on.  There wasn't room to do this
in the old doc format, but now there is, and I think it makes the
examples a good bit less confusing.
---
 doc/src/sgml/func.sgml | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index f766c1bc67c1..9a4ac5a1ea36 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -15414,7 +15414,12 @@ table2-mapping
         calls.
        </para>
        <para>
-        <literal>select * from json_populate_record(null::myrowtype, '{"a": 1, "b": ["2", "a b"], "c": {"d": 4, "e": "a  b c"}}')</literal>
+        <literal>create type subrowtype as (d int, e text);</literal>
+        <literal>create type myrowtype as (a int, b text[], c subrowtype);</literal>
+       </para>
+       <para>
+        <literal>select * from json_populate_record(null::myrowtype,
+         '{"a": 1, "b": ["2", "a b"], "c": {"d": 4, "e": "a  b c"}, "x": "foo"}')</literal>
         <returnvalue></returnvalue>
 <programlisting>
  a |   b       |      c
@@ -15446,7 +15451,10 @@ table2-mapping
         for <function>json[b]_populate_record</function>.
        </para>
        <para>
-        <literal>select * from json_populate_recordset(null::myrowtype, '[{"a":1,"b":2},{"a":3,"b":4}]')</literal>
+        <literal>create type twoints as (a int, b int);</literal>
+       </para>
+       <para>
+        <literal>select * from json_populate_recordset(null::twoints, '[{"a":1,"b":2},{"a":3,"b":4}]')</literal>
         <returnvalue></returnvalue>
 <programlisting>
  a | b
@@ -15483,7 +15491,10 @@ table2-mapping
         input record value, unmatched columns are always filled with nulls.
        </para>
        <para>
-        <literal>select * from json_to_record('{"a":1,"b":[1,2,3],"c":[1,2,3],"e":"bar","r": {"a": 123, "b": "a b c"}}') as x(a int, b text, c int[], d text, r myrowtype) </literal>
+        <literal>create type myrowtype as (a int, b text);</literal>
+       </para>
+       <para>
+        <literal>select * from json_to_record('{"a":1,"b":[1,2,3],"c":[1,2,3],"e":"bar","r": {"a": 123, "b": "a b c"}}') as x(a int, b text, c int[], d text, r myrowtype)</literal>
         <returnvalue></returnvalue>
 <programlisting>
  a |    b    |    c    | d |       r

From 1f32136a9960df2e135e7b36363ea1f087b514a0 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Fri, 14 Aug 2020 09:30:34 +0900
Subject: [PATCH 292/334] Fix compilation warnings with libselinux 3.1 in
 contrib/sepgsql/

Upstream SELinux has recently marked security_context_t as officially
deprecated, causing warnings with -Wdeprecated-declarations.  This is
considered as legacy code for some time now by upstream as
security_context_t got removed from most of the code tree during the
development of 2.3 back in 2014.

This removes all the references to security_context_t in sepgsql/ to be
consistent with SELinux, fixing the warnings.  Note that this does not
impact the minimum version of libselinux supported.

Reviewed-by: Tom Lane
Discussion: https://postgr.es/m/20200813012735.GC11663@paquier.xyz
---
 contrib/sepgsql/label.c   | 10 +++++-----
 contrib/sepgsql/selinux.c | 10 +++++-----
 contrib/sepgsql/uavc.c    |  4 ++--
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/contrib/sepgsql/label.c b/contrib/sepgsql/label.c
index 32e405530bb6..b00b91df5aa3 100644
--- a/contrib/sepgsql/label.c
+++ b/contrib/sepgsql/label.c
@@ -120,7 +120,7 @@ sepgsql_set_client_label(const char *new_label)
 		tcontext = client_label_peer;
 	else
 	{
-		if (security_check_context_raw((security_context_t) new_label) < 0)
+		if (security_check_context_raw(new_label) < 0)
 			ereport(ERROR,
 					(errcode(ERRCODE_INVALID_NAME),
 					 errmsg("SELinux: invalid security label: \"%s\"",
@@ -453,9 +453,9 @@ sepgsql_get_label(Oid classId, Oid objectId, int32 subId)
 	object.objectSubId = subId;
 
 	label = GetSecurityLabel(&object, SEPGSQL_LABEL_TAG);
-	if (!label || security_check_context_raw((security_context_t) label))
+	if (!label || security_check_context_raw(label))
 	{
-		security_context_t unlabeled;
+		char	   *unlabeled;
 
 		if (security_get_initial_context_raw("unlabeled", &unlabeled) < 0)
 			ereport(ERROR,
@@ -487,7 +487,7 @@ sepgsql_object_relabel(const ObjectAddress *object, const char *seclabel)
 	 * context of selinux.
 	 */
 	if (seclabel &&
-		security_check_context_raw((security_context_t) seclabel) < 0)
+		security_check_context_raw(seclabel) < 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_NAME),
 				 errmsg("SELinux: invalid security label: \"%s\"", seclabel)));
@@ -725,7 +725,7 @@ exec_object_restorecon(struct selabel_handle *sehnd, Oid catalogId)
 		char	   *objname;
 		int			objtype = 1234;
 		ObjectAddress object;
-		security_context_t context;
+		char	   *context;
 
 		/*
 		 * The way to determine object name depends on object classes. So, any
diff --git a/contrib/sepgsql/selinux.c b/contrib/sepgsql/selinux.c
index 9fdc810f2ed4..2695e88f23c9 100644
--- a/contrib/sepgsql/selinux.c
+++ b/contrib/sepgsql/selinux.c
@@ -768,8 +768,8 @@ sepgsql_compute_avd(const char *scontext,
 	 * Ask SELinux what is allowed set of permissions on a pair of the
 	 * security contexts and the given object class.
 	 */
-	if (security_compute_av_flags_raw((security_context_t) scontext,
-									  (security_context_t) tcontext,
+	if (security_compute_av_flags_raw(scontext,
+									  tcontext,
 									  tclass_ex, 0, &avd_ex) < 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_INTERNAL_ERROR),
@@ -838,7 +838,7 @@ sepgsql_compute_create(const char *scontext,
 					   uint16 tclass,
 					   const char *objname)
 {
-	security_context_t ncontext;
+	char	   *ncontext;
 	security_class_t tclass_ex;
 	const char *tclass_name;
 	char	   *result;
@@ -853,8 +853,8 @@ sepgsql_compute_create(const char *scontext,
 	 * Ask SELinux what is the default context for the given object class on a
 	 * pair of security contexts
 	 */
-	if (security_compute_create_name_raw((security_context_t) scontext,
-										 (security_context_t) tcontext,
+	if (security_compute_create_name_raw(scontext,
+										 tcontext,
 										 tclass_ex,
 										 objname,
 										 &ncontext) < 0)
diff --git a/contrib/sepgsql/uavc.c b/contrib/sepgsql/uavc.c
index 639a52c5567b..97189b7c46f0 100644
--- a/contrib/sepgsql/uavc.c
+++ b/contrib/sepgsql/uavc.c
@@ -171,7 +171,7 @@ sepgsql_avc_unlabeled(void)
 {
 	if (!avc_unlabeled)
 	{
-		security_context_t unlabeled;
+		char	   *unlabeled;
 
 		if (security_get_initial_context_raw("unlabeled", &unlabeled) < 0)
 			ereport(ERROR,
@@ -216,7 +216,7 @@ sepgsql_avc_compute(const char *scontext, const char *tcontext, uint16 tclass)
 	 * policy is reloaded, validation status shall be kept, so we also cache
 	 * whether the supplied security context was valid, or not.
 	 */
-	if (security_check_context_raw((security_context_t) tcontext) != 0)
+	if (security_check_context_raw(tcontext) != 0)
 		ucontext = sepgsql_avc_unlabeled();
 
 	/*

From 5bdf694568ef0b9eebef32002a9ebc1918dd0b4b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 14 Aug 2020 10:40:50 +0300
Subject: [PATCH 293/334] Fix typo in test comment.

---
 src/test/regress/expected/stats_ext.out | 2 +-
 src/test/regress/sql/stats_ext.sql      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 0ae779a3b974..8c667d786a21 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -292,7 +292,7 @@ SELECT s.stxkind, d.stxdndistinct
  {d,f,m} | {"3, 4": 2550, "3, 6": 800, "4, 6": 1632, "3, 4, 6": 5000}
 (1 row)
 
--- correct esimates
+-- correct estimates
 SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
  estimated | actual 
 -----------+--------
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 2834a902a70c..f8d947af9e80 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -201,7 +201,7 @@ SELECT s.stxkind, d.stxdndistinct
  WHERE s.stxrelid = 'ndistinct'::regclass
    AND d.stxoid = s.oid;
 
--- correct esimates
+-- correct estimates
 SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b');
 
 SELECT * FROM check_estimated_rows('SELECT COUNT(*) FROM ndistinct GROUP BY a, b, c');

From 0038f943878286ce84b2dfac10d64e00eab02edd Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 14 Aug 2020 13:26:57 -0400
Subject: [PATCH 294/334] Fix postmaster's behavior during smart shutdown.

Up to now, upon receipt of a SIGTERM ("smart shutdown" command), the
postmaster has immediately killed all "optional" background processes,
and subsequently refused to launch new ones while it's waiting for
foreground client processes to exit.  No doubt this seemed like an OK
policy at some point; but it's a pretty bad one now, because it makes
for a seriously degraded environment for the remaining clients:

* Parallel queries are killed, and new ones fail to launch. (And our
parallel-query infrastructure utterly fails to deal with the case
in a reasonable way --- it just hangs waiting for workers that are
not going to arrive.  There is more work needed in that area IMO.)

* Autovacuum ceases to function.  We can tolerate that for awhile,
but if bulk-update queries continue to run in the surviving client
sessions, there's eventually going to be a mess.  In the worst case
the system could reach a forced shutdown to prevent XID wraparound.

* The bgwriter and walwriter are also stopped immediately, likely
resulting in performance degradation.

Hence, let's rearrange things so that the only immediate change in
behavior is refusing to let in new normal connections.  Once the last
normal connection is gone, shut everything down as though we'd received
a "fast" shutdown.  To implement this, remove the PM_WAIT_BACKUP and
PM_WAIT_READONLY states, instead staying in PM_RUN or PM_HOT_STANDBY
while normal connections remain.  A subsidiary state variable tracks
whether or not we're letting in new connections in those states.

This also allows having just one copy of the logic for killing child
processes in smart and fast shutdown modes.  I moved that logic into
PostmasterStateMachine() by inventing a new state PM_STOP_BACKENDS.

Back-patch to 9.6 where parallel query was added.  In principle
this'd be a good idea in 9.5 as well, but the risk/reward ratio
is not as good there, since lack of autovacuum is not a problem
during typical uses of smart shutdown.

Per report from Bharath Rupireddy.

Patch by me, reviewed by Thomas Munro

Discussion: https://postgr.es/m/CALj2ACXAZ5vKxT9P7P89D87i3MDO9bfS+_bjMHgnWJs8uwUOOw@mail.gmail.com
---
 doc/src/sgml/ref/pg_ctl-ref.sgml    |   4 +-
 src/backend/postmaster/postmaster.c | 239 ++++++++++++++--------------
 src/backend/utils/init/postinit.c   |   2 +-
 src/include/libpq/libpq-be.h        |   2 +-
 4 files changed, 126 insertions(+), 121 deletions(-)

diff --git a/doc/src/sgml/ref/pg_ctl-ref.sgml b/doc/src/sgml/ref/pg_ctl-ref.sgml
index e31275a04e27..3946fa52eab7 100644
--- a/doc/src/sgml/ref/pg_ctl-ref.sgml
+++ b/doc/src/sgml/ref/pg_ctl-ref.sgml
@@ -185,8 +185,8 @@ PostgreSQL documentation
    <option>stop</option> mode shuts down the server that is running in
    the specified data directory.  Three different
    shutdown methods can be selected with the <option>-m</option>
-   option.  <quote>Smart</quote> mode waits for all active
-   clients to disconnect and any online backup to finish.
+   option.  <quote>Smart</quote> mode disallows new connections, then waits
+   for all existing clients to disconnect and any online backup to finish.
    If the server is in hot standby, recovery and streaming replication
    will be terminated once all clients have disconnected.
    <quote>Fast</quote> mode (the default) does not wait for clients to disconnect and
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 38e2c16ac206..42223c0f61e2 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -148,8 +148,6 @@
 #define BACKEND_TYPE_BGWORKER	0x0008	/* bgworker process */
 #define BACKEND_TYPE_ALL		0x000F	/* OR of all the above */
 
-#define BACKEND_TYPE_WORKER		(BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER)
-
 /*
  * List of active backends (or child processes anyway; we don't actually
  * know whether a given child has become a backend or is still in the
@@ -304,8 +302,7 @@ static bool FatalError = false; /* T if recovering from backend crash */
  * and we switch to PM_RUN state.
  *
  * Normal child backends can only be launched when we are in PM_RUN or
- * PM_HOT_STANDBY state.  (We also allow launch of normal
- * child backends in PM_WAIT_BACKUP state, but only for superusers.)
+ * PM_HOT_STANDBY state.  (connsAllowed can also restrict launching.)
  * In other states we handle connection requests by launching "dead_end"
  * child processes, which will simply send the client an error message and
  * quit.  (We track these in the BackendList so that we can know when they
@@ -319,10 +316,10 @@ static bool FatalError = false; /* T if recovering from backend crash */
  *
  * Notice that this state variable does not distinguish *why* we entered
  * states later than PM_RUN --- Shutdown and FatalError must be consulted
- * to find that out.  FatalError is never true in PM_RECOVERY_* or PM_RUN
- * states, nor in PM_SHUTDOWN states (because we don't enter those states
- * when trying to recover from a crash).  It can be true in PM_STARTUP state,
- * because we don't clear it until we've successfully started WAL redo.
+ * to find that out.  FatalError is never true in PM_RECOVERY, PM_HOT_STANDBY,
+ * or PM_RUN states, nor in PM_SHUTDOWN states (because we don't enter those
+ * states when trying to recover from a crash).  It can be true in PM_STARTUP
+ * state, because we don't clear it until we've successfully started WAL redo.
  */
 typedef enum
 {
@@ -331,8 +328,7 @@ typedef enum
 	PM_RECOVERY,				/* in archive recovery mode */
 	PM_HOT_STANDBY,				/* in hot standby mode */
 	PM_RUN,						/* normal "database is alive" state */
-	PM_WAIT_BACKUP,				/* waiting for online backup mode to end */
-	PM_WAIT_READONLY,			/* waiting for read only backends to exit */
+	PM_STOP_BACKENDS,			/* need to stop remaining backends */
 	PM_WAIT_BACKENDS,			/* waiting for live backends to exit */
 	PM_SHUTDOWN,				/* waiting for checkpointer to do shutdown
 								 * ckpt */
@@ -344,6 +340,21 @@ typedef enum
 
 static PMState pmState = PM_INIT;
 
+/*
+ * While performing a "smart shutdown", we restrict new connections but stay
+ * in PM_RUN or PM_HOT_STANDBY state until all the client backends are gone.
+ * connsAllowed is a sub-state indicator showing the active restriction.
+ * It is of no interest unless pmState is PM_RUN or PM_HOT_STANDBY.
+ */
+typedef enum
+{
+	ALLOW_ALL_CONNS,			/* normal not-shutting-down state */
+	ALLOW_SUPERUSER_CONNS,		/* only superusers can connect */
+	ALLOW_NO_CONNS				/* no new connections allowed, period */
+} ConnsAllowedState;
+
+static ConnsAllowedState connsAllowed = ALLOW_ALL_CONNS;
+
 /* Start time of SIGKILL timeout during immediate shutdown or child crash */
 /* Zero means timeout is not running */
 static time_t AbortStartTime = 0;
@@ -2323,7 +2334,7 @@ ProcessStartupPacket(Port *port, bool ssl_done, bool gss_done)
 					(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
 					 errmsg("sorry, too many clients already")));
 			break;
-		case CAC_WAITBACKUP:
+		case CAC_SUPERUSER:
 			/* OK for now, will check in InitPostgres */
 			break;
 		case CAC_OK:
@@ -2443,31 +2454,36 @@ canAcceptConnections(int backend_type)
 	 * state.  We treat autovac workers the same as user backends for this
 	 * purpose.  However, bgworkers are excluded from this test; we expect
 	 * bgworker_should_start_now() decided whether the DB state allows them.
-	 *
-	 * In state PM_WAIT_BACKUP only superusers can connect (this must be
-	 * allowed so that a superuser can end online backup mode); we return
-	 * CAC_WAITBACKUP code to indicate that this must be checked later. Note
-	 * that neither CAC_OK nor CAC_WAITBACKUP can safely be returned until we
-	 * have checked for too many children.
 	 */
-	if (pmState != PM_RUN &&
+	if (pmState != PM_RUN && pmState != PM_HOT_STANDBY &&
 		backend_type != BACKEND_TYPE_BGWORKER)
 	{
-		if (pmState == PM_WAIT_BACKUP)
-			result = CAC_WAITBACKUP;	/* allow superusers only */
-		else if (Shutdown > NoShutdown)
+		if (Shutdown > NoShutdown)
 			return CAC_SHUTDOWN;	/* shutdown is pending */
 		else if (!FatalError &&
 				 (pmState == PM_STARTUP ||
 				  pmState == PM_RECOVERY))
 			return CAC_STARTUP; /* normal startup */
-		else if (!FatalError &&
-				 pmState == PM_HOT_STANDBY)
-			result = CAC_OK;	/* connection OK during hot standby */
 		else
 			return CAC_RECOVERY;	/* else must be crash recovery */
 	}
 
+	/*
+	 * "Smart shutdown" restrictions are applied only to normal connections,
+	 * not to autovac workers or bgworkers.  When only superusers can connect,
+	 * we return CAC_SUPERUSER to indicate that superuserness must be checked
+	 * later.  Note that neither CAC_OK nor CAC_SUPERUSER can safely be
+	 * returned until we have checked for too many children.
+	 */
+	if (connsAllowed != ALLOW_ALL_CONNS &&
+		backend_type == BACKEND_TYPE_NORMAL)
+	{
+		if (connsAllowed == ALLOW_SUPERUSER_CONNS)
+			result = CAC_SUPERUSER; /* allow superusers only */
+		else
+			return CAC_SHUTDOWN;	/* shutdown is pending */
+	}
+
 	/*
 	 * Don't start too many children.
 	 *
@@ -2793,34 +2809,22 @@ pmdie(SIGNAL_ARGS)
 			sd_notify(0, "STOPPING=1");
 #endif
 
-			if (pmState == PM_RUN || pmState == PM_RECOVERY ||
-				pmState == PM_HOT_STANDBY || pmState == PM_STARTUP)
+			/*
+			 * If we reached normal running, we have to wait for any online
+			 * backup mode to end; otherwise go straight to waiting for client
+			 * backends to exit.  (The difference is that in the former state,
+			 * we'll still let in new superuser clients, so that somebody can
+			 * end the online backup mode.)  If already in PM_STOP_BACKENDS or
+			 * a later state, do not change it.
+			 */
+			if (pmState == PM_RUN)
+				connsAllowed = ALLOW_SUPERUSER_CONNS;
+			else if (pmState == PM_HOT_STANDBY)
+				connsAllowed = ALLOW_NO_CONNS;
+			else if (pmState == PM_STARTUP || pmState == PM_RECOVERY)
 			{
-				/* autovac workers are told to shut down immediately */
-				/* and bgworkers too; does this need tweaking? */
-				SignalSomeChildren(SIGTERM,
-								   BACKEND_TYPE_AUTOVAC | BACKEND_TYPE_BGWORKER);
-				/* and the autovac launcher too */
-				if (AutoVacPID != 0)
-					signal_child(AutoVacPID, SIGTERM);
-				/* and the bgwriter too */
-				if (BgWriterPID != 0)
-					signal_child(BgWriterPID, SIGTERM);
-				/* and the walwriter too */
-				if (WalWriterPID != 0)
-					signal_child(WalWriterPID, SIGTERM);
-
-				/*
-				 * If we're in recovery, we can't kill the startup process
-				 * right away, because at present doing so does not release
-				 * its locks.  We might want to change this in a future
-				 * release.  For the time being, the PM_WAIT_READONLY state
-				 * indicates that we're waiting for the regular (read only)
-				 * backends to die off; once they do, we'll kill the startup
-				 * and walreceiver processes.
-				 */
-				pmState = (pmState == PM_RUN) ?
-					PM_WAIT_BACKUP : PM_WAIT_READONLY;
+				/* There should be no clients, so proceed to stop children */
+				pmState = PM_STOP_BACKENDS;
 			}
 
 			/*
@@ -2851,48 +2855,23 @@ pmdie(SIGNAL_ARGS)
 			sd_notify(0, "STOPPING=1");
 #endif
 
-			if (StartupPID != 0)
-				signal_child(StartupPID, SIGTERM);
-			if (BgWriterPID != 0)
-				signal_child(BgWriterPID, SIGTERM);
-			if (WalReceiverPID != 0)
-				signal_child(WalReceiverPID, SIGTERM);
 			if (pmState == PM_STARTUP || pmState == PM_RECOVERY)
 			{
-				SignalSomeChildren(SIGTERM, BACKEND_TYPE_BGWORKER);
-
-				/*
-				 * Only startup, bgwriter, walreceiver, possibly bgworkers,
-				 * and/or checkpointer should be active in this state; we just
-				 * signaled the first four, and we don't want to kill
-				 * checkpointer yet.
-				 */
-				pmState = PM_WAIT_BACKENDS;
+				/* Just shut down background processes silently */
+				pmState = PM_STOP_BACKENDS;
 			}
 			else if (pmState == PM_RUN ||
-					 pmState == PM_WAIT_BACKUP ||
-					 pmState == PM_WAIT_READONLY ||
-					 pmState == PM_WAIT_BACKENDS ||
 					 pmState == PM_HOT_STANDBY)
 			{
+				/* Report that we're about to zap live client sessions */
 				ereport(LOG,
 						(errmsg("aborting any active transactions")));
-				/* shut down all backends and workers */
-				SignalSomeChildren(SIGTERM,
-								   BACKEND_TYPE_NORMAL | BACKEND_TYPE_AUTOVAC |
-								   BACKEND_TYPE_BGWORKER);
-				/* and the autovac launcher too */
-				if (AutoVacPID != 0)
-					signal_child(AutoVacPID, SIGTERM);
-				/* and the walwriter too */
-				if (WalWriterPID != 0)
-					signal_child(WalWriterPID, SIGTERM);
-				pmState = PM_WAIT_BACKENDS;
+				pmState = PM_STOP_BACKENDS;
 			}
 
 			/*
-			 * Now wait for backends to exit.  If there are none,
-			 * PostmasterStateMachine will take the next step.
+			 * PostmasterStateMachine will issue any necessary signals, or
+			 * take the next step if no child processes need to be killed.
 			 */
 			PostmasterStateMachine();
 			break;
@@ -2987,7 +2966,7 @@ reaper(SIGNAL_ARGS)
 				ereport(LOG,
 						(errmsg("shutdown at recovery target")));
 				StartupStatus = STARTUP_NOT_RUNNING;
-				Shutdown = SmartShutdown;
+				Shutdown = Max(Shutdown, SmartShutdown);
 				TerminateChildren(SIGTERM);
 				pmState = PM_WAIT_BACKENDS;
 				/* PostmasterStateMachine logic does the rest */
@@ -3051,6 +3030,7 @@ reaper(SIGNAL_ARGS)
 			AbortStartTime = 0;
 			ReachedNormalRunning = true;
 			pmState = PM_RUN;
+			connsAllowed = ALLOW_ALL_CONNS;
 
 			/*
 			 * Crank up the background tasks, if we didn't do that already
@@ -3712,8 +3692,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
 	if (pmState == PM_RECOVERY ||
 		pmState == PM_HOT_STANDBY ||
 		pmState == PM_RUN ||
-		pmState == PM_WAIT_BACKUP ||
-		pmState == PM_WAIT_READONLY ||
+		pmState == PM_STOP_BACKENDS ||
 		pmState == PM_SHUTDOWN)
 		pmState = PM_WAIT_BACKENDS;
 
@@ -3796,35 +3775,60 @@ LogChildExit(int lev, const char *procname, int pid, int exitstatus)
 static void
 PostmasterStateMachine(void)
 {
-	if (pmState == PM_WAIT_BACKUP)
+	/* If we're doing a smart shutdown, try to advance that state. */
+	if (pmState == PM_RUN || pmState == PM_HOT_STANDBY)
 	{
-		/*
-		 * PM_WAIT_BACKUP state ends when online backup mode is not active.
-		 */
-		if (!BackupInProgress())
-			pmState = PM_WAIT_BACKENDS;
-	}
+		if (connsAllowed == ALLOW_SUPERUSER_CONNS)
+		{
+			/*
+			 * ALLOW_SUPERUSER_CONNS state ends as soon as online backup mode
+			 * is not active.
+			 */
+			if (!BackupInProgress())
+				connsAllowed = ALLOW_NO_CONNS;
+		}
 
-	if (pmState == PM_WAIT_READONLY)
-	{
-		/*
-		 * PM_WAIT_READONLY state ends when we have no regular backends that
-		 * have been started during recovery.  We kill the startup and
-		 * walreceiver processes and transition to PM_WAIT_BACKENDS.  Ideally,
-		 * we might like to kill these processes first and then wait for
-		 * backends to die off, but that doesn't work at present because
-		 * killing the startup process doesn't release its locks.
-		 */
-		if (CountChildren(BACKEND_TYPE_NORMAL) == 0)
+		if (connsAllowed == ALLOW_NO_CONNS)
 		{
-			if (StartupPID != 0)
-				signal_child(StartupPID, SIGTERM);
-			if (WalReceiverPID != 0)
-				signal_child(WalReceiverPID, SIGTERM);
-			pmState = PM_WAIT_BACKENDS;
+			/*
+			 * ALLOW_NO_CONNS state ends when we have no normal client
+			 * backends running.  Then we're ready to stop other children.
+			 */
+			if (CountChildren(BACKEND_TYPE_NORMAL) == 0)
+				pmState = PM_STOP_BACKENDS;
 		}
 	}
 
+	/*
+	 * If we're ready to do so, signal child processes to shut down.  (This
+	 * isn't a persistent state, but treating it as a distinct pmState allows
+	 * us to share this code across multiple shutdown code paths.)
+	 */
+	if (pmState == PM_STOP_BACKENDS)
+	{
+		/* Signal all backend children except walsenders */
+		SignalSomeChildren(SIGTERM,
+						   BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND);
+		/* and the autovac launcher too */
+		if (AutoVacPID != 0)
+			signal_child(AutoVacPID, SIGTERM);
+		/* and the bgwriter too */
+		if (BgWriterPID != 0)
+			signal_child(BgWriterPID, SIGTERM);
+		/* and the walwriter too */
+		if (WalWriterPID != 0)
+			signal_child(WalWriterPID, SIGTERM);
+		/* If we're in recovery, also stop startup and walreceiver procs */
+		if (StartupPID != 0)
+			signal_child(StartupPID, SIGTERM);
+		if (WalReceiverPID != 0)
+			signal_child(WalReceiverPID, SIGTERM);
+		/* checkpointer, archiver, stats, and syslogger may continue for now */
+
+		/* Now transition to PM_WAIT_BACKENDS state to wait for them to die */
+		pmState = PM_WAIT_BACKENDS;
+	}
+
 	/*
 	 * If we are in a state-machine state that implies waiting for backends to
 	 * exit, see if they're all gone, and change state if so.
@@ -3843,7 +3847,7 @@ PostmasterStateMachine(void)
 		 * later after writing the checkpoint record, like the archiver
 		 * process.
 		 */
-		if (CountChildren(BACKEND_TYPE_NORMAL | BACKEND_TYPE_WORKER) == 0 &&
+		if (CountChildren(BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND) == 0 &&
 			StartupPID == 0 &&
 			WalReceiverPID == 0 &&
 			BgWriterPID == 0 &&
@@ -4184,7 +4188,7 @@ BackendStartup(Port *port)
 	/* Pass down canAcceptConnections state */
 	port->canAcceptConnections = canAcceptConnections(BACKEND_TYPE_NORMAL);
 	bn->dead_end = (port->canAcceptConnections != CAC_OK &&
-					port->canAcceptConnections != CAC_WAITBACKUP);
+					port->canAcceptConnections != CAC_SUPERUSER);
 
 	/*
 	 * Unless it's a dead_end child, assign it a child slot number
@@ -5255,6 +5259,8 @@ sigusr1_handler(SIGNAL_ARGS)
 #endif
 
 		pmState = PM_HOT_STANDBY;
+		connsAllowed = ALLOW_ALL_CONNS;
+
 		/* Some workers may be scheduled to start now */
 		StartWorkerNeeded = true;
 	}
@@ -5287,7 +5293,7 @@ sigusr1_handler(SIGNAL_ARGS)
 	}
 
 	if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER) &&
-		Shutdown == NoShutdown)
+		Shutdown <= SmartShutdown && pmState < PM_STOP_BACKENDS)
 	{
 		/*
 		 * Start one iteration of the autovacuum daemon, even if autovacuuming
@@ -5302,7 +5308,7 @@ sigusr1_handler(SIGNAL_ARGS)
 	}
 
 	if (CheckPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER) &&
-		Shutdown == NoShutdown)
+		Shutdown <= SmartShutdown && pmState < PM_STOP_BACKENDS)
 	{
 		/* The autovacuum launcher wants us to start a worker process. */
 		StartAutovacuumWorker();
@@ -5333,7 +5339,7 @@ sigusr1_handler(SIGNAL_ARGS)
 
 	if (StartupPID != 0 &&
 		(pmState == PM_STARTUP || pmState == PM_RECOVERY ||
-		 pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY) &&
+		 pmState == PM_HOT_STANDBY) &&
 		CheckPromoteSignal())
 	{
 		/*
@@ -5651,8 +5657,8 @@ MaybeStartWalReceiver(void)
 {
 	if (WalReceiverPID == 0 &&
 		(pmState == PM_STARTUP || pmState == PM_RECOVERY ||
-		 pmState == PM_HOT_STANDBY || pmState == PM_WAIT_READONLY) &&
-		Shutdown == NoShutdown)
+		 pmState == PM_HOT_STANDBY) &&
+		Shutdown <= SmartShutdown)
 	{
 		WalReceiverPID = StartWalReceiver();
 		if (WalReceiverPID != 0)
@@ -5905,8 +5911,7 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
 		case PM_SHUTDOWN_2:
 		case PM_SHUTDOWN:
 		case PM_WAIT_BACKENDS:
-		case PM_WAIT_READONLY:
-		case PM_WAIT_BACKUP:
+		case PM_STOP_BACKENDS:
 			break;
 
 		case PM_RUN:
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 893be2f3ddbf..d4ab4c7e2333 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -795,7 +795,7 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username,
 	 */
 	if ((!am_superuser || am_walsender) &&
 		MyProcPort != NULL &&
-		MyProcPort->canAcceptConnections == CAC_WAITBACKUP)
+		MyProcPort->canAcceptConnections == CAC_SUPERUSER)
 	{
 		if (am_walsender)
 			ereport(FATAL,
diff --git a/src/include/libpq/libpq-be.h b/src/include/libpq/libpq-be.h
index 179ebaa104b3..0a23281ad59b 100644
--- a/src/include/libpq/libpq-be.h
+++ b/src/include/libpq/libpq-be.h
@@ -71,7 +71,7 @@ typedef struct
 typedef enum CAC_state
 {
 	CAC_OK, CAC_STARTUP, CAC_SHUTDOWN, CAC_RECOVERY, CAC_TOOMANY,
-	CAC_WAITBACKUP
+	CAC_SUPERUSER
 } CAC_state;
 
 

From 914140e85a79c63853c86334afa2d7e6e930c11a Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 14 Aug 2020 11:09:08 -0700
Subject: [PATCH 295/334] Fix obsolete comment in xlogutils.c.

Oversight in commit 2c03216d831.
---
 src/backend/access/transam/xlogutils.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index b2ca0cd4cf39..7e915bcadf10 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -260,10 +260,9 @@ XLogCheckInvalidPages(void)
  * determines what needs to be done to redo the changes to it.  If the WAL
  * record includes a full-page image of the page, it is restored.
  *
- * 'lsn' is the LSN of the record being replayed.  It is compared with the
- * page's LSN to determine if the record has already been replayed.
- * 'block_id' is the ID number the block was registered with, when the WAL
- * record was created.
+ * 'record.EndRecPtr' is compared to the page's LSN to determine if the record
+ * has already been replayed.  'block_id' is the ID number the block was
+ * registered with, when the WAL record was created.
  *
  * Returns one of the following:
  *

From 2ba5b2db7943742e100834d99548c5d2661a105b Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Fri, 14 Aug 2020 17:33:31 -0400
Subject: [PATCH 296/334] pg_dump: fix dependencies on FKs to partitioned
 tables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Parallel-restoring a foreign key that references a partitioned table
with several levels of partitions can fail:

pg_restore: while PROCESSING TOC:
pg_restore: from TOC entry 6684; 2606 29166 FK CONSTRAINT fk fk_a_fkey postgres
pg_restore: error: could not execute query: ERROR:  there is no unique constraint matching given keys for referenced table "pk"
Command was: ALTER TABLE fkpart3.fk
    ADD CONSTRAINT fk_a_fkey FOREIGN KEY (a) REFERENCES fkpart3.pk(a);

This happens in parallel restore mode because some index partitions
aren't yet attached to the topmost partitioned index that the FK uses,
and so the index is still invalid.  The current code marks the FK as
dependent on the first level of index-attach dump objects; the bug is
fixed by recursively marking the FK on their children.

Backpatch to 12, where FKs to partitioned tables were introduced.

Reported-by: Tom Lane <tgl@sss.pgh.pa.us>
Author: Álvaro Herrera <alvherre@alvh.no-ip.org>
Discussion: https://postgr.es/m/3170626.1594842723@sss.pgh.pa.us
Backpatch: 12-master
---
 src/bin/pg_dump/pg_dump.c | 39 ++++++++++++++++++++++++++++++++-------
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 9c8436dde6cc..2cb3f9b083ec 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -235,6 +235,7 @@ static DumpableObject *createBoundaryObjects(void);
 static void addBoundaryDependencies(DumpableObject **dobjs, int numObjs,
 									DumpableObject *boundaryObjs);
 
+static void addConstrChildIdxDeps(DumpableObject *dobj, IndxInfo *refidx);
 static void getDomainConstraints(Archive *fout, TypeInfo *tyinfo);
 static void getTableData(DumpOptions *dopt, TableInfo *tblinfo, int numTables, char relkind);
 static void makeTableDataInfo(DumpOptions *dopt, TableInfo *tbinfo);
@@ -7517,25 +7518,20 @@ getConstraints(Archive *fout, TableInfo tblinfo[], int numTables)
 			reftable = findTableByOid(constrinfo[j].confrelid);
 			if (reftable && reftable->relkind == RELKIND_PARTITIONED_TABLE)
 			{
-				IndxInfo   *refidx;
 				Oid			indexOid = atooid(PQgetvalue(res, j, i_conindid));
 
 				if (indexOid != InvalidOid)
 				{
 					for (int k = 0; k < reftable->numIndexes; k++)
 					{
-						SimplePtrListCell *cell;
+						IndxInfo   *refidx;
 
 						/* not our index? */
 						if (reftable->indexes[k].dobj.catId.oid != indexOid)
 							continue;
 
 						refidx = &reftable->indexes[k];
-						for (cell = refidx->partattaches.head; cell;
-							 cell = cell->next)
-							addObjectDependency(&constrinfo[j].dobj,
-												((DumpableObject *)
-												 cell->ptr)->dumpId);
+						addConstrChildIdxDeps(&constrinfo[j].dobj, refidx);
 						break;
 					}
 				}
@@ -7548,6 +7544,35 @@ getConstraints(Archive *fout, TableInfo tblinfo[], int numTables)
 	destroyPQExpBuffer(query);
 }
 
+/*
+ * addConstrChildIdxDeps
+ *
+ * Recursive subroutine for getConstraints
+ *
+ * Given an object representing a foreign key constraint and an index on the
+ * partitioned table it references, mark the constraint object as dependent
+ * on the DO_INDEX_ATTACH object of each index partition, recursively
+ * drilling down to their partitions if any.  This ensures that the FK is not
+ * restored until the index is fully marked valid.
+ */
+static void
+addConstrChildIdxDeps(DumpableObject *dobj, IndxInfo *refidx)
+{
+	SimplePtrListCell *cell;
+
+	Assert(dobj->objType == DO_FK_CONSTRAINT);
+
+	for (cell = refidx->partattaches.head; cell; cell = cell->next)
+	{
+		IndexAttachInfo *attach = (IndexAttachInfo *) cell->ptr;
+
+		addObjectDependency(dobj, attach->dobj.dumpId);
+
+		if (attach->partitionIdx->partattaches.head != NULL)
+			addConstrChildIdxDeps(dobj, attach->partitionIdx);
+	}
+}
+
 /*
  * getDomainConstraints
  *

From 941697c3c1ae5d6ee153065adb96e1e63ee11224 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 14 Aug 2020 12:15:38 -0700
Subject: [PATCH 297/334] snapshot scalability: Introduce dense array of
 in-progress xids.

The new array contains the xids for all connected backends / in-use
PGPROC entries in a dense manner (in contrast to the PGPROC/PGXACT
arrays which can have unused entries interspersed).

This improves performance because GetSnapshotData() always needs to
scan the xids of all live procarray entries and now there's no need to
go through the procArray->pgprocnos indirection anymore.

As the set of running top-level xids changes rarely, compared to the
number of snapshots taken, this substantially increases the likelihood
of most data required for a snapshot being in l2 cache.  In
read-mostly workloads scanning the xids[] array will sufficient to
build a snapshot, as most backends will not have an xid assigned.

To keep the xid array dense ProcArrayRemove() needs to move entries
behind the to-be-removed proc's one further up in the array. Obviously
moving array entries cannot happen while a backend sets it
xid. I.e. locking needs to prevent that array entries are moved while
a backend modifies its xid.

To avoid locking ProcArrayLock in GetNewTransactionId() - a fairly hot
spot already - ProcArrayAdd() / ProcArrayRemove() now needs to hold
XidGenLock in addition to ProcArrayLock. Adding / Removing a procarray
entry is not a very frequent operation, even taking 2PC into account.

Due to the above, the dense array entries can only be read or modified
while holding ProcArrayLock and/or XidGenLock. This prevents a
concurrent ProcArrayRemove() from shifting the dense array while it is
accessed concurrently.

While the new dense array is very good when needing to look at all
xids it is less suitable when accessing a single backend's xid. In
particular it would be problematic to have to acquire a lock to access
a backend's own xid. Therefore a backend's xid is not just stored in
the dense array, but also in PGPROC. This also allows a backend to
only access the shared xid value when the backend had acquired an
xid.

The infrastructure added in this commit will be used for the remaining
PGXACT fields in subsequent commits. They are kept separate to make
review easier.

Author: Andres Freund <andres@anarazel.de>
Reviewed-By: Robert Haas <robertmhaas@gmail.com>
Reviewed-By: Thomas Munro <thomas.munro@gmail.com>
Reviewed-By: David Rowley <dgrowleyml@gmail.com>
Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
---
 src/backend/access/heap/heapam_visibility.c |   8 +-
 src/backend/access/transam/README           |  29 ++-
 src/backend/access/transam/clog.c           |   8 +-
 src/backend/access/transam/twophase.c       |  31 +--
 src/backend/access/transam/varsup.c         |  20 +-
 src/backend/commands/vacuum.c               |   2 +-
 src/backend/storage/ipc/procarray.c         | 271 +++++++++++++-------
 src/backend/storage/ipc/sinvaladt.c         |   4 +-
 src/backend/storage/lmgr/lock.c             |   3 +-
 src/backend/storage/lmgr/proc.c             |  26 +-
 src/include/storage/proc.h                  |  79 +++++-
 11 files changed, 327 insertions(+), 154 deletions(-)

diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c
index 528e75bafd45..80bd4940769c 100644
--- a/src/backend/access/heap/heapam_visibility.c
+++ b/src/backend/access/heap/heapam_visibility.c
@@ -11,12 +11,12 @@
  * shared buffer content lock on the buffer containing the tuple.
  *
  * NOTE: When using a non-MVCC snapshot, we must check
- * TransactionIdIsInProgress (which looks in the PGXACT array)
+ * TransactionIdIsInProgress (which looks in the PGPROC array)
  * before TransactionIdDidCommit/TransactionIdDidAbort (which look in
  * pg_xact).  Otherwise we have a race condition: we might decide that a
  * just-committed transaction crashed, because none of the tests succeed.
  * xact.c is careful to record commit/abort in pg_xact before it unsets
- * MyPgXact->xid in the PGXACT array.  That fixes that problem, but it
+ * MyProc->xid in the PGPROC array.  That fixes that problem, but it
  * also means there is a window where TransactionIdIsInProgress and
  * TransactionIdDidCommit will both return true.  If we check only
  * TransactionIdDidCommit, we could consider a tuple committed when a
@@ -956,7 +956,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot,
  * coding where we tried to set the hint bits as soon as possible, we instead
  * did TransactionIdIsInProgress in each call --- to no avail, as long as the
  * inserting/deleting transaction was still running --- which was more cycles
- * and more contention on the PGXACT array.
+ * and more contention on ProcArrayLock.
  */
 static bool
 HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot,
@@ -1459,7 +1459,7 @@ HeapTupleSatisfiesNonVacuumable(HeapTuple htup, Snapshot snapshot,
  *	HeapTupleSatisfiesMVCC) and, therefore, any hint bits that can be set
  *	should already be set.  We assume that if no hint bits are set, the xmin
  *	or xmax transaction is still running.  This is therefore faster than
- *	HeapTupleSatisfiesVacuum, because we don't consult PGXACT nor CLOG.
+ *	HeapTupleSatisfiesVacuum, because we consult neither procarray nor CLOG.
  *	It's okay to return false when in doubt, but we must return true only
  *	if the tuple is removable.
  */
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index eab8edd20ec2..c5f09667ba15 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -251,10 +251,10 @@ enforce, and it assists with some other issues as explained below.)  The
 implementation of this is that GetSnapshotData takes the ProcArrayLock in
 shared mode (so that multiple backends can take snapshots in parallel),
 but ProcArrayEndTransaction must take the ProcArrayLock in exclusive mode
-while clearing MyPgXact->xid at transaction end (either commit or abort).
-(To reduce context switching, when multiple transactions commit nearly
-simultaneously, we have one backend take ProcArrayLock and clear the XIDs
-of multiple processes at once.)
+while clearing the ProcGlobal->xids[] entry at transaction end (either
+commit or abort). (To reduce context switching, when multiple transactions
+commit nearly simultaneously, we have one backend take ProcArrayLock and
+clear the XIDs of multiple processes at once.)
 
 ProcArrayEndTransaction also holds the lock while advancing the shared
 latestCompletedXid variable.  This allows GetSnapshotData to use
@@ -278,12 +278,12 @@ present in the ProcArray, or not running anymore.  (This guarantee doesn't
 apply to subtransaction XIDs, because of the possibility that there's not
 room for them in the subxid array; instead we guarantee that they are
 present or the overflow flag is set.)  If a backend released XidGenLock
-before storing its XID into MyPgXact, then it would be possible for another
-backend to allocate and commit a later XID, causing latestCompletedXid to
-pass the first backend's XID, before that value became visible in the
+before storing its XID into ProcGlobal->xids[], then it would be possible for
+another backend to allocate and commit a later XID, causing latestCompletedXid
+to pass the first backend's XID, before that value became visible in the
 ProcArray.  That would break ComputeXidHorizons, as discussed below.
 
-We allow GetNewTransactionId to store the XID into MyPgXact->xid (or the
+We allow GetNewTransactionId to store the XID into ProcGlobal->xids[] (or the
 subxid array) without taking ProcArrayLock.  This was once necessary to
 avoid deadlock; while that is no longer the case, it's still beneficial for
 performance.  We are thereby relying on fetch/store of an XID to be atomic,
@@ -382,12 +382,13 @@ Top-level transactions do not have a parent, so they leave their pg_subtrans
 entries set to the default value of zero (InvalidTransactionId).
 
 pg_subtrans is used to check whether the transaction in question is still
-running --- the main Xid of a transaction is recorded in the PGXACT struct,
-but since we allow arbitrary nesting of subtransactions, we can't fit all Xids
-in shared memory, so we have to store them on disk.  Note, however, that for
-each transaction we keep a "cache" of Xids that are known to be part of the
-transaction tree, so we can skip looking at pg_subtrans unless we know the
-cache has been overflowed.  See storage/ipc/procarray.c for the gory details.
+running --- the main Xid of a transaction is recorded in ProcGlobal->xids[],
+with a copy in PGPROC->xid, but since we allow arbitrary nesting of
+subtransactions, we can't fit all Xids in shared memory, so we have to store
+them on disk.  Note, however, that for each transaction we keep a "cache" of
+Xids that are known to be part of the transaction tree, so we can skip looking
+at pg_subtrans unless we know the cache has been overflowed.  See
+storage/ipc/procarray.c for the gory details.
 
 slru.c is the supporting mechanism for both pg_xact and pg_subtrans.  It
 implements the LRU policy for in-memory buffer pages.  The high-level routines
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index dd2f4d5bc7e7..a4599e966106 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -285,15 +285,15 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
 	 * updates for multiple backends so that the number of times XactSLRULock
 	 * needs to be acquired is reduced.
 	 *
-	 * For this optimization to be safe, the XID in MyPgXact and the subxids
-	 * in MyProc must be the same as the ones for which we're setting the
-	 * status.  Check that this is the case.
+	 * For this optimization to be safe, the XID and subxids in MyProc must be
+	 * the same as the ones for which we're setting the status.  Check that
+	 * this is the case.
 	 *
 	 * For this optimization to be efficient, we shouldn't have too many
 	 * sub-XIDs and all of the XIDs for which we're adjusting clog should be
 	 * on the same page.  Check those conditions, too.
 	 */
-	if (all_xact_same_page && xid == MyPgXact->xid &&
+	if (all_xact_same_page && xid == MyProc->xid &&
 		nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT &&
 		nsubxids == MyPgXact->nxids &&
 		memcmp(subxids, MyProc->subxids.xids,
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index eb5f4680a3d9..a0398bf3a3e8 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -351,7 +351,7 @@ AtAbort_Twophase(void)
 
 /*
  * This is called after we have finished transferring state to the prepared
- * PGXACT entry.
+ * PGPROC entry.
  */
 void
 PostPrepare_Twophase(void)
@@ -463,7 +463,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
 	proc->waitStatus = PROC_WAIT_STATUS_OK;
 	/* We set up the gxact's VXID as InvalidBackendId/XID */
 	proc->lxid = (LocalTransactionId) xid;
-	pgxact->xid = xid;
+	proc->xid = xid;
 	Assert(proc->xmin == InvalidTransactionId);
 	proc->delayChkpt = false;
 	pgxact->vacuumFlags = 0;
@@ -768,7 +768,6 @@ pg_prepared_xact(PG_FUNCTION_ARGS)
 	{
 		GlobalTransaction gxact = &status->array[status->currIdx++];
 		PGPROC	   *proc = &ProcGlobal->allProcs[gxact->pgprocno];
-		PGXACT	   *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
 		Datum		values[5];
 		bool		nulls[5];
 		HeapTuple	tuple;
@@ -783,7 +782,7 @@ pg_prepared_xact(PG_FUNCTION_ARGS)
 		MemSet(values, 0, sizeof(values));
 		MemSet(nulls, 0, sizeof(nulls));
 
-		values[0] = TransactionIdGetDatum(pgxact->xid);
+		values[0] = TransactionIdGetDatum(proc->xid);
 		values[1] = CStringGetTextDatum(gxact->gid);
 		values[2] = TimestampTzGetDatum(gxact->prepared_at);
 		values[3] = ObjectIdGetDatum(gxact->owner);
@@ -829,9 +828,8 @@ TwoPhaseGetGXact(TransactionId xid, bool lock_held)
 	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
 	{
 		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
-		PGXACT	   *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
 
-		if (pgxact->xid == xid)
+		if (gxact->xid == xid)
 		{
 			result = gxact;
 			break;
@@ -987,8 +985,7 @@ void
 StartPrepare(GlobalTransaction gxact)
 {
 	PGPROC	   *proc = &ProcGlobal->allProcs[gxact->pgprocno];
-	PGXACT	   *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
-	TransactionId xid = pgxact->xid;
+	TransactionId xid = gxact->xid;
 	TwoPhaseFileHeader hdr;
 	TransactionId *children;
 	RelFileNode *commitrels;
@@ -1140,15 +1137,15 @@ EndPrepare(GlobalTransaction gxact)
 
 	/*
 	 * Mark the prepared transaction as valid.  As soon as xact.c marks
-	 * MyPgXact as not running our XID (which it will do immediately after
+	 * MyProc as not running our XID (which it will do immediately after
 	 * this function returns), others can commit/rollback the xact.
 	 *
 	 * NB: a side effect of this is to make a dummy ProcArray entry for the
-	 * prepared XID.  This must happen before we clear the XID from MyPgXact,
-	 * else there is a window where the XID is not running according to
-	 * TransactionIdIsInProgress, and onlookers would be entitled to assume
-	 * the xact crashed.  Instead we have a window where the same XID appears
-	 * twice in ProcArray, which is OK.
+	 * prepared XID.  This must happen before we clear the XID from MyProc /
+	 * ProcGlobal->xids[], else there is a window where the XID is not running
+	 * according to TransactionIdIsInProgress, and onlookers would be entitled
+	 * to assume the xact crashed.  Instead we have a window where the same
+	 * XID appears twice in ProcArray, which is OK.
 	 */
 	MarkAsPrepared(gxact, false);
 
@@ -1404,7 +1401,6 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 {
 	GlobalTransaction gxact;
 	PGPROC	   *proc;
-	PGXACT	   *pgxact;
 	TransactionId xid;
 	char	   *buf;
 	char	   *bufptr;
@@ -1423,8 +1419,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 	 */
 	gxact = LockGXact(gid, GetUserId());
 	proc = &ProcGlobal->allProcs[gxact->pgprocno];
-	pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
-	xid = pgxact->xid;
+	xid = gxact->xid;
 
 	/*
 	 * Read and validate 2PC state data. State data will typically be stored
@@ -1726,7 +1721,7 @@ CheckPointTwoPhase(XLogRecPtr redo_horizon)
 	for (i = 0; i < TwoPhaseState->numPrepXacts; i++)
 	{
 		/*
-		 * Note that we are using gxact not pgxact so this works in recovery
+		 * Note that we are using gxact not PGPROC so this works in recovery
 		 * also
 		 */
 		GlobalTransaction gxact = TwoPhaseState->prepXacts[i];
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 2ef0f4991caf..4c91b343ecd2 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -38,7 +38,8 @@ VariableCache ShmemVariableCache = NULL;
  * Allocate the next FullTransactionId for a new transaction or
  * subtransaction.
  *
- * The new XID is also stored into MyPgXact before returning.
+ * The new XID is also stored into MyProc->xid/ProcGlobal->xids[] before
+ * returning.
  *
  * Note: when this is called, we are actually already inside a valid
  * transaction, since XIDs are now not allocated until the transaction
@@ -65,7 +66,8 @@ GetNewTransactionId(bool isSubXact)
 	if (IsBootstrapProcessingMode())
 	{
 		Assert(!isSubXact);
-		MyPgXact->xid = BootstrapTransactionId;
+		MyProc->xid = BootstrapTransactionId;
+		ProcGlobal->xids[MyProc->pgxactoff] = BootstrapTransactionId;
 		return FullTransactionIdFromEpochAndXid(0, BootstrapTransactionId);
 	}
 
@@ -190,10 +192,10 @@ GetNewTransactionId(bool isSubXact)
 	 * latestCompletedXid is present in the ProcArray, which is essential for
 	 * correct OldestXmin tracking; see src/backend/access/transam/README.
 	 *
-	 * Note that readers of PGXACT xid fields should be careful to fetch the
-	 * value only once, rather than assume they can read a value multiple
-	 * times and get the same answer each time.  Note we are assuming that
-	 * TransactionId and int fetch/store are atomic.
+	 * Note that readers of ProcGlobal->xids/PGPROC->xid should be careful
+	 * to fetch the value for each proc only once, rather than assume they can
+	 * read a value multiple times and get the same answer each time.  Note we
+	 * are assuming that TransactionId and int fetch/store are atomic.
 	 *
 	 * The same comments apply to the subxact xid count and overflow fields.
 	 *
@@ -219,7 +221,11 @@ GetNewTransactionId(bool isSubXact)
 	 * answer later on when someone does have a reason to inquire.)
 	 */
 	if (!isSubXact)
-		MyPgXact->xid = xid;	/* LWLockRelease acts as barrier */
+	{
+		/* LWLockRelease acts as barrier */
+		MyProc->xid = xid;
+		ProcGlobal->xids[MyProc->pgxactoff] = xid;
+	}
 	else
 	{
 		int			nxids = MyPgXact->nxids;
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 22228f5684f0..648e12c78d84 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1724,7 +1724,7 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params)
 		 *
 		 * Note: these flags remain set until CommitTransaction or
 		 * AbortTransaction.  We don't want to clear them until we reset
-		 * MyPgXact->xid/xmin, otherwise GetOldestNonRemovableTransactionId()
+		 * MyProc->xid/xmin, otherwise GetOldestNonRemovableTransactionId()
 		 * might appear to go backwards, which is probably Not Good.
 		 */
 		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 185f581c8b6f..0bf20a49375d 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -9,8 +9,9 @@
  * one is as a means of determining the set of currently running transactions.
  *
  * Because of various subtle race conditions it is critical that a backend
- * hold the correct locks while setting or clearing its MyPgXact->xid field.
- * See notes in src/backend/access/transam/README.
+ * hold the correct locks while setting or clearing its xid (in
+ * ProcGlobal->xids[]/MyProc->xid).  See notes in
+ * src/backend/access/transam/README.
  *
  * The process arrays now also include structures representing prepared
  * transactions.  The xid and subxids fields of these are valid, as are the
@@ -436,7 +437,9 @@ ProcArrayAdd(PGPROC *proc)
 	ProcArrayStruct *arrayP = procArray;
 	int			index;
 
+	/* See ProcGlobal comment explaining why both locks are held */
 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
 
 	if (arrayP->numProcs >= arrayP->maxProcs)
 	{
@@ -445,7 +448,6 @@ ProcArrayAdd(PGPROC *proc)
 		 * fixed supply of PGPROC structs too, and so we should have failed
 		 * earlier.)
 		 */
-		LWLockRelease(ProcArrayLock);
 		ereport(FATAL,
 				(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
 				 errmsg("sorry, too many clients already")));
@@ -471,10 +473,25 @@ ProcArrayAdd(PGPROC *proc)
 	}
 
 	memmove(&arrayP->pgprocnos[index + 1], &arrayP->pgprocnos[index],
-			(arrayP->numProcs - index) * sizeof(int));
+			(arrayP->numProcs - index) * sizeof(*arrayP->pgprocnos));
+	memmove(&ProcGlobal->xids[index + 1], &ProcGlobal->xids[index],
+			(arrayP->numProcs - index) * sizeof(*ProcGlobal->xids));
+
 	arrayP->pgprocnos[index] = proc->pgprocno;
+	ProcGlobal->xids[index] = proc->xid;
+
 	arrayP->numProcs++;
 
+	for (; index < arrayP->numProcs; index++)
+	{
+		allProcs[arrayP->pgprocnos[index]].pgxactoff = index;
+	}
+
+	/*
+	 * Release in reversed acquisition order, to reduce frequency of having to
+	 * wait for XidGenLock while holding ProcArrayLock.
+	 */
+	LWLockRelease(XidGenLock);
 	LWLockRelease(ProcArrayLock);
 }
 
@@ -500,36 +517,58 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 		DisplayXidCache();
 #endif
 
+	/* See ProcGlobal comment explaining why both locks are held */
 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+
+	Assert(ProcGlobal->allProcs[arrayP->pgprocnos[proc->pgxactoff]].pgxactoff == proc->pgxactoff);
 
 	if (TransactionIdIsValid(latestXid))
 	{
-		Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
+		Assert(TransactionIdIsValid(ProcGlobal->xids[proc->pgxactoff]));
 
 		/* Advance global latestCompletedXid while holding the lock */
 		MaintainLatestCompletedXid(latestXid);
+
+		ProcGlobal->xids[proc->pgxactoff] = 0;
 	}
 	else
 	{
 		/* Shouldn't be trying to remove a live transaction here */
-		Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
+		Assert(!TransactionIdIsValid(ProcGlobal->xids[proc->pgxactoff]));
 	}
 
+	Assert(TransactionIdIsValid(ProcGlobal->xids[proc->pgxactoff] == 0));
+
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		if (arrayP->pgprocnos[index] == proc->pgprocno)
 		{
 			/* Keep the PGPROC array sorted. See notes above */
 			memmove(&arrayP->pgprocnos[index], &arrayP->pgprocnos[index + 1],
-					(arrayP->numProcs - index - 1) * sizeof(int));
+					(arrayP->numProcs - index - 1) * sizeof(*arrayP->pgprocnos));
+			memmove(&ProcGlobal->xids[index], &ProcGlobal->xids[index + 1],
+					(arrayP->numProcs - index - 1) * sizeof(*ProcGlobal->xids));
+
 			arrayP->pgprocnos[arrayP->numProcs - 1] = -1;	/* for debugging */
 			arrayP->numProcs--;
+
+			/* adjust for removed PGPROC */
+			for (; index < arrayP->numProcs; index++)
+				allProcs[arrayP->pgprocnos[index]].pgxactoff--;
+
+			/*
+			 * Release in reversed acquisition order, to reduce frequency of
+			 * having to wait for XidGenLock while holding ProcArrayLock.
+			 */
+			LWLockRelease(XidGenLock);
 			LWLockRelease(ProcArrayLock);
 			return;
 		}
 	}
 
 	/* Oops */
+	LWLockRelease(XidGenLock);
 	LWLockRelease(ProcArrayLock);
 
 	elog(LOG, "failed to find proc %p in ProcArray", proc);
@@ -562,7 +601,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 		 * else is taking a snapshot.  See discussion in
 		 * src/backend/access/transam/README.
 		 */
-		Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
+		Assert(TransactionIdIsValid(proc->xid));
 
 		/*
 		 * If we can immediately acquire ProcArrayLock, we clear our own XID
@@ -584,7 +623,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 		 * anyone else's calculation of a snapshot.  We might change their
 		 * estimate of global xmin, but that's OK.
 		 */
-		Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
+		Assert(!TransactionIdIsValid(proc->xid));
 
 		proc->lxid = InvalidLocalTransactionId;
 		/* must be cleared with xid/xmin: */
@@ -607,7 +646,13 @@ static inline void
 ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
 								TransactionId latestXid)
 {
-	pgxact->xid = InvalidTransactionId;
+	size_t		pgxactoff = proc->pgxactoff;
+
+	Assert(TransactionIdIsValid(ProcGlobal->xids[pgxactoff]));
+	Assert(ProcGlobal->xids[pgxactoff] == proc->xid);
+
+	ProcGlobal->xids[pgxactoff] = InvalidTransactionId;
+	proc->xid = InvalidTransactionId;
 	proc->lxid = InvalidLocalTransactionId;
 	/* must be cleared with xid/xmin: */
 	pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
@@ -643,7 +688,7 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
 	uint32		wakeidx;
 
 	/* We should definitely have an XID to clear. */
-	Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid));
+	Assert(TransactionIdIsValid(proc->xid));
 
 	/* Add ourselves to the list of processes needing a group XID clear. */
 	proc->procArrayGroupMember = true;
@@ -748,20 +793,28 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
  * This is used after successfully preparing a 2-phase transaction.  We are
  * not actually reporting the transaction's XID as no longer running --- it
  * will still appear as running because the 2PC's gxact is in the ProcArray
- * too.  We just have to clear out our own PGXACT.
+ * too.  We just have to clear out our own PGPROC.
  */
 void
 ProcArrayClearTransaction(PGPROC *proc)
 {
 	PGXACT	   *pgxact = &allPgXact[proc->pgprocno];
+	size_t		pgxactoff;
 
 	/*
-	 * We can skip locking ProcArrayLock here, because this action does not
-	 * actually change anyone's view of the set of running XIDs: our entry is
-	 * duplicate with the gxact that has already been inserted into the
-	 * ProcArray.
+	 * We can skip locking ProcArrayLock exclusively here, because this action
+	 * does not actually change anyone's view of the set of running XIDs: our
+	 * entry is duplicate with the gxact that has already been inserted into
+	 * the ProcArray. But need it in shared mode for pgproc->pgxactoff to stay
+	 * the same.
 	 */
-	pgxact->xid = InvalidTransactionId;
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	pgxactoff = proc->pgxactoff;
+
+	ProcGlobal->xids[pgxactoff] = InvalidTransactionId;
+	proc->xid = InvalidTransactionId;
+
 	proc->lxid = InvalidLocalTransactionId;
 	proc->xmin = InvalidTransactionId;
 	proc->recoveryConflictPending = false;
@@ -773,6 +826,8 @@ ProcArrayClearTransaction(PGPROC *proc)
 	/* Clear the subtransaction-XID cache too */
 	pgxact->nxids = 0;
 	pgxact->overflowed = false;
+
+	LWLockRelease(ProcArrayLock);
 }
 
 /*
@@ -1167,7 +1222,7 @@ ProcArrayApplyXidAssignment(TransactionId topxid,
  * there are four possibilities for finding a running transaction:
  *
  * 1. The given Xid is a main transaction Id.  We will find this out cheaply
- * by looking at the PGXACT struct for each backend.
+ * by looking at ProcGlobal->xids.
  *
  * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
  * We can find this out cheaply too.
@@ -1176,26 +1231,28 @@ ProcArrayApplyXidAssignment(TransactionId topxid,
  * if the Xid is running on the primary.
  *
  * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see
- * if that is running according to PGXACT or KnownAssignedXids.  This is the
- * slowest way, but sadly it has to be done always if the others failed,
- * unless we see that the cached subxact sets are complete (none have
+ * if that is running according to ProcGlobal->xids[] or KnownAssignedXids.
+ * This is the slowest way, but sadly it has to be done always if the others
+ * failed, unless we see that the cached subxact sets are complete (none have
  * overflowed).
  *
  * ProcArrayLock has to be held while we do 1, 2, 3.  If we save the top Xids
  * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
  * This buys back some concurrency (and we can't retrieve the main Xids from
- * PGXACT again anyway; see GetNewTransactionId).
+ * ProcGlobal->xids[] again anyway; see GetNewTransactionId).
  */
 bool
 TransactionIdIsInProgress(TransactionId xid)
 {
 	static TransactionId *xids = NULL;
+	static TransactionId *other_xids;
 	int			nxids = 0;
 	ProcArrayStruct *arrayP = procArray;
 	TransactionId topxid;
 	TransactionId latestCompletedXid;
-	int			i,
-				j;
+	int			mypgxactoff;
+	size_t		numProcs;
+	int			j;
 
 	/*
 	 * Don't bother checking a transaction older than RecentXmin; it could not
@@ -1250,6 +1307,8 @@ TransactionIdIsInProgress(TransactionId xid)
 					 errmsg("out of memory")));
 	}
 
+	other_xids = ProcGlobal->xids;
+
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
 	/*
@@ -1266,20 +1325,22 @@ TransactionIdIsInProgress(TransactionId xid)
 	}
 
 	/* No shortcuts, gotta grovel through the array */
-	for (i = 0; i < arrayP->numProcs; i++)
+	mypgxactoff = MyProc->pgxactoff;
+	numProcs = arrayP->numProcs;
+	for (size_t pgxactoff = 0; pgxactoff < numProcs; pgxactoff++)
 	{
-		int			pgprocno = arrayP->pgprocnos[i];
-		PGPROC	   *proc = &allProcs[pgprocno];
-		PGXACT	   *pgxact = &allPgXact[pgprocno];
+		int			pgprocno;
+		PGXACT	   *pgxact;
+		PGPROC	   *proc;
 		TransactionId pxid;
 		int			pxids;
 
-		/* Ignore my own proc --- dealt with it above */
-		if (proc == MyProc)
+		/* Ignore ourselves --- dealt with it above */
+		if (pgxactoff == mypgxactoff)
 			continue;
 
 		/* Fetch xid just once - see GetNewTransactionId */
-		pxid = UINT32_ACCESS_ONCE(pgxact->xid);
+		pxid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]);
 
 		if (!TransactionIdIsValid(pxid))
 			continue;
@@ -1304,8 +1365,12 @@ TransactionIdIsInProgress(TransactionId xid)
 		/*
 		 * Step 2: check the cached child-Xids arrays
 		 */
+		pgprocno = arrayP->pgprocnos[pgxactoff];
+		pgxact = &allPgXact[pgprocno];
 		pxids = pgxact->nxids;
 		pg_read_barrier();		/* pairs with barrier in GetNewTransactionId() */
+		pgprocno = arrayP->pgprocnos[pgxactoff];
+		proc = &allProcs[pgprocno];
 		for (j = pxids - 1; j >= 0; j--)
 		{
 			/* Fetch xid just once - see GetNewTransactionId */
@@ -1336,7 +1401,7 @@ TransactionIdIsInProgress(TransactionId xid)
 	 */
 	if (RecoveryInProgress())
 	{
-		/* none of the PGXACT entries should have XIDs in hot standby mode */
+		/* none of the PGPROC entries should have XIDs in hot standby mode */
 		Assert(nxids == 0);
 
 		if (KnownAssignedXidExists(xid))
@@ -1391,7 +1456,7 @@ TransactionIdIsInProgress(TransactionId xid)
 	Assert(TransactionIdIsValid(topxid));
 	if (!TransactionIdEquals(topxid, xid))
 	{
-		for (i = 0; i < nxids; i++)
+		for (int i = 0; i < nxids; i++)
 		{
 			if (TransactionIdEquals(xids[i], topxid))
 				return true;
@@ -1414,6 +1479,7 @@ TransactionIdIsActive(TransactionId xid)
 {
 	bool		result = false;
 	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
 	int			i;
 
 	/*
@@ -1429,11 +1495,10 @@ TransactionIdIsActive(TransactionId xid)
 	{
 		int			pgprocno = arrayP->pgprocnos[i];
 		PGPROC	   *proc = &allProcs[pgprocno];
-		PGXACT	   *pgxact = &allPgXact[pgprocno];
 		TransactionId pxid;
 
 		/* Fetch xid just once - see GetNewTransactionId */
-		pxid = UINT32_ACCESS_ONCE(pgxact->xid);
+		pxid = UINT32_ACCESS_ONCE(other_xids[i]);
 
 		if (!TransactionIdIsValid(pxid))
 			continue;
@@ -1519,6 +1584,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 	ProcArrayStruct *arrayP = procArray;
 	TransactionId kaxmin;
 	bool		in_recovery = RecoveryInProgress();
+	TransactionId *other_xids = ProcGlobal->xids;
 
 	/* inferred after ProcArrayLock is released */
 	h->catalog_oldest_nonremovable = InvalidTransactionId;
@@ -1562,7 +1628,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 		TransactionId xmin;
 
 		/* Fetch xid just once - see GetNewTransactionId */
-		xid = UINT32_ACCESS_ONCE(pgxact->xid);
+		xid = UINT32_ACCESS_ONCE(other_xids[pgprocno]);
 		xmin = UINT32_ACCESS_ONCE(proc->xmin);
 
 		/*
@@ -1852,14 +1918,17 @@ Snapshot
 GetSnapshotData(Snapshot snapshot)
 {
 	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
 	TransactionId xmin;
 	TransactionId xmax;
-	int			index;
-	int			count = 0;
+	size_t		count = 0;
 	int			subcount = 0;
 	bool		suboverflowed = false;
 	FullTransactionId latest_completed;
 	TransactionId oldestxid;
+	int			mypgxactoff;
+	TransactionId myxid;
+
 	TransactionId replication_slot_xmin = InvalidTransactionId;
 	TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
 
@@ -1904,6 +1973,10 @@ GetSnapshotData(Snapshot snapshot)
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
 	latest_completed = ShmemVariableCache->latestCompletedXid;
+	mypgxactoff = MyProc->pgxactoff;
+	myxid = other_xids[mypgxactoff];
+	Assert(myxid == MyProc->xid);
+
 	oldestxid = ShmemVariableCache->oldestXid;
 
 	/* xmax is always latestCompletedXid + 1 */
@@ -1914,57 +1987,79 @@ GetSnapshotData(Snapshot snapshot)
 	/* initialize xmin calculation with xmax */
 	xmin = xmax;
 
+	/* take own xid into account, saves a check inside the loop */
+	if (TransactionIdIsNormal(myxid) && NormalTransactionIdPrecedes(myxid, xmin))
+		xmin = myxid;
+
 	snapshot->takenDuringRecovery = RecoveryInProgress();
 
 	if (!snapshot->takenDuringRecovery)
 	{
+		size_t		numProcs = arrayP->numProcs;
+		TransactionId *xip = snapshot->xip;
 		int		   *pgprocnos = arrayP->pgprocnos;
-		int			numProcs;
 
 		/*
-		 * Spin over procArray checking xid, xmin, and subxids.  The goal is
-		 * to gather all active xids, find the lowest xmin, and try to record
-		 * subxids.
+		 * First collect set of pgxactoff/xids that need to be included in the
+		 * snapshot.
 		 */
-		numProcs = arrayP->numProcs;
-		for (index = 0; index < numProcs; index++)
+		for (size_t pgxactoff = 0; pgxactoff < numProcs; pgxactoff++)
 		{
-			int			pgprocno = pgprocnos[index];
-			PGXACT	   *pgxact = &allPgXact[pgprocno];
-			TransactionId xid;
+			/* Fetch xid just once - see GetNewTransactionId */
+			TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]);
+			int			pgprocno;
+			PGXACT	   *pgxact;
+			uint8		vacuumFlags;
+
+			Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff);
 
 			/*
-			 * Skip over backends doing logical decoding which manages xmin
-			 * separately (check below) and ones running LAZY VACUUM.
+			 * If the transaction has no XID assigned, we can skip it; it
+			 * won't have sub-XIDs either.
 			 */
-			if (pgxact->vacuumFlags &
-				(PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM))
+			if (likely(xid == InvalidTransactionId))
 				continue;
 
-			/* Fetch xid just once - see GetNewTransactionId */
-			xid = UINT32_ACCESS_ONCE(pgxact->xid);
+			/*
+			 * We don't include our own XIDs (if any) in the snapshot. It
+			 * needs to be includeded in the xmin computation, but we did so
+			 * outside the loop.
+			 */
+			if (pgxactoff == mypgxactoff)
+				continue;
 
 			/*
-			 * If the transaction has no XID assigned, we can skip it; it
-			 * won't have sub-XIDs either.  If the XID is >= xmax, we can also
-			 * skip it; such transactions will be treated as running anyway
-			 * (and any sub-XIDs will also be >= xmax).
+			 * The only way we are able to get here with a non-normal xid
+			 * is during bootstrap - with this backend using
+			 * BootstrapTransactionId. But the above test should filter
+			 * that out.
 			 */
-			if (!TransactionIdIsNormal(xid)
-				|| !NormalTransactionIdPrecedes(xid, xmax))
+			Assert(TransactionIdIsNormal(xid));
+
+			/*
+			 * If the XID is >= xmax, we can skip it; such transactions will
+			 * be treated as running anyway (and any sub-XIDs will also be >=
+			 * xmax).
+			 */
+			if (!NormalTransactionIdPrecedes(xid, xmax))
 				continue;
 
+			pgprocno = pgprocnos[pgxactoff];
+			pgxact = &allPgXact[pgprocno];
+			vacuumFlags = pgxact->vacuumFlags;
+
 			/*
-			 * We don't include our own XIDs (if any) in the snapshot, but we
-			 * must include them in xmin.
+			 * Skip over backends doing logical decoding which manages xmin
+			 * separately (check below) and ones running LAZY VACUUM.
 			 */
+			if (vacuumFlags & (PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM))
+				continue;
+
 			if (NormalTransactionIdPrecedes(xid, xmin))
 				xmin = xid;
-			if (pgxact == MyPgXact)
-				continue;
 
 			/* Add XID to snapshot. */
-			snapshot->xip[count++] = xid;
+			xip[count++] = xid;
 
 			/*
 			 * Save subtransaction XIDs if possible (if we've already
@@ -1987,9 +2082,9 @@ GetSnapshotData(Snapshot snapshot)
 					suboverflowed = true;
 				else
 				{
-					int			nxids = pgxact->nxids;
+					int			nsubxids = pgxact->nxids;
 
-					if (nxids > 0)
+					if (nsubxids > 0)
 					{
 						PGPROC	   *proc = &allProcs[pgprocno];
 
@@ -1997,8 +2092,8 @@ GetSnapshotData(Snapshot snapshot)
 
 						memcpy(snapshot->subxip + subcount,
 							   (void *) proc->subxids.xids,
-							   nxids * sizeof(TransactionId));
-						subcount += nxids;
+							   nsubxids * sizeof(TransactionId));
+						subcount += nsubxids;
 					}
 				}
 			}
@@ -2130,6 +2225,7 @@ GetSnapshotData(Snapshot snapshot)
 	}
 
 	RecentXmin = xmin;
+	Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin));
 
 	snapshot->xmin = xmin;
 	snapshot->xmax = xmax;
@@ -2292,7 +2388,7 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
  * GetRunningTransactionData -- returns information about running transactions.
  *
  * Similar to GetSnapshotData but returns more information. We include
- * all PGXACTs with an assigned TransactionId, even VACUUM processes and
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes and
  * prepared transactions.
  *
  * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for
@@ -2307,7 +2403,7 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
  * This is never executed during recovery so there is no need to look at
  * KnownAssignedXids.
  *
- * Dummy PGXACTs from prepared transaction are included, meaning that this
+ * Dummy PGPROCs from prepared transaction are included, meaning that this
  * may return entries with duplicated TransactionId values coming from
  * transaction finishing to prepare.  Nothing is done about duplicated
  * entries here to not hold on ProcArrayLock more than necessary.
@@ -2326,6 +2422,7 @@ GetRunningTransactionData(void)
 	static RunningTransactionsData CurrentRunningXactsData;
 
 	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
 	RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData;
 	TransactionId latestCompletedXid;
 	TransactionId oldestRunningXid;
@@ -2386,7 +2483,7 @@ GetRunningTransactionData(void)
 		TransactionId xid;
 
 		/* Fetch xid just once - see GetNewTransactionId */
-		xid = UINT32_ACCESS_ONCE(pgxact->xid);
+		xid = UINT32_ACCESS_ONCE(other_xids[index]);
 
 		/*
 		 * We don't need to store transactions that don't have a TransactionId
@@ -2483,7 +2580,7 @@ GetRunningTransactionData(void)
  * GetOldestActiveTransactionId()
  *
  * Similar to GetSnapshotData but returns just oldestActiveXid. We include
- * all PGXACTs with an assigned TransactionId, even VACUUM processes.
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes.
  * We look at all databases, though there is no need to include WALSender
  * since this has no effect on hot standby conflicts.
  *
@@ -2498,6 +2595,7 @@ TransactionId
 GetOldestActiveTransactionId(void)
 {
 	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
 	TransactionId oldestRunningXid;
 	int			index;
 
@@ -2520,12 +2618,10 @@ GetOldestActiveTransactionId(void)
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
-		int			pgprocno = arrayP->pgprocnos[index];
-		PGXACT	   *pgxact = &allPgXact[pgprocno];
 		TransactionId xid;
 
 		/* Fetch xid just once - see GetNewTransactionId */
-		xid = UINT32_ACCESS_ONCE(pgxact->xid);
+		xid = UINT32_ACCESS_ONCE(other_xids[index]);
 
 		if (!TransactionIdIsNormal(xid))
 			continue;
@@ -2603,8 +2699,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
 	 * If we're not in recovery, we walk over the procarray and collect the
 	 * lowest xid. Since we're called with ProcArrayLock held and have
 	 * acquired XidGenLock, no entries can vanish concurrently, since
-	 * PGXACT->xid is only set with XidGenLock held and only cleared with
-	 * ProcArrayLock held.
+	 * ProcGlobal->xids[i] is only set with XidGenLock held and only cleared
+	 * with ProcArrayLock held.
 	 *
 	 * In recovery we can't lower the safe value besides what we've computed
 	 * above, so we'll have to wait a bit longer there. We unfortunately can
@@ -2613,17 +2709,17 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
 	 */
 	if (!recovery_in_progress)
 	{
+		TransactionId *other_xids = ProcGlobal->xids;
+
 		/*
-		 * Spin over procArray collecting all min(PGXACT->xid)
+		 * Spin over procArray collecting min(ProcGlobal->xids[i])
 		 */
 		for (index = 0; index < arrayP->numProcs; index++)
 		{
-			int			pgprocno = arrayP->pgprocnos[index];
-			PGXACT	   *pgxact = &allPgXact[pgprocno];
 			TransactionId xid;
 
 			/* Fetch xid just once - see GetNewTransactionId */
-			xid = UINT32_ACCESS_ONCE(pgxact->xid);
+			xid = UINT32_ACCESS_ONCE(other_xids[index]);
 
 			if (!TransactionIdIsNormal(xid))
 				continue;
@@ -2811,6 +2907,7 @@ BackendXidGetPid(TransactionId xid)
 {
 	int			result = 0;
 	ProcArrayStruct *arrayP = procArray;
+	TransactionId *other_xids = ProcGlobal->xids;
 	int			index;
 
 	if (xid == InvalidTransactionId)	/* never match invalid xid */
@@ -2822,9 +2919,8 @@ BackendXidGetPid(TransactionId xid)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
 		PGPROC	   *proc = &allProcs[pgprocno];
-		PGXACT	   *pgxact = &allPgXact[pgprocno];
 
-		if (pgxact->xid == xid)
+		if (other_xids[index] == xid)
 		{
 			result = proc->pid;
 			break;
@@ -3104,7 +3200,6 @@ MinimumActiveBackends(int min)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
 		PGPROC	   *proc = &allProcs[pgprocno];
-		PGXACT	   *pgxact = &allPgXact[pgprocno];
 
 		/*
 		 * Since we're not holding a lock, need to be prepared to deal with
@@ -3121,7 +3216,7 @@ MinimumActiveBackends(int min)
 			continue;			/* do not count deleted entries */
 		if (proc == MyProc)
 			continue;			/* do not count myself */
-		if (pgxact->xid == InvalidTransactionId)
+		if (proc->xid == InvalidTransactionId)
 			continue;			/* do not count if no XID assigned */
 		if (proc->pid == 0)
 			continue;			/* do not count prepared xacts */
@@ -3547,8 +3642,8 @@ XidCacheRemoveRunningXids(TransactionId xid,
 	 *
 	 * Note that we do not have to be careful about memory ordering of our own
 	 * reads wrt. GetNewTransactionId() here - only this process can modify
-	 * relevant fields of MyProc/MyPgXact.  But we do have to be careful about
-	 * our own writes being well ordered.
+	 * relevant fields of MyProc/ProcGlobal->xids[].  But we do have to be
+	 * careful about our own writes being well ordered.
 	 */
 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 
@@ -3906,7 +4001,7 @@ FullXidRelativeTo(FullTransactionId rel, TransactionId xid)
  * In Hot Standby mode, we maintain a list of transactions that are (or were)
  * running on the primary at the current point in WAL.  These XIDs must be
  * treated as running by standby transactions, even though they are not in
- * the standby server's PGXACT array.
+ * the standby server's PGPROC array.
  *
  * We record all XIDs that we know have been assigned.  That includes all the
  * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have
diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c
index ad048bc85fab..a9477ccb4a30 100644
--- a/src/backend/storage/ipc/sinvaladt.c
+++ b/src/backend/storage/ipc/sinvaladt.c
@@ -417,9 +417,7 @@ BackendIdGetTransactionIds(int backendID, TransactionId *xid, TransactionId *xmi
 
 		if (proc != NULL)
 		{
-			PGXACT	   *xact = &ProcGlobal->allPgXact[proc->pgprocno];
-
-			*xid = xact->xid;
+			*xid = proc->xid;
 			*xmin = proc->xmin;
 		}
 	}
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 95989ce79bd6..d86566f4554b 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -3974,9 +3974,8 @@ GetRunningTransactionLocks(int *nlocks)
 			proclock->tag.myLock->tag.locktag_type == LOCKTAG_RELATION)
 		{
 			PGPROC	   *proc = proclock->tag.myProc;
-			PGXACT	   *pgxact = &ProcGlobal->allPgXact[proc->pgprocno];
 			LOCK	   *lock = proclock->tag.myLock;
-			TransactionId xid = pgxact->xid;
+			TransactionId xid = proc->xid;
 
 			/*
 			 * Don't record locks for transactions if we know they have
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index de346cd87fcd..7fad49544ce0 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -102,21 +102,18 @@ Size
 ProcGlobalShmemSize(void)
 {
 	Size		size = 0;
+	Size		TotalProcs =
+		add_size(MaxBackends, add_size(NUM_AUXILIARY_PROCS, max_prepared_xacts));
 
 	/* ProcGlobal */
 	size = add_size(size, sizeof(PROC_HDR));
-	/* MyProcs, including autovacuum workers and launcher */
-	size = add_size(size, mul_size(MaxBackends, sizeof(PGPROC)));
-	/* AuxiliaryProcs */
-	size = add_size(size, mul_size(NUM_AUXILIARY_PROCS, sizeof(PGPROC)));
-	/* Prepared xacts */
-	size = add_size(size, mul_size(max_prepared_xacts, sizeof(PGPROC)));
-	/* ProcStructLock */
+	size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC)));
 	size = add_size(size, sizeof(slock_t));
 
 	size = add_size(size, mul_size(MaxBackends, sizeof(PGXACT)));
 	size = add_size(size, mul_size(NUM_AUXILIARY_PROCS, sizeof(PGXACT)));
 	size = add_size(size, mul_size(max_prepared_xacts, sizeof(PGXACT)));
+	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids)));
 
 	return size;
 }
@@ -216,6 +213,17 @@ InitProcGlobal(void)
 	MemSet(pgxacts, 0, TotalProcs * sizeof(PGXACT));
 	ProcGlobal->allPgXact = pgxacts;
 
+	/*
+	 * Allocate arrays mirroring PGPROC fields in a dense manner. See
+	 * PROC_HDR.
+	 *
+	 * XXX: It might make sense to increase padding for these arrays, given
+	 * how hotly they are accessed.
+	 */
+	ProcGlobal->xids =
+		(TransactionId *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids));
+	MemSet(ProcGlobal->xids, 0, TotalProcs * sizeof(*ProcGlobal->xids));
+
 	for (i = 0; i < TotalProcs; i++)
 	{
 		/* Common initialization for all PGPROCs, regardless of type. */
@@ -387,7 +395,7 @@ InitProcess(void)
 	MyProc->lxid = InvalidLocalTransactionId;
 	MyProc->fpVXIDLock = false;
 	MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
-	MyPgXact->xid = InvalidTransactionId;
+	MyProc->xid = InvalidTransactionId;
 	MyProc->xmin = InvalidTransactionId;
 	MyProc->pid = MyProcPid;
 	/* backendId, databaseId and roleId will be filled in later */
@@ -571,7 +579,7 @@ InitAuxiliaryProcess(void)
 	MyProc->lxid = InvalidLocalTransactionId;
 	MyProc->fpVXIDLock = false;
 	MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
-	MyPgXact->xid = InvalidTransactionId;
+	MyProc->xid = InvalidTransactionId;
 	MyProc->xmin = InvalidTransactionId;
 	MyProc->backendId = InvalidBackendId;
 	MyProc->databaseId = InvalidOid;
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 5e4b028a5f98..e29ed85e53db 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -89,6 +89,17 @@ typedef enum
  * distinguished from a real one at need by the fact that it has pid == 0.
  * The semaphore and lock-activity fields in a prepared-xact PGPROC are unused,
  * but its myProcLocks[] lists are valid.
+ *
+ * Mirrored fields:
+ *
+ * Some fields in PGPROC (see "mirrored in ..." comment) are mirrored into an
+ * element of more densely packed ProcGlobal arrays. These arrays are indexed
+ * by PGPROC->pgxactoff. Both copies need to be maintained coherently.
+ *
+ * NB: The pgxactoff indexed value can *never* be accessed without holding
+ * locks.
+ *
+ * See PROC_HDR for details.
  */
 struct PGPROC
 {
@@ -101,6 +112,12 @@ struct PGPROC
 
 	Latch		procLatch;		/* generic latch for process */
 
+
+	TransactionId xid;			/* id of top-level transaction currently being
+								 * executed by this proc, if running and XID
+								 * is assigned; else InvalidTransactionId.
+								 * mirrored in ProcGlobal->xids[pgxactoff] */
+
 	TransactionId xmin;			/* minimal running XID as it was when we were
 								 * starting our xact, excluding LAZY VACUUM:
 								 * vacuum must not remove tuples deleted by
@@ -110,6 +127,9 @@ struct PGPROC
 								 * being executed by this proc, if running;
 								 * else InvalidLocalTransactionId */
 	int			pid;			/* Backend's process ID; 0 if prepared xact */
+
+	int			pgxactoff;		/* offset into various ProcGlobal->arrays
+								 * with data mirrored from this PGPROC */
 	int			pgprocno;
 
 	/* These fields are zero while a backend is still starting up: */
@@ -224,10 +244,6 @@ extern PGDLLIMPORT struct PGXACT *MyPgXact;
  */
 typedef struct PGXACT
 {
-	TransactionId xid;			/* id of top-level transaction currently being
-								 * executed by this proc, if running and XID
-								 * is assigned; else InvalidTransactionId */
-
 	uint8		vacuumFlags;	/* vacuum-related flags, see above */
 	bool		overflowed;
 
@@ -236,6 +252,57 @@ typedef struct PGXACT
 
 /*
  * There is one ProcGlobal struct for the whole database cluster.
+ *
+ * Adding/Removing an entry into the procarray requires holding *both*
+ * ProcArrayLock and XidGenLock in exclusive mode (in that order). Both are
+ * needed because the dense arrays (see below) are accessed from
+ * GetNewTransactionId() and GetSnapshotData(), and we don't want to add
+ * further contention by both using the same lock. Adding/Removing a procarray
+ * entry is much less frequent.
+ *
+ * Some fields in PGPROC are mirrored into more densely packed arrays (e.g.
+ * xids), with one entry for each backend. These arrays only contain entries
+ * for PGPROCs that have been added to the shared array with ProcArrayAdd()
+ * (in contrast to PGPROC array which has unused PGPROCs interspersed).
+ *
+ * The dense arrays are indexed by PGPROC->pgxactoff. Any concurrent
+ * ProcArrayAdd() / ProcArrayRemove() can lead to pgxactoff of a procarray
+ * member to change.  Therefore it is only safe to use PGPROC->pgxactoff to
+ * access the dense array while holding either ProcArrayLock or XidGenLock.
+ *
+ * As long as a PGPROC is in the procarray, the mirrored values need to be
+ * maintained in both places in a coherent manner.
+ *
+ * The denser separate arrays are beneficial for three main reasons: First, to
+ * allow for as tight loops accessing the data as possible. Second, to prevent
+ * updates of frequently changing data (e.g. xmin) from invalidating
+ * cachelines also containing less frequently changing data (e.g. xid,
+ * vacuumFlags). Third to condense frequently accessed data into as few
+ * cachelines as possible.
+ *
+ * There are two main reasons to have the data mirrored between these dense
+ * arrays and PGPROC. First, as explained above, a PGPROC's array entries can
+ * only be accessed with either ProcArrayLock or XidGenLock held, whereas the
+ * PGPROC entries do not require that (obviously there may still be locking
+ * requirements around the individual field, separate from the concerns
+ * here). That is particularly important for a backend to efficiently checks
+ * it own values, which it often can safely do without locking.  Second, the
+ * PGPROC fields allow to avoid unnecessary accesses and modification to the
+ * dense arrays. A backend's own PGPROC is more likely to be in a local cache,
+ * whereas the cachelines for the dense array will be modified by other
+ * backends (often removing it from the cache for other cores/sockets). At
+ * commit/abort time a check of the PGPROC value can avoid accessing/dirtying
+ * the corresponding array value.
+ *
+ * Basically it makes sense to access the PGPROC variable when checking a
+ * single backend's data, especially when already looking at the PGPROC for
+ * other reasons already.  It makes sense to look at the "dense" arrays if we
+ * need to look at many / most entries, because we then benefit from the
+ * reduced indirection and better cross-process cache-ability.
+ *
+ * When entering a PGPROC for 2PC transactions with ProcArrayAdd(), the data
+ * in the dense arrays is initialized from the PGPROC while it already holds
+ * ProcArrayLock.
  */
 typedef struct PROC_HDR
 {
@@ -243,6 +310,10 @@ typedef struct PROC_HDR
 	PGPROC	   *allProcs;
 	/* Array of PGXACT structures (not including dummies for prepared txns) */
 	PGXACT	   *allPgXact;
+
+	/* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */
+	TransactionId *xids;
+
 	/* Length of allProcs array */
 	uint32		allProcCount;
 	/* Head of list of free PGPROC structures */

From 5788e258bb26495fab65ff3aa486268d1c50b123 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Wed, 15 Jul 2020 15:35:07 -0700
Subject: [PATCH 298/334] snapshot scalability: Move PGXACT->vacuumFlags to
 ProcGlobal->vacuumFlags.

Similar to the previous commit this increases the chance that data
frequently needed by GetSnapshotData() stays in l2 cache. As we now
take care to not unnecessarily write to ProcGlobal->vacuumFlags, there
should be very few modifications to the ProcGlobal->vacuumFlags array.

Author: Andres Freund <andres@anarazel.de>
Reviewed-By: Robert Haas <robertmhaas@gmail.com>
Reviewed-By: Thomas Munro <thomas.munro@gmail.com>
Reviewed-By: David Rowley <dgrowleyml@gmail.com>
Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
---
 src/backend/access/transam/twophase.c     |  2 +-
 src/backend/commands/vacuum.c             |  5 +-
 src/backend/postmaster/autovacuum.c       |  6 +--
 src/backend/replication/logical/logical.c |  3 +-
 src/backend/replication/slot.c            |  3 +-
 src/backend/storage/ipc/procarray.c       | 66 +++++++++++++++--------
 src/backend/storage/lmgr/deadlock.c       |  4 +-
 src/backend/storage/lmgr/proc.c           | 16 +++---
 src/include/storage/proc.h                | 12 ++++-
 9 files changed, 75 insertions(+), 42 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index a0398bf3a3e8..744b8a7f3935 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -466,7 +466,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
 	proc->xid = xid;
 	Assert(proc->xmin == InvalidTransactionId);
 	proc->delayChkpt = false;
-	pgxact->vacuumFlags = 0;
+	proc->vacuumFlags = 0;
 	proc->pid = 0;
 	proc->backendId = InvalidBackendId;
 	proc->databaseId = databaseid;
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 648e12c78d84..aba13c31d1bc 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1728,9 +1728,10 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params)
 		 * might appear to go backwards, which is probably Not Good.
 		 */
 		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-		MyPgXact->vacuumFlags |= PROC_IN_VACUUM;
+		MyProc->vacuumFlags |= PROC_IN_VACUUM;
 		if (params->is_wraparound)
-			MyPgXact->vacuumFlags |= PROC_VACUUM_FOR_WRAPAROUND;
+			MyProc->vacuumFlags |= PROC_VACUUM_FOR_WRAPAROUND;
+		ProcGlobal->vacuumFlags[MyProc->pgxactoff] = MyProc->vacuumFlags;
 		LWLockRelease(ProcArrayLock);
 	}
 
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index ac97e28be19c..c6ec657a9367 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -2493,7 +2493,7 @@ do_autovacuum(void)
 						   tab->at_datname, tab->at_nspname, tab->at_relname);
 			EmitErrorReport();
 
-			/* this resets the PGXACT flags too */
+			/* this resets ProcGlobal->vacuumFlags[i] too */
 			AbortOutOfAnyTransaction();
 			FlushErrorState();
 			MemoryContextResetAndDeleteChildren(PortalContext);
@@ -2509,7 +2509,7 @@ do_autovacuum(void)
 
 		did_vacuum = true;
 
-		/* the PGXACT flags are reset at the next end of transaction */
+		/* ProcGlobal->vacuumFlags[i] are reset at the next end of xact */
 
 		/* be tidy */
 deleted:
@@ -2686,7 +2686,7 @@ perform_work_item(AutoVacuumWorkItem *workitem)
 				   cur_datname, cur_nspname, cur_relname);
 		EmitErrorReport();
 
-		/* this resets the PGXACT flags too */
+		/* this resets ProcGlobal->vacuumFlags[i] too */
 		AbortOutOfAnyTransaction();
 		FlushErrorState();
 		MemoryContextResetAndDeleteChildren(PortalContext);
diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c
index 57c5b513ccf8..0f6af952f939 100644
--- a/src/backend/replication/logical/logical.c
+++ b/src/backend/replication/logical/logical.c
@@ -181,7 +181,8 @@ StartupDecodingContext(List *output_plugin_options,
 	if (!IsTransactionOrTransactionBlock())
 	{
 		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-		MyPgXact->vacuumFlags |= PROC_IN_LOGICAL_DECODING;
+		MyProc->vacuumFlags |= PROC_IN_LOGICAL_DECODING;
+		ProcGlobal->vacuumFlags[MyProc->pgxactoff] = MyProc->vacuumFlags;
 		LWLockRelease(ProcArrayLock);
 	}
 
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index 3dc01b6df22a..42c78eabd4eb 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -520,7 +520,8 @@ ReplicationSlotRelease(void)
 
 	/* might not have been set when we've been a plain slot */
 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
-	MyPgXact->vacuumFlags &= ~PROC_IN_LOGICAL_DECODING;
+	MyProc->vacuumFlags &= ~PROC_IN_LOGICAL_DECODING;
+	ProcGlobal->vacuumFlags[MyProc->pgxactoff] = MyProc->vacuumFlags;
 	LWLockRelease(ProcArrayLock);
 }
 
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 0bf20a49375d..224da4f9510b 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -476,9 +476,12 @@ ProcArrayAdd(PGPROC *proc)
 			(arrayP->numProcs - index) * sizeof(*arrayP->pgprocnos));
 	memmove(&ProcGlobal->xids[index + 1], &ProcGlobal->xids[index],
 			(arrayP->numProcs - index) * sizeof(*ProcGlobal->xids));
+	memmove(&ProcGlobal->vacuumFlags[index + 1], &ProcGlobal->vacuumFlags[index],
+			(arrayP->numProcs - index) * sizeof(*ProcGlobal->vacuumFlags));
 
 	arrayP->pgprocnos[index] = proc->pgprocno;
 	ProcGlobal->xids[index] = proc->xid;
+	ProcGlobal->vacuumFlags[index] = proc->vacuumFlags;
 
 	arrayP->numProcs++;
 
@@ -539,6 +542,7 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 	}
 
 	Assert(TransactionIdIsValid(ProcGlobal->xids[proc->pgxactoff] == 0));
+	ProcGlobal->vacuumFlags[proc->pgxactoff] = 0;
 
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
@@ -549,6 +553,8 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 					(arrayP->numProcs - index - 1) * sizeof(*arrayP->pgprocnos));
 			memmove(&ProcGlobal->xids[index], &ProcGlobal->xids[index + 1],
 					(arrayP->numProcs - index - 1) * sizeof(*ProcGlobal->xids));
+			memmove(&ProcGlobal->vacuumFlags[index], &ProcGlobal->vacuumFlags[index + 1],
+					(arrayP->numProcs - index - 1) * sizeof(*ProcGlobal->vacuumFlags));
 
 			arrayP->pgprocnos[arrayP->numProcs - 1] = -1;	/* for debugging */
 			arrayP->numProcs--;
@@ -626,14 +632,24 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 		Assert(!TransactionIdIsValid(proc->xid));
 
 		proc->lxid = InvalidLocalTransactionId;
-		/* must be cleared with xid/xmin: */
-		pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
 		proc->xmin = InvalidTransactionId;
 		proc->delayChkpt = false;	/* be sure this is cleared in abort */
 		proc->recoveryConflictPending = false;
 
 		Assert(pgxact->nxids == 0);
 		Assert(pgxact->overflowed == false);
+
+		/* must be cleared with xid/xmin: */
+		/* avoid unnecessarily dirtying shared cachelines */
+		if (proc->vacuumFlags & PROC_VACUUM_STATE_MASK)
+		{
+			Assert(!LWLockHeldByMe(ProcArrayLock));
+			LWLockAcquire(ProcArrayLock, LW_SHARED);
+			Assert(proc->vacuumFlags == ProcGlobal->vacuumFlags[proc->pgxactoff]);
+			proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
+			ProcGlobal->vacuumFlags[proc->pgxactoff] = proc->vacuumFlags;
+			LWLockRelease(ProcArrayLock);
+		}
 	}
 }
 
@@ -654,12 +670,18 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
 	ProcGlobal->xids[pgxactoff] = InvalidTransactionId;
 	proc->xid = InvalidTransactionId;
 	proc->lxid = InvalidLocalTransactionId;
-	/* must be cleared with xid/xmin: */
-	pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
 	proc->xmin = InvalidTransactionId;
 	proc->delayChkpt = false;	/* be sure this is cleared in abort */
 	proc->recoveryConflictPending = false;
 
+	/* must be cleared with xid/xmin: */
+	/* avoid unnecessarily dirtying shared cachelines */
+	if (proc->vacuumFlags & PROC_VACUUM_STATE_MASK)
+	{
+		proc->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
+		ProcGlobal->vacuumFlags[proc->pgxactoff] = proc->vacuumFlags;
+	}
+
 	/* Clear the subtransaction-XID cache too while holding the lock */
 	pgxact->nxids = 0;
 	pgxact->overflowed = false;
@@ -819,9 +841,8 @@ ProcArrayClearTransaction(PGPROC *proc)
 	proc->xmin = InvalidTransactionId;
 	proc->recoveryConflictPending = false;
 
-	/* redundant, but just in case */
-	pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
-	proc->delayChkpt = false;
+	Assert(!(proc->vacuumFlags & PROC_VACUUM_STATE_MASK));
+	Assert(!proc->delayChkpt);
 
 	/* Clear the subtransaction-XID cache too */
 	pgxact->nxids = 0;
@@ -1623,7 +1644,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
 		PGPROC	   *proc = &allProcs[pgprocno];
-		PGXACT	   *pgxact = &allPgXact[pgprocno];
+		int8		vacuumFlags = ProcGlobal->vacuumFlags[index];
 		TransactionId xid;
 		TransactionId xmin;
 
@@ -1640,8 +1661,8 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 		 */
 		xmin = TransactionIdOlder(xmin, xid);
 
-		/* if neither is set, this proc doesn't influence the horizon */
-		if (!TransactionIdIsValid(xmin))
+        /* if neither is set, this proc doesn't influence the horizon */
+        if (!TransactionIdIsValid(xmin))
 			continue;
 
 		/*
@@ -1658,7 +1679,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 		 * removed, as long as pg_subtrans is not truncated) or doing logical
 		 * decoding (which manages xmin separately, check below).
 		 */
-		if (pgxact->vacuumFlags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING))
+		if (vacuumFlags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING))
 			continue;
 
 		/* shared tables need to take backends in all database into account */
@@ -1998,6 +2019,7 @@ GetSnapshotData(Snapshot snapshot)
 		size_t		numProcs = arrayP->numProcs;
 		TransactionId *xip = snapshot->xip;
 		int		   *pgprocnos = arrayP->pgprocnos;
+		uint8	   *allVacuumFlags = ProcGlobal->vacuumFlags;
 
 		/*
 		 * First collect set of pgxactoff/xids that need to be included in the
@@ -2007,8 +2029,6 @@ GetSnapshotData(Snapshot snapshot)
 		{
 			/* Fetch xid just once - see GetNewTransactionId */
 			TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]);
-			int			pgprocno;
-			PGXACT	   *pgxact;
 			uint8		vacuumFlags;
 
 			Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff);
@@ -2044,14 +2064,11 @@ GetSnapshotData(Snapshot snapshot)
 			if (!NormalTransactionIdPrecedes(xid, xmax))
 				continue;
 
-			pgprocno = pgprocnos[pgxactoff];
-			pgxact = &allPgXact[pgprocno];
-			vacuumFlags = pgxact->vacuumFlags;
-
 			/*
 			 * Skip over backends doing logical decoding which manages xmin
 			 * separately (check below) and ones running LAZY VACUUM.
 			 */
+			vacuumFlags = allVacuumFlags[pgxactoff];
 			if (vacuumFlags & (PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM))
 				continue;
 
@@ -2078,6 +2095,9 @@ GetSnapshotData(Snapshot snapshot)
 			 */
 			if (!suboverflowed)
 			{
+				int			pgprocno = pgprocnos[pgxactoff];
+				PGXACT	   *pgxact = &allPgXact[pgprocno];
+
 				if (pgxact->overflowed)
 					suboverflowed = true;
 				else
@@ -2296,11 +2316,11 @@ ProcArrayInstallImportedXmin(TransactionId xmin,
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
 		PGPROC	   *proc = &allProcs[pgprocno];
-		PGXACT	   *pgxact = &allPgXact[pgprocno];
+		int			vacuumFlags = ProcGlobal->vacuumFlags[index];
 		TransactionId xid;
 
 		/* Ignore procs running LAZY VACUUM */
-		if (pgxact->vacuumFlags & PROC_IN_VACUUM)
+		if (vacuumFlags & PROC_IN_VACUUM)
 			continue;
 
 		/* We are only interested in the specific virtual transaction. */
@@ -2990,12 +3010,12 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
 	{
 		int			pgprocno = arrayP->pgprocnos[index];
 		PGPROC	   *proc = &allProcs[pgprocno];
-		PGXACT	   *pgxact = &allPgXact[pgprocno];
+		uint8		vacuumFlags = ProcGlobal->vacuumFlags[index];
 
 		if (proc == MyProc)
 			continue;
 
-		if (excludeVacuum & pgxact->vacuumFlags)
+		if (excludeVacuum & vacuumFlags)
 			continue;
 
 		if (allDbs || proc->databaseId == MyDatabaseId)
@@ -3410,7 +3430,7 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
 		{
 			int			pgprocno = arrayP->pgprocnos[index];
 			PGPROC	   *proc = &allProcs[pgprocno];
-			PGXACT	   *pgxact = &allPgXact[pgprocno];
+			uint8		vacuumFlags = ProcGlobal->vacuumFlags[index];
 
 			if (proc->databaseId != databaseId)
 				continue;
@@ -3424,7 +3444,7 @@ CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
 			else
 			{
 				(*nbackends)++;
-				if ((pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) &&
+				if ((vacuumFlags & PROC_IS_AUTOVACUUM) &&
 					nautovacs < MAXAUTOVACPIDS)
 					autovac_pids[nautovacs++] = proc->pid;
 			}
diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c
index beedc7947db9..e1246b8a4da1 100644
--- a/src/backend/storage/lmgr/deadlock.c
+++ b/src/backend/storage/lmgr/deadlock.c
@@ -544,7 +544,6 @@ FindLockCycleRecurseMember(PGPROC *checkProc,
 {
 	PGPROC	   *proc;
 	LOCK	   *lock = checkProc->waitLock;
-	PGXACT	   *pgxact;
 	PROCLOCK   *proclock;
 	SHM_QUEUE  *procLocks;
 	LockMethod	lockMethodTable;
@@ -582,7 +581,6 @@ FindLockCycleRecurseMember(PGPROC *checkProc,
 		PGPROC	   *leader;
 
 		proc = proclock->tag.myProc;
-		pgxact = &ProcGlobal->allPgXact[proc->pgprocno];
 		leader = proc->lockGroupLeader == NULL ? proc : proc->lockGroupLeader;
 
 		/* A proc never blocks itself or any other lock group member */
@@ -630,7 +628,7 @@ FindLockCycleRecurseMember(PGPROC *checkProc,
 					 * ProcArrayLock.
 					 */
 					if (checkProc == MyProc &&
-						pgxact->vacuumFlags & PROC_IS_AUTOVACUUM)
+						proc->vacuumFlags & PROC_IS_AUTOVACUUM)
 						blocking_autovacuum_proc = proc;
 
 					/* We're done looking at this proclock */
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 7fad49544ce0..f6113b2d2432 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -114,6 +114,7 @@ ProcGlobalShmemSize(void)
 	size = add_size(size, mul_size(NUM_AUXILIARY_PROCS, sizeof(PGXACT)));
 	size = add_size(size, mul_size(max_prepared_xacts, sizeof(PGXACT)));
 	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids)));
+	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->vacuumFlags)));
 
 	return size;
 }
@@ -223,6 +224,8 @@ InitProcGlobal(void)
 	ProcGlobal->xids =
 		(TransactionId *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids));
 	MemSet(ProcGlobal->xids, 0, TotalProcs * sizeof(*ProcGlobal->xids));
+	ProcGlobal->vacuumFlags = (uint8 *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->vacuumFlags));
+	MemSet(ProcGlobal->vacuumFlags, 0, TotalProcs * sizeof(*ProcGlobal->vacuumFlags));
 
 	for (i = 0; i < TotalProcs; i++)
 	{
@@ -405,10 +408,10 @@ InitProcess(void)
 	MyProc->tempNamespaceId = InvalidOid;
 	MyProc->isBackgroundWorker = IsBackgroundWorker;
 	MyProc->delayChkpt = false;
-	MyPgXact->vacuumFlags = 0;
+	MyProc->vacuumFlags = 0;
 	/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
 	if (IsAutoVacuumWorkerProcess())
-		MyPgXact->vacuumFlags |= PROC_IS_AUTOVACUUM;
+		MyProc->vacuumFlags |= PROC_IS_AUTOVACUUM;
 	MyProc->lwWaiting = false;
 	MyProc->lwWaitMode = 0;
 	MyProc->waitLock = NULL;
@@ -587,7 +590,7 @@ InitAuxiliaryProcess(void)
 	MyProc->tempNamespaceId = InvalidOid;
 	MyProc->isBackgroundWorker = IsBackgroundWorker;
 	MyProc->delayChkpt = false;
-	MyPgXact->vacuumFlags = 0;
+	MyProc->vacuumFlags = 0;
 	MyProc->lwWaiting = false;
 	MyProc->lwWaitMode = 0;
 	MyProc->waitLock = NULL;
@@ -1323,7 +1326,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 		if (deadlock_state == DS_BLOCKED_BY_AUTOVACUUM && allow_autovacuum_cancel)
 		{
 			PGPROC	   *autovac = GetBlockingAutoVacuumPgproc();
-			PGXACT	   *autovac_pgxact = &ProcGlobal->allPgXact[autovac->pgprocno];
+			uint8		vacuumFlags;
 
 			LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 
@@ -1331,8 +1334,9 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
 			 * Only do it if the worker is not working to protect against Xid
 			 * wraparound.
 			 */
-			if ((autovac_pgxact->vacuumFlags & PROC_IS_AUTOVACUUM) &&
-				!(autovac_pgxact->vacuumFlags & PROC_VACUUM_FOR_WRAPAROUND))
+			vacuumFlags = ProcGlobal->vacuumFlags[proc->pgxactoff];
+			if ((vacuumFlags & PROC_IS_AUTOVACUUM) &&
+				!(vacuumFlags & PROC_VACUUM_FOR_WRAPAROUND))
 			{
 				int			pid = autovac->pid;
 				StringInfoData locktagbuf;
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index e29ed85e53db..9f3a8b518eb2 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -41,7 +41,7 @@ struct XidCache
 };
 
 /*
- * Flags for PGXACT->vacuumFlags
+ * Flags for ProcGlobal->vacuumFlags[]
  */
 #define		PROC_IS_AUTOVACUUM	0x01	/* is it an autovac worker? */
 #define		PROC_IN_VACUUM		0x02	/* currently running lazy vacuum */
@@ -167,6 +167,9 @@ struct PGPROC
 
 	bool		delayChkpt;		/* true if this proc delays checkpoint start */
 
+	uint8		vacuumFlags;    /* this backend's vacuum flags, see PROC_*
+								 * above. mirrored in
+								 * ProcGlobal->vacuumFlags[pgxactoff] */
 	/*
 	 * Info to allow us to wait for synchronous replication, if needed.
 	 * waitLSN is InvalidXLogRecPtr if not waiting; set only by user backend.
@@ -244,7 +247,6 @@ extern PGDLLIMPORT struct PGXACT *MyPgXact;
  */
 typedef struct PGXACT
 {
-	uint8		vacuumFlags;	/* vacuum-related flags, see above */
 	bool		overflowed;
 
 	uint8		nxids;
@@ -314,6 +316,12 @@ typedef struct PROC_HDR
 	/* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */
 	TransactionId *xids;
 
+	/*
+	 * Array mirroring PGPROC.vacuumFlags for each PGPROC currently in the
+	 * procarray.
+	 */
+	uint8	   *vacuumFlags;
+
 	/* Length of allProcs array */
 	uint32		allProcCount;
 	/* Head of list of free PGPROC structures */

From 73487a60fc1063ba4b5178b69aee4ee210c182c4 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Fri, 14 Aug 2020 14:30:38 -0700
Subject: [PATCH 299/334] snapshot scalability: Move subxact info to
 ProcGlobal, remove PGXACT.

Similar to the previous changes this increases the chance that data
frequently needed by GetSnapshotData() stays in l2 cache. In many
workloads subtransactions are very rare, and this makes the check for
that considerably cheaper.

As this removes the last member of PGXACT, there is no need to keep it
around anymore.

On a larger 2 socket machine this and the two preceding commits result
in a ~1.07x performance increase in read-only pgbench. For read-heavy
mixed r/w workloads without row level contention, I see about 1.1x.

Author: Andres Freund <andres@anarazel.de>
Reviewed-By: Robert Haas <robertmhaas@gmail.com>
Reviewed-By: Thomas Munro <thomas.munro@gmail.com>
Reviewed-By: David Rowley <dgrowleyml@gmail.com>
Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
---
 src/backend/access/transam/clog.c     |   7 +-
 src/backend/access/transam/twophase.c |  17 ++--
 src/backend/access/transam/varsup.c   |  15 ++-
 src/backend/storage/ipc/procarray.c   | 128 ++++++++++++++------------
 src/backend/storage/lmgr/proc.c       |  24 +----
 src/include/storage/proc.h            |  34 ++++---
 src/tools/pgindent/typedefs.list      |   1 -
 7 files changed, 113 insertions(+), 113 deletions(-)

diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index a4599e966106..65aa8841f7ce 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -295,7 +295,7 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
 	 */
 	if (all_xact_same_page && xid == MyProc->xid &&
 		nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT &&
-		nsubxids == MyPgXact->nxids &&
+		nsubxids == MyProc->subxidStatus.count &&
 		memcmp(subxids, MyProc->subxids.xids,
 			   nsubxids * sizeof(TransactionId)) == 0)
 	{
@@ -510,16 +510,15 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status,
 	while (nextidx != INVALID_PGPROCNO)
 	{
 		PGPROC	   *proc = &ProcGlobal->allProcs[nextidx];
-		PGXACT	   *pgxact = &ProcGlobal->allPgXact[nextidx];
 
 		/*
 		 * Transactions with more than THRESHOLD_SUBTRANS_CLOG_OPT sub-XIDs
 		 * should not use group XID status update mechanism.
 		 */
-		Assert(pgxact->nxids <= THRESHOLD_SUBTRANS_CLOG_OPT);
+		Assert(proc->subxidStatus.count <= THRESHOLD_SUBTRANS_CLOG_OPT);
 
 		TransactionIdSetPageStatusInternal(proc->clogGroupMemberXid,
-										   pgxact->nxids,
+										   proc->subxidStatus.count,
 										   proc->subxids.xids,
 										   proc->clogGroupMemberXidStatus,
 										   proc->clogGroupMemberLsn,
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 744b8a7f3935..ef4f9981e359 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -21,9 +21,9 @@
  *		GIDs and aborts the transaction if there already is a global
  *		transaction in prepared state with the same GID.
  *
- *		A global transaction (gxact) also has dummy PGXACT and PGPROC; this is
- *		what keeps the XID considered running by TransactionIdIsInProgress.
- *		It is also convenient as a PGPROC to hook the gxact's locks to.
+ *		A global transaction (gxact) also has dummy PGPROC; this is what keeps
+ *		the XID considered running by TransactionIdIsInProgress.  It is also
+ *		convenient as a PGPROC to hook the gxact's locks to.
  *
  *		Information to recover prepared transactions in case of crash is
  *		now stored in WAL for the common case. In some cases there will be
@@ -447,14 +447,12 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
 					TimestampTz prepared_at, Oid owner, Oid databaseid)
 {
 	PGPROC	   *proc;
-	PGXACT	   *pgxact;
 	int			i;
 
 	Assert(LWLockHeldByMeInMode(TwoPhaseStateLock, LW_EXCLUSIVE));
 
 	Assert(gxact != NULL);
 	proc = &ProcGlobal->allProcs[gxact->pgprocno];
-	pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
 
 	/* Initialize the PGPROC entry */
 	MemSet(proc, 0, sizeof(PGPROC));
@@ -480,8 +478,8 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
 	for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
 		SHMQueueInit(&(proc->myProcLocks[i]));
 	/* subxid data must be filled later by GXactLoadSubxactData */
-	pgxact->overflowed = false;
-	pgxact->nxids = 0;
+	proc->subxidStatus.overflowed = false;
+	proc->subxidStatus.count = 0;
 
 	gxact->prepared_at = prepared_at;
 	gxact->xid = xid;
@@ -510,19 +508,18 @@ GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts,
 					 TransactionId *children)
 {
 	PGPROC	   *proc = &ProcGlobal->allProcs[gxact->pgprocno];
-	PGXACT	   *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno];
 
 	/* We need no extra lock since the GXACT isn't valid yet */
 	if (nsubxacts > PGPROC_MAX_CACHED_SUBXIDS)
 	{
-		pgxact->overflowed = true;
+		proc->subxidStatus.overflowed = true;
 		nsubxacts = PGPROC_MAX_CACHED_SUBXIDS;
 	}
 	if (nsubxacts > 0)
 	{
 		memcpy(proc->subxids.xids, children,
 			   nsubxacts * sizeof(TransactionId));
-		pgxact->nxids = nsubxacts;
+		proc->subxidStatus.count = nsubxacts;
 	}
 }
 
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 4c91b343ecd2..2d2b05be36c4 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -222,22 +222,31 @@ GetNewTransactionId(bool isSubXact)
 	 */
 	if (!isSubXact)
 	{
+		Assert(ProcGlobal->subxidStates[MyProc->pgxactoff].count == 0);
+		Assert(!ProcGlobal->subxidStates[MyProc->pgxactoff].overflowed);
+		Assert(MyProc->subxidStatus.count == 0);
+		Assert(!MyProc->subxidStatus.overflowed);
+
 		/* LWLockRelease acts as barrier */
 		MyProc->xid = xid;
 		ProcGlobal->xids[MyProc->pgxactoff] = xid;
 	}
 	else
 	{
-		int			nxids = MyPgXact->nxids;
+		XidCacheStatus *substat = &ProcGlobal->subxidStates[MyProc->pgxactoff];
+		int			nxids = MyProc->subxidStatus.count;
+
+		Assert(substat->count == MyProc->subxidStatus.count);
+		Assert(substat->overflowed == MyProc->subxidStatus.overflowed);
 
 		if (nxids < PGPROC_MAX_CACHED_SUBXIDS)
 		{
 			MyProc->subxids.xids[nxids] = xid;
 			pg_write_barrier();
-			MyPgXact->nxids = nxids + 1;
+			MyProc->subxidStatus.count = substat->count = nxids + 1;
 		}
 		else
-			MyPgXact->overflowed = true;
+			MyProc->subxidStatus.overflowed = substat->overflowed = true;
 	}
 
 	LWLockRelease(XidGenLock);
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 224da4f9510b..8262abd42e6b 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -4,9 +4,10 @@
  *	  POSTGRES process array code.
  *
  *
- * This module maintains arrays of the PGPROC and PGXACT structures for all
- * active backends.  Although there are several uses for this, the principal
- * one is as a means of determining the set of currently running transactions.
+ * This module maintains arrays of PGPROC substructures, as well as associated
+ * arrays in ProcGlobal, for all active backends.  Although there are several
+ * uses for this, the principal one is as a means of determining the set of
+ * currently running transactions.
  *
  * Because of various subtle race conditions it is critical that a backend
  * hold the correct locks while setting or clearing its xid (in
@@ -85,7 +86,7 @@ typedef struct ProcArrayStruct
 	/*
 	 * Highest subxid that has been removed from KnownAssignedXids array to
 	 * prevent overflow; or InvalidTransactionId if none.  We track this for
-	 * similar reasons to tracking overflowing cached subxids in PGXACT
+	 * similar reasons to tracking overflowing cached subxids in PGPROC
 	 * entries.  Must hold exclusive ProcArrayLock to change this, and shared
 	 * lock to read it.
 	 */
@@ -96,7 +97,7 @@ typedef struct ProcArrayStruct
 	/* oldest catalog xmin of any replication slot */
 	TransactionId replication_slot_catalog_xmin;
 
-	/* indexes into allPgXact[], has PROCARRAY_MAXPROCS entries */
+	/* indexes into allProcs[], has PROCARRAY_MAXPROCS entries */
 	int			pgprocnos[FLEXIBLE_ARRAY_MEMBER];
 } ProcArrayStruct;
 
@@ -239,7 +240,6 @@ typedef struct ComputeXidHorizonsResult
 static ProcArrayStruct *procArray;
 
 static PGPROC *allProcs;
-static PGXACT *allPgXact;
 
 /*
  * Bookkeeping for tracking emulated transactions in recovery
@@ -325,8 +325,7 @@ static int	KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
 static TransactionId KnownAssignedXidsGetOldestXmin(void);
 static void KnownAssignedXidsDisplay(int trace_level);
 static void KnownAssignedXidsReset(void);
-static inline void ProcArrayEndTransactionInternal(PGPROC *proc,
-												   PGXACT *pgxact, TransactionId latestXid);
+static inline void ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid);
 static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid);
 static void MaintainLatestCompletedXid(TransactionId latestXid);
 static void MaintainLatestCompletedXidRecovery(TransactionId latestXid);
@@ -411,7 +410,6 @@ CreateSharedProcArray(void)
 	}
 
 	allProcs = ProcGlobal->allProcs;
-	allPgXact = ProcGlobal->allPgXact;
 
 	/* Create or attach to the KnownAssignedXids arrays too, if needed */
 	if (EnableHotStandby)
@@ -476,11 +474,14 @@ ProcArrayAdd(PGPROC *proc)
 			(arrayP->numProcs - index) * sizeof(*arrayP->pgprocnos));
 	memmove(&ProcGlobal->xids[index + 1], &ProcGlobal->xids[index],
 			(arrayP->numProcs - index) * sizeof(*ProcGlobal->xids));
+	memmove(&ProcGlobal->subxidStates[index + 1], &ProcGlobal->subxidStates[index],
+			(arrayP->numProcs - index) * sizeof(*ProcGlobal->subxidStates));
 	memmove(&ProcGlobal->vacuumFlags[index + 1], &ProcGlobal->vacuumFlags[index],
 			(arrayP->numProcs - index) * sizeof(*ProcGlobal->vacuumFlags));
 
 	arrayP->pgprocnos[index] = proc->pgprocno;
 	ProcGlobal->xids[index] = proc->xid;
+	ProcGlobal->subxidStates[index] = proc->subxidStatus;
 	ProcGlobal->vacuumFlags[index] = proc->vacuumFlags;
 
 	arrayP->numProcs++;
@@ -534,6 +535,8 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 		MaintainLatestCompletedXid(latestXid);
 
 		ProcGlobal->xids[proc->pgxactoff] = 0;
+		ProcGlobal->subxidStates[proc->pgxactoff].overflowed = false;
+		ProcGlobal->subxidStates[proc->pgxactoff].count = 0;
 	}
 	else
 	{
@@ -542,6 +545,8 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 	}
 
 	Assert(TransactionIdIsValid(ProcGlobal->xids[proc->pgxactoff] == 0));
+	Assert(TransactionIdIsValid(ProcGlobal->subxidStates[proc->pgxactoff].count == 0));
+	Assert(TransactionIdIsValid(ProcGlobal->subxidStates[proc->pgxactoff].overflowed == false));
 	ProcGlobal->vacuumFlags[proc->pgxactoff] = 0;
 
 	for (index = 0; index < arrayP->numProcs; index++)
@@ -553,6 +558,8 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 					(arrayP->numProcs - index - 1) * sizeof(*arrayP->pgprocnos));
 			memmove(&ProcGlobal->xids[index], &ProcGlobal->xids[index + 1],
 					(arrayP->numProcs - index - 1) * sizeof(*ProcGlobal->xids));
+			memmove(&ProcGlobal->subxidStates[index], &ProcGlobal->subxidStates[index + 1],
+					(arrayP->numProcs - index - 1) * sizeof(*ProcGlobal->subxidStates));
 			memmove(&ProcGlobal->vacuumFlags[index], &ProcGlobal->vacuumFlags[index + 1],
 					(arrayP->numProcs - index - 1) * sizeof(*ProcGlobal->vacuumFlags));
 
@@ -597,8 +604,6 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 void
 ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 {
-	PGXACT	   *pgxact = &allPgXact[proc->pgprocno];
-
 	if (TransactionIdIsValid(latestXid))
 	{
 		/*
@@ -616,7 +621,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 		 */
 		if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE))
 		{
-			ProcArrayEndTransactionInternal(proc, pgxact, latestXid);
+			ProcArrayEndTransactionInternal(proc, latestXid);
 			LWLockRelease(ProcArrayLock);
 		}
 		else
@@ -630,15 +635,14 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
 		 * estimate of global xmin, but that's OK.
 		 */
 		Assert(!TransactionIdIsValid(proc->xid));
+		Assert(proc->subxidStatus.count == 0);
+		Assert(!proc->subxidStatus.overflowed);
 
 		proc->lxid = InvalidLocalTransactionId;
 		proc->xmin = InvalidTransactionId;
 		proc->delayChkpt = false;	/* be sure this is cleared in abort */
 		proc->recoveryConflictPending = false;
 
-		Assert(pgxact->nxids == 0);
-		Assert(pgxact->overflowed == false);
-
 		/* must be cleared with xid/xmin: */
 		/* avoid unnecessarily dirtying shared cachelines */
 		if (proc->vacuumFlags & PROC_VACUUM_STATE_MASK)
@@ -659,8 +663,7 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
  * We don't do any locking here; caller must handle that.
  */
 static inline void
-ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
-								TransactionId latestXid)
+ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
 {
 	size_t		pgxactoff = proc->pgxactoff;
 
@@ -683,8 +686,15 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
 	}
 
 	/* Clear the subtransaction-XID cache too while holding the lock */
-	pgxact->nxids = 0;
-	pgxact->overflowed = false;
+	Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
+		   ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);
+	if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed)
+	{
+		ProcGlobal->subxidStates[pgxactoff].count = 0;
+		ProcGlobal->subxidStates[pgxactoff].overflowed = false;
+		proc->subxidStatus.count = 0;
+		proc->subxidStatus.overflowed = false;
+	}
 
 	/* Also advance global latestCompletedXid while holding the lock */
 	MaintainLatestCompletedXid(latestXid);
@@ -774,9 +784,8 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
 	while (nextidx != INVALID_PGPROCNO)
 	{
 		PGPROC	   *proc = &allProcs[nextidx];
-		PGXACT	   *pgxact = &allPgXact[nextidx];
 
-		ProcArrayEndTransactionInternal(proc, pgxact, proc->procArrayGroupMemberXid);
+		ProcArrayEndTransactionInternal(proc, proc->procArrayGroupMemberXid);
 
 		/* Move to next proc in list. */
 		nextidx = pg_atomic_read_u32(&proc->procArrayGroupNext);
@@ -820,7 +829,6 @@ ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
 void
 ProcArrayClearTransaction(PGPROC *proc)
 {
-	PGXACT	   *pgxact = &allPgXact[proc->pgprocno];
 	size_t		pgxactoff;
 
 	/*
@@ -845,8 +853,15 @@ ProcArrayClearTransaction(PGPROC *proc)
 	Assert(!proc->delayChkpt);
 
 	/* Clear the subtransaction-XID cache too */
-	pgxact->nxids = 0;
-	pgxact->overflowed = false;
+	Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
+		   ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);
+	if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed)
+	{
+		ProcGlobal->subxidStates[pgxactoff].count = 0;
+		ProcGlobal->subxidStates[pgxactoff].overflowed = false;
+		proc->subxidStatus.count = 0;
+		proc->subxidStatus.overflowed = false;
+	}
 
 	LWLockRelease(ProcArrayLock);
 }
@@ -1267,6 +1282,7 @@ TransactionIdIsInProgress(TransactionId xid)
 {
 	static TransactionId *xids = NULL;
 	static TransactionId *other_xids;
+	XidCacheStatus *other_subxidstates;
 	int			nxids = 0;
 	ProcArrayStruct *arrayP = procArray;
 	TransactionId topxid;
@@ -1329,6 +1345,7 @@ TransactionIdIsInProgress(TransactionId xid)
 	}
 
 	other_xids = ProcGlobal->xids;
+	other_subxidstates = ProcGlobal->subxidStates;
 
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
@@ -1351,7 +1368,6 @@ TransactionIdIsInProgress(TransactionId xid)
 	for (size_t pgxactoff = 0; pgxactoff < numProcs; pgxactoff++)
 	{
 		int			pgprocno;
-		PGXACT	   *pgxact;
 		PGPROC	   *proc;
 		TransactionId pxid;
 		int			pxids;
@@ -1386,9 +1402,7 @@ TransactionIdIsInProgress(TransactionId xid)
 		/*
 		 * Step 2: check the cached child-Xids arrays
 		 */
-		pgprocno = arrayP->pgprocnos[pgxactoff];
-		pgxact = &allPgXact[pgprocno];
-		pxids = pgxact->nxids;
+		pxids = other_subxidstates[pgxactoff].count;
 		pg_read_barrier();		/* pairs with barrier in GetNewTransactionId() */
 		pgprocno = arrayP->pgprocnos[pgxactoff];
 		proc = &allProcs[pgprocno];
@@ -1412,7 +1426,7 @@ TransactionIdIsInProgress(TransactionId xid)
 		 * we hold ProcArrayLock.  So we can't miss an Xid that we need to
 		 * worry about.)
 		 */
-		if (pgxact->overflowed)
+		if (other_subxidstates[pgxactoff].overflowed)
 			xids[nxids++] = pxid;
 	}
 
@@ -2019,6 +2033,7 @@ GetSnapshotData(Snapshot snapshot)
 		size_t		numProcs = arrayP->numProcs;
 		TransactionId *xip = snapshot->xip;
 		int		   *pgprocnos = arrayP->pgprocnos;
+		XidCacheStatus *subxidStates = ProcGlobal->subxidStates;
 		uint8	   *allVacuumFlags = ProcGlobal->vacuumFlags;
 
 		/*
@@ -2095,17 +2110,16 @@ GetSnapshotData(Snapshot snapshot)
 			 */
 			if (!suboverflowed)
 			{
-				int			pgprocno = pgprocnos[pgxactoff];
-				PGXACT	   *pgxact = &allPgXact[pgprocno];
 
-				if (pgxact->overflowed)
+				if (subxidStates[pgxactoff].overflowed)
 					suboverflowed = true;
 				else
 				{
-					int			nsubxids = pgxact->nxids;
+					int			nsubxids = subxidStates[pgxactoff].count;
 
 					if (nsubxids > 0)
 					{
+						int			pgprocno = pgprocnos[pgxactoff];
 						PGPROC	   *proc = &allProcs[pgprocno];
 
 						pg_read_barrier();	/* pairs with GetNewTransactionId */
@@ -2498,8 +2512,6 @@ GetRunningTransactionData(void)
 	 */
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
-		int			pgprocno = arrayP->pgprocnos[index];
-		PGXACT	   *pgxact = &allPgXact[pgprocno];
 		TransactionId xid;
 
 		/* Fetch xid just once - see GetNewTransactionId */
@@ -2520,7 +2532,7 @@ GetRunningTransactionData(void)
 		if (TransactionIdPrecedes(xid, oldestRunningXid))
 			oldestRunningXid = xid;
 
-		if (pgxact->overflowed)
+		if (ProcGlobal->subxidStates[index].overflowed)
 			suboverflowed = true;
 
 		/*
@@ -2540,27 +2552,28 @@ GetRunningTransactionData(void)
 	 */
 	if (!suboverflowed)
 	{
+		XidCacheStatus *other_subxidstates = ProcGlobal->subxidStates;
+
 		for (index = 0; index < arrayP->numProcs; index++)
 		{
 			int			pgprocno = arrayP->pgprocnos[index];
 			PGPROC	   *proc = &allProcs[pgprocno];
-			PGXACT	   *pgxact = &allPgXact[pgprocno];
-			int			nxids;
+			int			nsubxids;
 
 			/*
 			 * Save subtransaction XIDs. Other backends can't add or remove
 			 * entries while we're holding XidGenLock.
 			 */
-			nxids = pgxact->nxids;
-			if (nxids > 0)
+			nsubxids = other_subxidstates[index].count;
+			if (nsubxids > 0)
 			{
 				/* barrier not really required, as XidGenLock is held, but ... */
 				pg_read_barrier();	/* pairs with GetNewTransactionId */
 
 				memcpy(&xids[count], (void *) proc->subxids.xids,
-					   nxids * sizeof(TransactionId));
-				count += nxids;
-				subcount += nxids;
+					   nsubxids * sizeof(TransactionId));
+				count += nsubxids;
+				subcount += nsubxids;
 
 				/*
 				 * Top-level XID of a transaction is always less than any of
@@ -3627,14 +3640,6 @@ ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
 	LWLockRelease(ProcArrayLock);
 }
 
-
-#define XidCacheRemove(i) \
-	do { \
-		MyProc->subxids.xids[i] = MyProc->subxids.xids[MyPgXact->nxids - 1]; \
-		pg_write_barrier(); \
-		MyPgXact->nxids--; \
-	} while (0)
-
 /*
  * XidCacheRemoveRunningXids
  *
@@ -3650,6 +3655,7 @@ XidCacheRemoveRunningXids(TransactionId xid,
 {
 	int			i,
 				j;
+	XidCacheStatus *mysubxidstat;
 
 	Assert(TransactionIdIsValid(xid));
 
@@ -3667,6 +3673,8 @@ XidCacheRemoveRunningXids(TransactionId xid,
 	 */
 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 
+	mysubxidstat = &ProcGlobal->subxidStates[MyProc->pgxactoff];
+
 	/*
 	 * Under normal circumstances xid and xids[] will be in increasing order,
 	 * as will be the entries in subxids.  Scan backwards to avoid O(N^2)
@@ -3676,11 +3684,14 @@ XidCacheRemoveRunningXids(TransactionId xid,
 	{
 		TransactionId anxid = xids[i];
 
-		for (j = MyPgXact->nxids - 1; j >= 0; j--)
+		for (j = MyProc->subxidStatus.count - 1; j >= 0; j--)
 		{
 			if (TransactionIdEquals(MyProc->subxids.xids[j], anxid))
 			{
-				XidCacheRemove(j);
+				MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1];
+				pg_write_barrier();
+				mysubxidstat->count--;
+				MyProc->subxidStatus.count--;
 				break;
 			}
 		}
@@ -3692,20 +3703,23 @@ XidCacheRemoveRunningXids(TransactionId xid,
 		 * error during AbortSubTransaction.  So instead of Assert, emit a
 		 * debug warning.
 		 */
-		if (j < 0 && !MyPgXact->overflowed)
+		if (j < 0 && !MyProc->subxidStatus.overflowed)
 			elog(WARNING, "did not find subXID %u in MyProc", anxid);
 	}
 
-	for (j = MyPgXact->nxids - 1; j >= 0; j--)
+	for (j = MyProc->subxidStatus.count - 1; j >= 0; j--)
 	{
 		if (TransactionIdEquals(MyProc->subxids.xids[j], xid))
 		{
-			XidCacheRemove(j);
+			MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1];
+			pg_write_barrier();
+			mysubxidstat->count--;
+			MyProc->subxidStatus.count--;
 			break;
 		}
 	}
 	/* Ordinarily we should have found it, unless the cache has overflowed */
-	if (j < 0 && !MyPgXact->overflowed)
+	if (j < 0 && !MyProc->subxidStatus.overflowed)
 		elog(WARNING, "did not find subXID %u in MyProc", xid);
 
 	/* Also advance global latestCompletedXid while holding the lock */
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index f6113b2d2432..aa9fbd80545b 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -63,9 +63,8 @@ int			LockTimeout = 0;
 int			IdleInTransactionSessionTimeout = 0;
 bool		log_lock_waits = false;
 
-/* Pointer to this process's PGPROC and PGXACT structs, if any */
+/* Pointer to this process's PGPROC struct, if any */
 PGPROC	   *MyProc = NULL;
-PGXACT	   *MyPgXact = NULL;
 
 /*
  * This spinlock protects the freelist of recycled PGPROC structures.
@@ -110,10 +109,8 @@ ProcGlobalShmemSize(void)
 	size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC)));
 	size = add_size(size, sizeof(slock_t));
 
-	size = add_size(size, mul_size(MaxBackends, sizeof(PGXACT)));
-	size = add_size(size, mul_size(NUM_AUXILIARY_PROCS, sizeof(PGXACT)));
-	size = add_size(size, mul_size(max_prepared_xacts, sizeof(PGXACT)));
 	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids)));
+	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->subxidStates)));
 	size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->vacuumFlags)));
 
 	return size;
@@ -161,7 +158,6 @@ void
 InitProcGlobal(void)
 {
 	PGPROC	   *procs;
-	PGXACT	   *pgxacts;
 	int			i,
 				j;
 	bool		found;
@@ -202,18 +198,6 @@ InitProcGlobal(void)
 	/* XXX allProcCount isn't really all of them; it excludes prepared xacts */
 	ProcGlobal->allProcCount = MaxBackends + NUM_AUXILIARY_PROCS;
 
-	/*
-	 * Also allocate a separate array of PGXACT structures.  This is separate
-	 * from the main PGPROC array so that the most heavily accessed data is
-	 * stored contiguously in memory in as few cache lines as possible. This
-	 * provides significant performance benefits, especially on a
-	 * multiprocessor system.  There is one PGXACT structure for every PGPROC
-	 * structure.
-	 */
-	pgxacts = (PGXACT *) ShmemAlloc(TotalProcs * sizeof(PGXACT));
-	MemSet(pgxacts, 0, TotalProcs * sizeof(PGXACT));
-	ProcGlobal->allPgXact = pgxacts;
-
 	/*
 	 * Allocate arrays mirroring PGPROC fields in a dense manner. See
 	 * PROC_HDR.
@@ -224,6 +208,8 @@ InitProcGlobal(void)
 	ProcGlobal->xids =
 		(TransactionId *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids));
 	MemSet(ProcGlobal->xids, 0, TotalProcs * sizeof(*ProcGlobal->xids));
+	ProcGlobal->subxidStates = (XidCacheStatus *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->subxidStates));
+	MemSet(ProcGlobal->subxidStates, 0, TotalProcs * sizeof(*ProcGlobal->subxidStates));
 	ProcGlobal->vacuumFlags = (uint8 *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->vacuumFlags));
 	MemSet(ProcGlobal->vacuumFlags, 0, TotalProcs * sizeof(*ProcGlobal->vacuumFlags));
 
@@ -372,7 +358,6 @@ InitProcess(void)
 				(errcode(ERRCODE_TOO_MANY_CONNECTIONS),
 				 errmsg("sorry, too many clients already")));
 	}
-	MyPgXact = &ProcGlobal->allPgXact[MyProc->pgprocno];
 
 	/*
 	 * Cross-check that the PGPROC is of the type we expect; if this were not
@@ -569,7 +554,6 @@ InitAuxiliaryProcess(void)
 	((volatile PGPROC *) auxproc)->pid = MyProcPid;
 
 	MyProc = auxproc;
-	MyPgXact = &ProcGlobal->allPgXact[auxproc->pgprocno];
 
 	SpinLockRelease(ProcStructLock);
 
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 9f3a8b518eb2..9c9a50ae457f 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -35,6 +35,14 @@
  */
 #define PGPROC_MAX_CACHED_SUBXIDS 64	/* XXX guessed-at value */
 
+typedef struct XidCacheStatus
+{
+	/* number of cached subxids, never more than PGPROC_MAX_CACHED_SUBXIDS */
+	uint8	count;
+	/* has PGPROC->subxids overflowed */
+	bool	overflowed;
+} XidCacheStatus;
+
 struct XidCache
 {
 	TransactionId xids[PGPROC_MAX_CACHED_SUBXIDS];
@@ -187,6 +195,8 @@ struct PGPROC
 	 */
 	SHM_QUEUE	myProcLocks[NUM_LOCK_PARTITIONS];
 
+	XidCacheStatus subxidStatus; /* mirrored with
+								  * ProcGlobal->subxidStates[i] */
 	struct XidCache subxids;	/* cache for subtransaction XIDs */
 
 	/* Support for group XID clearing. */
@@ -235,22 +245,6 @@ struct PGPROC
 
 
 extern PGDLLIMPORT PGPROC *MyProc;
-extern PGDLLIMPORT struct PGXACT *MyPgXact;
-
-/*
- * Prior to PostgreSQL 9.2, the fields below were stored as part of the
- * PGPROC.  However, benchmarking revealed that packing these particular
- * members into a separate array as tightly as possible sped up GetSnapshotData
- * considerably on systems with many CPU cores, by reducing the number of
- * cache lines needing to be fetched.  Thus, think very carefully before adding
- * anything else here.
- */
-typedef struct PGXACT
-{
-	bool		overflowed;
-
-	uint8		nxids;
-} PGXACT;
 
 /*
  * There is one ProcGlobal struct for the whole database cluster.
@@ -310,12 +304,16 @@ typedef struct PROC_HDR
 {
 	/* Array of PGPROC structures (not including dummies for prepared txns) */
 	PGPROC	   *allProcs;
-	/* Array of PGXACT structures (not including dummies for prepared txns) */
-	PGXACT	   *allPgXact;
 
 	/* Array mirroring PGPROC.xid for each PGPROC currently in the procarray */
 	TransactionId *xids;
 
+	/*
+	 * Array mirroring PGPROC.subxidStatus for each PGPROC currently in the
+	 * procarray.
+	 */
+	XidCacheStatus *subxidStates;
+
 	/*
 	 * Array mirroring PGPROC.vacuumFlags for each PGPROC currently in the
 	 * procarray.
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index b4948ac675f7..3d990463ce9c 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1536,7 +1536,6 @@ PGSetenvStatusType
 PGShmemHeader
 PGTransactionStatusType
 PGVerbosity
-PGXACT
 PG_Locale_Strategy
 PG_Lock_Status
 PG_init_t

From 1e7629d2c95ffd290ab0e18d7618ca9d9da94265 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 14 Aug 2020 22:14:03 -0400
Subject: [PATCH 300/334] Be more careful about the shape of hashable subplan
 clauses.

nodeSubplan.c expects that the testexpr for a hashable ANY SubPlan
has the form of one or more OpExprs whose LHS is an expression of the
outer query's, while the RHS is an expression over Params representing
output columns of the subquery.  However, the planner only went as far
as verifying that the clauses were all binary OpExprs.  This works
99.99% of the time, because the clauses have the right shape when
emitted by the parser --- but it's possible for function inlining to
break that, as reported by PegoraroF10.  To fix, teach the planner
to check that the LHS and RHS contain the right things, or more
accurately don't contain the wrong things.  Given that this has been
broken for years without anyone noticing, it seems sufficient to just
give up hashing when it happens, rather than go to the trouble of
commuting the clauses back again (which wouldn't necessarily work
anyway).

While poking at that, I also noticed that nodeSubplan.c had a baked-in
assumption that the number of hash clauses is identical to the number
of subquery output columns.  Again, that's fine as far as parser output
goes, but it's not hard to break it via function inlining.  There seems
little reason for that assumption though --- AFAICS, the only thing
it's buying us is not having to store the number of hash clauses
explicitly.  Adding code to the planner to reject such cases would take
more code than getting nodeSubplan.c to cope, so I fixed it that way.

This has been broken for as long as we've had hashable SubPlans,
so back-patch to all supported branches.

Discussion: https://postgr.es/m/1549209182255-0.post@n3.nabble.com
---
 src/backend/executor/nodeSubplan.c      | 16 ++---
 src/backend/optimizer/plan/subselect.c  | 77 ++++++++++++++++++-------
 src/backend/optimizer/util/clauses.c    | 35 +++++++++++
 src/include/nodes/execnodes.h           |  2 +
 src/include/optimizer/clauses.h         |  1 +
 src/test/regress/expected/subselect.out | 77 +++++++++++++++++++++++++
 src/test/regress/sql/subselect.sql      | 41 +++++++++++++
 7 files changed, 219 insertions(+), 30 deletions(-)

diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 38c2fc0b50b6..9a7962518ee6 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -471,7 +471,7 @@ buildSubPlanHash(SubPlanState *node, ExprContext *econtext)
 {
 	SubPlan    *subplan = node->subplan;
 	PlanState  *planstate = node->planstate;
-	int			ncols = list_length(subplan->paramIds);
+	int			ncols = node->numCols;
 	ExprContext *innerecontext = node->innerecontext;
 	MemoryContext oldcontext;
 	long		nbuckets;
@@ -878,11 +878,6 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
 								  ALLOCSET_SMALL_SIZES);
 		/* and a short-lived exprcontext for function evaluation */
 		sstate->innerecontext = CreateExprContext(estate);
-		/* Silly little array of column numbers 1..n */
-		ncols = list_length(subplan->paramIds);
-		sstate->keyColIdx = (AttrNumber *) palloc(ncols * sizeof(AttrNumber));
-		for (i = 0; i < ncols; i++)
-			sstate->keyColIdx[i] = i + 1;
 
 		/*
 		 * We use ExecProject to evaluate the lefthand and righthand
@@ -914,13 +909,15 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
 				 (int) nodeTag(subplan->testexpr));
 			oplist = NIL;		/* keep compiler quiet */
 		}
-		Assert(list_length(oplist) == ncols);
+		ncols = list_length(oplist);
 
 		lefttlist = righttlist = NIL;
+		sstate->numCols = ncols;
+		sstate->keyColIdx = (AttrNumber *) palloc(ncols * sizeof(AttrNumber));
 		sstate->tab_eq_funcoids = (Oid *) palloc(ncols * sizeof(Oid));
+		sstate->tab_collations = (Oid *) palloc(ncols * sizeof(Oid));
 		sstate->tab_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
 		sstate->tab_eq_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
-		sstate->tab_collations = (Oid *) palloc(ncols * sizeof(Oid));
 		sstate->lhs_hash_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
 		sstate->cur_eq_funcs = (FmgrInfo *) palloc(ncols * sizeof(FmgrInfo));
 		/* we'll need the cross-type equality fns below, but not in sstate */
@@ -979,6 +976,9 @@ ExecInitSubPlan(SubPlan *subplan, PlanState *parent)
 			/* Set collation */
 			sstate->tab_collations[i - 1] = opexpr->inputcollid;
 
+			/* keyColIdx is just column numbers 1..n */
+			sstate->keyColIdx[i - 1] = i;
+
 			i++;
 		}
 
diff --git a/src/backend/optimizer/plan/subselect.c b/src/backend/optimizer/plan/subselect.c
index 9a8f738c9d05..6eb794669fe3 100644
--- a/src/backend/optimizer/plan/subselect.c
+++ b/src/backend/optimizer/plan/subselect.c
@@ -69,7 +69,7 @@ typedef struct inline_cte_walker_context
 static Node *build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot,
 						   List *plan_params,
 						   SubLinkType subLinkType, int subLinkId,
-						   Node *testexpr, bool adjust_testexpr,
+						   Node *testexpr, List *testexpr_paramids,
 						   bool unknownEqFalse);
 static List *generate_subquery_params(PlannerInfo *root, List *tlist,
 									  List **paramIds);
@@ -81,7 +81,8 @@ static Node *convert_testexpr(PlannerInfo *root,
 static Node *convert_testexpr_mutator(Node *node,
 									  convert_testexpr_context *context);
 static bool subplan_is_hashable(Plan *plan);
-static bool testexpr_is_hashable(Node *testexpr);
+static bool testexpr_is_hashable(Node *testexpr, List *param_ids);
+static bool test_opexpr_is_hashable(OpExpr *testexpr, List *param_ids);
 static bool hash_ok_operator(OpExpr *expr);
 static bool contain_dml(Node *node);
 static bool contain_dml_walker(Node *node, void *context);
@@ -237,7 +238,7 @@ make_subplan(PlannerInfo *root, Query *orig_subquery,
 	/* And convert to SubPlan or InitPlan format. */
 	result = build_subplan(root, plan, subroot, plan_params,
 						   subLinkType, subLinkId,
-						   testexpr, true, isTopQual);
+						   testexpr, NIL, isTopQual);
 
 	/*
 	 * If it's a correlated EXISTS with an unimportant targetlist, we might be
@@ -291,12 +292,11 @@ make_subplan(PlannerInfo *root, Query *orig_subquery,
 												  plan_params,
 												  ANY_SUBLINK, 0,
 												  newtestexpr,
-												  false, true));
+												  paramIds,
+												  true));
 				/* Check we got what we expected */
 				Assert(hashplan->parParam == NIL);
 				Assert(hashplan->useHashTable);
-				/* build_subplan won't have filled in paramIds */
-				hashplan->paramIds = paramIds;
 
 				/* Leave it to the executor to decide which plan to use */
 				asplan = makeNode(AlternativeSubPlan);
@@ -319,7 +319,7 @@ static Node *
 build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot,
 			  List *plan_params,
 			  SubLinkType subLinkType, int subLinkId,
-			  Node *testexpr, bool adjust_testexpr,
+			  Node *testexpr, List *testexpr_paramids,
 			  bool unknownEqFalse)
 {
 	Node	   *result;
@@ -484,10 +484,10 @@ build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot,
 	else
 	{
 		/*
-		 * Adjust the Params in the testexpr, unless caller said it's not
-		 * needed.
+		 * Adjust the Params in the testexpr, unless caller already took care
+		 * of it (as indicated by passing a list of Param IDs).
 		 */
-		if (testexpr && adjust_testexpr)
+		if (testexpr && testexpr_paramids == NIL)
 		{
 			List	   *params;
 
@@ -499,7 +499,10 @@ build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot,
 											   params);
 		}
 		else
+		{
 			splan->testexpr = testexpr;
+			splan->paramIds = testexpr_paramids;
+		}
 
 		/*
 		 * We can't convert subplans of ALL_SUBLINK or ANY_SUBLINK types to
@@ -511,7 +514,7 @@ build_subplan(PlannerInfo *root, Plan *plan, PlannerInfo *subroot,
 		if (subLinkType == ANY_SUBLINK &&
 			splan->parParam == NIL &&
 			subplan_is_hashable(plan) &&
-			testexpr_is_hashable(splan->testexpr))
+			testexpr_is_hashable(splan->testexpr, splan->paramIds))
 			splan->useHashTable = true;
 
 		/*
@@ -734,24 +737,20 @@ subplan_is_hashable(Plan *plan)
 
 /*
  * testexpr_is_hashable: is an ANY SubLink's test expression hashable?
+ *
+ * To identify LHS vs RHS of the hash expression, we must be given the
+ * list of output Param IDs of the SubLink's subquery.
  */
 static bool
-testexpr_is_hashable(Node *testexpr)
+testexpr_is_hashable(Node *testexpr, List *param_ids)
 {
 	/*
 	 * The testexpr must be a single OpExpr, or an AND-clause containing only
-	 * OpExprs.
-	 *
-	 * The combining operators must be hashable and strict. The need for
-	 * hashability is obvious, since we want to use hashing. Without
-	 * strictness, behavior in the presence of nulls is too unpredictable.  We
-	 * actually must assume even more than plain strictness: they can't yield
-	 * NULL for non-null inputs, either (see nodeSubplan.c).  However, hash
-	 * indexes and hash joins assume that too.
+	 * OpExprs, each of which satisfy test_opexpr_is_hashable().
 	 */
 	if (testexpr && IsA(testexpr, OpExpr))
 	{
-		if (hash_ok_operator((OpExpr *) testexpr))
+		if (test_opexpr_is_hashable((OpExpr *) testexpr, param_ids))
 			return true;
 	}
 	else if (is_andclause(testexpr))
@@ -764,7 +763,7 @@ testexpr_is_hashable(Node *testexpr)
 
 			if (!IsA(andarg, OpExpr))
 				return false;
-			if (!hash_ok_operator((OpExpr *) andarg))
+			if (!test_opexpr_is_hashable((OpExpr *) andarg, param_ids))
 				return false;
 		}
 		return true;
@@ -773,6 +772,40 @@ testexpr_is_hashable(Node *testexpr)
 	return false;
 }
 
+static bool
+test_opexpr_is_hashable(OpExpr *testexpr, List *param_ids)
+{
+	/*
+	 * The combining operator must be hashable and strict.  The need for
+	 * hashability is obvious, since we want to use hashing.  Without
+	 * strictness, behavior in the presence of nulls is too unpredictable.  We
+	 * actually must assume even more than plain strictness: it can't yield
+	 * NULL for non-null inputs, either (see nodeSubplan.c).  However, hash
+	 * indexes and hash joins assume that too.
+	 */
+	if (!hash_ok_operator(testexpr))
+		return false;
+
+	/*
+	 * The left and right inputs must belong to the outer and inner queries
+	 * respectively; hence Params that will be supplied by the subquery must
+	 * not appear in the LHS, and Vars of the outer query must not appear in
+	 * the RHS.  (Ordinarily, this must be true because of the way that the
+	 * parser builds an ANY SubLink's testexpr ... but inlining of functions
+	 * could have changed the expression's structure, so we have to check.
+	 * Such cases do not occur often enough to be worth trying to optimize, so
+	 * we don't worry about trying to commute the clause or anything like
+	 * that; we just need to be sure not to build an invalid plan.)
+	 */
+	if (list_length(testexpr->args) != 2)
+		return false;
+	if (contain_exec_param((Node *) linitial(testexpr->args), param_ids))
+		return false;
+	if (contain_var_clause((Node *) lsecond(testexpr->args)))
+		return false;
+	return true;
+}
+
 /*
  * Check expression is hashable + strict
  *
diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
index e04b14407236..7105d0a2db9a 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -108,6 +108,7 @@ static bool contain_volatile_functions_not_nextval_walker(Node *node, void *cont
 static bool max_parallel_hazard_walker(Node *node,
 									   max_parallel_hazard_context *context);
 static bool contain_nonstrict_functions_walker(Node *node, void *context);
+static bool contain_exec_param_walker(Node *node, List *param_ids);
 static bool contain_context_dependent_node(Node *clause);
 static bool contain_context_dependent_node_walker(Node *node, int *flags);
 static bool contain_leaked_vars_walker(Node *node, void *context);
@@ -1221,6 +1222,40 @@ contain_nonstrict_functions_walker(Node *node, void *context)
 								  context);
 }
 
+/*****************************************************************************
+ *		Check clauses for Params
+ *****************************************************************************/
+
+/*
+ * contain_exec_param
+ *	  Recursively search for PARAM_EXEC Params within a clause.
+ *
+ * Returns true if the clause contains any PARAM_EXEC Param with a paramid
+ * appearing in the given list of Param IDs.  Does not descend into
+ * subqueries!
+ */
+bool
+contain_exec_param(Node *clause, List *param_ids)
+{
+	return contain_exec_param_walker(clause, param_ids);
+}
+
+static bool
+contain_exec_param_walker(Node *node, List *param_ids)
+{
+	if (node == NULL)
+		return false;
+	if (IsA(node, Param))
+	{
+		Param	   *p = (Param *) node;
+
+		if (p->paramkind == PARAM_EXEC &&
+			list_member_int(param_ids, p->paramid))
+			return true;
+	}
+	return expression_tree_walker(node, contain_exec_param_walker, param_ids);
+}
+
 /*****************************************************************************
  *		Check clauses for context-dependent nodes
  *****************************************************************************/
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index cf832d7f9097..0b42dd6f9441 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -867,6 +867,8 @@ typedef struct SubPlanState
 	MemoryContext hashtablecxt; /* memory context containing hash tables */
 	MemoryContext hashtempcxt;	/* temp memory context for hash tables */
 	ExprContext *innerecontext; /* econtext for computing inner tuples */
+	int			numCols;		/* number of columns being hashed */
+	/* each of the remaining fields is an array of length numCols: */
 	AttrNumber *keyColIdx;		/* control data for hash tables */
 	Oid		   *tab_eq_funcoids;	/* equality func oids for table
 									 * datatype(s) */
diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h
index b7456e3e595b..7ef8cce79eec 100644
--- a/src/include/optimizer/clauses.h
+++ b/src/include/optimizer/clauses.h
@@ -38,6 +38,7 @@ extern bool contain_subplans(Node *clause);
 extern char max_parallel_hazard(Query *parse);
 extern bool is_parallel_safe(PlannerInfo *root, Node *node);
 extern bool contain_nonstrict_functions(Node *clause);
+extern bool contain_exec_param(Node *clause, List *param_ids);
 extern bool contain_leaked_vars(Node *clause);
 
 extern Relids find_nonnullable_rels(Node *clause);
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 1c5d80da323e..b81923f2e741 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -757,6 +757,7 @@ insert into outer_text values ('a', null);
 insert into outer_text values ('b', null);
 create temp table inner_text (c1 text, c2 text);
 insert into inner_text values ('a', null);
+insert into inner_text values ('123', '456');
 select * from outer_text where (f1, f2) not in (select * from inner_text);
  f1 | f2 
 ----+----
@@ -797,6 +798,82 @@ select '1'::text in (select '1'::name union all select '1'::name);
  t
 (1 row)
 
+--
+-- Test that we don't try to use a hashed subplan if the simplified
+-- testexpr isn't of the right shape
+--
+-- this fails by default, of course
+select * from int8_tbl where q1 in (select c1 from inner_text);
+ERROR:  operator does not exist: bigint = text
+LINE 1: select * from int8_tbl where q1 in (select c1 from inner_tex...
+                                        ^
+HINT:  No operator matches the given name and argument types. You might need to add explicit type casts.
+begin;
+-- make an operator to allow it to succeed
+create function bogus_int8_text_eq(int8, text) returns boolean
+language sql as 'select $1::text = $2';
+create operator = (procedure=bogus_int8_text_eq, leftarg=int8, rightarg=text);
+explain (costs off)
+select * from int8_tbl where q1 in (select c1 from inner_text);
+           QUERY PLAN           
+--------------------------------
+ Seq Scan on int8_tbl
+   Filter: (hashed SubPlan 1)
+   SubPlan 1
+     ->  Seq Scan on inner_text
+(4 rows)
+
+select * from int8_tbl where q1 in (select c1 from inner_text);
+ q1  |        q2        
+-----+------------------
+ 123 |              456
+ 123 | 4567890123456789
+(2 rows)
+
+-- inlining of this function results in unusual number of hash clauses,
+-- which we can still cope with
+create or replace function bogus_int8_text_eq(int8, text) returns boolean
+language sql as 'select $1::text = $2 and $1::text = $2';
+explain (costs off)
+select * from int8_tbl where q1 in (select c1 from inner_text);
+           QUERY PLAN           
+--------------------------------
+ Seq Scan on int8_tbl
+   Filter: (hashed SubPlan 1)
+   SubPlan 1
+     ->  Seq Scan on inner_text
+(4 rows)
+
+select * from int8_tbl where q1 in (select c1 from inner_text);
+ q1  |        q2        
+-----+------------------
+ 123 |              456
+ 123 | 4567890123456789
+(2 rows)
+
+-- inlining of this function causes LHS and RHS to be switched,
+-- which we can't cope with, so hashing should be abandoned
+create or replace function bogus_int8_text_eq(int8, text) returns boolean
+language sql as 'select $2 = $1::text';
+explain (costs off)
+select * from int8_tbl where q1 in (select c1 from inner_text);
+              QUERY PLAN              
+--------------------------------------
+ Seq Scan on int8_tbl
+   Filter: (SubPlan 1)
+   SubPlan 1
+     ->  Materialize
+           ->  Seq Scan on inner_text
+(5 rows)
+
+select * from int8_tbl where q1 in (select c1 from inner_text);
+ q1  |        q2        
+-----+------------------
+ 123 |              456
+ 123 | 4567890123456789
+(2 rows)
+
+rollback;  -- to get rid of the bogus operator
 --
 -- Test case for planner bug with nested EXISTS handling
 --
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index a56057bd4fad..cce8ebdb3d9f 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -449,6 +449,7 @@ insert into outer_text values ('b', null);
 
 create temp table inner_text (c1 text, c2 text);
 insert into inner_text values ('a', null);
+insert into inner_text values ('123', '456');
 
 select * from outer_text where (f1, f2) not in (select * from inner_text);
 
@@ -468,6 +469,46 @@ select 'foo'::text in (select 'bar'::name union all select 'bar'::name);
 
 select '1'::text in (select '1'::name union all select '1'::name);
 
+--
+-- Test that we don't try to use a hashed subplan if the simplified
+-- testexpr isn't of the right shape
+--
+
+-- this fails by default, of course
+select * from int8_tbl where q1 in (select c1 from inner_text);
+
+begin;
+
+-- make an operator to allow it to succeed
+create function bogus_int8_text_eq(int8, text) returns boolean
+language sql as 'select $1::text = $2';
+
+create operator = (procedure=bogus_int8_text_eq, leftarg=int8, rightarg=text);
+
+explain (costs off)
+select * from int8_tbl where q1 in (select c1 from inner_text);
+select * from int8_tbl where q1 in (select c1 from inner_text);
+
+-- inlining of this function results in unusual number of hash clauses,
+-- which we can still cope with
+create or replace function bogus_int8_text_eq(int8, text) returns boolean
+language sql as 'select $1::text = $2 and $1::text = $2';
+
+explain (costs off)
+select * from int8_tbl where q1 in (select c1 from inner_text);
+select * from int8_tbl where q1 in (select c1 from inner_text);
+
+-- inlining of this function causes LHS and RHS to be switched,
+-- which we can't cope with, so hashing should be abandoned
+create or replace function bogus_int8_text_eq(int8, text) returns boolean
+language sql as 'select $2 = $1::text';
+
+explain (costs off)
+select * from int8_tbl where q1 in (select c1 from inner_text);
+select * from int8_tbl where q1 in (select c1 from inner_text);
+
+rollback;  -- to get rid of the bogus operator
+
 --
 -- Test case for planner bug with nested EXISTS handling
 --

From b48cac3b10a02fea2bed684469dd4d36a6616405 Mon Sep 17 00:00:00 2001
From: Amit Kapila <akapila@postgresql.org>
Date: Sat, 15 Aug 2020 08:34:48 +0530
Subject: [PATCH 301/334] Mark a few logical decoding related variables with
 PGDLLIMPORT.

Commit 7259736a6e added two variables CheckXidAlive and bsysscan to detect
concurrent aborts and used these in inline functions that are part of the
API that can be used by extensions. So it is better to mark them with
PGDLLIMPORT.

Reported-by: Thomas Munro
Discussion: https://postgr.es/m/CA+hUKGJ7+HYupd=Pz9+QrXa-C_YnbC4rAYu8V+=OKg=UgdzMeg@mail.gmail.com
---
 src/include/access/xact.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index c18554bae2c2..c59de9bebaf8 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -82,8 +82,8 @@ typedef enum
 extern int	synchronous_commit;
 
 /* used during logical streaming of a transaction */
-extern TransactionId CheckXidAlive;
-extern bool bsysscan;
+extern PGDLLIMPORT TransactionId CheckXidAlive;
+extern PGDLLIMPORT bool bsysscan;
 
 /*
  * Miscellaneous flag bits to record events which occur on the top level

From bacda6a327efb820d0e9f3262b81e803b2d5702b Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Sat, 15 Aug 2020 11:23:18 +0200
Subject: [PATCH 302/334] Remove obsolete HAVE_BUGGY_SOLARIS_STRTOD

Fixed more than 10 years ago.

Reviewed-by: Noah Misch <noah@leadboat.com>
Discussion: https://www.postgresql.org/message-id/flat/aa266ede-baaa-f4e6-06cf-5b1737610e9a%402ndquadrant.com
---
 src/backend/utils/adt/float.c | 24 ------------------------
 src/include/port/solaris.h    | 12 ------------
 2 files changed, 36 deletions(-)

diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c
index ffd1ce8c7610..429c9280c0cf 100644
--- a/src/backend/utils/adt/float.c
+++ b/src/backend/utils/adt/float.c
@@ -271,18 +271,6 @@ float4in(PG_FUNCTION_ARGS)
 					 errmsg("invalid input syntax for type %s: \"%s\"",
 							"real", orig_num)));
 	}
-#ifdef HAVE_BUGGY_SOLARIS_STRTOD
-	else
-	{
-		/*
-		 * Many versions of Solaris have a bug wherein strtod sets endptr to
-		 * point one byte beyond the end of the string when given "inf" or
-		 * "infinity".
-		 */
-		if (endptr != num && endptr[-1] == '\0')
-			endptr--;
-	}
-#endif							/* HAVE_BUGGY_SOLARIS_STRTOD */
 
 	/* skip trailing whitespace */
 	while (*endptr != '\0' && isspace((unsigned char) *endptr))
@@ -499,18 +487,6 @@ float8in_internal_opt_error(char *num, char **endptr_p,
 										 type_name, orig_string))),
 						 have_error);
 	}
-#ifdef HAVE_BUGGY_SOLARIS_STRTOD
-	else
-	{
-		/*
-		 * Many versions of Solaris have a bug wherein strtod sets endptr to
-		 * point one byte beyond the end of the string when given "inf" or
-		 * "infinity".
-		 */
-		if (endptr != num && endptr[-1] == '\0')
-			endptr--;
-	}
-#endif							/* HAVE_BUGGY_SOLARIS_STRTOD */
 
 	/* skip trailing whitespace */
 	while (*endptr != '\0' && isspace((unsigned char) *endptr))
diff --git a/src/include/port/solaris.h b/src/include/port/solaris.h
index eeb1a320bd5b..e63a3bd824d6 100644
--- a/src/include/port/solaris.h
+++ b/src/include/port/solaris.h
@@ -24,15 +24,3 @@
 #if defined(__i386__)
 #include <sys/isa_defs.h>
 #endif
-
-/*
- * Many versions of Solaris have broken strtod() --- see bug #4751182.
- * This has been fixed in current versions of Solaris:
- *
- * http://sunsolve.sun.com/search/document.do?assetkey=1-21-108993-62-1&searchclause=108993-62
- * http://sunsolve.sun.com/search/document.do?assetkey=1-21-112874-34-1&searchclause=112874-34
- *
- * However, many people might not have patched versions, so
- * still use our own fix for the buggy version.
- */
-#define HAVE_BUGGY_SOLARIS_STRTOD

From 53095b5fe650270118bc2ab77416d08e19472cd3 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Sat, 15 Aug 2020 11:23:18 +0200
Subject: [PATCH 303/334] Remove obsolete cygwin.h hack

The version being checked for is 20 years old.

Reviewed-by: Marco Atzeri <marco.atzeri@gmail.com>
Discussion: https://www.postgresql.org/message-id/flat/aa266ede-baaa-f4e6-06cf-5b1737610e9a%402ndquadrant.com
---
 src/include/port/cygwin.h | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/include/port/cygwin.h b/src/include/port/cygwin.h
index f1fc1a93d76c..64d69936e5e0 100644
--- a/src/include/port/cygwin.h
+++ b/src/include/port/cygwin.h
@@ -1,14 +1,5 @@
 /* src/include/port/cygwin.h */
 
-#include <cygwin/version.h>
-
-/*
- * Check for b20.1 and disable AF_UNIX family socket support.
- */
-#if CYGWIN_VERSION_DLL_MAJOR < 1001
-#undef HAVE_UNIX_SOCKETS
-#endif
-
 #ifdef BUILDING_DLL
 #define PGDLLIMPORT __declspec (dllexport)
 #else

From d4d443b3bbbb3eb9cdc511564ef3c57fde7dd3ac Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 15 Aug 2020 12:04:19 -0400
Subject: [PATCH 304/334] Remove no-longer-usable hstore--1.0--1.1.sql update
 script.

Since commit 865f14a2d made "=>" unusable as an operator name,
it's been impossible either to install hstore 1.0 or to execute
this update script.  There's not much point in continuing
to ship it.

Discussion: https://postgr.es/m/653936.1597431032@sss.pgh.pa.us
---
 contrib/hstore/Makefile             | 2 +-
 contrib/hstore/hstore--1.0--1.1.sql | 7 -------
 2 files changed, 1 insertion(+), 8 deletions(-)
 delete mode 100644 contrib/hstore/hstore--1.0--1.1.sql

diff --git a/contrib/hstore/Makefile b/contrib/hstore/Makefile
index 872ca03cd1fb..72376d900763 100644
--- a/contrib/hstore/Makefile
+++ b/contrib/hstore/Makefile
@@ -15,7 +15,7 @@ DATA = hstore--1.4.sql \
 	hstore--1.5--1.6.sql \
 	hstore--1.4--1.5.sql \
 	hstore--1.3--1.4.sql hstore--1.2--1.3.sql \
-	hstore--1.1--1.2.sql hstore--1.0--1.1.sql
+	hstore--1.1--1.2.sql
 PGFILEDESC = "hstore - key/value pair data type"
 
 HEADERS = hstore.h
diff --git a/contrib/hstore/hstore--1.0--1.1.sql b/contrib/hstore/hstore--1.0--1.1.sql
deleted file mode 100644
index 4e32a575c5f6..000000000000
--- a/contrib/hstore/hstore--1.0--1.1.sql
+++ /dev/null
@@ -1,7 +0,0 @@
-/* contrib/hstore/hstore--1.0--1.1.sql */
-
--- complain if script is sourced in psql, rather than via ALTER EXTENSION
-\echo Use "ALTER EXTENSION hstore UPDATE TO '1.1'" to load this file. \quit
-
-ALTER EXTENSION hstore DROP OPERATOR => (text, text);
-DROP OPERATOR => (text, text);

From 566372b3d6435639e4cc4476d79b8505a0297c87 Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Sat, 15 Aug 2020 10:15:53 -0700
Subject: [PATCH 305/334] Prevent concurrent SimpleLruTruncate() for any given
 SLRU.

The SimpleLruTruncate() header comment states the new coding rule.  To
achieve this, add locktype "frozenid" and two LWLocks.  This closes a
rare opportunity for data loss, which manifested as "apparent
wraparound" or "could not access status of transaction" errors.  Data
loss is more likely in pg_multixact, due to released branches' thin
margin between multiStopLimit and multiWrapLimit.  If a user's physical
replication primary logged ":  apparent wraparound" messages, the user
should rebuild standbys of that primary regardless of symptoms.  At less
risk is a cluster having emitted "not accepting commands" errors or
"must be vacuumed" warnings at some point.  One can test a cluster for
this data loss by running VACUUM FREEZE in every database.  Back-patch
to 9.5 (all supported versions).

Discussion: https://postgr.es/m/20190218073103.GA1434723@rfd.leadboat.com
---
 doc/src/sgml/catalogs.sgml               |  4 ++-
 doc/src/sgml/monitoring.sgml             | 16 ++++++++++
 src/backend/access/transam/slru.c        |  8 +++++
 src/backend/access/transam/subtrans.c    |  4 +--
 src/backend/commands/async.c             | 37 +++++++++++++++++-------
 src/backend/commands/vacuum.c            | 13 +++++++++
 src/backend/storage/lmgr/lmgr.c          | 20 +++++++++++++
 src/backend/storage/lmgr/lwlocknames.txt |  3 ++
 src/backend/utils/adt/lockfuncs.c        | 12 ++++++++
 src/include/storage/lmgr.h               |  3 ++
 src/include/storage/lock.h               | 10 +++++++
 11 files changed, 117 insertions(+), 13 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 26fda20d1939..fc329c5cff96 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -10226,7 +10226,8 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
    and general database objects (identified by class OID and object OID,
    in the same way as in <structname>pg_description</structname> or
    <structname>pg_depend</structname>).  Also, the right to extend a
-   relation is represented as a separate lockable object.
+   relation is represented as a separate lockable object, as is the right to
+   update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>.
    Also, <quote>advisory</quote> locks can be taken on numbers that have
    user-defined meanings.
   </para>
@@ -10254,6 +10255,7 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
        Type of the lockable object:
        <literal>relation</literal>,
        <literal>extend</literal>,
+       <literal>frozenid</literal>,
        <literal>page</literal>,
        <literal>tuple</literal>,
        <literal>transactionid</literal>,
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 7dcddf478a11..304c49f07b76 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -1742,6 +1742,12 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
       <entry><literal>extend</literal></entry>
       <entry>Waiting to extend a relation.</entry>
      </row>
+     <row>
+      <entry><literal>frozenid</literal></entry>
+      <entry>Waiting to
+       update <structname>pg_database</structname>.<structfield>datfrozenxid</structfield>
+       and <structname>pg_database</structname>.<structfield>datminmxid</structfield>.</entry>
+     </row>
      <row>
       <entry><literal>object</literal></entry>
       <entry>Waiting to acquire a lock on a non-relation database object.</entry>
@@ -1910,6 +1916,11 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
       <entry><literal>NotifyQueue</literal></entry>
       <entry>Waiting to read or update <command>NOTIFY</command> messages.</entry>
      </row>
+     <row>
+      <entry><literal>NotifyQueueTail</literal></entry>
+      <entry>Waiting to update limit on <command>NOTIFY</command> message
+       storage.</entry>
+     </row>
      <row>
       <entry><literal>NotifySLRU</literal></entry>
       <entry>Waiting to access the <command>NOTIFY</command> message SLRU
@@ -2086,6 +2097,11 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
       <entry><literal>WALWrite</literal></entry>
       <entry>Waiting for WAL buffers to be written to disk.</entry>
      </row>
+     <row>
+      <entry><literal>WrapLimitsVacuum</literal></entry>
+      <entry>Waiting to update limits on transaction id and multixact
+       consumption.</entry>
+     </row>
      <row>
       <entry><literal>XactBuffer</literal></entry>
       <entry>Waiting for I/O on a transaction status SLRU buffer.</entry>
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index d1dbb43e096c..7640f153c227 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -1191,6 +1191,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied)
 
 /*
  * Remove all segments before the one holding the passed page number
+ *
+ * All SLRUs prevent concurrent calls to this function, either with an LWLock
+ * or by calling it only as part of a checkpoint.  Mutual exclusion must begin
+ * before computing cutoffPage.  Mutual exclusion must end after any limit
+ * update that would permit other backends to write fresh data into the
+ * segment immediately preceding the one containing cutoffPage.  Otherwise,
+ * when the SLRU is quite full, SimpleLruTruncate() might delete that segment
+ * after it has accrued freshly-written data.
  */
 void
 SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index a087a5554210..a50f60b99af2 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -349,8 +349,8 @@ ExtendSUBTRANS(TransactionId newestXact)
 /*
  * Remove all SUBTRANS segments before the one holding the passed transaction ID
  *
- * This is normally called during checkpoint, with oldestXact being the
- * oldest TransactionXmin of any running transaction.
+ * oldestXact is the oldest TransactionXmin of any running transaction.  This
+ * is called only during checkpoint.
  */
 void
 TruncateSUBTRANS(TransactionId oldestXact)
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 71b7577afc06..4c1286eb988e 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -244,19 +244,22 @@ typedef struct QueueBackendStatus
 /*
  * Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff)
  *
- * The AsyncQueueControl structure is protected by the NotifyQueueLock.
+ * The AsyncQueueControl structure is protected by the NotifyQueueLock and
+ * NotifyQueueTailLock.
  *
- * When holding the lock in SHARED mode, backends may only inspect their own
- * entries as well as the head and tail pointers. Consequently we can allow a
- * backend to update its own record while holding only SHARED lock (since no
- * other backend will inspect it).
+ * When holding NotifyQueueLock in SHARED mode, backends may only inspect
+ * their own entries as well as the head and tail pointers. Consequently we
+ * can allow a backend to update its own record while holding only SHARED lock
+ * (since no other backend will inspect it).
  *
- * When holding the lock in EXCLUSIVE mode, backends can inspect the entries
- * of other backends and also change the head and tail pointers.
+ * When holding NotifyQueueLock in EXCLUSIVE mode, backends can inspect the
+ * entries of other backends and also change the head pointer. When holding
+ * both NotifyQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends
+ * can change the tail pointer.
  *
  * NotifySLRULock is used as the control lock for the pg_notify SLRU buffers.
- * In order to avoid deadlocks, whenever we need both locks, we always first
- * get NotifyQueueLock and then NotifySLRULock.
+ * In order to avoid deadlocks, whenever we need multiple locks, we first get
+ * NotifyQueueTailLock, then NotifyQueueLock, and lastly NotifySLRULock.
  *
  * Each backend uses the backend[] array entry with index equal to its
  * BackendId (which can range from 1 to MaxBackends).  We rely on this to make
@@ -2177,6 +2180,10 @@ asyncQueueAdvanceTail(void)
 	int			newtailpage;
 	int			boundary;
 
+	/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
+	LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE);
+
+	/* Compute the new tail. */
 	LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE);
 	min = QUEUE_HEAD;
 	for (BackendId i = QUEUE_FIRST_LISTENER; i > 0; i = QUEUE_NEXT_LISTENER(i))
@@ -2185,7 +2192,6 @@ asyncQueueAdvanceTail(void)
 		min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i));
 	}
 	oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL);
-	QUEUE_TAIL = min;
 	LWLockRelease(NotifyQueueLock);
 
 	/*
@@ -2205,6 +2211,17 @@ asyncQueueAdvanceTail(void)
 		 */
 		SimpleLruTruncate(NotifyCtl, newtailpage);
 	}
+
+	/*
+	 * Advertise the new tail.  This changes asyncQueueIsFull()'s verdict for
+	 * the segment immediately prior to the new tail, allowing fresh data into
+	 * that segment.
+	 */
+	LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE);
+	QUEUE_TAIL = min;
+	LWLockRelease(NotifyQueueLock);
+
+	LWLockRelease(NotifyQueueTailLock);
 }
 
 /*
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index aba13c31d1bc..5189a5ad5e37 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1361,6 +1361,14 @@ vac_update_datfrozenxid(void)
 	bool		bogus = false;
 	bool		dirty = false;
 
+	/*
+	 * Restrict this task to one backend per database.  This avoids race
+	 * conditions that would move datfrozenxid or datminmxid backward.  It
+	 * avoids calling vac_truncate_clog() with a datfrozenxid preceding a
+	 * datfrozenxid passed to an earlier vac_truncate_clog() call.
+	 */
+	LockDatabaseFrozenIds(ExclusiveLock);
+
 	/*
 	 * Initialize the "min" calculation with
 	 * GetOldestNonRemovableTransactionId(), which is a reasonable
@@ -1551,6 +1559,9 @@ vac_truncate_clog(TransactionId frozenXID,
 	bool		bogus = false;
 	bool		frozenAlreadyWrapped = false;
 
+	/* Restrict task to one backend per cluster; see SimpleLruTruncate(). */
+	LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE);
+
 	/* init oldest datoids to sync with my frozenXID/minMulti values */
 	oldestxid_datoid = MyDatabaseId;
 	minmulti_datoid = MyDatabaseId;
@@ -1660,6 +1671,8 @@ vac_truncate_clog(TransactionId frozenXID,
 	 */
 	SetTransactionIdLimit(frozenXID, oldestxid_datoid);
 	SetMultiXactIdLimit(minMulti, minmulti_datoid, false);
+
+	LWLockRelease(WrapLimitsVacuumLock);
 }
 
 
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index 20103200952e..7409de940592 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -460,6 +460,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
 	LockRelease(&tag, lockmode, false);
 }
 
+/*
+ *		LockDatabaseFrozenIds
+ *
+ * This allows one backend per database to execute vac_update_datfrozenxid().
+ */
+void
+LockDatabaseFrozenIds(LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+
+	SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
+
+	(void) LockAcquire(&tag, lockmode, false, false);
+}
+
 /*
  *		LockPage
  *
@@ -1098,6 +1113,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
 							 tag->locktag_field2,
 							 tag->locktag_field1);
 			break;
+		case LOCKTAG_DATABASE_FROZEN_IDS:
+			appendStringInfo(buf,
+							 _("pg_database.datfrozenxid of database %u"),
+							 tag->locktag_field1);
+			break;
 		case LOCKTAG_PAGE:
 			appendStringInfo(buf,
 							 _("page %u of relation %u of database %u"),
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index e6985e8eedfb..774292fd9427 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -50,3 +50,6 @@ MultiXactTruncationLock				41
 OldSnapshotTimeMapLock				42
 LogicalRepWorkerLock				43
 XactTruncationLock					44
+# 45 was XactTruncationLock until removal of BackendRandomLock
+WrapLimitsVacuumLock				46
+NotifyQueueTailLock					47
diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c
index e992d1bbfced..f592292d067b 100644
--- a/src/backend/utils/adt/lockfuncs.c
+++ b/src/backend/utils/adt/lockfuncs.c
@@ -29,6 +29,7 @@
 const char *const LockTagTypeNames[] = {
 	"relation",
 	"extend",
+	"frozenid",
 	"page",
 	"tuple",
 	"transactionid",
@@ -254,6 +255,17 @@ pg_lock_status(PG_FUNCTION_ARGS)
 				nulls[8] = true;
 				nulls[9] = true;
 				break;
+			case LOCKTAG_DATABASE_FROZEN_IDS:
+				values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
+				nulls[2] = true;
+				nulls[3] = true;
+				nulls[4] = true;
+				nulls[5] = true;
+				nulls[6] = true;
+				nulls[7] = true;
+				nulls[8] = true;
+				nulls[9] = true;
+				break;
 			case LOCKTAG_PAGE:
 				values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
 				values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2);
diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h
index 3acc11aa5a3b..f7cabcbbf550 100644
--- a/src/include/storage/lmgr.h
+++ b/src/include/storage/lmgr.h
@@ -59,6 +59,9 @@ extern bool ConditionalLockRelationForExtension(Relation relation,
 												LOCKMODE lockmode);
 extern int	RelationExtensionLockWaiterCount(Relation relation);
 
+/* Lock to recompute pg_database.datfrozenxid in the current database */
+extern void LockDatabaseFrozenIds(LOCKMODE lockmode);
+
 /* Lock a page (currently only used within indexes) */
 extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
 extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h
index fdabf427210a..1c3e9c1999f5 100644
--- a/src/include/storage/lock.h
+++ b/src/include/storage/lock.h
@@ -138,6 +138,7 @@ typedef enum LockTagType
 {
 	LOCKTAG_RELATION,			/* whole relation */
 	LOCKTAG_RELATION_EXTEND,	/* the right to extend a relation */
+	LOCKTAG_DATABASE_FROZEN_IDS,	/* pg_database.datfrozenxid */
 	LOCKTAG_PAGE,				/* one page of a relation */
 	LOCKTAG_TUPLE,				/* one physical tuple */
 	LOCKTAG_TRANSACTION,		/* transaction (for waiting for xact done) */
@@ -194,6 +195,15 @@ typedef struct LOCKTAG
 	 (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \
 	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
 
+/* ID info for frozen IDs is DB OID */
+#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \
+	((locktag).locktag_field1 = (dboid), \
+	 (locktag).locktag_field2 = 0, \
+	 (locktag).locktag_field3 = 0, \
+	 (locktag).locktag_field4 = 0, \
+	 (locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \
+	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
 /* ID info for a page is RELATION info + BlockNumber */
 #define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \
 	((locktag).locktag_field1 = (dboid), \

From db659a3416b967d716806e558efbb9d1ec610cd1 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 15 Aug 2020 15:43:34 -0400
Subject: [PATCH 306/334] Doc: various improvements for pg_basebackup reference
 page.

Put the -r option in the right section (it certainly isn't an
option controlling "the location and format of the output").

Clarify the behavior of the tablespace and waldir options
(that part per gripe from robert@interactive.co.uk).

Make a large number of small copy-editing fixes in text that
visibly wasn't written by native speakers, and try to avoid
grammatical inconsistencies between the descriptions of
the different options.

Back-patch to v13, since HEAD hasn't meaningfully diverged yet.

Discussion: https://postgr.es/m/159749418850.14322.216503677134569752@wrigleys.postgresql.org
---
 doc/src/sgml/ref/pg_basebackup.sgml | 324 +++++++++++++++-------------
 1 file changed, 171 insertions(+), 153 deletions(-)

diff --git a/doc/src/sgml/ref/pg_basebackup.sgml b/doc/src/sgml/ref/pg_basebackup.sgml
index e246efbdb520..aa0b27c9f300 100644
--- a/doc/src/sgml/ref/pg_basebackup.sgml
+++ b/doc/src/sgml/ref/pg_basebackup.sgml
@@ -29,51 +29,51 @@ PostgreSQL documentation
  <refsect1>
   <title>Description</title>
   <para>
-   <application>pg_basebackup</application> is used to take base backups of
-   a running <productname>PostgreSQL</productname> database cluster. These
-   are taken without affecting other clients to the database, and can be used
+   <application>pg_basebackup</application> is used to take a base backup of
+   a running <productname>PostgreSQL</productname> database cluster. The backup
+   is taken without affecting other clients of the database, and can be used
    both for point-in-time recovery (see <xref linkend="continuous-archiving"/>)
-   and as the starting point for a log shipping or streaming replication standby
-   servers (see <xref linkend="warm-standby"/>).
+   and as the starting point for a log-shipping or streaming-replication standby
+   server (see <xref linkend="warm-standby"/>).
   </para>
 
   <para>
-   <application>pg_basebackup</application> makes a binary copy of the database
-   cluster files, while making sure the system is put in and
+   <application>pg_basebackup</application> makes an exact copy of the database
+   cluster's files, while making sure the server is put into and
    out of backup mode automatically. Backups are always taken of the entire
    database cluster; it is not possible to back up individual databases or
-   database objects. For individual database backups, a tool such as
+   database objects. For selective backups, another tool such as
    <xref linkend="app-pgdump"/> must be used.
   </para>
 
   <para>
    The backup is made over a regular <productname>PostgreSQL</productname>
-   connection, and uses the replication protocol. The connection must be made
-   with a user having <literal>REPLICATION</literal> permissions
-   (see <xref linkend="role-attributes"/>) or a superuser,
-   and <filename>pg_hba.conf</filename> must explicitly permit the replication
-   connection. The server must also be configured
-   with <xref linkend="guc-max-wal-senders"/> set high enough to leave at least
-   one session available for the backup and one for WAL streaming (if used).
+   connection that uses the replication protocol. The connection must be made
+   with a user ID that has <literal>REPLICATION</literal> permissions
+   (see <xref linkend="role-attributes"/>) or is a superuser,
+   and <link linkend="auth-pg-hba-conf"><filename>pg_hba.conf</filename></link>
+   must permit the replication connection. The server must also be configured
+   with <xref linkend="guc-max-wal-senders"/> set high enough to provide at
+   least one walsender for the backup plus one for WAL streaming (if used).
   </para>
 
   <para>
-   There can be multiple <command>pg_basebackup</command>s running at the same time, but it is
+   There can be multiple <command>pg_basebackup</command>s running at the same time, but it is usually
    better from a performance point of view to take only one backup, and copy
    the result.
   </para>
 
   <para>
    <application>pg_basebackup</application> can make a base backup from
-   not only the primary but also the standby. To take a backup from the standby,
+   not only a primary server but also a standby. To take a backup from a standby,
    set up the standby so that it can accept replication connections (that is, set
    <varname>max_wal_senders</varname> and <xref linkend="guc-hot-standby"/>,
-   and configure <link linkend="auth-pg-hba-conf">host-based authentication</link>).
+   and configure its <filename>pg_hba.conf</filename> appropriately).
    You will also need to enable <xref linkend="guc-full-page-writes"/> on the primary.
   </para>
 
   <para>
-   Note that there are some limitations in an online backup from the standby:
+   Note that there are some limitations in taking a backup from a standby:
 
    <itemizedlist>
     <listitem>
@@ -89,7 +89,7 @@ PostgreSQL documentation
     </listitem>
     <listitem>
      <para>
-      If the standby is promoted to the primary during online backup, the backup fails.
+      If the standby is promoted to be primary during backup, the backup fails.
      </para>
     </listitem>
     <listitem>
@@ -105,7 +105,7 @@ PostgreSQL documentation
 
   <para>
    Whenever <application>pg_basebackup</application> is taking a base
-   backup, the <structname>pg_stat_progress_basebackup</structname>
+   backup, the server's <structname>pg_stat_progress_basebackup</structname>
    view will report the progress of the backup.
    See <xref linkend="basebackup-progress-reporting"/> for details.
   </para>
@@ -116,7 +116,7 @@ PostgreSQL documentation
 
    <para>
     The following command-line options control the location and format of the
-    output.
+    output:
 
     <variablelist>
      <varlistentry>
@@ -124,15 +124,15 @@ PostgreSQL documentation
       <term><option>--pgdata=<replaceable class="parameter">directory</replaceable></option></term>
       <listitem>
        <para>
-        Directory to write the output to.
-        <application>pg_basebackup</application> will create the directory and
-        any parent directories if necessary.  The directory may already exist,
-        but it is an error if the directory already exists and is not empty.
+        Sets the target directory to write the output to.
+        <application>pg_basebackup</application> will create this directory
+        (and any missing parent directories) if it does not exist.  If it
+        already exists, it must be empty.
        </para>
        <para>
-        When the backup is in tar mode, and the directory is specified as
-        <literal>-</literal> (dash), the tar file will be written to
-        <literal>stdout</literal>.
+        When the backup is in tar format, the target directory may be
+        specified as <literal>-</literal> (dash), causing the tar file to be
+        written to <literal>stdout</literal>.
        </para>
        <para>
         This option is required.
@@ -155,12 +155,12 @@ PostgreSQL documentation
           <listitem>
            <para>
             Write the output as plain files, with the same layout as the
-            current data directory and tablespaces. When the cluster has
+            source server's data directory and tablespaces. When the cluster has
             no additional tablespaces, the whole database will be placed in
             the target directory. If the cluster contains additional
             tablespaces, the main data directory will be placed in the
             target directory, but all other tablespaces will be placed
-            in the same absolute path as they have on the server.
+            in the same absolute path as they have on the source server.
            </para>
            <para>
             This is the default format.
@@ -174,15 +174,15 @@ PostgreSQL documentation
           <listitem>
            <para>
             Write the output as tar files in the target directory. The main
-            data directory will be written to a file named
-            <filename>base.tar</filename>, and all other tablespaces will
-            be named after the tablespace OID.
-            </para>
+            data directory's contents will be written to a file named
+            <filename>base.tar</filename>, and each other tablespace will be
+            written to a separate tar file named after that tablespace's OID.
+           </para>
            <para>
-            If the value <literal>-</literal> (dash) is specified as
-            target directory, the tar contents will be written to
-            standard output, suitable for piping to for example
-            <productname>gzip</productname>. This is only possible if
+            If the target directory is specified as <literal>-</literal>
+            (dash), the tar contents will be written to
+            standard output, suitable for piping to (for example)
+            <productname>gzip</productname>. This is only allowed if
             the cluster has no additional tablespaces and WAL
             streaming is not used.
            </para>
@@ -192,40 +192,22 @@ PostgreSQL documentation
       </listitem>
      </varlistentry>
 
-     <varlistentry>
-      <term><option>-r <replaceable class="parameter">rate</replaceable></option></term>
-      <term><option>--max-rate=<replaceable class="parameter">rate</replaceable></option></term>
-      <listitem>
-       <para>
-        The maximum transfer rate of data transferred from the server.  Values are
-        in kilobytes per second.  Use a suffix of <literal>M</literal> to indicate megabytes
-        per second.  A suffix of <literal>k</literal> is also accepted, and has no effect.
-        Valid values are between 32 kilobytes per second and 1024 megabytes per second.
-       </para>
-       <para>
-        The purpose is to limit the impact of <application>pg_basebackup</application>
-        on the running server.
-       </para>
-       <para>
-        This option always affects transfer of the data directory. Transfer of
-        WAL files is only affected if the collection method is <literal>fetch</literal>.
-       </para>
-      </listitem>
-     </varlistentry>
-
      <varlistentry>
       <term><option>-R</option></term>
       <term><option>--write-recovery-conf</option></term>
       <listitem>
 
        <para>
-        Create <filename>standby.signal</filename> and append connection settings
-        to <filename>postgresql.auto.conf</filename> in the output
-        directory (or into the base archive file when using tar format) to
-        ease setting up a standby server.
+        Creates a <filename>standby.signal</filename> file and appends
+        connection settings to the <filename>postgresql.auto.conf</filename>
+        file in the target directory (or within the base archive file when
+        using tar format).  This eases setting up a standby server using the
+        results of the backup.
+       </para>
+       <para>
         The <filename>postgresql.auto.conf</filename> file will record the connection
         settings and, if specified, the replication slot
-        that <application>pg_basebackup</application> is using, so that the
+        that <application>pg_basebackup</application> is using, so that
         streaming replication will use the same settings later on.
        </para>
 
@@ -237,17 +219,21 @@ PostgreSQL documentation
       <term><option>--tablespace-mapping=<replaceable class="parameter">olddir</replaceable>=<replaceable class="parameter">newdir</replaceable></option></term>
       <listitem>
        <para>
-        Relocate the tablespace in directory <replaceable>olddir</replaceable>
+        Relocates the tablespace in directory <replaceable>olddir</replaceable>
         to <replaceable>newdir</replaceable> during the backup.  To be
         effective, <replaceable>olddir</replaceable> must exactly match the
-        path specification of the tablespace as it is currently defined.  (But
-        it is not an error if there is no tablespace
-        in <replaceable>olddir</replaceable> contained in the backup.)
+        path specification of the tablespace as it is defined on the source
+        server.  (But it is not an error if there is no tablespace
+        in <replaceable>olddir</replaceable> on the source server.)
+        Meanwhile <replaceable>newdir</replaceable> is a directory in the
+        receiving host's filesystem.  As with the main target directory,
+        <replaceable>newdir</replaceable> need not exist already, but if
+        it does exist it must be empty.
         Both <replaceable>olddir</replaceable>
-        and <replaceable>newdir</replaceable> must be absolute paths.  If a
-        path happens to contain a <literal>=</literal> sign, escape it with a
-        backslash.  This option can be specified multiple times for multiple
-        tablespaces.  See examples below.
+        and <replaceable>newdir</replaceable> must be absolute paths.  If
+        either path needs to contain an equal sign (<literal>=</literal>),
+        precede that with a backslash.  This option can be specified multiple
+        times for multiple tablespaces.
        </para>
 
        <para>
@@ -263,10 +249,16 @@ PostgreSQL documentation
       <term><option>--waldir=<replaceable class="parameter">waldir</replaceable></option></term>
       <listitem>
        <para>
-        Specifies the location for the write-ahead log directory.
+        Sets the directory to write WAL (write-ahead log) files to.
+        By default WAL files will be placed in
+        the <filename>pg_wal</filename> subdirectory of the target
+        directory, but this option can be used to place them elsewhere.
         <replaceable>waldir</replaceable> must be an absolute path.
-        The write-ahead log directory can only be specified when
-        the backup is in plain mode.
+        As with the main target directory,
+        <replaceable>waldir</replaceable> need not exist already, but if
+        it does exist it must be empty.
+        This option can only be specified when
+        the backup is in plain format.
        </para>
       </listitem>
      </varlistentry>
@@ -276,16 +268,16 @@ PostgreSQL documentation
       <term><option>--wal-method=<replaceable class="parameter">method</replaceable></option></term>
       <listitem>
        <para>
-        Includes the required write-ahead log files (WAL files) in the
+        Includes the required WAL (write-ahead log) files in the
         backup. This will include all write-ahead logs generated during
         the backup. Unless the method <literal>none</literal> is specified,
-        it is possible to start a postmaster directly in the extracted
+        it is possible to start a postmaster in the target
         directory without the need to consult the log archive, thus
-        making this a completely standalone backup.
+        making the output a completely standalone backup.
        </para>
        <para>
-        The following methods for collecting the write-ahead logs are
-        supported:
+        The following <replaceable>method</replaceable>s for collecting the
+        write-ahead logs are supported:
 
         <variablelist>
          <varlistentry>
@@ -293,7 +285,7 @@ PostgreSQL documentation
           <term><literal>none</literal></term>
           <listitem>
            <para>
-            Don't include write-ahead log in the backup.
+            Don't include write-ahead logs in the backup.
            </para>
           </listitem>
          </varlistentry>
@@ -304,15 +296,16 @@ PostgreSQL documentation
           <listitem>
            <para>
             The write-ahead log files are collected at the end of the backup.
-            Therefore, it is necessary for the
+            Therefore, it is necessary for the source server's
             <xref linkend="guc-wal-keep-size"/> parameter to be set high
-             enough that the log is not removed before the end of the backup.
-             If the log has been rotated when it's time to transfer it, the
-             backup will fail and be unusable.
+            enough that the required log data is not removed before the end
+            of the backup.  If the required log data has been recycled
+            before it's time to transfer it, the backup will fail and be
+            unusable.
            </para>
            <para>
-            When tar format mode is used, the write-ahead log files will be
-            written to the <filename>base.tar</filename> file.
+            When tar format is used, the write-ahead log files will be
+            included in the <filename>base.tar</filename> file.
            </para>
           </listitem>
          </varlistentry>
@@ -322,16 +315,16 @@ PostgreSQL documentation
           <term><literal>stream</literal></term>
           <listitem>
            <para>
-            Stream the write-ahead log while the backup is created. This will
-            open a second connection to the server and start streaming the
-            write-ahead log in parallel while running the backup. Therefore,
-            it will use up two connections configured by the
-            <xref linkend="guc-max-wal-senders"/> parameter. As long as the
-             client can keep up with write-ahead log received, using this mode
-             requires no extra write-ahead logs to be saved on the primary.
+            Stream write-ahead log data while the backup is being taken.
+            This method will open a second connection to the server and
+            start streaming the write-ahead log in parallel while running
+            the backup.  Therefore, it will require two replication
+            connections not just one.  As long as the client can keep up
+            with the write-ahead log data, using this method requires no
+            extra write-ahead logs to be saved on the source server.
            </para>
            <para>
-            When tar format mode is used, the write-ahead log files will be
+            When tar format is used, the write-ahead log files will be
             written to a separate file named <filename>pg_wal.tar</filename>
             (if the server is a version earlier than 10, the file will be named
             <filename>pg_xlog.tar</filename>).
@@ -375,7 +368,7 @@ PostgreSQL documentation
    </para>
    <para>
     The following command-line options control the generation of the
-    backup and the running of the program.
+    backup and the running of the program:
 
     <variablelist>
      <varlistentry>
@@ -383,7 +376,8 @@ PostgreSQL documentation
       <term><option>--checkpoint=<replaceable class="parameter">fast|spread</replaceable></option></term>
       <listitem>
        <para>
-        Sets checkpoint mode to fast (immediate) or spread (default) (see <xref linkend="backup-lowlevel-base-backup"/>).
+        Sets checkpoint mode to fast (immediate) or spread (the default)
+        (see <xref linkend="backup-lowlevel-base-backup"/>).
        </para>
       </listitem>
      </varlistentry>
@@ -393,9 +387,9 @@ PostgreSQL documentation
       <term><option>--create-slot</option></term>
       <listitem>
        <para>
-        This option causes creation of a replication slot named by the
-        <literal>--slot</literal> option before starting the backup.
-        An error is raised if the slot already exists.
+        Specifies that the replication slot named by the
+        <literal>--slot</literal> option should be created before starting
+        the backup.  An error is raised if the slot already exists.
        </para>
       </listitem>
      </varlistentry>
@@ -418,9 +412,9 @@ PostgreSQL documentation
        <para>
         By default, when <command>pg_basebackup</command> aborts with an
         error, it removes any directories it might have created before
-        discovering that it cannot finish the job (for example, data directory
-        and write-ahead log directory). This option inhibits tidying-up and is
-        thus useful for debugging.
+        discovering that it cannot finish the job (for example, the target
+        directory and write-ahead log directory). This option inhibits
+        tidying-up and is thus useful for debugging.
        </para>
 
        <para>
@@ -460,19 +454,41 @@ PostgreSQL documentation
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>-r <replaceable class="parameter">rate</replaceable></option></term>
+      <term><option>--max-rate=<replaceable class="parameter">rate</replaceable></option></term>
+      <listitem>
+       <para>
+        Sets the maximum transfer rate at which data is collected from the
+        source server.  This can be useful to limit the impact
+        of <application>pg_basebackup</application> on the server.  Values
+        are in kilobytes per second.  Use a suffix of <literal>M</literal>
+        to indicate megabytes per second.  A suffix of <literal>k</literal>
+        is also accepted, and has no effect.  Valid values are between 32
+        kilobytes per second and 1024 megabytes per second.
+       </para>
+       <para>
+        This option always affects transfer of the data directory. Transfer of
+        WAL files is only affected if the collection method
+        is <literal>fetch</literal>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
       <term><option>-S <replaceable>slotname</replaceable></option></term>
       <term><option>--slot=<replaceable class="parameter">slotname</replaceable></option></term>
       <listitem>
        <para>
         This option can only be used together with <literal>-X
-        stream</literal>.  It causes the WAL streaming to use the specified
+        stream</literal>.  It causes WAL streaming to use the specified
         replication slot.  If the base backup is intended to be used as a
-        streaming replication standby using replication slots, it should then
-        use the same replication slot name
-        in <xref linkend="guc-primary-slot-name"/>.  That way, it is ensured that
-        the server does not remove any necessary WAL data in the time between
-        the end of the base backup and the start of streaming replication.
+        streaming-replication standby using a replication slot, the standby
+        should then use the same replication slot name as
+        <xref linkend="guc-primary-slot-name"/>.  This ensures that the
+        primary server does not remove any necessary WAL data in the time
+        between the end of the base backup and the start of streaming
+        replication on the new standby.
        </para>
        <para>
         The specified replication slot has to exist unless the
@@ -522,15 +538,15 @@ PostgreSQL documentation
        <para>
         Using a SHA hash function provides a cryptographically secure digest
         of each file for users who wish to verify that the backup has not been
-        tampered with, while the CRC32C algorithm provides a checksum which is
-        much faster to calculate and good at catching errors due to accidental
+        tampered with, while the CRC32C algorithm provides a checksum that is
+        much faster to calculate; it is good at catching errors due to accidental
         changes but is not resistant to targeted modifications.  Note that, to
         be useful against an adversary who has access to the backup, the backup
         manifest would need to be stored securely elsewhere or otherwise
         verified not to have been modified since the backup was taken.
        </para>
        <para>
-        <xref linkend="app-pgverifybackup" /> can be used to check the
+        <xref linkend="app-pgverifybackup"/> can be used to check the
         integrity of a backup against the backup manifest.
        </para>
       </listitem>
@@ -552,11 +568,11 @@ PostgreSQL documentation
       <term><option>--no-estimate-size</option></term>
       <listitem>
        <para>
-        This option prevents the server from estimating the total
+        Prevents the server from estimating the total
         amount of backup data that will be streamed, resulting in the
-        <literal>backup_total</literal> column in the
-        <structname>pg_stat_progress_basebackup</structname>
-        to be <literal>NULL</literal>.
+        <structfield>backup_total</structfield> column in the
+        <structname>pg_stat_progress_basebackup</structname> view
+        always being <literal>NULL</literal>.
        </para>
        <para>
         Without this option, the backup will start by enumerating
@@ -578,7 +594,7 @@ PostgreSQL documentation
        <para>
         Disables generation of a backup manifest. If this option is not
         specified, the server will generate and send a backup manifest
-        which can be verified using <xref linkend="app-pgverifybackup" />.
+        which can be verified using <xref linkend="app-pgverifybackup"/>.
         The manifest is a list of every file present in the backup with the
         exception of any WAL files that may be included. It also stores the
         size, last modification time, and an optional checksum for each file.
@@ -590,16 +606,17 @@ PostgreSQL documentation
       <term><option>--no-slot</option></term>
       <listitem>
        <para>
-        This option prevents the creation of a temporary replication slot
-        during the backup even if it's supported by the server.
+        Prevents the creation of a temporary replication slot
+        for the backup.
        </para>
        <para>
-        Temporary replication slots are created by default if no slot name
-        is given with the option <option>-S</option> when using log streaming.
+        By default, if log streaming is selected but no slot name is given
+        with the <option>-S</option> option, then a temporary replication
+        slot is created (if supported by the source server).
        </para>
        <para>
         The main purpose of this option is to allow taking a base backup when
-        the server is out of free replication slots.  Using replication slots
+        the server has no free replication slots.  Using a replication slot
         is almost always preferred, because it prevents needed WAL from being
         removed by the server during the backup.
        </para>
@@ -617,7 +634,7 @@ PostgreSQL documentation
         By default, checksums are verified and checksum failures will result
         in a non-zero exit status. However, the base backup will not be
         removed in such a case, as if the <option>--no-clean</option> option
-        had been used.  Checksum verifications failures will also be reported
+        had been used.  Checksum verification failures will also be reported
         in the <link linkend="monitoring-pg-stat-database-view">
         <structname>pg_stat_database</structname></link> view.
        </para>
@@ -627,7 +644,8 @@ PostgreSQL documentation
    </para>
 
    <para>
-    The following command-line options control the database connection parameters.
+    The following command-line options control the connection to the source
+    server:
 
     <variablelist>
      <varlistentry>
@@ -641,7 +659,7 @@ PostgreSQL documentation
        <para>
         The option is called <literal>--dbname</literal> for consistency with other
         client applications, but because <application>pg_basebackup</application>
-        doesn't connect to any particular database in the cluster, database
+        doesn't connect to any particular database in the cluster, any database
         name in the connection string will be ignored.
        </para>
       </listitem>
@@ -654,7 +672,7 @@ PostgreSQL documentation
        <para>
         Specifies the host name of the machine on which the server is
         running.  If the value begins with a slash, it is used as the
-        directory for the Unix domain socket. The default is taken
+        directory for a Unix domain socket. The default is taken
         from the <envar>PGHOST</envar> environment variable, if set,
         else a Unix domain socket connection is attempted.
        </para>
@@ -679,11 +697,12 @@ PostgreSQL documentation
       <term><option>--status-interval=<replaceable class="parameter">interval</replaceable></option></term>
       <listitem>
        <para>
-        Specifies the number of seconds between status packets sent back to the
-        server. This allows for easier monitoring of the progress from server.
-        A value of zero disables the periodic status updates completely,
+        Specifies the number of seconds between status packets sent back to
+        the source server. Smaller values allow more accurate monitoring of
+        backup progress from the server.
+        A value of zero disables periodic status updates completely,
         although an update will still be sent when requested by the server, to
-        avoid timeout disconnect. The default value is 10 seconds.
+        avoid timeout-based disconnects. The default value is 10 seconds.
        </para>
       </listitem>
      </varlistentry>
@@ -693,7 +712,7 @@ PostgreSQL documentation
       <term><option>--username=<replaceable class="parameter">username</replaceable></option></term>
       <listitem>
        <para>
-        User name to connect as.
+        Specifies the user name to connect as.
        </para>
       </listitem>
      </varlistentry>
@@ -703,7 +722,7 @@ PostgreSQL documentation
       <term><option>--no-password</option></term>
       <listitem>
        <para>
-        Never issue a password prompt.  If the server requires
+        Prevents issuing a password prompt.  If the server requires
         password authentication and a password is not available by
         other means such as a <filename>.pgpass</filename> file, the
         connection attempt will fail.  This option can be useful in
@@ -718,8 +737,8 @@ PostgreSQL documentation
       <term><option>--password</option></term>
       <listitem>
        <para>
-        Force <application>pg_basebackup</application> to prompt for a
-        password before connecting to a database.
+        Forces <application>pg_basebackup</application> to prompt for a
+        password before connecting to the source server.
        </para>
 
        <para>
@@ -745,7 +764,7 @@ PostgreSQL documentation
        <term><option>--version</option></term>
        <listitem>
        <para>
-       Print the <application>pg_basebackup</application> version and exit.
+       Prints the <application>pg_basebackup</application> version and exits.
        </para>
        </listitem>
      </varlistentry>
@@ -755,8 +774,8 @@ PostgreSQL documentation
        <term><option>--help</option></term>
        <listitem>
        <para>
-       Show help about <application>pg_basebackup</application> command line
-       arguments, and exit.
+       Shows help about <application>pg_basebackup</application> command line
+       arguments, and exits.
        </para>
        </listitem>
      </varlistentry>
@@ -787,11 +806,10 @@ PostgreSQL documentation
   <title>Notes</title>
 
   <para>
-   At the beginning of the backup, a checkpoint needs to be written on the
-   server the backup is taken from.  Especially if the option
-   <literal>--checkpoint=fast</literal> is not used, this can take some time
-   during which <application>pg_basebackup</application> will be appear
-   to be idle.
+   At the beginning of the backup, a checkpoint needs to be performed on the
+   source server.  This can take some time (especially if the option
+   <literal>--checkpoint=fast</literal> is not used), during
+   which <application>pg_basebackup</application> will appear to be idle.
   </para>
 
   <para>
@@ -806,8 +824,8 @@ PostgreSQL documentation
   </para>
 
   <para>
-   Tablespaces will in plain format by default be backed up to the same path
-   they have on the server, unless the
+   In plain format, tablespaces will be backed up to the same path
+   they have on the source server, unless the
    option <literal>--tablespace-mapping</literal> is used.  Without
    this option, running a plain format base backup on the same host as the
    server will not work if tablespaces are in use, because the backup would
@@ -816,8 +834,9 @@ PostgreSQL documentation
   </para>
 
   <para>
-   When tar format mode is used, it is the user's responsibility to unpack each
-   tar file before starting the PostgreSQL server. If there are additional tablespaces, the
+   When tar format is used, it is the user's responsibility to unpack each
+   tar file before starting a PostgreSQL server that uses the data. If there
+   are additional tablespaces, the
    tar files for them need to be unpacked in the correct locations. In this
    case the symbolic links for those tablespaces will be created by the server
    according to the contents of the <filename>tablespace_map</filename> file that is
@@ -827,15 +846,14 @@ PostgreSQL documentation
   <para>
    <application>pg_basebackup</application> works with servers of the same
    or an older major version, down to 9.1. However, WAL streaming mode (<literal>-X
-   stream</literal>) only works with server version 9.3 and later, and tar format mode
-   (<literal>--format=tar</literal>) of the current version only works with server version 9.5
-   or later.
+   stream</literal>) only works with server version 9.3 and later, and tar format
+   (<literal>--format=tar</literal>) only works with server version 9.5
+   and later.
   </para>
 
   <para>
-   <application>pg_basebackup</application> will preserve group permissions in
-   both the <literal>plain</literal> and <literal>tar</literal> formats if group
-   permissions are enabled on the source cluster.
+   <application>pg_basebackup</application> will preserve group permissions
+   for data files if group permissions are enabled on the source cluster.
   </para>
 
  </refsect1>

From 676a9c3cc4b5f1d262c29de318868948513f0fa0 Mon Sep 17 00:00:00 2001
From: Noah Misch <noah@leadboat.com>
Date: Sat, 15 Aug 2020 20:21:52 -0700
Subject: [PATCH 307/334] Correct several behavior descriptions in comments.

Reuse cautionary language from src/test/ssl/README in
src/test/kerberos/README.  SLRUs have had access to six-character
segments names since commit 73c986adde5d73a5e2555da9b5c8facedb146dcd,
and recovery stopped calling HeapTupleHeaderAdvanceLatestRemovedXid() in
commit 558a9165e081d1936573e5a7d576f5febd7fb55a.  The other corrections
are more self-evident.
---
 src/backend/access/heap/heapam.c                    |  2 --
 src/backend/access/transam/README                   | 11 +++++------
 src/backend/access/transam/varsup.c                 | 13 +++++++------
 src/backend/commands/async.c                        | 11 ++++-------
 src/backend/commands/vacuum.c                       | 10 +++++-----
 src/backend/storage/buffer/bufmgr.c                 |  2 +-
 src/bin/pg_waldump/pg_waldump.c                     | 11 +++--------
 src/include/access/xlog_internal.h                  |  7 ++-----
 src/test/kerberos/README                            | 10 ++++++----
 src/test/perl/PostgresNode.pm                       |  6 ++----
 .../recovery/t/010_logical_decoding_timelines.pl    |  2 +-
 src/test/ssl/t/SSLServer.pm                         |  1 -
 12 files changed, 36 insertions(+), 50 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index f75e1cf0e7b0..9b5f417eac44 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6920,8 +6920,6 @@ HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple,
 	 * updated/deleted by the inserting transaction.
 	 *
 	 * Look for a committed hint bit, or if no xmin bit is set, check clog.
-	 * This needs to work on both primary and standby, where it is used to
-	 * assess btree delete records.
 	 */
 	if (HeapTupleHeaderXminCommitted(tuple) ||
 		(!HeapTupleHeaderXminInvalid(tuple) && TransactionIdDidCommit(xmin)))
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index c5f09667ba15..1edc8180c128 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -635,12 +635,11 @@ be reconstructed later following a crash and the action is simply a way
 of optimising for performance. When a hint is written we use
 MarkBufferDirtyHint() to mark the block dirty.
 
-If the buffer is clean and checksums are in use then
-MarkBufferDirtyHint() inserts an XLOG_FPI record to ensure that we
-take a full page image that includes the hint. We do this to avoid
-a partial page write, when we write the dirtied page. WAL is not
-written during recovery, so we simply skip dirtying blocks because
-of hints when in recovery.
+If the buffer is clean and checksums are in use then MarkBufferDirtyHint()
+inserts an XLOG_FPI_FOR_HINT record to ensure that we take a full page image
+that includes the hint. We do this to avoid a partial page write, when we
+write the dirtied page. WAL is not written during recovery, so we simply skip
+dirtying blocks because of hints when in recovery.
 
 If you do decide to optimise away a WAL record, then any calls to
 MarkBufferDirty() must be replaced by MarkBufferDirtyHint(),
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 2d2b05be36c4..a4944faa32e3 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -367,12 +367,13 @@ SetTransactionIdLimit(TransactionId oldest_datfrozenxid, Oid oldest_datoid)
 	 * We'll refuse to continue assigning XIDs in interactive mode once we get
 	 * within 3M transactions of data loss.  This leaves lots of room for the
 	 * DBA to fool around fixing things in a standalone backend, while not
-	 * being significant compared to total XID space. (Note that since
-	 * vacuuming requires one transaction per table cleaned, we had better be
-	 * sure there's lots of XIDs left...)  Also, at default BLCKSZ, this
-	 * leaves two completely-idle segments.  In the event of edge-case bugs
-	 * involving page or segment arithmetic, idle segments render the bugs
-	 * unreachable outside of single-user mode.
+	 * being significant compared to total XID space. (VACUUM requires an XID
+	 * if it truncates at wal_level!=minimal.  "VACUUM (ANALYZE)", which a DBA
+	 * might do by reflex, assigns an XID.  Hence, we had better be sure
+	 * there's lots of XIDs left...)  Also, at default BLCKSZ, this leaves two
+	 * completely-idle segments.  In the event of edge-case bugs involving
+	 * page or segment arithmetic, idle segments render the bugs unreachable
+	 * outside of single-user mode.
 	 */
 	xidStopLimit = xidWrapLimit - 3000000;
 	if (xidStopLimit < FirstNormalTransactionId)
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 4c1286eb988e..774b26fd2c4d 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -302,13 +302,10 @@ static SlruCtlData NotifyCtlData;
 #define QUEUE_FULL_WARN_INTERVAL	5000	/* warn at most once every 5s */
 
 /*
- * slru.c currently assumes that all filenames are four characters of hex
- * digits. That means that we can use segments 0000 through FFFF.
- * Each segment contains SLRU_PAGES_PER_SEGMENT pages which gives us
- * the pages from 0 to SLRU_PAGES_PER_SEGMENT * 0x10000 - 1.
- *
- * It's of course possible to enhance slru.c, but this gives us so much
- * space already that it doesn't seem worth the trouble.
+ * Use segments 0000 through FFFF.  Each contains SLRU_PAGES_PER_SEGMENT pages
+ * which gives us the pages from 0 to SLRU_PAGES_PER_SEGMENT * 0x10000 - 1.
+ * We could use as many segments as SlruScanDirectory() allows, but this gives
+ * us so much space already that it doesn't seem worth the trouble.
  *
  * The most data we can have in the queue at a time is QUEUE_MAX_PAGE/2
  * pages, because more than that would confuse slru.c into thinking there
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 5189a5ad5e37..23eb605d4cb2 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -949,11 +949,11 @@ vacuum_set_xid_limits(Relation rel,
 	/*
 	 * We can always ignore processes running lazy vacuum.  This is because we
 	 * use these values only for deciding which tuples we must keep in the
-	 * tables.  Since lazy vacuum doesn't write its XID anywhere, it's safe to
-	 * ignore it.  In theory it could be problematic to ignore lazy vacuums in
-	 * a full vacuum, but keep in mind that only one vacuum process can be
-	 * working on a particular table at any time, and that each vacuum is
-	 * always an independent transaction.
+	 * tables.  Since lazy vacuum doesn't write its XID anywhere (usually no
+	 * XID assigned), it's safe to ignore it.  In theory it could be
+	 * problematic to ignore lazy vacuums in a full vacuum, but keep in mind
+	 * that only one vacuum process can be working on a particular table at
+	 * any time, and that each vacuum is always an independent transaction.
 	 */
 	*oldestXmin = GetOldestNonRemovableTransactionId(rel);
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f1ae6f9f8443..a2a963bd5b41 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3578,7 +3578,7 @@ IncrBufferRefCount(Buffer buffer)
  * This is essentially the same as MarkBufferDirty, except:
  *
  * 1. The caller does not write WAL; so if checksums are enabled, we may need
- *	  to write an XLOG_FPI WAL record to protect against torn pages.
+ *	  to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
  * 2. The caller might have only share-lock instead of exclusive-lock on the
  *	  buffer's content lock.
  * 3. This function does not guarantee that the buffer is always marked dirty
diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c
index d1a067893539..31e99c2a6da5 100644
--- a/src/bin/pg_waldump/pg_waldump.c
+++ b/src/bin/pg_waldump/pg_waldump.c
@@ -611,14 +611,9 @@ XLogDumpDisplayStats(XLogDumpConfig *config, XLogDumpStats *stats)
 	double		rec_len_pct,
 				fpi_len_pct;
 
-	/* ---
-	 * Make a first pass to calculate column totals:
-	 * count(*),
-	 * sum(xl_len+SizeOfXLogRecord),
-	 * sum(xl_tot_len-xl_len-SizeOfXLogRecord), and
-	 * sum(xl_tot_len).
-	 * These are used to calculate percentages for each record type.
-	 * ---
+	/*
+	 * Each row shows its percentages of the total, so make a first pass to
+	 * calculate column totals.
 	 */
 
 	for (ri = 0; ri < RM_NEXT_ID; ri++)
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 9b2da56379e1..4146753d4765 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -43,11 +43,8 @@ typedef struct XLogPageHeaderData
 	/*
 	 * When there is not enough space on current page for whole record, we
 	 * continue on the next page.  xlp_rem_len is the number of bytes
-	 * remaining from a previous page.
-	 *
-	 * Note that xlp_rem_len includes backup-block data; that is, it tracks
-	 * xl_tot_len not xl_len in the initial header.  Also note that the
-	 * continuation data isn't necessarily aligned.
+	 * remaining from a previous page; it tracks xl_tot_len in the initial
+	 * header.  Note that the continuation data isn't necessarily aligned.
 	 */
 	uint32		xlp_rem_len;	/* total len of remaining data for record */
 } XLogPageHeaderData;
diff --git a/src/test/kerberos/README b/src/test/kerberos/README
index 93af72e16367..fa9c03e78291 100644
--- a/src/test/kerberos/README
+++ b/src/test/kerberos/README
@@ -8,10 +8,12 @@ functionality.  This requires a full MIT Kerberos installation,
 including server and client tools, and is therefore kept separate and
 not run by default.
 
-Also, this test suite creates a KDC server that listens for TCP/IP
-connections on localhost without any real access control, so it is not
-safe to run this on a system where there might be untrusted local
-users.
+CAUTION: The test server run by this test is configured to listen for TCP
+connections on localhost. Any user on the same host is able to log in to the
+test server while the tests are running. Do not run this suite on a multi-user
+system where you don't trust all local users! Also, this test suite creates a
+KDC server that listens for TCP/IP connections on localhost without any real
+access control.
 
 Running the tests
 =================
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 8c1b77376fb0..1488bffa2ba3 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -1234,10 +1234,8 @@ sub can_bind
 	return $ret;
 }
 
-# Automatically shut down any still-running nodes when the test script exits.
-# Note that this just stops the postmasters (in the same order the nodes were
-# created in).  Any temporary directories are deleted, in an unspecified
-# order, later when the File::Temp objects are destroyed.
+# Automatically shut down any still-running nodes (in the same order the nodes
+# were created in) when the test script exits.
 END
 {
 
diff --git a/src/test/recovery/t/010_logical_decoding_timelines.pl b/src/test/recovery/t/010_logical_decoding_timelines.pl
index 09aaefa9f032..329500f0ae5b 100644
--- a/src/test/recovery/t/010_logical_decoding_timelines.pl
+++ b/src/test/recovery/t/010_logical_decoding_timelines.pl
@@ -111,7 +111,7 @@
 # Examine the physical slot the replica uses to stream changes
 # from the primary to make sure its hot_standby_feedback
 # has locked in a catalog_xmin on the physical slot, and that
-# any xmin is < the catalog_xmin
+# any xmin is >= the catalog_xmin
 $node_primary->poll_query_until(
 	'postgres', q[
 	SELECT catalog_xmin IS NOT NULL
diff --git a/src/test/ssl/t/SSLServer.pm b/src/test/ssl/t/SSLServer.pm
index 1e392b8fbf61..f5987a003efd 100644
--- a/src/test/ssl/t/SSLServer.pm
+++ b/src/test/ssl/t/SSLServer.pm
@@ -9,7 +9,6 @@
 # - a database called trustdb that lets anyone in
 # - another database called certdb that uses certificate authentication, ie.
 #   the client must present a valid certificate signed by the client CA
-# - two users, called ssltestuser and anotheruser.
 #
 # The server is configured to only accept connections from localhost. If you
 # want to run the client from another host, you'll have to configure that

From 49967da65aec970fcda123acc681f1df5d70bfc6 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 16 Aug 2020 12:57:01 -0700
Subject: [PATCH 308/334] Make vacuum a bit more verbose to debug BF failure.

This is temporary. While possibly some more error checking / debugging
in this path would be a good thing, it'll not look exactly like this.

Discussion: https://postgr.es/m/20200816181604.l54m6kss5ntd6xow@alap3.anarazel.de
---
 src/backend/access/heap/heapam.c     | 11 ++++++++++-
 src/backend/access/heap/vacuumlazy.c |  7 +++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 9b5f417eac44..8eb276e46449 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6048,7 +6048,16 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 				TransactionIdIsInProgress(members[i].xid))
 			{
 				/* running locker cannot possibly be older than the cutoff */
-				Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
+				if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
+				{
+					/* temporary on-bf debugging */
+					elog(PANIC, "too old alive locker: multi: %u, member xid: %u, memb-current: %d, memb-progress: %d, cutoff: %u, cutoff-multi: %u, relfrozenxid: %u, relminmxid: %u",
+						 multi, members[i].xid,
+						 TransactionIdIsCurrentTransactionId(members[i].xid),
+						 TransactionIdIsInProgress(members[i].xid),
+						 cutoff_xid, cutoff_multi,
+						 relfrozenxid, relminmxid);
+				}
 				newmembers[nnewmembers++] = members[i];
 				has_lockers = true;
 			}
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 44e2224dd557..03c8e1ff7ea9 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -1350,7 +1350,14 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
 					if (HeapTupleIsHotUpdated(&tuple) ||
 						HeapTupleIsHeapOnly(&tuple) ||
 						params->index_cleanup == VACOPT_TERNARY_DISABLED)
+					{
+						/* temporary on-bf debugging */
+						elog(LOG, "treating dead HOT tuple (updated %d, heap only: %d, index cleanup: %d) as alive",
+							 HeapTupleIsHotUpdated(&tuple), HeapTupleIsHeapOnly(&tuple),
+							 params->index_cleanup == VACOPT_TERNARY_DISABLED);
+
 						nkeep += 1;
+					}
 					else
 						tupgone = true; /* we can delete the tuple */
 					all_visible = false;

From f6661d3df228dbbf50efb04f2b760774a6f2bfff Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 16 Aug 2020 14:21:37 -0700
Subject: [PATCH 309/334] Fix use of wrong index in ComputeXidHorizons().

This bug, recently introduced in 941697c3c1a, at least lead to vacuum
failing because it found tuples inserted by a running transaction, but
below the freeze limit. The freeze limit in turn is directly affected
by the aforementioned bug.

Thanks to Tom Lane figuring how to make the bug reproducible.

We should add a few more assertions to make sure this type of bug
isn't as hard to notice, but it's not yet clear how to best do so.

Co-Diagnosed-By: Tom Lane <tgl@sss.pgh.pa.us>
Author: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/1013484.1597609043@sss.pgh.pa.us
---
 src/backend/storage/ipc/procarray.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 8262abd42e6b..96e4a8785760 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -1663,7 +1663,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 		TransactionId xmin;
 
 		/* Fetch xid just once - see GetNewTransactionId */
-		xid = UINT32_ACCESS_ONCE(other_xids[pgprocno]);
+		xid = UINT32_ACCESS_ONCE(other_xids[index]);
 		xmin = UINT32_ACCESS_ONCE(proc->xmin);
 
 		/*

From b4f16397af460d9d6ead31b86cb3e7f562806866 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 17 Aug 2020 10:23:17 +0900
Subject: [PATCH 310/334] doc: Fix description about bgwriter and checkpoint in
 HA section

Since 806a2ae, the work of the bgwriter is split the checkpointer, but a
portion of the documentation did not get the message.

Author: Masahiko Sawada
Discussion: https://postgr.es/m/CA+fd4k6jXxjAtjMVC=wG3=QGpauZBtcgN3Jhw+oV7zXGKVLKzQ@mail.gmail.com
Backpatch-through: 9.5
---
 doc/src/sgml/high-availability.sgml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml
index a824d383f2d8..d6f79fc435ea 100644
--- a/doc/src/sgml/high-availability.sgml
+++ b/doc/src/sgml/high-availability.sgml
@@ -2380,9 +2380,10 @@ LOG:  database system is ready to accept read only connections
    </para>
 
    <para>
-    The background writer is active during recovery and will perform
-    restartpoints (similar to checkpoints on the primary) and normal block
-    cleaning activities. This can include updates of the hint bit
+    The checkpointer process and the background writer process are active during
+    recovery. The checkpointer process will perform restartpoints (similar to
+    checkpoints on the primary) and the background writer process will perform
+    normal block cleaning activities. This can include updates of the hint bit
     information stored on the standby server.
     The <command>CHECKPOINT</command> command is accepted during recovery,
     though it performs a restartpoint rather than a new checkpoint.

From d7ec8337f9093b097f08f94e5ecec36303ad73fd Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 17 Aug 2020 09:27:29 +0300
Subject: [PATCH 311/334] Fix printing last progress report line in client
 programs.

A number of client programs have a "--progress" option that when printing
to a TTY, updates the current line by printing a '\r' and overwriting it.
After the last line, '\n' needs to be printed to move the cursor to the
next line. pg_basebackup and pgbench got this right, but pg_rewind and
pg_checksums were slightly wrong. pg_rewind printed the newline to stdout
instead of stderr, and pg_checksums printed the newline even when not
printing to a TTY. Fix them, and also add a 'finished' argument to
pg_basebackup's progress_report() function, to keep it consistent with
the other programs.

Backpatch to v12. pg_rewind's newline was broken with the logging changes
in commit cc8d415117 in v12, and pg_checksums was introduced in v12.

Discussion: https://www.postgresql.org/message-id/82b539e5-ae33-34b0-1aee-22b3379fd3eb@iki.fi
---
 src/bin/pg_basebackup/pg_basebackup.c | 38 ++++++++++++++-------------
 src/bin/pg_checksums/pg_checksums.c   | 14 +++++-----
 src/bin/pg_rewind/pg_rewind.c         | 22 +++++++++-------
 src/bin/pg_rewind/pg_rewind.h         |  2 +-
 4 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 4f29671d0cdc..8158c8e41957 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -188,7 +188,8 @@ static PQExpBuffer recoveryconfcontents = NULL;
 /* Function headers */
 static void usage(void);
 static void verify_dir_is_empty_or_create(char *dirname, bool *created, bool *found);
-static void progress_report(int tablespacenum, const char *filename, bool force);
+static void progress_report(int tablespacenum, const char *filename, bool force,
+							bool finished);
 
 static void ReceiveTarFile(PGconn *conn, PGresult *res, int rownum);
 static void ReceiveTarCopyChunk(size_t r, char *copybuf, void *callback_data);
@@ -765,11 +766,15 @@ verify_dir_is_empty_or_create(char *dirname, bool *created, bool *found)
  * Print a progress report based on the global variables. If verbose output
  * is enabled, also print the current file name.
  *
- * Progress report is written at maximum once per second, unless the
- * force parameter is set to true.
+ * Progress report is written at maximum once per second, unless the force
+ * parameter is set to true.
+ *
+ * If finished is set to true, this is the last progress report. The cursor
+ * is moved to the next line.
  */
 static void
-progress_report(int tablespacenum, const char *filename, bool force)
+progress_report(int tablespacenum, const char *filename,
+				bool force, bool finished)
 {
 	int			percent;
 	char		totaldone_str[32];
@@ -780,7 +785,7 @@ progress_report(int tablespacenum, const char *filename, bool force)
 		return;
 
 	now = time(NULL);
-	if (now == last_progress_report && !force)
+	if (now == last_progress_report && !force && !finished)
 		return;					/* Max once per second */
 
 	last_progress_report = now;
@@ -851,10 +856,11 @@ progress_report(int tablespacenum, const char *filename, bool force)
 				totaldone_str, totalsize_str, percent,
 				tablespacenum, tablespacecount);
 
-	if (isatty(fileno(stderr)))
-		fprintf(stderr, "\r");
-	else
-		fprintf(stderr, "\n");
+	/*
+	 * Stay on the same line if reporting to a terminal and we're not done
+	 * yet.
+	 */
+	fprintf(stderr, (!finished && isatty(fileno(stderr))) ? "\r" : "\n");
 }
 
 static int32
@@ -1277,7 +1283,7 @@ ReceiveTarFile(PGconn *conn, PGresult *res, int rownum)
 		}
 	}
 
-	progress_report(rownum, state.filename, true);
+	progress_report(rownum, state.filename, true, false);
 
 	/*
 	 * Do not sync the resulting tar file yet, all files are synced once at
@@ -1470,7 +1476,7 @@ ReceiveTarCopyChunk(size_t r, char *copybuf, void *callback_data)
 		}
 	}
 	totaldone += r;
-	progress_report(state->tablespacenum, state->filename, false);
+	progress_report(state->tablespacenum, state->filename, false, false);
 }
 
 
@@ -1528,7 +1534,7 @@ ReceiveAndUnpackTarFile(PGconn *conn, PGresult *res, int rownum)
 	if (state.file)
 		fclose(state.file);
 
-	progress_report(rownum, state.filename, true);
+	progress_report(rownum, state.filename, true, false);
 
 	if (state.file != NULL)
 	{
@@ -1709,7 +1715,7 @@ ReceiveTarAndUnpackCopyChunk(size_t r, char *copybuf, void *callback_data)
 			exit(1);
 		}
 		totaldone += r;
-		progress_report(state->tablespacenum, state->filename, false);
+		progress_report(state->tablespacenum, state->filename, false, false);
 
 		state->current_len_left -= r;
 		if (state->current_len_left == 0 && state->current_padding == 0)
@@ -2027,11 +2033,7 @@ BaseBackup(void)
 		ReceiveBackupManifest(conn);
 
 	if (showprogress)
-	{
-		progress_report(PQntuples(res), NULL, true);
-		if (isatty(fileno(stderr)))
-			fprintf(stderr, "\n");	/* Need to move to next line */
-	}
+		progress_report(PQntuples(res), NULL, true, true);
 
 	PQclear(res);
 
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 1daa5aed0e0f..0696db69bbd5 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -125,7 +125,7 @@ static const struct exclude_list_item skip[] = {
  * src/bin/pg_basebackup/pg_basebackup.c.
  */
 static void
-progress_report(bool force)
+progress_report(bool finished)
 {
 	int			percent;
 	char		total_size_str[32];
@@ -135,7 +135,7 @@ progress_report(bool force)
 	Assert(showprogress);
 
 	now = time(NULL);
-	if (now == last_progress_report && !force)
+	if (now == last_progress_report && !finished)
 		return;					/* Max once per second */
 
 	/* Save current time */
@@ -162,8 +162,11 @@ progress_report(bool force)
 			(int) strlen(current_size_str), current_size_str, total_size_str,
 			percent);
 
-	/* Stay on the same line if reporting to a terminal */
-	fprintf(stderr, isatty(fileno(stderr)) ? "\r" : "\n");
+	/*
+	 * Stay on the same line if reporting to a terminal and we're not done
+	 * yet.
+	 */
+	fprintf(stderr, (!finished && isatty(fileno(stderr))) ? "\r" : "\n");
 }
 
 static bool
@@ -624,10 +627,7 @@ main(int argc, char *argv[])
 		(void) scan_directory(DataDir, "pg_tblspc", false);
 
 		if (showprogress)
-		{
 			progress_report(true);
-			fprintf(stderr, "\n");	/* Need to move to next line */
-		}
 
 		printf(_("Checksum operation completed\n"));
 		printf(_("Files scanned:  %s\n"), psprintf(INT64_FORMAT, files));
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 0015d3b461a7..a9aecc790528 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -422,7 +422,6 @@ main(int argc, char **argv)
 	executeFileMap();
 
 	progress_report(true);
-	printf("\n");
 
 	if (showprogress)
 		pg_log_info("creating backup label and updating control file");
@@ -519,11 +518,14 @@ sanityChecks(void)
 /*
  * Print a progress report based on the fetch_size and fetch_done variables.
  *
- * Progress report is written at maximum once per second, unless the
- * force parameter is set to true.
+ * Progress report is written at maximum once per second, except that the
+ * last progress report is always printed.
+ *
+ * If finished is set to true, this is the last progress report. The cursor
+ * is moved to the next line.
  */
 void
-progress_report(bool force)
+progress_report(bool finished)
 {
 	static pg_time_t last_progress_report = 0;
 	int			percent;
@@ -535,7 +537,7 @@ progress_report(bool force)
 		return;
 
 	now = time(NULL);
-	if (now == last_progress_report && !force)
+	if (now == last_progress_report && !finished)
 		return;					/* Max once per second */
 
 	last_progress_report = now;
@@ -565,10 +567,12 @@ progress_report(bool force)
 	fprintf(stderr, _("%*s/%s kB (%d%%) copied"),
 			(int) strlen(fetch_size_str), fetch_done_str, fetch_size_str,
 			percent);
-	if (isatty(fileno(stderr)))
-		fprintf(stderr, "\r");
-	else
-		fprintf(stderr, "\n");
+
+	/*
+	 * Stay on the same line if reporting to a terminal and we're not done
+	 * yet.
+	 */
+	fprintf(stderr, (!finished && isatty(fileno(stderr))) ? "\r" : "\n");
 }
 
 /*
diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h
index 5cf5f17bb5f1..8a9319ed6759 100644
--- a/src/bin/pg_rewind/pg_rewind.h
+++ b/src/bin/pg_rewind/pg_rewind.h
@@ -53,7 +53,7 @@ extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr,
 								int tliIndex, const char *restoreCommand);
 
 /* in pg_rewind.c */
-extern void progress_report(bool force);
+extern void progress_report(bool finished);
 
 /* in timeline.c */
 extern TimeLineHistoryEntry *rewind_parseTimeLineHistory(char *buffer,

From 3941eb6341d8274dd63a26972042da6632533f2b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 17 Aug 2020 10:50:13 +0300
Subject: [PATCH 312/334] Make xact.h usable in frontend.

xact.h included utils/datetime.h, which cannot be used in the frontend
(it includes fmgr.h, which needs Datum). But xact.h only needs the
definition of TimestampTz from it, which is available directly in
datatypes/timestamp.h. Change xact.h to include that instead of
utils/datetime.h, so that it can be used in client programs.
---
 contrib/pg_prewarm/autoprewarm.c  | 1 +
 contrib/postgres_fdw/connection.c | 1 +
 src/backend/nodes/params.c        | 1 +
 src/backend/utils/time/snapmgr.c  | 2 ++
 src/include/access/xact.h         | 2 +-
 5 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c
index d797095458a4..c32ddc56fdbc 100644
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -46,6 +46,7 @@
 #include "storage/smgr.h"
 #include "tcop/tcopprot.h"
 #include "utils/acl.h"
+#include "utils/datetime.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
 #include "utils/rel.h"
diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index 52d1fe356315..08daf26fdf08 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -22,6 +22,7 @@
 #include "postgres_fdw.h"
 #include "storage/fd.h"
 #include "storage/latch.h"
+#include "utils/datetime.h"
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 #include "utils/memutils.h"
diff --git a/src/backend/nodes/params.c b/src/backend/nodes/params.c
index 1719119fc28f..bce0c7e72b2c 100644
--- a/src/backend/nodes/params.c
+++ b/src/backend/nodes/params.c
@@ -16,6 +16,7 @@
 #include "postgres.h"
 
 #include "access/xact.h"
+#include "fmgr.h"
 #include "mb/stringinfo_mb.h"
 #include "nodes/params.h"
 #include "parser/parse_node.h"
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 752af0c10dfc..c208538e2e5c 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -53,6 +53,7 @@
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "catalog/catalog.h"
+#include "datatype/timestamp.h"
 #include "lib/pairingheap.h"
 #include "miscadmin.h"
 #include "storage/predicate.h"
@@ -67,6 +68,7 @@
 #include "utils/resowner_private.h"
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
+#include "utils/timestamp.h"
 
 
 /*
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index c59de9bebaf8..df1b43a932e3 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -16,11 +16,11 @@
 
 #include "access/transam.h"
 #include "access/xlogreader.h"
+#include "datatype/timestamp.h"
 #include "lib/stringinfo.h"
 #include "nodes/pg_list.h"
 #include "storage/relfilenode.h"
 #include "storage/sinval.h"
-#include "utils/datetime.h"
 
 /*
  * Maximum size of Global Transaction ID (including '\0').

From a28d731a1187e8d9d8c2b6319375fcbf0a8debd5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Mon, 17 Aug 2020 10:52:58 +0300
Subject: [PATCH 313/334] Mark commit and abort WAL records with
 XLR_SPECIAL_REL_UPDATE.

If a commit or abort record includes "dropped relfilenodes", then replaying
the record will remove data files. That is surely a "special rel update",
but the records were not marked as such. Fix that, teach pg_rewind to
expect and ignore them, and add a test case to cover it.

It's always been like this, but no backporting for fear of breaking
existing applications. If an application parsed the WAL but was not
handling commit/abort records, it would stop working. That might be a good
thing if it really needed to handle the dropped rels, but it will be caught
when the application is updated to work with PostgreSQL v14 anyway.

Discussion: https://www.postgresql.org/message-id/07b33e2c-46a6-86a1-5f9e-a7da73fddb95%40iki.fi
Reviewed-by: Amit Kapila, Michael Paquier
---
 src/backend/access/transam/xact.c |  2 ++
 src/bin/pg_rewind/parsexlog.c     | 13 +++++++++++++
 src/bin/pg_rewind/t/001_basic.pl  | 15 ++++++++++++++-
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 7ccb7d68ed9a..af6afcebb133 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -5565,6 +5565,7 @@ XactLogCommitRecord(TimestampTz commit_time,
 	{
 		xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES;
 		xl_relfilenodes.nrels = nrels;
+		info |= XLR_SPECIAL_REL_UPDATE;
 	}
 
 	if (nmsgs > 0)
@@ -5697,6 +5698,7 @@ XactLogAbortRecord(TimestampTz abort_time,
 	{
 		xl_xinfo.xinfo |= XACT_XINFO_HAS_RELFILENODES;
 		xl_relfilenodes.nrels = nrels;
+		info |= XLR_SPECIAL_REL_UPDATE;
 	}
 
 	if (TransactionIdIsValid(twophase_xid))
diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c
index 2325fb5d3021..2229c86f9afb 100644
--- a/src/bin/pg_rewind/parsexlog.c
+++ b/src/bin/pg_rewind/parsexlog.c
@@ -14,6 +14,7 @@
 #include <unistd.h>
 
 #include "access/rmgr.h"
+#include "access/xact.h"
 #include "access/xlog_internal.h"
 #include "access/xlogreader.h"
 #include "catalog/pg_control.h"
@@ -397,6 +398,18 @@ extractPageInfo(XLogReaderState *record)
 		 * source system.
 		 */
 	}
+	else if (rmid == RM_XACT_ID &&
+			 ((rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT ||
+			  (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_COMMIT_PREPARED ||
+			  (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_ABORT ||
+			  (rminfo & XLOG_XACT_OPMASK) == XLOG_XACT_ABORT_PREPARED))
+	{
+		/*
+		 * These records can include "dropped rels". We can safely ignore
+		 * them, we will see that they are missing and copy them from the
+		 * source.
+		 */
+	}
 	else if (info & XLR_SPECIAL_REL_UPDATE)
 	{
 		/*
diff --git a/src/bin/pg_rewind/t/001_basic.pl b/src/bin/pg_rewind/t/001_basic.pl
index fb4a0acd965a..ba528e262f32 100644
--- a/src/bin/pg_rewind/t/001_basic.pl
+++ b/src/bin/pg_rewind/t/001_basic.pl
@@ -1,7 +1,7 @@
 use strict;
 use warnings;
 use TestLib;
-use Test::More tests => 20;
+use Test::More tests => 23;
 
 use FindBin;
 use lib $FindBin::RealBin;
@@ -29,6 +29,10 @@ sub run_test
 	primary_psql("CREATE TABLE tail_tbl (id integer, d text)");
 	primary_psql("INSERT INTO tail_tbl VALUES (0, 'in primary')");
 
+	# This test table is dropped in the old primary after promotion.
+	primary_psql("CREATE TABLE drop_tbl (d text)");
+	primary_psql("INSERT INTO drop_tbl VALUES ('in primary')");
+
 	primary_psql("CHECKPOINT");
 
 	RewindTest::create_standby($test_mode);
@@ -66,6 +70,9 @@ sub run_test
 	primary_psql("DELETE FROM tail_tbl WHERE id > 10");
 	primary_psql("VACUUM tail_tbl");
 
+	# Drop drop_tbl. pg_rewind should copy it back.
+	primary_psql("DROP TABLE drop_tbl");
+
 	# Before running pg_rewind, do a couple of extra tests with several
 	# option combinations.  As the code paths taken by those tests
 	# do not change for the "local" and "remote" modes, just run them
@@ -154,6 +161,12 @@ sub run_test
 ),
 		'tail-copy');
 
+	check_query(
+		'SELECT * FROM drop_tbl',
+		qq(in primary
+),
+		'drop');
+
 	# Permissions on PGDATA should be default
   SKIP:
 	{

From 22e75a341ecc841bdc1db417d11a643b0a42df4f Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 17 Aug 2020 15:40:07 -0400
Subject: [PATCH 314/334] Doc: fix description of UNION/CASE/etc type
 unification.

The description of what select_common_type() does was not terribly
accurate.  Improve it.

David Johnston and Tom Lane

Discussion: https://postgr.es/m/1019930.1597613200@sss.pgh.pa.us
---
 doc/src/sgml/typeconv.sgml | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/doc/src/sgml/typeconv.sgml b/doc/src/sgml/typeconv.sgml
index 81dba7dacfed..8900d0eb3832 100644
--- a/doc/src/sgml/typeconv.sgml
+++ b/doc/src/sgml/typeconv.sgml
@@ -1069,7 +1069,7 @@ domain's base type for all subsequent steps.
     functions, this behavior allows a domain type to be preserved through
     a <literal>UNION</literal> or similar construct, so long as the user is
     careful to ensure that all inputs are implicitly or explicitly of that
-    exact type.  Otherwise the domain's base type will be preferred.
+    exact type.  Otherwise the domain's base type will be used.
    </para>
   </footnote>
 </para>
@@ -1092,24 +1092,29 @@ If the non-unknown inputs are not all of the same type category, fail.
 
 <step performance="required">
 <para>
-Choose the first non-unknown input type which is a preferred type in
-that category, if there is one.
-</para>
-</step>
-
-<step performance="required">
-<para>
-Otherwise, choose the last non-unknown input type that allows all the
-preceding non-unknown inputs to be implicitly converted to it.  (There
-always is such a type, since at least the first type in the list must
-satisfy this condition.)
+Select the first non-unknown input type as the candidate type,
+then consider each other non-unknown input type, left to right.
+  <footnote>
+   <para>
+    For historical reasons, <literal>CASE</literal> treats
+    its <literal>ELSE</literal> clause (if any) as the <quote>first</quote>
+    input, with the <literal>THEN</literal> clauses(s) considered after
+    that.  In all other cases, <quote>left to right</quote> means the order
+    in which the expressions appear in the query text.
+   </para>
+  </footnote>
+If the candidate type can be implicitly converted to the other type,
+but not vice-versa, select the other type as the new candidate type.
+Then continue considering the remaining inputs.  If, at any stage of this
+process, a preferred type is selected, stop considering additional
+inputs.
 </para>
 </step>
 
 <step performance="required">
 <para>
-Convert all inputs to the selected type.  Fail if there is not a
-conversion from a given input to the selected type.
+Convert all inputs to the final candidate type.  Fail if there is not an
+implicit conversion from a given input type to the candidate type.
 </para>
 </step>
 </procedure>

From 6e70443edacfc86674995c0c10ade0aec7a4fddf Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Mon, 17 Aug 2020 16:20:06 -0400
Subject: [PATCH 315/334] Disable autovacuum for BRIN test table

This should improve stability in the tests.

Per buildfarm member hyrax (CLOBBER_CACHE_ALWAYS) via Tom Lane.

Discussion: https://postgr.es/m/871534.1597503261@sss.pgh.pa.us
---
 src/test/regress/expected/brin.out | 2 +-
 src/test/regress/sql/brin.sql      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/test/regress/expected/brin.out b/src/test/regress/expected/brin.out
index 0b14c73fc645..18403498dfab 100644
--- a/src/test/regress/expected/brin.out
+++ b/src/test/regress/expected/brin.out
@@ -26,7 +26,7 @@ CREATE TABLE brintest (byteacol bytea,
 	int4rangecol int4range,
 	lsncol pg_lsn,
 	boxcol box
-) WITH (fillfactor=10);
+) WITH (fillfactor=10, autovacuum_enabled=off);
 INSERT INTO brintest SELECT
 	repeat(stringu1, 8)::bytea,
 	substr(stringu1, 1, 1)::"char",
diff --git a/src/test/regress/sql/brin.sql b/src/test/regress/sql/brin.sql
index 1289e76ecb9b..d1a82474f3f1 100644
--- a/src/test/regress/sql/brin.sql
+++ b/src/test/regress/sql/brin.sql
@@ -26,7 +26,7 @@ CREATE TABLE brintest (byteacol bytea,
 	int4rangecol int4range,
 	lsncol pg_lsn,
 	boxcol box
-) WITH (fillfactor=10);
+) WITH (fillfactor=10, autovacuum_enabled=off);
 
 INSERT INTO brintest SELECT
 	repeat(stringu1, 8)::bytea,

From adbe62d04b360bbd408d97e447932d8078485972 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 18 Aug 2020 11:10:50 +0900
Subject: [PATCH 316/334] Add PL/Sample to src/test/modules/

PL/Sample is an example template of procedural-language handler.  This
can be used as a base to implement a custom PL, or as a facility to test
APIs dedicated to PLs.  Much more could be done in this module, like
adding a simple validator, but this is left as future work.

The documentation included originally some C code to understand the
basics of PL handler implementation, but it was outdated, and not really
helpful either if trying to implement a new procedural language,
particularly when it came to the integration of a PL installation with
CREATE EXTENSION.

Author: Mark Wong
Reviewed-by: Tom Lane, Michael Paquier
Discussion: https://postgr.es/m/20200612172648.GA3327@2ndQuadrant.com
---
 doc/src/sgml/plhandler.sgml                   |  60 +-----
 src/test/modules/Makefile                     |   1 +
 src/test/modules/plsample/.gitignore          |   3 +
 src/test/modules/plsample/Makefile            |  20 ++
 src/test/modules/plsample/README              |   6 +
 .../modules/plsample/expected/plsample.out    |  36 ++++
 src/test/modules/plsample/plsample--1.0.sql   |  14 ++
 src/test/modules/plsample/plsample.c          | 183 ++++++++++++++++++
 src/test/modules/plsample/plsample.control    |   8 +
 src/test/modules/plsample/sql/plsample.sql    |  15 ++
 10 files changed, 290 insertions(+), 56 deletions(-)
 create mode 100644 src/test/modules/plsample/.gitignore
 create mode 100644 src/test/modules/plsample/Makefile
 create mode 100644 src/test/modules/plsample/README
 create mode 100644 src/test/modules/plsample/expected/plsample.out
 create mode 100644 src/test/modules/plsample/plsample--1.0.sql
 create mode 100644 src/test/modules/plsample/plsample.c
 create mode 100644 src/test/modules/plsample/plsample.control
 create mode 100644 src/test/modules/plsample/sql/plsample.sql

diff --git a/doc/src/sgml/plhandler.sgml b/doc/src/sgml/plhandler.sgml
index e1b0af7a60d1..40ee59de9f34 100644
--- a/doc/src/sgml/plhandler.sgml
+++ b/doc/src/sgml/plhandler.sgml
@@ -96,62 +96,10 @@
    </para>
 
    <para>
-    This is a template for a procedural-language handler written in C:
-<programlisting>
-#include "postgres.h"
-#include "executor/spi.h"
-#include "commands/trigger.h"
-#include "fmgr.h"
-#include "access/heapam.h"
-#include "utils/syscache.h"
-#include "catalog/pg_proc.h"
-#include "catalog/pg_type.h"
-
-PG_MODULE_MAGIC;
-
-PG_FUNCTION_INFO_V1(plsample_call_handler);
-
-Datum
-plsample_call_handler(PG_FUNCTION_ARGS)
-{
-    Datum          retval;
-
-    if (CALLED_AS_TRIGGER(fcinfo))
-    {
-        /*
-         * Called as a trigger function
-         */
-        TriggerData    *trigdata = (TriggerData *) fcinfo-&gt;context;
-
-        retval = ...
-    }
-    else
-    {
-        /*
-         * Called as a function
-         */
-
-        retval = ...
-    }
-
-    return retval;
-}
-</programlisting>
-    Only a few thousand lines of code have to be added instead of the
-    dots to complete the call handler.
-   </para>
-
-   <para>
-    After having compiled the handler function into a loadable module
-    (see <xref linkend="dfunc"/>), the following commands then
-    register the sample procedural language:
-<programlisting>
-CREATE FUNCTION plsample_call_handler() RETURNS language_handler
-    AS '<replaceable>filename</replaceable>'
-    LANGUAGE C;
-CREATE LANGUAGE plsample
-    HANDLER plsample_call_handler;
-</programlisting>
+    A template for a procedural-language handler written as a C extension is
+    provided in <literal>src/test/modules/plsample</literal>.  This is a
+    working sample demonstrating one way to create a procedural-language
+    handler, process parameters, and return a value.
    </para>
 
    <para>
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 1428529b041a..a6d2ffbf9e0e 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -10,6 +10,7 @@ SUBDIRS = \
 		  delay_execution \
 		  dummy_index_am \
 		  dummy_seclabel \
+		  plsample \
 		  snapshot_too_old \
 		  test_bloomfilter \
 		  test_ddl_deparse \
diff --git a/src/test/modules/plsample/.gitignore b/src/test/modules/plsample/.gitignore
new file mode 100644
index 000000000000..44d119cfcc24
--- /dev/null
+++ b/src/test/modules/plsample/.gitignore
@@ -0,0 +1,3 @@
+# Generated subdirectories
+/log/
+/results/
diff --git a/src/test/modules/plsample/Makefile b/src/test/modules/plsample/Makefile
new file mode 100644
index 000000000000..f1bc334bfc87
--- /dev/null
+++ b/src/test/modules/plsample/Makefile
@@ -0,0 +1,20 @@
+# src/test/modules/plsample/Makefile
+
+MODULES = plsample
+
+EXTENSION = plsample
+DATA = plsample--1.0.sql
+PGFILEDESC = "PL/Sample - template for procedural language"
+
+REGRESS = plsample
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/plsample
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/plsample/README b/src/test/modules/plsample/README
new file mode 100644
index 000000000000..0ed319308d22
--- /dev/null
+++ b/src/test/modules/plsample/README
@@ -0,0 +1,6 @@
+PL/Sample
+=========
+
+PL/Sample is an example template of procedural-language handler.  It is
+a simple implementation, yet demonstrates some of the things that can be done
+to build a fully functional procedural-language handler.
diff --git a/src/test/modules/plsample/expected/plsample.out b/src/test/modules/plsample/expected/plsample.out
new file mode 100644
index 000000000000..a0c318b6df55
--- /dev/null
+++ b/src/test/modules/plsample/expected/plsample.out
@@ -0,0 +1,36 @@
+CREATE EXTENSION plsample;
+-- Create and test some dummy functions
+CREATE FUNCTION plsample_result_text(a1 numeric, a2 text, a3 integer[])
+RETURNS TEXT
+AS $$
+  Example of source with text result.
+$$ LANGUAGE plsample;
+SELECT plsample_result_text(1.23, 'abc', '{4, 5, 6}');
+NOTICE:  source text of function "plsample_result_text": 
+  Example of source with text result.
+
+NOTICE:  argument: 0; name: a1; value: 1.23
+NOTICE:  argument: 1; name: a2; value: abc
+NOTICE:  argument: 2; name: a3; value: {4,5,6}
+         plsample_result_text          
+---------------------------------------
+                                      +
+   Example of source with text result.+
+ 
+(1 row)
+
+CREATE FUNCTION plsample_result_void(a1 text[])
+RETURNS VOID
+AS $$
+  Example of source with void result.
+$$ LANGUAGE plsample;
+SELECT plsample_result_void('{foo, bar, hoge}');
+NOTICE:  source text of function "plsample_result_void": 
+  Example of source with void result.
+
+NOTICE:  argument: 0; name: a1; value: {foo,bar,hoge}
+ plsample_result_void 
+----------------------
+ 
+(1 row)
+
diff --git a/src/test/modules/plsample/plsample--1.0.sql b/src/test/modules/plsample/plsample--1.0.sql
new file mode 100644
index 000000000000..fc5b280bd4fa
--- /dev/null
+++ b/src/test/modules/plsample/plsample--1.0.sql
@@ -0,0 +1,14 @@
+/* src/test/modules/plsample/plsample--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION plsample" to load this file. \quit
+
+CREATE FUNCTION plsample_call_handler() RETURNS language_handler
+  AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE TRUSTED LANGUAGE plsample
+  HANDLER plsample_call_handler;
+
+ALTER LANGUAGE plsample OWNER TO @extowner@;
+
+COMMENT ON LANGUAGE plsample IS 'PL/Sample procedural language';
diff --git a/src/test/modules/plsample/plsample.c b/src/test/modules/plsample/plsample.c
new file mode 100644
index 000000000000..408366906697
--- /dev/null
+++ b/src/test/modules/plsample/plsample.c
@@ -0,0 +1,183 @@
+/*-------------------------------------------------------------------------
+ *
+ * plsample.c
+ *	  Handler for the PL/Sample procedural language
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *		src/test/modules/plsample/plsample.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "catalog/pg_proc.h"
+#include "catalog/pg_type.h"
+#include "commands/event_trigger.h"
+#include "commands/trigger.h"
+#include "funcapi.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+#include "utils/syscache.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(plsample_call_handler);
+
+static Datum plsample_func_handler(PG_FUNCTION_ARGS);
+
+/*
+ * Handle function, procedure, and trigger calls.
+ */
+Datum
+plsample_call_handler(PG_FUNCTION_ARGS)
+{
+	Datum		retval = (Datum) 0;
+
+	PG_TRY();
+	{
+		/*
+		 * Determine if called as function or trigger and call appropriate
+		 * subhandler.
+		 */
+		if (CALLED_AS_TRIGGER(fcinfo))
+		{
+			/*
+			 * This function has been called as a trigger function, where
+			 * (TriggerData *) fcinfo->context includes the information of the
+			 * context.
+			 */
+		}
+		else if (CALLED_AS_EVENT_TRIGGER(fcinfo))
+		{
+			/*
+			 * This function is called as an event trigger function, where
+			 * (EventTriggerData *) fcinfo->context includes the information
+			 * of the context.
+			 */
+		}
+		else
+		{
+			/* Regular function handler */
+			retval = plsample_func_handler(fcinfo);
+		}
+	}
+	PG_FINALLY();
+	{
+	}
+	PG_END_TRY();
+
+	return retval;
+}
+
+/*
+ * plsample_func_handler
+ *
+ * Function called by the call handler for function execution.
+ */
+static Datum
+plsample_func_handler(PG_FUNCTION_ARGS)
+{
+	HeapTuple	pl_tuple;
+	Datum		ret;
+	char	   *source;
+	bool		isnull;
+	FmgrInfo   *arg_out_func;
+	Form_pg_type type_struct;
+	HeapTuple	type_tuple;
+	Form_pg_proc pl_struct;
+	volatile MemoryContext proc_cxt = NULL;
+	Oid		   *argtypes;
+	char	  **argnames;
+	char	   *argmodes;
+	char	   *proname;
+	Form_pg_type pg_type_entry;
+	Oid			result_typioparam;
+	FmgrInfo	result_in_func;
+	int			numargs;
+
+	/* Fetch the source text of the function. */
+	pl_tuple = SearchSysCache(PROCOID,
+							  ObjectIdGetDatum(fcinfo->flinfo->fn_oid), 0, 0, 0);
+	if (!HeapTupleIsValid(pl_tuple))
+		elog(ERROR, "cache lookup failed for function %u",
+			 fcinfo->flinfo->fn_oid);
+
+	/*
+	 * Extract and print the source text of the function.  This can be used as
+	 * a base for the function validation and execution.
+	 */
+	pl_struct = (Form_pg_proc) GETSTRUCT(pl_tuple);
+	proname = pstrdup(NameStr(pl_struct->proname));
+	ret = SysCacheGetAttr(PROCOID, pl_tuple, Anum_pg_proc_prosrc, &isnull);
+	if (isnull)
+		elog(ERROR, "could not find source text of function \"%s\"",
+			 proname);
+	ReleaseSysCache(pl_tuple);
+	source = DatumGetCString(DirectFunctionCall1(textout, ret));
+	ereport(NOTICE,
+			(errmsg("source text of function \"%s\": %s",
+					proname, source)));
+
+	/*
+	 * Allocate a context that will hold all the Postgres data for the
+	 * procedure.
+	 */
+	proc_cxt = AllocSetContextCreate(TopMemoryContext,
+									 "PL/Sample function",
+									 ALLOCSET_SMALL_SIZES);
+
+	arg_out_func = (FmgrInfo *) palloc0(fcinfo->nargs * sizeof(FmgrInfo));
+	numargs = get_func_arg_info(pl_tuple, &argtypes, &argnames, &argmodes);
+
+	/*
+	 * Iterate through all of the function arguments, printing each input
+	 * value.
+	 */
+	for (int i = 0; i < numargs; i++)
+	{
+		Oid			argtype = pl_struct->proargtypes.values[i];
+		char	   *value;
+
+		type_tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(argtype));
+		if (!HeapTupleIsValid(type_tuple))
+			elog(ERROR, "cache lookup failed for type %u", argtype);
+
+		type_struct = (Form_pg_type) GETSTRUCT(type_tuple);
+		fmgr_info_cxt(type_struct->typoutput, &(arg_out_func[i]), proc_cxt);
+		ReleaseSysCache(type_tuple);
+
+		value = OutputFunctionCall(&arg_out_func[i], fcinfo->args[i].value);
+		ereport(NOTICE,
+				(errmsg("argument: %d; name: %s; value: %s",
+						i, argnames[i], value)));
+	}
+
+	/*
+	 * Get the required information for input conversion of the return value.
+	 *
+	 * If the function uses VOID as result, it is better to return NULL.
+	 * Anyway, let's be honest.  This is just a template, so there is not much
+	 * we can do here.  This returns NULL except if the result type is text,
+	 * where the result is the source text of the function.
+	 */
+	if (pl_struct->prorettype != TEXTOID)
+		PG_RETURN_NULL();
+
+	type_tuple = SearchSysCache1(TYPEOID,
+								 ObjectIdGetDatum(pl_struct->prorettype));
+	if (!HeapTupleIsValid(type_tuple))
+		elog(ERROR, "cache lookup failed for type %u", pl_struct->prorettype);
+	pg_type_entry = (Form_pg_type) GETSTRUCT(type_tuple);
+	result_typioparam = getTypeIOParam(type_tuple);
+
+	fmgr_info_cxt(pg_type_entry->typinput, &result_in_func, proc_cxt);
+	ReleaseSysCache(type_tuple);
+
+	ret = InputFunctionCall(&result_in_func, source, result_typioparam, -1);
+	PG_RETURN_DATUM(ret);
+}
diff --git a/src/test/modules/plsample/plsample.control b/src/test/modules/plsample/plsample.control
new file mode 100644
index 000000000000..1e67251a1e03
--- /dev/null
+++ b/src/test/modules/plsample/plsample.control
@@ -0,0 +1,8 @@
+# plsample extension
+comment = 'PL/Sample'
+default_version = '1.0'
+module_pathname = '$libdir/plsample'
+relocatable = false
+schema = pg_catalog
+superuser = false
+trusted = true
diff --git a/src/test/modules/plsample/sql/plsample.sql b/src/test/modules/plsample/sql/plsample.sql
new file mode 100644
index 000000000000..bf0fddac7fc8
--- /dev/null
+++ b/src/test/modules/plsample/sql/plsample.sql
@@ -0,0 +1,15 @@
+CREATE EXTENSION plsample;
+-- Create and test some dummy functions
+CREATE FUNCTION plsample_result_text(a1 numeric, a2 text, a3 integer[])
+RETURNS TEXT
+AS $$
+  Example of source with text result.
+$$ LANGUAGE plsample;
+SELECT plsample_result_text(1.23, 'abc', '{4, 5, 6}');
+
+CREATE FUNCTION plsample_result_void(a1 text[])
+RETURNS VOID
+AS $$
+  Example of source with void result.
+$$ LANGUAGE plsample;
+SELECT plsample_result_void('{foo, bar, hoge}');

From 51300b45db95b6fd29f88534ab0739fdc9df1699 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 18 Aug 2020 12:24:22 +0900
Subject: [PATCH 317/334] Fix use-after-release issue in PL/Sample

Introduced in adbe62d0.  Per buildfarm member prion, when using
RELCACHE_FORCE_RELEASE.
---
 src/test/modules/plsample/plsample.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/test/modules/plsample/plsample.c b/src/test/modules/plsample/plsample.c
index 408366906697..80faef506b15 100644
--- a/src/test/modules/plsample/plsample.c
+++ b/src/test/modules/plsample/plsample.c
@@ -97,6 +97,7 @@ plsample_func_handler(PG_FUNCTION_ARGS)
 	char	   *proname;
 	Form_pg_type pg_type_entry;
 	Oid			result_typioparam;
+	Oid			prorettype;
 	FmgrInfo	result_in_func;
 	int			numargs;
 
@@ -117,7 +118,6 @@ plsample_func_handler(PG_FUNCTION_ARGS)
 	if (isnull)
 		elog(ERROR, "could not find source text of function \"%s\"",
 			 proname);
-	ReleaseSysCache(pl_tuple);
 	source = DatumGetCString(DirectFunctionCall1(textout, ret));
 	ereport(NOTICE,
 			(errmsg("source text of function \"%s\": %s",
@@ -157,6 +157,10 @@ plsample_func_handler(PG_FUNCTION_ARGS)
 						i, argnames[i], value)));
 	}
 
+	/* Type of the result */
+	prorettype = pl_struct->prorettype;
+	ReleaseSysCache(pl_tuple);
+
 	/*
 	 * Get the required information for input conversion of the return value.
 	 *
@@ -165,13 +169,13 @@ plsample_func_handler(PG_FUNCTION_ARGS)
 	 * we can do here.  This returns NULL except if the result type is text,
 	 * where the result is the source text of the function.
 	 */
-	if (pl_struct->prorettype != TEXTOID)
+	if (prorettype != TEXTOID)
 		PG_RETURN_NULL();
 
 	type_tuple = SearchSysCache1(TYPEOID,
-								 ObjectIdGetDatum(pl_struct->prorettype));
+								 ObjectIdGetDatum(prorettype));
 	if (!HeapTupleIsValid(type_tuple))
-		elog(ERROR, "cache lookup failed for type %u", pl_struct->prorettype);
+		elog(ERROR, "cache lookup failed for type %u", prorettype);
 	pg_type_entry = (Form_pg_type) GETSTRUCT(type_tuple);
 	result_typioparam = getTypeIOParam(type_tuple);
 

From 623a9ba79bbdd11c5eccb30b8bd5c446130e521c Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 17 Aug 2020 21:07:10 -0700
Subject: [PATCH 318/334] snapshot scalability: cache snapshots using a xact
 completion counter.

Previous commits made it faster/more scalable to compute snapshots. But not
building a snapshot is still faster. Now that GetSnapshotData() does not
maintain RecentGlobal* anymore, that is actually not too hard:

This commit introduces xactCompletionCount, which tracks the number of
top-level transactions with xids (i.e. which may have modified the database)
that completed in some form since the start of the server.

We can avoid rebuilding the snapshot's contents whenever the current
xactCompletionCount is the same as it was when the snapshot was
originally built.  Currently this check happens while holding
ProcArrayLock. While it's likely possible to perform the check without
acquiring ProcArrayLock, it seems better to do that separately /
later, some careful analysis is required. Even with the lock this is a
significant win on its own.

On a smaller two socket machine this gains another ~1.03x, on a larger
machine the effect is roughly double (earlier patch version tested
though).  If we were able to safely avoid the lock there'd be another
significant gain on top of that.

Author: Andres Freund <andres@anarazel.de>
Reviewed-By: Robert Haas <robertmhaas@gmail.com>
Reviewed-By: Thomas Munro <thomas.munro@gmail.com>
Reviewed-By: David Rowley <dgrowleyml@gmail.com>
Discussion: https://postgr.es/m/20200301083601.ews6hz5dduc3w2se@alap3.anarazel.de
---
 src/backend/replication/logical/snapbuild.c |   1 +
 src/backend/storage/ipc/procarray.c         | 125 ++++++++++++++++----
 src/backend/utils/time/snapmgr.c            |   4 +
 src/include/access/transam.h                |   9 ++
 src/include/utils/snapshot.h                |   7 ++
 5 files changed, 126 insertions(+), 20 deletions(-)

diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index e9701ea72215..9d5d68f3fa78 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -524,6 +524,7 @@ SnapBuildBuildSnapshot(SnapBuild *builder)
 	snapshot->curcid = FirstCommandId;
 	snapshot->active_count = 0;
 	snapshot->regd_count = 0;
+	snapshot->snapXactCompletionCount = 0;
 
 	return snapshot;
 }
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 96e4a8785760..e687cde6f176 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -407,6 +407,7 @@ CreateSharedProcArray(void)
 		procArray->lastOverflowedXid = InvalidTransactionId;
 		procArray->replication_slot_xmin = InvalidTransactionId;
 		procArray->replication_slot_catalog_xmin = InvalidTransactionId;
+		ShmemVariableCache->xactCompletionCount = 1;
 	}
 
 	allProcs = ProcGlobal->allProcs;
@@ -534,6 +535,9 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
 		/* Advance global latestCompletedXid while holding the lock */
 		MaintainLatestCompletedXid(latestXid);
 
+		/* Same with xactCompletionCount  */
+		ShmemVariableCache->xactCompletionCount++;
+
 		ProcGlobal->xids[proc->pgxactoff] = 0;
 		ProcGlobal->subxidStates[proc->pgxactoff].overflowed = false;
 		ProcGlobal->subxidStates[proc->pgxactoff].count = 0;
@@ -667,6 +671,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
 {
 	size_t		pgxactoff = proc->pgxactoff;
 
+	Assert(LWLockHeldByMe(ProcArrayLock));
 	Assert(TransactionIdIsValid(ProcGlobal->xids[pgxactoff]));
 	Assert(ProcGlobal->xids[pgxactoff] == proc->xid);
 
@@ -698,6 +703,9 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
 
 	/* Also advance global latestCompletedXid while holding the lock */
 	MaintainLatestCompletedXid(latestXid);
+
+	/* Same with xactCompletionCount  */
+	ShmemVariableCache->xactCompletionCount++;
 }
 
 /*
@@ -1916,6 +1924,93 @@ GetMaxSnapshotSubxidCount(void)
 	return TOTAL_MAX_CACHED_SUBXIDS;
 }
 
+/*
+ * Initialize old_snapshot_threshold specific parts of a newly build snapshot.
+ */
+static void
+GetSnapshotDataInitOldSnapshot(Snapshot snapshot)
+{
+	if (!OldSnapshotThresholdActive())
+	{
+		/*
+		 * If not using "snapshot too old" feature, fill related fields with
+		 * dummy values that don't require any locking.
+		 */
+		snapshot->lsn = InvalidXLogRecPtr;
+		snapshot->whenTaken = 0;
+	}
+	else
+	{
+		/*
+		 * Capture the current time and WAL stream location in case this
+		 * snapshot becomes old enough to need to fall back on the special
+		 * "old snapshot" logic.
+		 */
+		snapshot->lsn = GetXLogInsertRecPtr();
+		snapshot->whenTaken = GetSnapshotCurrentTimestamp();
+		MaintainOldSnapshotTimeMapping(snapshot->whenTaken, snapshot->xmin);
+	}
+}
+
+/*
+ * Helper function for GetSnapshotData() that checks if the bulk of the
+ * visibility information in the snapshot is still valid. If so, it updates
+ * the fields that need to change and returns true. Otherwise it returns
+ * false.
+ *
+ * This very likely can be evolved to not need ProcArrayLock held (at very
+ * least in the case we already hold a snapshot), but that's for another day.
+ */
+static bool
+GetSnapshotDataReuse(Snapshot snapshot)
+{
+	uint64 curXactCompletionCount;
+
+	Assert(LWLockHeldByMe(ProcArrayLock));
+
+	if (unlikely(snapshot->snapXactCompletionCount == 0))
+		return false;
+
+	curXactCompletionCount = ShmemVariableCache->xactCompletionCount;
+	if (curXactCompletionCount != snapshot->snapXactCompletionCount)
+		return false;
+
+	/*
+	 * If the current xactCompletionCount is still the same as it was at the
+	 * time the snapshot was built, we can be sure that rebuilding the
+	 * contents of the snapshot the hard way would result in the same snapshot
+	 * contents:
+	 *
+	 * As explained in transam/README, the set of xids considered running by
+	 * GetSnapshotData() cannot change while ProcArrayLock is held. Snapshot
+	 * contents only depend on transactions with xids and xactCompletionCount
+	 * is incremented whenever a transaction with an xid finishes (while
+	 * holding ProcArrayLock) exclusively). Thus the xactCompletionCount check
+	 * ensures we would detect if the snapshot would have changed.
+	 *
+	 * As the snapshot contents are the same as it was before, it is is safe
+	 * to re-enter the snapshot's xmin into the PGPROC array. None of the rows
+	 * visible under the snapshot could already have been removed (that'd
+	 * require the set of running transactions to change) and it fulfills the
+	 * requirement that concurrent GetSnapshotData() calls yield the same
+	 * xmin.
+	 */
+	if (!TransactionIdIsValid(MyProc->xmin))
+		MyProc->xmin = TransactionXmin = snapshot->xmin;
+
+	RecentXmin = snapshot->xmin;
+	Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin));
+
+	snapshot->curcid = GetCurrentCommandId(false);
+	snapshot->active_count = 0;
+	snapshot->regd_count = 0;
+	snapshot->copied = false;
+
+	GetSnapshotDataInitOldSnapshot(snapshot);
+
+	return true;
+}
+
 /*
  * GetSnapshotData -- returns information about running transactions.
  *
@@ -1963,6 +2058,7 @@ GetSnapshotData(Snapshot snapshot)
 	TransactionId oldestxid;
 	int			mypgxactoff;
 	TransactionId myxid;
+	uint64		curXactCompletionCount;
 
 	TransactionId replication_slot_xmin = InvalidTransactionId;
 	TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
@@ -2007,12 +2103,19 @@ GetSnapshotData(Snapshot snapshot)
 	 */
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
+	if (GetSnapshotDataReuse(snapshot))
+	{
+		LWLockRelease(ProcArrayLock);
+		return snapshot;
+	}
+
 	latest_completed = ShmemVariableCache->latestCompletedXid;
 	mypgxactoff = MyProc->pgxactoff;
 	myxid = other_xids[mypgxactoff];
 	Assert(myxid == MyProc->xid);
 
 	oldestxid = ShmemVariableCache->oldestXid;
+	curXactCompletionCount = ShmemVariableCache->xactCompletionCount;
 
 	/* xmax is always latestCompletedXid + 1 */
 	xmax = XidFromFullTransactionId(latest_completed);
@@ -2266,6 +2369,7 @@ GetSnapshotData(Snapshot snapshot)
 	snapshot->xcnt = count;
 	snapshot->subxcnt = subcount;
 	snapshot->suboverflowed = suboverflowed;
+	snapshot->snapXactCompletionCount = curXactCompletionCount;
 
 	snapshot->curcid = GetCurrentCommandId(false);
 
@@ -2277,26 +2381,7 @@ GetSnapshotData(Snapshot snapshot)
 	snapshot->regd_count = 0;
 	snapshot->copied = false;
 
-	if (old_snapshot_threshold < 0)
-	{
-		/*
-		 * If not using "snapshot too old" feature, fill related fields with
-		 * dummy values that don't require any locking.
-		 */
-		snapshot->lsn = InvalidXLogRecPtr;
-		snapshot->whenTaken = 0;
-	}
-	else
-	{
-		/*
-		 * Capture the current time and WAL stream location in case this
-		 * snapshot becomes old enough to need to fall back on the special
-		 * "old snapshot" logic.
-		 */
-		snapshot->lsn = GetXLogInsertRecPtr();
-		snapshot->whenTaken = GetSnapshotCurrentTimestamp();
-		MaintainOldSnapshotTimeMapping(snapshot->whenTaken, xmin);
-	}
+	GetSnapshotDataInitOldSnapshot(snapshot);
 
 	return snapshot;
 }
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index c208538e2e5c..22cf3ebaf472 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -597,6 +597,8 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid,
 	CurrentSnapshot->takenDuringRecovery = sourcesnap->takenDuringRecovery;
 	/* NB: curcid should NOT be copied, it's a local matter */
 
+	CurrentSnapshot->snapXactCompletionCount = 0;
+
 	/*
 	 * Now we have to fix what GetSnapshotData did with MyProc->xmin and
 	 * TransactionXmin.  There is a race condition: to make sure we are not
@@ -672,6 +674,7 @@ CopySnapshot(Snapshot snapshot)
 	newsnap->regd_count = 0;
 	newsnap->active_count = 0;
 	newsnap->copied = true;
+	newsnap->snapXactCompletionCount = 0;
 
 	/* setup XID array */
 	if (snapshot->xcnt > 0)
@@ -2209,6 +2212,7 @@ RestoreSnapshot(char *start_address)
 	snapshot->curcid = serialized_snapshot.curcid;
 	snapshot->whenTaken = serialized_snapshot.whenTaken;
 	snapshot->lsn = serialized_snapshot.lsn;
+	snapshot->snapXactCompletionCount = 0;
 
 	/* Copy XIDs, if present. */
 	if (serialized_snapshot.xcnt > 0)
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index b32044153b09..2f1f144db4d0 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -231,6 +231,15 @@ typedef struct VariableCacheData
 	FullTransactionId latestCompletedXid;	/* newest full XID that has
 											 * committed or aborted */
 
+	/*
+	 * Number of top-level transactions with xids (i.e. which may have
+	 * modified the database) that completed in some form since the start of
+	 * the server. This currently is solely used to check whether
+	 * GetSnapshotData() needs to recompute the contents of the snapshot, or
+	 * not. There are likely other users of this.  Always above 1.
+	 */
+	uint64 xactCompletionCount;
+
 	/*
 	 * These fields are protected by XactTruncationLock
 	 */
diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 35b1f05bea65..dea072e5edf5 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -207,6 +207,13 @@ typedef struct SnapshotData
 
 	TimestampTz whenTaken;		/* timestamp when snapshot was taken */
 	XLogRecPtr	lsn;			/* position in the WAL stream when taken */
+
+	/*
+	 * The transaction completion count at the time GetSnapshotData() built
+	 * this snapshot. Allows to avoid re-computing static snapshots when no
+	 * transactions completed since the last GetSnapshotData().
+	 */
+	uint64		snapXactCompletionCount;
 } SnapshotData;
 
 #endif							/* SNAPSHOT_H */

From 734478200ababcbb328ec3f02a74047bc470cae2 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 18 Aug 2020 13:13:09 +0300
Subject: [PATCH 319/334] Avoid non-constant format string argument to
 fprintf().

As Tom Lane pointed out, it could defeat the compiler's printf() format
string verification.

Backpatch to v12, like that patch that introduced it.

Discussion: https://www.postgresql.org/message-id/1069283.1597672779%40sss.pgh.pa.us
---
 src/bin/pg_basebackup/pg_basebackup.c | 2 +-
 src/bin/pg_checksums/pg_checksums.c   | 2 +-
 src/bin/pg_rewind/pg_rewind.c         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 8158c8e41957..7a5d4562f946 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -860,7 +860,7 @@ progress_report(int tablespacenum, const char *filename,
 	 * Stay on the same line if reporting to a terminal and we're not done
 	 * yet.
 	 */
-	fprintf(stderr, (!finished && isatty(fileno(stderr))) ? "\r" : "\n");
+	fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
 }
 
 static int32
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 0696db69bbd5..ffdc23945c6d 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -166,7 +166,7 @@ progress_report(bool finished)
 	 * Stay on the same line if reporting to a terminal and we're not done
 	 * yet.
 	 */
-	fprintf(stderr, (!finished && isatty(fileno(stderr))) ? "\r" : "\n");
+	fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
 }
 
 static bool
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index a9aecc790528..23fc749e4451 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -572,7 +572,7 @@ progress_report(bool finished)
 	 * Stay on the same line if reporting to a terminal and we're not done
 	 * yet.
 	 */
-	fprintf(stderr, (!finished && isatty(fileno(stderr))) ? "\r" : "\n");
+	fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
 }
 
 /*

From 07f32fcd23ac81898ed47f88beb569c631a2f223 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Aug 2020 16:31:12 -0700
Subject: [PATCH 320/334] Fix race condition in snapshot caching when 2PC is
 used.

When preparing a transaction xactCompletionCount needs to be
incremented, even though the transaction has not committed
yet. Otherwise the snapshot used within the transaction otherwise can
get reused outside of the prepared transaction. As GetSnapshotData()
does not include the current xid when building a snapshot, reuse would
not be correct.

Somewhat surprisingly the regression tests only rarely show incorrect
results without the fix. The reason for that is that often the
snapshot's xmax will be >= the backend xid, yielding a snapshot that
is correct, despite the bug.

I'm working on a reliable test for the bug, but it seems worth seeing
whether this fixes all the BF failures while I do.

Author: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/E1k7tGP-0005V0-5k@gemulon.postgresql.org
---
 src/backend/storage/ipc/procarray.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index e687cde6f176..51f8099cad2c 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -860,6 +860,15 @@ ProcArrayClearTransaction(PGPROC *proc)
 	Assert(!(proc->vacuumFlags & PROC_VACUUM_STATE_MASK));
 	Assert(!proc->delayChkpt);
 
+	/*
+	 * Need to increment completion count even though transaction hasn't
+	 * really committed yet. The reason for that is that GetSnapshotData()
+	 * omits the xid of the current transaction, thus without the increment we
+	 * otherwise could end up reusing the snapshot later. Which would be bad,
+	 * because it might not count the prepared transaction as running.
+	 */
+	ShmemVariableCache->xactCompletionCount++;
+
 	/* Clear the subtransaction-XID cache too */
 	Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
 		   ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);

From 3e98c0bafb28de87ae095b341687dc082371af54 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Wed, 19 Aug 2020 15:34:43 +0900
Subject: [PATCH 321/334] Add pg_backend_memory_contexts system view.

This view displays the usages of all the memory contexts of the server
process attached to the current session. This information is useful to
investigate the cause of backend-local memory bloat.

This information can be also collected by calling
MemoryContextStats(TopMemoryContext) via a debugger. But this technique
cannot be uesd in some environments because no debugger is available there.
And it outputs lots of text messages and it's not easy to analyze them.
So, pg_backend_memory_contexts view allows us to access to backend-local
memory contexts information more easily.

Bump catalog version.

Author: Atsushi Torikoshi, Fujii Masao
Reviewed-by: Tatsuhito Kasahara, Andres Freund, Daniel Gustafsson, Robert Haas, Michael Paquier
Discussion: https://postgr.es/m/72a656e0f71d0860161e0b3f67e4d771@oss.nttdata.com
---
 doc/src/sgml/catalogs.sgml           | 122 +++++++++++++++++++++++
 src/backend/catalog/system_views.sql |   3 +
 src/backend/utils/mmgr/mcxt.c        | 138 +++++++++++++++++++++++++++
 src/include/catalog/catversion.h     |   2 +-
 src/include/catalog/pg_proc.dat      |   9 ++
 src/test/regress/expected/rules.out  |  10 ++
 6 files changed, 283 insertions(+), 1 deletion(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index fc329c5cff96..1232b24e74cf 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -9226,6 +9226,11 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
       <entry>available versions of extensions</entry>
      </row>
 
+     <row>
+      <entry><link linkend="view-pg-backend-memory-contexts"><structname>pg_backend_memory_contexts</structname></link></entry>
+      <entry>backend memory contexts</entry>
+     </row>
+
      <row>
       <entry><link linkend="view-pg-config"><structname>pg_config</structname></link></entry>
       <entry>compile-time configuration parameters</entry>
@@ -9577,6 +9582,123 @@ SCRAM-SHA-256$<replaceable>&lt;iteration count&gt;</replaceable>:<replaceable>&l
   </para>
  </sect1>
 
+ <sect1 id="view-pg-backend-memory-contexts">
+  <title><structname>pg_backend_memory_contexts</structname></title>
+
+  <indexterm zone="view-pg-backend-memory-contexts">
+   <primary>pg_backend_memory_contexts</primary>
+  </indexterm>
+
+  <para>
+   The view <structname>pg_backend_memory_contexts</structname> displays all
+   the memory contexts of the server process attached to the current session.
+  </para>
+  <para>
+   <structname>pg_backend_memory_contexts</structname> contains one row
+   for each memory context.
+  </para>
+
+  <table>
+   <title><structname>pg_backend_memory_contexts</structname> Columns</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>name</structfield> <type>text</type>
+      </para>
+      <para>
+       Name of the memory context
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>ident</structfield> <type>text</type>
+      </para>
+      <para>
+       Identification information of the memory context. This field is truncated at 1024 bytes
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>parent</structfield> <type>text</type>
+      </para>
+      <para>
+       Name of the parent of this memory context
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>level</structfield> <type>int4</type>
+      </para>
+      <para>
+       Distance from TopMemoryContext in context tree
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>total_bytes</structfield> <type>int8</type>
+      </para>
+      <para>
+       Total bytes allocated for this memory context
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>total_nblocks</structfield> <type>int8</type>
+      </para>
+      <para>
+       Total number of blocks allocated for this memory context
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>free_bytes</structfield> <type>int8</type>
+      </para>
+      <para>
+       Free space in bytes
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>free_chunks</structfield> <type>int8</type>
+      </para>
+      <para>
+       Total number of free chunks
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>used_bytes</structfield> <type>int8</type>
+      </para>
+      <para>
+       Used space in bytes
+      </para></entry>
+     </row>
+    </tbody>
+   </tgroup>
+  </table>
+
+ </sect1>
+
  <sect1 id="view-pg-config">
   <title><structname>pg_config</structname></title>
 
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 8625cbeab6e4..ba5a23ac2524 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -554,6 +554,9 @@ CREATE VIEW pg_shmem_allocations AS
 REVOKE ALL ON pg_shmem_allocations FROM PUBLIC;
 REVOKE EXECUTE ON FUNCTION pg_get_shmem_allocations() FROM PUBLIC;
 
+CREATE VIEW pg_backend_memory_contexts AS
+    SELECT * FROM pg_get_backend_memory_contexts();
+
 -- Statistics views
 
 CREATE VIEW pg_stat_all_tables AS
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index abda22fa570a..d9bb2499db75 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -21,8 +21,10 @@
 
 #include "postgres.h"
 
+#include "funcapi.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
+#include "utils/builtins.h"
 #include "utils/memdebug.h"
 #include "utils/memutils.h"
 
@@ -67,6 +69,12 @@ static void MemoryContextStatsPrint(MemoryContext context, void *passthru,
 #define AssertNotInCriticalSection(context) \
 	Assert(CritSectionCount == 0 || (context)->allowInCritSection)
 
+/* ----------
+ * The max bytes for showing identifiers of MemoryContext.
+ * ----------
+ */
+#define MEMORY_CONTEXT_IDENT_DISPLAY_SIZE	1024
+
 /*****************************************************************************
  *	  EXPORTED ROUTINES														 *
  *****************************************************************************/
@@ -1220,3 +1228,133 @@ pchomp(const char *in)
 		n--;
 	return pnstrdup(in, n);
 }
+
+/*
+ * PutMemoryContextsStatsTupleStore
+ *		One recursion level for pg_get_backend_memory_contexts.
+ */
+static void
+PutMemoryContextsStatsTupleStore(Tuplestorestate *tupstore,
+								TupleDesc tupdesc, MemoryContext context,
+								const char *parent, int level)
+{
+#define PG_GET_BACKEND_MEMORY_CONTEXTS_COLS	9
+
+	Datum		values[PG_GET_BACKEND_MEMORY_CONTEXTS_COLS];
+	bool		nulls[PG_GET_BACKEND_MEMORY_CONTEXTS_COLS];
+	MemoryContextCounters stat;
+	MemoryContext child;
+	const char *name;
+	const char *ident;
+
+	AssertArg(MemoryContextIsValid(context));
+
+	name = context->name;
+	ident = context->ident;
+
+	/*
+	 * To be consistent with logging output, we label dynahash contexts
+	 * with just the hash table name as with MemoryContextStatsPrint().
+	 */
+	if (ident && strcmp(name, "dynahash") == 0)
+	{
+		name = ident;
+		ident = NULL;
+	}
+
+	/* Examine the context itself */
+	memset(&stat, 0, sizeof(stat));
+	(*context->methods->stats) (context, NULL, (void *) &level, &stat);
+
+	memset(values, 0, sizeof(values));
+	memset(nulls, 0, sizeof(nulls));
+
+	if (name)
+		values[0] = CStringGetTextDatum(name);
+	else
+		nulls[0] = true;
+
+	if (ident)
+	{
+		int		idlen = strlen(ident);
+		char		clipped_ident[MEMORY_CONTEXT_IDENT_DISPLAY_SIZE];
+
+		/*
+		 * Some identifiers such as SQL query string can be very long,
+		 * truncate oversize identifiers.
+		 */
+		if (idlen >= MEMORY_CONTEXT_IDENT_DISPLAY_SIZE)
+			idlen = pg_mbcliplen(ident, idlen, MEMORY_CONTEXT_IDENT_DISPLAY_SIZE - 1);
+
+		memcpy(clipped_ident, ident, idlen);
+		clipped_ident[idlen] = '\0';
+		values[1] = CStringGetTextDatum(clipped_ident);
+	}
+	else
+		nulls[1] = true;
+
+	if (parent)
+		values[2] = CStringGetTextDatum(parent);
+	else
+		nulls[2] = true;
+
+	values[3] = Int32GetDatum(level);
+	values[4] = Int64GetDatum(stat.totalspace);
+	values[5] = Int64GetDatum(stat.nblocks);
+	values[6] = Int64GetDatum(stat.freespace);
+	values[7] = Int64GetDatum(stat.freechunks);
+	values[8] = Int64GetDatum(stat.totalspace - stat.freespace);
+	tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+
+	for (child = context->firstchild; child != NULL; child = child->nextchild)
+	{
+		PutMemoryContextsStatsTupleStore(tupstore, tupdesc,
+								  child, name, level + 1);
+	}
+}
+
+/*
+ * pg_get_backend_memory_contexts
+ *		SQL SRF showing backend memory context.
+ */
+Datum
+pg_get_backend_memory_contexts(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	TupleDesc	tupdesc;
+	Tuplestorestate *tupstore;
+	MemoryContext per_query_ctx;
+	MemoryContext oldcontext;
+
+	/* check to see if caller supports us returning a tuplestore */
+	if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("set-valued function called in context that cannot accept a set")));
+	if (!(rsinfo->allowedModes & SFRM_Materialize))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("materialize mode required, but it is not allowed in this context")));
+
+	/* Build a tuple descriptor for our result type */
+	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+		elog(ERROR, "return type must be a row type");
+
+	per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+	oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+	tupstore = tuplestore_begin_heap(true, false, work_mem);
+	rsinfo->returnMode = SFRM_Materialize;
+	rsinfo->setResult = tupstore;
+	rsinfo->setDesc = tupdesc;
+
+	MemoryContextSwitchTo(oldcontext);
+
+	PutMemoryContextsStatsTupleStore(tupstore, tupdesc,
+								TopMemoryContext, NULL, 0);
+
+	/* clean up and return the tuplestore */
+	tuplestore_donestoring(tupstore);
+
+	return (Datum) 0;
+}
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 928495112196..3e6779763000 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	202007251
+#define CATALOG_VERSION_NO	202008191
 
 #endif
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 082a11f2708c..27989971db74 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -7807,6 +7807,15 @@
   proargnames => '{name,off,size,allocated_size}',
   prosrc => 'pg_get_shmem_allocations' },
 
+# memory context of local backend
+{ oid => '2282', descr => 'information about all memory contexts of local backend',
+  proname => 'pg_get_backend_memory_contexts', prorows => '100', proretset => 't',
+  provolatile => 'v', proparallel => 'r', prorettype => 'record', proargtypes => '',
+  proallargtypes => '{text,text,text,int4,int8,int8,int8,int8,int8}',
+  proargmodes => '{o,o,o,o,o,o,o,o,o}',
+  proargnames => '{name, ident, parent, level, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes}',
+  prosrc => 'pg_get_backend_memory_contexts' },
+
 # non-persistent series generator
 { oid => '1066', descr => 'non-persistent series generator',
   proname => 'generate_series', prorows => '1000',
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 601734a6f1ec..2a18dc423e2b 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1324,6 +1324,16 @@ pg_available_extensions| SELECT e.name,
     e.comment
    FROM (pg_available_extensions() e(name, default_version, comment)
      LEFT JOIN pg_extension x ON ((e.name = x.extname)));
+pg_backend_memory_contexts| SELECT pg_get_backend_memory_contexts.name,
+    pg_get_backend_memory_contexts.ident,
+    pg_get_backend_memory_contexts.parent,
+    pg_get_backend_memory_contexts.level,
+    pg_get_backend_memory_contexts.total_bytes,
+    pg_get_backend_memory_contexts.total_nblocks,
+    pg_get_backend_memory_contexts.free_bytes,
+    pg_get_backend_memory_contexts.free_chunks,
+    pg_get_backend_memory_contexts.used_bytes
+   FROM pg_get_backend_memory_contexts() pg_get_backend_memory_contexts(name, ident, parent, level, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes);
 pg_config| SELECT pg_config.name,
     pg_config.setting
    FROM pg_config() pg_config(name, setting);

From 20729324078055a4d9654fc5af9570fe625786a5 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 19 Aug 2020 14:07:49 -0400
Subject: [PATCH 322/334] Suppress unnecessary RelabelType nodes in yet more
 cases.

Commit a477bfc1d fixed eval_const_expressions() to ensure that it
didn't generate unnecessary RelabelType nodes, but I failed to notice
that some other places in the planner had the same issue.  Really
noplace in the planner should be using plain makeRelabelType(), for
fear of generating expressions that should be equal() to semantically
equivalent trees, but aren't.

An example is that because canonicalize_ec_expression() failed
to be careful about this, we could end up with an equivalence class
containing both a plain Const, and a Const-with-RelabelType
representing exactly the same value.  So far as I can tell this led to
no visible misbehavior, but we did waste a bunch of cycles generating
and evaluating "Const = Const-with-RelabelType" to prove such entries
are redundant.

Hence, move the support function added by a477bfc1d to where it can
be more generally useful, and use it in the places where planner code
previously used makeRelabelType.

Back-patch to v12, like the previous patch.  While I have no concrete
evidence of any real misbehavior here, it's certainly possible that
I overlooked a case where equivalent expressions that aren't equal()
could cause a user-visible problem.  In any case carrying extra
RelabelType nodes through planning to execution isn't very desirable.

Discussion: https://postgr.es/m/1311836.1597781384@sss.pgh.pa.us
---
 src/backend/nodes/nodeFuncs.c           | 75 ++++++++++++++++----
 src/backend/optimizer/path/equivclass.c | 43 +++++------
 src/backend/optimizer/prep/prepunion.c  | 10 +--
 src/backend/optimizer/util/clauses.c    | 94 ++++++-------------------
 src/include/nodes/nodeFuncs.h           |  3 +
 5 files changed, 106 insertions(+), 119 deletions(-)

diff --git a/src/backend/nodes/nodeFuncs.c b/src/backend/nodes/nodeFuncs.c
index d85ca9f7c501..9ce8f43385ec 100644
--- a/src/backend/nodes/nodeFuncs.c
+++ b/src/backend/nodes/nodeFuncs.c
@@ -575,27 +575,76 @@ exprIsLengthCoercion(const Node *expr, int32 *coercedTypmod)
 	return false;
 }
 
+/*
+ * applyRelabelType
+ *		Add a RelabelType node if needed to make the expression expose
+ *		the specified type, typmod, and collation.
+ *
+ * This is primarily intended to be used during planning.  Therefore, it must
+ * maintain the post-eval_const_expressions invariants that there are not
+ * adjacent RelabelTypes, and that the tree is fully const-folded (hence,
+ * we mustn't return a RelabelType atop a Const).  If we do find a Const,
+ * we'll modify it in-place if "overwrite_ok" is true; that should only be
+ * passed as true if caller knows the Const is newly generated.
+ */
+Node *
+applyRelabelType(Node *arg, Oid rtype, int32 rtypmod, Oid rcollid,
+				 CoercionForm rformat, int rlocation, bool overwrite_ok)
+{
+	/*
+	 * If we find stacked RelabelTypes (eg, from foo::int::oid) we can discard
+	 * all but the top one, and must do so to ensure that semantically
+	 * equivalent expressions are equal().
+	 */
+	while (arg && IsA(arg, RelabelType))
+		arg = (Node *) ((RelabelType *) arg)->arg;
+
+	if (arg && IsA(arg, Const))
+	{
+		/* Modify the Const directly to preserve const-flatness. */
+		Const	   *con = (Const *) arg;
+
+		if (!overwrite_ok)
+			con = copyObject(con);
+		con->consttype = rtype;
+		con->consttypmod = rtypmod;
+		con->constcollid = rcollid;
+		/* We keep the Const's original location. */
+		return (Node *) con;
+	}
+	else if (exprType(arg) == rtype &&
+			 exprTypmod(arg) == rtypmod &&
+			 exprCollation(arg) == rcollid)
+	{
+		/* Sometimes we find a nest of relabels that net out to nothing. */
+		return arg;
+	}
+	else
+	{
+		/* Nope, gotta have a RelabelType. */
+		RelabelType *newrelabel = makeNode(RelabelType);
+
+		newrelabel->arg = (Expr *) arg;
+		newrelabel->resulttype = rtype;
+		newrelabel->resulttypmod = rtypmod;
+		newrelabel->resultcollid = rcollid;
+		newrelabel->relabelformat = rformat;
+		newrelabel->location = rlocation;
+		return (Node *) newrelabel;
+	}
+}
+
 /*
  * relabel_to_typmod
  *		Add a RelabelType node that changes just the typmod of the expression.
  *
- * This is primarily intended to be used during planning.  Therefore, it
- * strips any existing RelabelType nodes to maintain the planner's invariant
- * that there are not adjacent RelabelTypes.
+ * Convenience function for a common usage of applyRelabelType.
  */
 Node *
 relabel_to_typmod(Node *expr, int32 typmod)
 {
-	Oid			type = exprType(expr);
-	Oid			coll = exprCollation(expr);
-
-	/* Strip any existing RelabelType node(s) */
-	while (expr && IsA(expr, RelabelType))
-		expr = (Node *) ((RelabelType *) expr)->arg;
-
-	/* Apply new typmod, preserving the previous exposed type and collation */
-	return (Node *) makeRelabelType((Expr *) expr, type, typmod, coll,
-									COERCE_EXPLICIT_CAST);
+	return applyRelabelType(expr, exprType(expr), typmod, exprCollation(expr),
+							COERCE_EXPLICIT_CAST, -1, false);
 }
 
 /*
diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c
index b99cec00cb7a..b68a5a0ec717 100644
--- a/src/backend/optimizer/path/equivclass.c
+++ b/src/backend/optimizer/path/equivclass.c
@@ -490,10 +490,6 @@ process_equivalence(PlannerInfo *root,
  * work to not label the collation at all in EC members, but this is risky
  * since some parts of the system expect exprCollation() to deliver the
  * right answer for a sort key.)
- *
- * Note this code assumes that the expression has already been through
- * eval_const_expressions, so there are no CollateExprs and no redundant
- * RelabelTypes.
  */
 Expr *
 canonicalize_ec_expression(Expr *expr, Oid req_type, Oid req_collation)
@@ -514,29 +510,24 @@ canonicalize_ec_expression(Expr *expr, Oid req_type, Oid req_collation)
 		exprCollation((Node *) expr) != req_collation)
 	{
 		/*
-		 * Strip any existing RelabelType, then add a new one if needed. This
-		 * is to preserve the invariant of no redundant RelabelTypes.
-		 *
-		 * If we have to change the exposed type of the stripped expression,
-		 * set typmod to -1 (since the new type may not have the same typmod
-		 * interpretation).  If we only have to change collation, preserve the
-		 * exposed typmod.
+		 * If we have to change the type of the expression, set typmod to -1,
+		 * since the new type may not have the same typmod interpretation.
+		 * When we only have to change collation, preserve the exposed typmod.
+		 */
+		int32		req_typmod;
+
+		if (expr_type != req_type)
+			req_typmod = -1;
+		else
+			req_typmod = exprTypmod((Node *) expr);
+
+		/*
+		 * Use applyRelabelType so that we preserve const-flatness.  This is
+		 * important since eval_const_expressions has already been applied.
 		 */
-		while (expr && IsA(expr, RelabelType))
-			expr = (Expr *) ((RelabelType *) expr)->arg;
-
-		if (exprType((Node *) expr) != req_type)
-			expr = (Expr *) makeRelabelType(expr,
-											req_type,
-											-1,
-											req_collation,
-											COERCE_IMPLICIT_CAST);
-		else if (exprCollation((Node *) expr) != req_collation)
-			expr = (Expr *) makeRelabelType(expr,
-											req_type,
-											exprTypmod((Node *) expr),
-											req_collation,
-											COERCE_IMPLICIT_CAST);
+		expr = (Expr *) applyRelabelType((Node *) expr,
+										 req_type, req_typmod, req_collation,
+										 COERCE_IMPLICIT_CAST, -1, false);
 	}
 
 	return expr;
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 2ebd4ea33207..745f443e5c2d 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -1200,13 +1200,9 @@ generate_setop_tlist(List *colTypes, List *colCollations,
 		 * will reach the executor without any further processing.
 		 */
 		if (exprCollation(expr) != colColl)
-		{
-			expr = (Node *) makeRelabelType((Expr *) expr,
-											exprType(expr),
-											exprTypmod(expr),
-											colColl,
-											COERCE_IMPLICIT_CAST);
-		}
+			expr = applyRelabelType(expr,
+									exprType(expr), exprTypmod(expr), colColl,
+									COERCE_IMPLICIT_CAST, -1, false);
 
 		tle = makeTargetEntry((Expr *) expr,
 							  (AttrNumber) resno++,
diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
index 7105d0a2db9a..750586fceb74 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -120,9 +120,6 @@ static Node *eval_const_expressions_mutator(Node *node,
 static bool contain_non_const_walker(Node *node, void *context);
 static bool ece_function_is_safe(Oid funcid,
 								 eval_const_expressions_context *context);
-static Node *apply_const_relabel(Node *arg, Oid rtype,
-								 int32 rtypmod, Oid rcollid,
-								 CoercionForm rformat, int rlocation);
 static List *simplify_or_arguments(List *args,
 								   eval_const_expressions_context *context,
 								   bool *haveNull, bool *forceTrue);
@@ -2819,12 +2816,13 @@ eval_const_expressions_mutator(Node *node,
 				arg = eval_const_expressions_mutator((Node *) relabel->arg,
 													 context);
 				/* ... and attach a new RelabelType node, if needed */
-				return apply_const_relabel(arg,
-										   relabel->resulttype,
-										   relabel->resulttypmod,
-										   relabel->resultcollid,
-										   relabel->relabelformat,
-										   relabel->location);
+				return applyRelabelType(arg,
+										relabel->resulttype,
+										relabel->resulttypmod,
+										relabel->resultcollid,
+										relabel->relabelformat,
+										relabel->location,
+										true);
 			}
 		case T_CoerceViaIO:
 			{
@@ -2971,12 +2969,13 @@ eval_const_expressions_mutator(Node *node,
 				arg = eval_const_expressions_mutator((Node *) collate->arg,
 													 context);
 				/* ... and attach a new RelabelType node, if needed */
-				return apply_const_relabel(arg,
-										   exprType(arg),
-										   exprTypmod(arg),
-										   collate->collOid,
-										   COERCE_IMPLICIT_CAST,
-										   collate->location);
+				return applyRelabelType(arg,
+										exprType(arg),
+										exprTypmod(arg),
+										collate->collOid,
+										COERCE_IMPLICIT_CAST,
+										collate->location,
+										true);
 			}
 		case T_CaseExpr:
 			{
@@ -3478,12 +3477,13 @@ eval_const_expressions_mutator(Node *node,
 													cdomain->resulttype);
 
 					/* Generate RelabelType to substitute for CoerceToDomain */
-					return apply_const_relabel(arg,
-											   cdomain->resulttype,
-											   cdomain->resulttypmod,
-											   cdomain->resultcollid,
-											   cdomain->coercionformat,
-											   cdomain->location);
+					return applyRelabelType(arg,
+											cdomain->resulttype,
+											cdomain->resulttypmod,
+											cdomain->resultcollid,
+											cdomain->coercionformat,
+											cdomain->location,
+											true);
 				}
 
 				newcdomain = makeNode(CoerceToDomain);
@@ -3616,58 +3616,6 @@ ece_function_is_safe(Oid funcid, eval_const_expressions_context *context)
 	return false;
 }
 
-/*
- * Subroutine for eval_const_expressions: apply RelabelType if needed
- */
-static Node *
-apply_const_relabel(Node *arg, Oid rtype, int32 rtypmod, Oid rcollid,
-					CoercionForm rformat, int rlocation)
-{
-	/*
-	 * If we find stacked RelabelTypes (eg, from foo::int::oid) we can discard
-	 * all but the top one, and must do so to ensure that semantically
-	 * equivalent expressions are equal().
-	 */
-	while (arg && IsA(arg, RelabelType))
-		arg = (Node *) ((RelabelType *) arg)->arg;
-
-	if (arg && IsA(arg, Const))
-	{
-		/*
-		 * If it's a Const, just modify it in-place; since this is part of
-		 * eval_const_expressions, we want to end up with a simple Const not
-		 * an expression tree.  We assume the Const is newly generated and
-		 * hence safe to modify.
-		 */
-		Const	   *con = (Const *) arg;
-
-		con->consttype = rtype;
-		con->consttypmod = rtypmod;
-		con->constcollid = rcollid;
-		return (Node *) con;
-	}
-	else if (exprType(arg) == rtype &&
-			 exprTypmod(arg) == rtypmod &&
-			 exprCollation(arg) == rcollid)
-	{
-		/* Sometimes we find a nest of relabels that net out to nothing. */
-		return arg;
-	}
-	else
-	{
-		/* Nope, gotta have a RelabelType. */
-		RelabelType *newrelabel = makeNode(RelabelType);
-
-		newrelabel->arg = (Expr *) arg;
-		newrelabel->resulttype = rtype;
-		newrelabel->resulttypmod = rtypmod;
-		newrelabel->resultcollid = rcollid;
-		newrelabel->relabelformat = rformat;
-		newrelabel->location = rlocation;
-		return (Node *) newrelabel;
-	}
-}
-
 /*
  * Subroutine for eval_const_expressions: process arguments of an OR clause
  *
diff --git a/src/include/nodes/nodeFuncs.h b/src/include/nodes/nodeFuncs.h
index 779906b9b77f..9cc56eecaa3a 100644
--- a/src/include/nodes/nodeFuncs.h
+++ b/src/include/nodes/nodeFuncs.h
@@ -36,6 +36,9 @@ typedef bool (*check_function_callback) (Oid func_id, void *context);
 extern Oid	exprType(const Node *expr);
 extern int32 exprTypmod(const Node *expr);
 extern bool exprIsLengthCoercion(const Node *expr, int32 *coercedTypmod);
+extern Node *applyRelabelType(Node *arg, Oid rtype, int32 rtypmod, Oid rcollid,
+							  CoercionForm rformat, int rlocation,
+							  bool overwrite_ok);
 extern Node *relabel_to_typmod(Node *expr, int32 typmod);
 extern Node *strip_implicit_coercions(Node *node);
 extern bool expression_returns_set(Node *clause);

From 1fe1f42e3e85279e1cb8b004b3b076a04bde4cee Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Wed, 19 Aug 2020 18:19:52 -0700
Subject: [PATCH 323/334] Acquire ProcArrayLock exclusively in
 ProcArrayClearTransaction.

This corrects an oversight by me in 20729324078, which made
ProcArrayClearTransaction() increment xactCompletionCount. That requires an
exclusive lock, obviously.

There's other approaches that avoid the exclusive acquisition, but given that a
2PC commit is fairly heavyweight, it doesn't seem worth doing so. I've not been
able to measure a performance difference, unsurprisingly.  I did add a
comment documenting that we could do so, should it ever become a bottleneck.

Reported-By: Tom Lane <tgl@sss.pgh.pa.us>
Author: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/1355915.1597794204@sss.pgh.pa.us
---
 src/backend/storage/ipc/procarray.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 51f8099cad2c..60b7a5db8e07 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -840,13 +840,20 @@ ProcArrayClearTransaction(PGPROC *proc)
 	size_t		pgxactoff;
 
 	/*
-	 * We can skip locking ProcArrayLock exclusively here, because this action
-	 * does not actually change anyone's view of the set of running XIDs: our
-	 * entry is duplicate with the gxact that has already been inserted into
-	 * the ProcArray. But need it in shared mode for pgproc->pgxactoff to stay
-	 * the same.
+	 * Currently we need to lock ProcArrayLock exclusively here, as we
+	 * increment xactCompletionCount below. We also need it at least in shared
+	 * mode for pgproc->pgxactoff to stay the same below.
+	 *
+	 * We could however, as this action does not actually change anyone's view
+	 * of the set of running XIDs (our entry is duplicate with the gxact that
+	 * has already been inserted into the ProcArray), lower the lock level to
+	 * shared if we were to make xactCompletionCount an atomic variable. But
+	 * that doesn't seem worth it currently, as a 2PC commit is heavyweight
+	 * enough for this not to be the bottleneck.  If it ever becomes a
+	 * bottleneck it may also be worth considering to combine this with the
+	 * subsequent ProcArrayRemove()
 	 */
-	LWLockAcquire(ProcArrayLock, LW_SHARED);
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 
 	pgxactoff = proc->pgxactoff;
 

From 0784c333728dd454b80c0bd0faec916782370810 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Thu, 20 Aug 2020 13:49:04 -0400
Subject: [PATCH 324/334] Revise REINDEX CONCURRENTLY recovery instructions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the leftover invalid index is "ccold", there's no need to re-run
the command.  Reword the instructions to make that explicit.

Backpatch to 12, where REINDEX CONCURRENTLY appeared.

Author: Álvaro Herrera <alvherre@alvh.no-ip.org>
Reviewed-by: Michael Paquier <michael@paquier.xyz>
Reviewed-by: Julien Rouhaud <rjuju123@gmail.com>
Discussion: https://postgr.es/m/20200819211312.GA15497@alvherre.pgsql
---
 doc/src/sgml/ref/reindex.sgml | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml
index aac5d5be23f4..c16f223e4edb 100644
--- a/doc/src/sgml/ref/reindex.sgml
+++ b/doc/src/sgml/ref/reindex.sgml
@@ -307,7 +307,7 @@ REINDEX [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ] { IN
     <orderedlist>
      <listitem>
       <para>
-       A new temporary index definition is added to the catalog
+       A new transient index definition is added to the catalog
        <literal>pg_index</literal>.  This definition will be used to replace
        the old index.  A <literal>SHARE UPDATE EXCLUSIVE</literal> lock at
        session level is taken on the indexes being reindexed as well as their
@@ -383,13 +383,15 @@ Indexes:
     "idx_ccnew" btree (col) INVALID
 </programlisting>
 
-    The recommended recovery method in such cases is to drop the invalid index
-    and try again to perform <command>REINDEX CONCURRENTLY</command>.  The
-    concurrent index created during the processing has a name ending in the
-    suffix <literal>ccnew</literal>, or <literal>ccold</literal> if it is an
-    old index definition which we failed to drop. Invalid indexes can be
-    dropped using <literal>DROP INDEX</literal>, including invalid toast
-    indexes.
+    If the index marked <literal>INVALID</literal> is suffixed
+    <literal>ccnew</literal>, then it corresponds to the transient
+    index created during the concurrent operation, and the recommended
+    recovery method is to drop it using <literal>DROP INDEX</literal>,
+    then attempt <command>REINDEX CONCURRENTLY</command> again.
+    If the invalid index is instead suffixed <literal>ccold</literal>,
+    it corresponds to the original index which could not be dropped;
+    the recommended recovery method is to just drop said index, since the
+    rebuild proper has been successful.
    </para>
 
    <para>

From c62a0a49f33a0d45a97aa1d3a5bc6ddc83f10d82 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 20 Aug 2020 12:59:00 -0700
Subject: [PATCH 325/334] Revert "Make vacuum a bit more verbose to debug BF
 failure."

This reverts commit 49967da65aec970fcda123acc681f1df5d70bfc6.

Enough time has passed that we can be confident that 07f32fcd23a
resolved the issue. Therefore we can remove the temporary debugging
aids.

Author: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/E1k7tGP-0005V0-5k@gemulon.postgresql.org
---
 src/backend/access/heap/heapam.c     | 11 +----------
 src/backend/access/heap/vacuumlazy.c |  7 -------
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 8eb276e46449..9b5f417eac44 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -6048,16 +6048,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
 				TransactionIdIsInProgress(members[i].xid))
 			{
 				/* running locker cannot possibly be older than the cutoff */
-				if (TransactionIdPrecedes(members[i].xid, cutoff_xid))
-				{
-					/* temporary on-bf debugging */
-					elog(PANIC, "too old alive locker: multi: %u, member xid: %u, memb-current: %d, memb-progress: %d, cutoff: %u, cutoff-multi: %u, relfrozenxid: %u, relminmxid: %u",
-						 multi, members[i].xid,
-						 TransactionIdIsCurrentTransactionId(members[i].xid),
-						 TransactionIdIsInProgress(members[i].xid),
-						 cutoff_xid, cutoff_multi,
-						 relfrozenxid, relminmxid);
-				}
+				Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid));
 				newmembers[nnewmembers++] = members[i];
 				has_lockers = true;
 			}
diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 03c8e1ff7ea9..44e2224dd557 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -1350,14 +1350,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats,
 					if (HeapTupleIsHotUpdated(&tuple) ||
 						HeapTupleIsHeapOnly(&tuple) ||
 						params->index_cleanup == VACOPT_TERNARY_DISABLED)
-					{
-						/* temporary on-bf debugging */
-						elog(LOG, "treating dead HOT tuple (updated %d, heap only: %d, index cleanup: %d) as alive",
-							 HeapTupleIsHotUpdated(&tuple), HeapTupleIsHeapOnly(&tuple),
-							 params->index_cleanup == VACOPT_TERNARY_DISABLED);
-
 						nkeep += 1;
-					}
 					else
 						tupgone = true; /* we can delete the tuple */
 					all_visible = false;

From 8431d33079a2c552aaa223ebcfd470572d90146b Mon Sep 17 00:00:00 2001
From: David Rowley <drowley@postgresql.org>
Date: Fri, 21 Aug 2020 09:33:56 +1200
Subject: [PATCH 326/334] Fix a few typos in JIT comments and README

Reviewed-by: Abhijit Menon-Sen
Reviewed-by: Andres Freund
Discussion: https://postgr.es/m/CAApHDvobgmCs6CohqhKTUf7D8vffoZXQTCBTERo9gbOeZmvLTw%40mail.gmail.com
Backpatch-through: 11, where JIT was added
---
 src/backend/jit/README         | 14 +++++++-------
 src/include/jit/llvmjit_emit.h |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/backend/jit/README b/src/backend/jit/README
index e2fac8558e8e..5427bdf2153f 100644
--- a/src/backend/jit/README
+++ b/src/backend/jit/README
@@ -10,11 +10,11 @@ SQL expressions to evaluate an SQL predicate like WHERE a.col = 3, it
 is possible to generate a function than can be natively executed by
 the CPU that just handles that expression, yielding a speedup.
 
-That this is done at query execution time, possibly even only in cases
-where the relevant task is done a number of times, makes it JIT,
-rather than ahead-of-time (AOT). Given the way JIT compilation is used
-in PostgreSQL, the lines between interpretation, AOT and JIT are
-somewhat blurry.
+This is JIT, rather than ahead-of-time (AOT) compilation, because it
+is done at query execution time, and perhaps only in cases where the
+relevant task is repeated a number of times. Given the way JIT
+compilation is used in PostgreSQL, the lines between interpretation,
+AOT and JIT are somewhat blurry.
 
 Note that the interpreted program turned into a native program does
 not necessarily have to be a program in the classical sense. E.g. it
@@ -99,7 +99,7 @@ Lifetimes of JITed functions are managed via JITContext. Exactly one
 such context should be created for work in which all created JITed
 function should have the same lifetime. E.g. there's exactly one
 JITContext for each query executed, in the query's EState.  Only the
-release of an JITContext is exposed to the provider independent
+release of a JITContext is exposed to the provider independent
 facility, as the creation of one is done on-demand by the JIT
 implementations.
 
@@ -231,7 +231,7 @@ needs to be referenced as an offset to one block of memory stored in
 an ExprState, rather than absolute pointers into memory.
 
 Once that is addressed, adding an LRU cache that's keyed by the
-generated LLVM IR will allow to use optimized functions even for
+generated LLVM IR will allow the usage of optimized functions even for
 faster queries.
 
 A longer term project is to move expression compilation to the planner
diff --git a/src/include/jit/llvmjit_emit.h b/src/include/jit/llvmjit_emit.h
index 1a7d6db7259e..3142df608b3c 100644
--- a/src/include/jit/llvmjit_emit.h
+++ b/src/include/jit/llvmjit_emit.h
@@ -1,6 +1,6 @@
 /*
  * llvmjit_emit.h
- *	  Helpers to make emitting LLVM IR a it more concise and pgindent proof.
+ *	  Helpers to make emitting LLVM IR a bit more concise and pgindent proof.
  *
  * Copyright (c) 2018-2020, PostgreSQL Global Development Group
  *

From d259afa7365165760004c2fdbe2520a94ddf2600 Mon Sep 17 00:00:00 2001
From: Fujii Masao <fujii@postgresql.org>
Date: Fri, 21 Aug 2020 12:33:30 +0900
Subject: [PATCH 327/334] Fix typos in comments.

Author: Masahiko Sawada
Reviewed-by: Fujii Masao
Discussion: https://postgr.es/m/CA+fd4k4m9hFSrRLB3etPWO5_v5=MujVZWRtz63q+55hM0Dz25Q@mail.gmail.com
---
 src/backend/storage/ipc/procarray.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 60b7a5db8e07..45eab7e5a622 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -198,7 +198,7 @@ typedef struct ComputeXidHorizonsResult
 	 * be removed.
 	 *
 	 * This likely should only be needed to determine whether pg_subtrans can
-	 * be truncated. It currently includes the effects of replications slots,
+	 * be truncated. It currently includes the effects of replication slots,
 	 * for historical reasons. But that could likely be changed.
 	 */
 	TransactionId oldest_considered_running;
@@ -207,7 +207,7 @@ typedef struct ComputeXidHorizonsResult
 	 * Oldest xid for which deleted tuples need to be retained in shared
 	 * tables.
 	 *
-	 * This includes the effects of replications lots. If that's not desired,
+	 * This includes the effects of replication slots. If that's not desired,
 	 * look at shared_oldest_nonremovable_raw;
 	 */
 	TransactionId shared_oldest_nonremovable;

From 4f78b6a46503c8b28f38ec40503149d78ba47716 Mon Sep 17 00:00:00 2001
From: Viktor Kurilko <v.kurilko@arenadata.io>
Date: Mon, 4 May 2026 21:52:00 +0700
Subject: [PATCH 328/334] Resolve conflicts in the catversion.h (#2443)

Commit 3e98c0bafb28de87ae095b341687dc082371af54 updates the catalog
version,
but we use a different format.
---
 src/include/catalog/catversion.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 972fd7982228..add1503e3859 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -55,12 +55,7 @@
  * catalog versions from Greenplum.
  */
 
-<<<<<<< HEAD
 /*							3yyymmddN */
-#define CATALOG_VERSION_NO	302604171
-=======
-/*							yyyymmddN */
-#define CATALOG_VERSION_NO	202008191
->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600
+#define CATALOG_VERSION_NO	302605041
 
 #endif

From 87d9e1568d497e466f2c7b3608d6e4695e2c44e7 Mon Sep 17 00:00:00 2001
From: Viktor Kurilko <v.kurilko@arenadata.io>
Date: Wed, 6 May 2026 10:56:47 +0700
Subject: [PATCH 329/334] Resolve conflicts in the snapshot.h (#2447)

Commit 623a9ba79bbdd11c5eccb30b8bd5c446130e521c added a new field to the
SnapshotData struct, but earlier commit
3b4cd7887fd16542339ae9cb13df252d7c58fc11
added a field nearby.
---
 src/include/utils/snapshot.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h
index 93b7114dcae2..7e2772ab0a6b 100644
--- a/src/include/utils/snapshot.h
+++ b/src/include/utils/snapshot.h
@@ -212,18 +212,17 @@ typedef struct SnapshotData
 	XLogRecPtr	lsn;			/* position in the WAL stream when taken */
 
 	/*
-<<<<<<< HEAD
 	 * GP: Global information about which transactions are visible for a
 	 * distributed transaction, with cached local xids
 	 */
 	DistributedSnapshotWithLocalMapping	distribSnapshotWithLocalMapping;
-=======
+
+	/*
 	 * The transaction completion count at the time GetSnapshotData() built
 	 * this snapshot. Allows to avoid re-computing static snapshots when no
 	 * transactions completed since the last GetSnapshotData().
 	 */
 	uint64		snapXactCompletionCount;
->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600
 } SnapshotData;
 
 #endif							/* SNAPSHOT_H */

From 4c3633beb77cd7d404186910c4be219590bcd2c0 Mon Sep 17 00:00:00 2001
From: Viktor Kurilko <v.kurilko@arenadata.io>
Date: Wed, 6 May 2026 11:27:00 +0700
Subject: [PATCH 330/334] Resolve conflicts
 src/backend/storage/lmgr/lwlocknames.txt (#2442)

Commit 566372b3d6435639e4cc4476d79b8505a0297c87 added new locks to the
lwlocknames.txt, while earlier commit
19cd1cf4b68faff2e29bc2fa884c480e4644cdb4
added gpdb specific locks to the same place.
---
 src/backend/storage/lmgr/lwlocknames.txt | 36 +++++++++++-------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index 8289c6993b7c..77fdf74695f7 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -50,26 +50,22 @@ MultiXactTruncationLock				41
 OldSnapshotTimeMapLock				42
 LogicalRepWorkerLock				43
 XactTruncationLock					44
-<<<<<<< HEAD
-
-# Additional individual locks in GPDB
-SharedSnapshotLock					45
-DistributedLogControlLock			46
-# 47 is available; was formerly AOSegFileLock
-ResQueueLock						48
-ResGroupLock						49
-ErrorLogLock						50
-SessionStateLock					51
-RelfilenodeGenLock					52
-WorkFileManagerLock					53
-DistributedLogTruncateLock			54
-TwophaseCommitLock				55
-ShareInputScanLock				56
-FTSReplicationStatusLock			57
-GxidBumpLock						58
-ParallelCursorEndpointLock			59
-=======
 # 45 was XactTruncationLock until removal of BackendRandomLock
 WrapLimitsVacuumLock				46
 NotifyQueueTailLock					47
->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600
+
+# Additional individual locks in GPDB
+SharedSnapshotLock					48
+DistributedLogControlLock			49
+ResQueueLock						50
+ResGroupLock						51
+ErrorLogLock						52
+SessionStateLock					53
+RelfilenodeGenLock					54
+WorkFileManagerLock					55
+DistributedLogTruncateLock			56
+TwophaseCommitLock					57
+ShareInputScanLock					58
+FTSReplicationStatusLock			59
+GxidBumpLock						60
+ParallelCursorEndpointLock			61

From bae42457e5125ed93bb1e181bcf2bf2a80b14fb4 Mon Sep 17 00:00:00 2001
From: Maxim Michkov <m.michkov@arenadata.io>
Date: Wed, 6 May 2026 10:53:55 +0300
Subject: [PATCH 331/334] Resolve conflicts in src/backend/commands/tablecmds.c
 (#2460)

Commit e3931d01f3afef14703827eda1dad0a3fb3b5d07 added new local variables
while commit 07c2f0112adc8c682fcf330b6beef0197d210834 added another variable
nearby.
---
 src/backend/commands/tablecmds.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index b117cf3f0b85..de6f9993d85d 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -7579,12 +7579,9 @@ ATExecAddColumn(List **wqueue, AlteredTableInfo *tab, Relation rel,
 	AlterTableCmd *childcmd;
 	AclResult	aclresult;
 	ObjectAddress address;
-<<<<<<< HEAD
  	List* enc;
-=======
 	TupleDesc	tupdesc;
 	FormData_pg_attribute *aattr[] = {&attribute};
->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600
 
 	/* At top level, permission check was done in ATPrepCmd, else do it */
 	if (recursing)

From a732b1e754b3bdbb74b230de5516337cdb9f9802 Mon Sep 17 00:00:00 2001
From: Maxim Michkov <m.michkov@arenadata.io>
Date: Wed, 6 May 2026 10:54:15 +0300
Subject: [PATCH 332/334] Resolve conflicts in
 src/backend/commands/opclasscmds.c (#2459)

Commit 9f9682783bea74bf8d93cac4f7dd65fa677f5dc7 removed header
opfam_internal.h, however there was GPDB-specific include nearby.
---
 src/backend/commands/opclasscmds.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/backend/commands/opclasscmds.c b/src/backend/commands/opclasscmds.c
index 5c92cad7d7cf..97e4a0fbe7e7 100644
--- a/src/backend/commands/opclasscmds.c
+++ b/src/backend/commands/opclasscmds.c
@@ -27,11 +27,6 @@
 #include "catalog/dependency.h"
 #include "catalog/indexing.h"
 #include "catalog/objectaccess.h"
-<<<<<<< HEAD
-#include "catalog/oid_dispatch.h"
-#include "catalog/opfam_internal.h"
-=======
->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600
 #include "catalog/pg_am.h"
 #include "catalog/pg_amop.h"
 #include "catalog/pg_amproc.h"
@@ -54,6 +49,7 @@
 #include "utils/rel.h"
 #include "utils/syscache.h"
 
+#include "catalog/oid_dispatch.h"
 #include "cdb/cdbvars.h"
 #include "cdb/cdbdisp_query.h"
 

From 33bce61d133eba8edb7fdafbdd47c28015a722ee Mon Sep 17 00:00:00 2001
From: Georgy Shelkovy <g.shelkovy@arenadata.io>
Date: Wed, 6 May 2026 13:30:27 +0500
Subject: [PATCH 333/334] Resolve conflicts in src/include/access/nbtree.h
 (#2448)

Commit 9f96827 added a definition of the new function btadjustmembers
to src/include/access/nbtree.h, while earlier commit 38d8815 had
already added a definition of the function btree_or_bitmap_validate to
the same location.
---
 src/include/access/nbtree.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 22df078571c0..ec6ba6072fe0 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1141,14 +1141,11 @@ extern bool _bt_allequalimage(Relation rel, bool debugmessage);
  * prototypes for functions in nbtvalidate.c
  */
 extern bool btvalidate(Oid opclassoid);
-<<<<<<< HEAD
 extern bool btree_or_bitmap_validate(Oid opclassoid, const char *amname);
-=======
 extern void btadjustmembers(Oid opfamilyoid,
 							Oid opclassoid,
 							List *operators,
 							List *functions);
->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600
 
 /*
  * prototypes for functions in nbtsort.c

From 02a1c00366db8589ce76c88cbc0a5bb536f487db Mon Sep 17 00:00:00 2001
From: Georgy Shelkovy <g.shelkovy@arenadata.io>
Date: Wed, 6 May 2026 13:31:07 +0500
Subject: [PATCH 334/334] Resolve conflicts in configure.ac (#2440)

Commit 25244b8 renamed configure.in to configure.ac, although earlier
GPDB-specific commits had already added the dnl prefix in the same
location.
---
 configure.ac | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/configure.ac b/configure.ac
index 5102af0ae73d..c545328648ac 100644
--- a/configure.ac
+++ b/configure.ac
@@ -24,17 +24,10 @@ AC_INIT([Greenplum Database], [8.0.0-alpha.0], [support@greenplum.org], [], [htt
 [PG_PACKAGE_VERSION=14alpha0]
 AC_SUBST(PG_PACKAGE_VERSION)
 
-<<<<<<< HEAD:configure.in
 dnl m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.69], [], [m4_fatal([Autoconf version 2.69 is required.
 dnl Untested combinations of 'autoconf' and PostgreSQL versions are not
-dnl recommended.  You can remove the check from 'configure.in' but it is then
+dnl recommended.  You can remove the check from 'configure.ac' but it is then
 dnl your responsibility whether the result works or not.])])
-=======
-m4_if(m4_defn([m4_PACKAGE_VERSION]), [2.69], [], [m4_fatal([Autoconf version 2.69 is required.
-Untested combinations of 'autoconf' and PostgreSQL versions are not
-recommended.  You can remove the check from 'configure.ac' but it is then
-your responsibility whether the result works or not.])])
->>>>>>> d259afa7365165760004c2fdbe2520a94ddf2600:configure.ac
 AC_COPYRIGHT([Copyright (c) 1996-2020, PostgreSQL Global Development Group])
 AC_CONFIG_SRCDIR([src/backend/access/common/heaptuple.c])
 AC_CONFIG_AUX_DIR(config)