From 235f733de515448fa3feb8b04e7aad4ba0c5f94c Mon Sep 17 00:00:00 2001
From: Lawrence Jones <lawrjone@gmail.com>
Date: Tue, 27 Aug 2019 14:17:54 +0100
Subject: [PATCH] Hardcode pg_rewind retrying (up-to 5m)

1. https://github.com/gocardless/stolon/pull/14

Retry pg_rewind for up-to 5m after the first attempt. This allows us to
handle the followed database booting for a moment before falling back to
the more expensive pg_basebackup.

This comes as a crappier alternative to [1], where we were creating
proper configuration values. We want to try this before we attempt to
create a more mature setup, as we intend to leave that work for whenever
we upstream our cascading replication.
---
 cmd/keeper/cmd/keeper.go | 46 ++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/cmd/keeper/cmd/keeper.go b/cmd/keeper/cmd/keeper.go
index a9e993d62..934597e45 100644
--- a/cmd/keeper/cmd/keeper.go
+++ b/cmd/keeper/cmd/keeper.go
@@ -20,6 +20,7 @@ import (
 	"fmt"
 	"io"
 	"io/ioutil"
+	"math/rand"
 	"net"
 	"net/http"
 	"os"
@@ -946,19 +947,38 @@ func (p *PostgresKeeper) resync(db, masterDB, followedDB *cluster.DB, tryPgrewin
 	// doesn't exists pgm.SyncFromFollowedPGRewind will return an error and
 	// fallback to pg_basebackup
 	if tryPgrewind && p.usePgrewind(db) {
-		// pg_rewind doesn't support running against a database that is in recovery, as it
-		// builds temporary tables and this is not supported on a hot-standby. Stolon doesn't
-		// currently support cascading replication, but we should be clear when issuing a
-		// rewind that it targets the current primary, rather than whatever database we
-		// follow.
-		connParams := p.getSUConnParams(db, masterDB)
-		log.Infow("syncing using pg_rewind", "masterDB", masterDB.UID, "keeper", followedDB.Spec.KeeperUID)
-		if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, true); err != nil {
-			// log pg_rewind error and fallback to pg_basebackup
-			log.Errorw("error syncing with pg_rewind", zap.Error(err))
-		} else {
-			pgm.SetRecoveryOptions(p.createRecoveryOptions(pg.RecoveryModeStandby, standbySettings, nil, nil))
-			return nil
+		startedPgrewind := time.Now()
+	pgrewindRetries:
+		for {
+			// pg_rewind doesn't support running against a database that is in recovery, as it
+			// builds temporary tables and this is not supported on a hot-standby. Stolon doesn't
+			// currently support cascading replication, but we should be clear when issuing a
+			// rewind that it targets the current primary, rather than whatever database we
+			// follow.
+			connParams := p.getSUConnParams(db, masterDB)
+			log.Infow("syncing using pg_rewind", "masterDB", masterDB.UID, "keeper", followedDB.Spec.KeeperUID)
+			// TODO: Remove the final true once this is merged:
+			// https://github.com/sorintlab/stolon/pull/644 is
+			if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, true); err != nil {
+				// log pg_rewind error and fallback to pg_basebackup
+				log.Errorw("error syncing with pg_rewind", zap.Error(err))
+
+				// TODO: This is a GoCardless modification that enables retrying pg_rewind for
+				// up-to 5m, to allow the follower to boot before we attempt to rewind. This
+				// avoids falling back on pg_basebackup unnecessarily, and will eventually require
+				// upstreaming in a more mature form.
+				if time.Since(startedPgrewind) > 5*time.Minute {
+					break pgrewindRetries
+				}
+
+				// Retry the pg_rewind 5-10s after our last attempt
+				log.Infow("sleeping before retrying pg_rewind")
+				time.Sleep((5 * time.Second) + time.Duration(rand.Int63n(int64(5*time.Second))))
+				continue pgrewindRetries
+			} else {
+				pgm.SetRecoveryOptions(p.createRecoveryOptions(pg.RecoveryModeStandby, standbySettings, nil, nil))
+				return nil
+			}
 		}
 	}