From 235f733de515448fa3feb8b04e7aad4ba0c5f94c Mon Sep 17 00:00:00 2001 From: Lawrence Jones Date: Tue, 27 Aug 2019 14:17:54 +0100 Subject: [PATCH] Hardcode pg_rewind retrying (up-to 5m) 1. https://github.com/gocardless/stolon/pull/14 Retry pg_rewind for up-to 5m after the first attempt. This allows us to handle the followed database booting for a moment before falling back to the more expensive pg_basebackup. This comes as a crappier alternative to [1], where we were creating proper configuration values. We want to try this before we attempt to create a more mature setup, as we intend to leave that work for whenever we upstream our cascading replication. --- cmd/keeper/cmd/keeper.go | 46 ++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/cmd/keeper/cmd/keeper.go b/cmd/keeper/cmd/keeper.go index a9e993d62..934597e45 100644 --- a/cmd/keeper/cmd/keeper.go +++ b/cmd/keeper/cmd/keeper.go @@ -20,6 +20,7 @@ import ( "fmt" "io" "io/ioutil" + "math/rand" "net" "net/http" "os" @@ -946,19 +947,38 @@ func (p *PostgresKeeper) resync(db, masterDB, followedDB *cluster.DB, tryPgrewin // doesn't exists pgm.SyncFromFollowedPGRewind will return an error and // fallback to pg_basebackup if tryPgrewind && p.usePgrewind(db) { - // pg_rewind doesn't support running against a database that is in recovery, as it - // builds temporary tables and this is not supported on a hot-standby. Stolon doesn't - // currently support cascading replication, but we should be clear when issuing a - // rewind that it targets the current primary, rather than whatever database we - // follow. - connParams := p.getSUConnParams(db, masterDB) - log.Infow("syncing using pg_rewind", "masterDB", masterDB.UID, "keeper", followedDB.Spec.KeeperUID) - if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, true); err != nil { - // log pg_rewind error and fallback to pg_basebackup - log.Errorw("error syncing with pg_rewind", zap.Error(err)) - } else { - pgm.SetRecoveryOptions(p.createRecoveryOptions(pg.RecoveryModeStandby, standbySettings, nil, nil)) - return nil + startedPgrewind := time.Now() + pgrewindRetries: + for { + // pg_rewind doesn't support running against a database that is in recovery, as it + // builds temporary tables and this is not supported on a hot-standby. Stolon doesn't + // currently support cascading replication, but we should be clear when issuing a + // rewind that it targets the current primary, rather than whatever database we + // follow. + connParams := p.getSUConnParams(db, masterDB) + log.Infow("syncing using pg_rewind", "masterDB", masterDB.UID, "keeper", followedDB.Spec.KeeperUID) + // TODO: Remove the final true once this is merged: + // https://github.com/sorintlab/stolon/pull/644 is + if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, true); err != nil { + // log pg_rewind error and fallback to pg_basebackup + log.Errorw("error syncing with pg_rewind", zap.Error(err)) + + // TODO: This is a GoCardless modification that enables retrying pg_rewind for + // up-to 5m, to allow the follower to boot before we attempt to rewind. This + // avoids falling back on pg_basebackup unnecessarily, and will eventually require + // upstreaming in a more mature form. + if time.Since(startedPgrewind) > 5*time.Minute { + break pgrewindRetries + } + + // Retry the pg_rewind 5-10s after our last attempt + log.Infow("sleeping before retrying pg_rewind") + time.Sleep((5 * time.Second) + time.Duration(rand.Int63n(int64(5*time.Second)))) + continue pgrewindRetries + } else { + pgm.SetRecoveryOptions(p.createRecoveryOptions(pg.RecoveryModeStandby, standbySettings, nil, nil)) + return nil + } } }