From a557e80c64c065e859958ef5940e6f7b49a69e1c Mon Sep 17 00:00:00 2001
From: Dyson Simmons <dysonsimmons@gmail.com>
Date: Mon, 19 Aug 2019 11:57:44 +0100
Subject: [PATCH 1/4] Add pg_rewind interval and timeout to clusterspec

Allow the setting of the a pg_rewind interval and timeout so that the
retry behaviour of pg_rewind can be customised. The defualt values if
these are not set in the clusterspec are 0s which results in the
current default behaviour of only trying to pg_rewind once if
usePgrewind is set to true.
---
 cmd/sentinel/cmd/sentinel.go |  2 ++
 internal/cluster/cluster.go  | 16 ++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/cmd/sentinel/cmd/sentinel.go b/cmd/sentinel/cmd/sentinel.go
index d443ee464..32aca1a4f 100644
--- a/cmd/sentinel/cmd/sentinel.go
+++ b/cmd/sentinel/cmd/sentinel.go
@@ -379,6 +379,8 @@ func (s *Sentinel) setDBSpecFromClusterSpec(cd *cluster.ClusterData) {
 		db.Spec.RequestTimeout = *clusterSpec.RequestTimeout
 		db.Spec.MaxStandbys = *clusterSpec.MaxStandbys
 		db.Spec.UsePgrewind = *clusterSpec.UsePgrewind
+		db.Spec.PgrewindInterval = *clusterSpec.PgrewindInterval
+		db.Spec.PgrewindTimeout = *clusterSpec.PgrewindTimeout
 		db.Spec.PGParameters = clusterSpec.PGParameters
 		db.Spec.PGHBA = clusterSpec.PGHBA
 		if db.Spec.FollowConfig != nil && db.Spec.FollowConfig.Type == cluster.FollowTypeExternal {
diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go
index deda5439b..836682e7e 100644
--- a/internal/cluster/cluster.go
+++ b/internal/cluster/cluster.go
@@ -66,6 +66,8 @@ const (
 	DefaultMaxSynchronousStandbys    uint16           = 1
 	DefaultAdditionalWalSenders                       = 5
 	DefaultUsePgrewind                                = false
+	DefaultPgrewindInterval                           = 0 * time.Second
+	DefaultPgrewindTimeout                            = 0 * time.Second
 	DefaultMergePGParameter                           = true
 	DefaultRole                      ClusterRole      = ClusterRoleMaster
 	DefaultSUReplAccess              SUReplAccessMode = SUReplAccessAll
@@ -261,6 +263,10 @@ type ClusterSpec struct {
 	AdditionalMasterReplicationSlots []string `json:"additionalMasterReplicationSlots"`
 	// Whether to use pg_rewind
 	UsePgrewind *bool `json:"usePgrewind,omitempty"`
+	// Interval to wait until next pg_rewind try
+	PgrewindInterval *Duration `json:"PgrewindInterval,omitempty"`
+	// Time after which we stop trying to pg_rewind
+	PgrewindTimeout *Duration `json:"PgrewindTimeout,omitempty"`
 	// InitMode defines the cluster initialization mode. Current modes are: new, existing, pitr
 	InitMode *ClusterInitMode `json:"initMode,omitempty"`
 	// Whether to merge pgParameters of the initialized db cluster, useful
@@ -379,6 +385,12 @@ func (os *ClusterSpec) WithDefaults() *ClusterSpec {
 	if s.UsePgrewind == nil {
 		s.UsePgrewind = BoolP(DefaultUsePgrewind)
 	}
+	if s.PgrewindInterval == nil {
+		s.PgrewindInterval = &Duration{Duration: DefaultPgrewindInterval}
+	}
+	if s.PgrewindTimeout == nil {
+		s.PgrewindTimeout = &Duration{Duration: DefaultPgrewindTimeout}
+	}
 	if s.MinSynchronousStandbys == nil {
 		s.MinSynchronousStandbys = Uint16P(DefaultMinSynchronousStandbys)
 	}
@@ -607,6 +619,10 @@ type DBSpec struct {
 	SynchronousReplication bool `json:"synchronousReplication,omitempty"`
 	// Whether to use pg_rewind
 	UsePgrewind bool `json:"usePgrewind,omitempty"`
+	// Interval to wait until next pg_rewind try
+	PgrewindInterval Duration `json:"PgrewindInterval,omitempty"`
+	// Time after which we stop trying to pg_rewind
+	PgrewindTimeout Duration `json:"PgrewindTimeout,omitempty"`
 	// AdditionalWalSenders defines the number of additional wal_senders in
 	// addition to the ones internally defined by stolon
 	AdditionalWalSenders uint16 `json:"additionalWalSenders"`

From 1c54b7e72b54ead8e1e90bceec7f8a2dcc0de537 Mon Sep 17 00:00:00 2001
From: Dyson Simmons <dysonsimmons@gmail.com>
Date: Mon, 19 Aug 2019 17:14:39 +0100
Subject: [PATCH 2/4] Add retrying to pg_rewind

GC has implemented cascading replication which requires the retrying of
pg_rewind to avoid expensive pg_basebackups.

With cascading replication, the old sync is rebooted to repoint the
recovery at the new primary. The old master will try resyncing against
the new sync, while it's rebooting, which causes resync to fail. This
falls back to pg_basebackup meaning the old master doesn't recover
for several hours.
---
 cmd/keeper/cmd/keeper.go    | 52 ++++++++++++++++++++++++++++++-------
 internal/cluster/cluster.go |  8 +++---
 2 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/cmd/keeper/cmd/keeper.go b/cmd/keeper/cmd/keeper.go
index 135be6a01..8ba5cece3 100644
--- a/cmd/keeper/cmd/keeper.go
+++ b/cmd/keeper/cmd/keeper.go
@@ -19,6 +19,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
+	"math/rand"
 	"net"
 	"net/http"
 	"os"
@@ -809,20 +810,53 @@ func (p *PostgresKeeper) resync(db, followedDB *cluster.DB, tryPgrewind bool) er
 	// postgresql version is > 9.5 since someone can also use an externally
 	// installed pg_rewind for postgres 9.4. If a pg_rewind executable
 	// doesn't exists pgm.SyncFromFollowedPGRewind will return an error and
-	// fallback to pg_basebackup
+	// fallback to pg_basebackup. If there is no pg_rewind timeout or
+	// interval set, we will only try pg_rewind once and fallback to
+	// pg_basebackup straight away. Otherwise we will keep retrying until
+	// pg_rewind succeeds or we reach the timeout (and then we will
+	// fallback to pg_basebackup).
 	if tryPgrewind && p.usePgrewind(db) {
-		connParams := p.getSUConnParams(db, followedDB)
-		log.Infow("syncing using pg_rewind", "followedDB", followedDB.UID, "keeper", followedDB.Spec.KeeperUID)
-		// TODO: Make the forceCheckpoint parameter use cluster specification
-		if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, true); err != nil {
-			// log pg_rewind error and fallback to pg_basebackup
-			log.Errorw("error syncing with pg_rewind", zap.Error(err))
-		} else {
-			pgm.SetRecoveryParameters(p.createRecoveryParameters(true, standbySettings, nil, nil))
+		var err error
+		timeout := db.Spec.PgrewindTimeout.Duration
+		interval := db.Spec.PgrewindInterval.Duration
+
+		pgrewind := func() error {
+			connParams := p.getSUConnParams(db, followedDB)
+			log.Infow("syncing using pg_rewind", "followedDB", followedDB.UID, "keeper", followedDB.Spec.KeeperUID)
+			if err = pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, true); err != nil {
+				log.Errorw("error syncing with pg_rewind", zap.Error(err))
+				pgm.SetRecoveryParameters(p.createRecoveryParameters(true, standbySettings, nil, nil))
+			}
+			return err
+		}
+
+		if err = pgrewind(); err == nil {
 			return nil
 		}
+
+		// Retry pg_rewind at an interval with jitter until it succeeds or
+		// we timeout retrying
+		if timeout != 0 || interval != 0 {
+			timeoutAfter := time.After(timeout)
+			jitter := time.Duration(rand.Int63n(int64(interval)) / 4)
+			intervalTick := time.Tick(interval + jitter)
+
+		Rewind:
+			for true {
+				select {
+				case <-timeoutAfter:
+					log.Errorw("timed out while trying to run pg_rewind")
+					break Rewind
+				case <-intervalTick:
+					if err = pgrewind(); err == nil {
+						return nil
+					}
+				}
+			}
+		}
 	}
 
+	// Fallback to pg_basebackup as pg_rewind failed
 	maj, min, err := p.pgm.BinaryVersion()
 	if err != nil {
 		// in case we fail to parse the binary version then log it and just don't use replSlot
diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go
index 836682e7e..64bfbfb5f 100644
--- a/internal/cluster/cluster.go
+++ b/internal/cluster/cluster.go
@@ -264,9 +264,9 @@ type ClusterSpec struct {
 	// Whether to use pg_rewind
 	UsePgrewind *bool `json:"usePgrewind,omitempty"`
 	// Interval to wait until next pg_rewind try
-	PgrewindInterval *Duration `json:"PgrewindInterval,omitempty"`
+	PgrewindInterval *Duration `json:"pgrewindInterval,omitempty"`
 	// Time after which we stop trying to pg_rewind
-	PgrewindTimeout *Duration `json:"PgrewindTimeout,omitempty"`
+	PgrewindTimeout *Duration `json:"pgrewindTimeout,omitempty"`
 	// InitMode defines the cluster initialization mode. Current modes are: new, existing, pitr
 	InitMode *ClusterInitMode `json:"initMode,omitempty"`
 	// Whether to merge pgParameters of the initialized db cluster, useful
@@ -620,9 +620,9 @@ type DBSpec struct {
 	// Whether to use pg_rewind
 	UsePgrewind bool `json:"usePgrewind,omitempty"`
 	// Interval to wait until next pg_rewind try
-	PgrewindInterval Duration `json:"PgrewindInterval,omitempty"`
+	PgrewindInterval Duration `json:"pgrewindInterval,omitempty"`
 	// Time after which we stop trying to pg_rewind
-	PgrewindTimeout Duration `json:"PgrewindTimeout,omitempty"`
+	PgrewindTimeout Duration `json:"pgrewindTimeout,omitempty"`
 	// AdditionalWalSenders defines the number of additional wal_senders in
 	// addition to the ones internally defined by stolon
 	AdditionalWalSenders uint16 `json:"additionalWalSenders"`

From 4f9534716720281a1d818097196ed640867b403d Mon Sep 17 00:00:00 2001
From: Dyson Simmons <dysonsimmons@gmail.com>
Date: Tue, 20 Aug 2019 12:16:42 +0100
Subject: [PATCH 3/4] Add pg_rewind checkpoint option to cluster spec

Make the forcing of a checkpoint on the primary before performing a
pg_rewind a cluster spec option so that it can be turned on or off.
---
 cmd/keeper/cmd/keeper.go     | 3 ++-
 cmd/sentinel/cmd/sentinel.go | 1 +
 internal/cluster/cluster.go  | 8 ++++++++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/cmd/keeper/cmd/keeper.go b/cmd/keeper/cmd/keeper.go
index 8ba5cece3..feb808777 100644
--- a/cmd/keeper/cmd/keeper.go
+++ b/cmd/keeper/cmd/keeper.go
@@ -819,11 +819,12 @@ func (p *PostgresKeeper) resync(db, followedDB *cluster.DB, tryPgrewind bool) er
 		var err error
 		timeout := db.Spec.PgrewindTimeout.Duration
 		interval := db.Spec.PgrewindInterval.Duration
+		checkpoint := db.Spec.PgrewindCheckpoint
 
 		pgrewind := func() error {
 			connParams := p.getSUConnParams(db, followedDB)
 			log.Infow("syncing using pg_rewind", "followedDB", followedDB.UID, "keeper", followedDB.Spec.KeeperUID)
-			if err = pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, true); err != nil {
+			if err = pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, checkpoint); err != nil {
 				log.Errorw("error syncing with pg_rewind", zap.Error(err))
 				pgm.SetRecoveryParameters(p.createRecoveryParameters(true, standbySettings, nil, nil))
 			}
diff --git a/cmd/sentinel/cmd/sentinel.go b/cmd/sentinel/cmd/sentinel.go
index 32aca1a4f..87bbb6ca6 100644
--- a/cmd/sentinel/cmd/sentinel.go
+++ b/cmd/sentinel/cmd/sentinel.go
@@ -381,6 +381,7 @@ func (s *Sentinel) setDBSpecFromClusterSpec(cd *cluster.ClusterData) {
 		db.Spec.UsePgrewind = *clusterSpec.UsePgrewind
 		db.Spec.PgrewindInterval = *clusterSpec.PgrewindInterval
 		db.Spec.PgrewindTimeout = *clusterSpec.PgrewindTimeout
+		db.Spec.PgrewindCheckpoint = *clusterSpec.PgrewindCheckpoint
 		db.Spec.PGParameters = clusterSpec.PGParameters
 		db.Spec.PGHBA = clusterSpec.PGHBA
 		if db.Spec.FollowConfig != nil && db.Spec.FollowConfig.Type == cluster.FollowTypeExternal {
diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go
index 64bfbfb5f..ef98fef55 100644
--- a/internal/cluster/cluster.go
+++ b/internal/cluster/cluster.go
@@ -68,6 +68,7 @@ const (
 	DefaultUsePgrewind                                = false
 	DefaultPgrewindInterval                           = 0 * time.Second
 	DefaultPgrewindTimeout                            = 0 * time.Second
+	DefaultPgrewindCheckpoint                         = false
 	DefaultMergePGParameter                           = true
 	DefaultRole                      ClusterRole      = ClusterRoleMaster
 	DefaultSUReplAccess              SUReplAccessMode = SUReplAccessAll
@@ -267,6 +268,8 @@ type ClusterSpec struct {
 	PgrewindInterval *Duration `json:"pgrewindInterval,omitempty"`
 	// Time after which we stop trying to pg_rewind
 	PgrewindTimeout *Duration `json:"pgrewindTimeout,omitempty"`
+	// Checkpoint before performing pg_rewind
+	PgrewindCheckpoint *bool `json:"pgrewindCheckpoint,omitempty"`
 	// InitMode defines the cluster initialization mode. Current modes are: new, existing, pitr
 	InitMode *ClusterInitMode `json:"initMode,omitempty"`
 	// Whether to merge pgParameters of the initialized db cluster, useful
@@ -391,6 +394,9 @@ func (os *ClusterSpec) WithDefaults() *ClusterSpec {
 	if s.PgrewindTimeout == nil {
 		s.PgrewindTimeout = &Duration{Duration: DefaultPgrewindTimeout}
 	}
+	if s.PgrewindCheckpoint == nil {
+		s.PgrewindCheckpoint = BoolP(DefaultPgrewindCheckpoint)
+	}
 	if s.MinSynchronousStandbys == nil {
 		s.MinSynchronousStandbys = Uint16P(DefaultMinSynchronousStandbys)
 	}
@@ -623,6 +629,8 @@ type DBSpec struct {
 	PgrewindInterval Duration `json:"pgrewindInterval,omitempty"`
 	// Time after which we stop trying to pg_rewind
 	PgrewindTimeout Duration `json:"pgrewindTimeout,omitempty"`
+	// Checkpoint before performing pg_rewind
+	PgrewindCheckpoint bool `json:"pgrewindCheckpoint,omitempty"`
 	// AdditionalWalSenders defines the number of additional wal_senders in
 	// addition to the ones internally defined by stolon
 	AdditionalWalSenders uint16 `json:"additionalWalSenders"`

From 821e93cb8eb7fe9a965226bbfd7e6a6178de2b04 Mon Sep 17 00:00:00 2001
From: Dyson Simmons <dysonsimmons@gmail.com>
Date: Tue, 20 Aug 2019 12:38:46 +0100
Subject: [PATCH 4/4] Update cluster spec documentation

Add pgrewindInternal, prgrewindTimeout, and pgrewindCheckpoint to the
cluster spec documentation.
---
 doc/cluster_spec.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/doc/cluster_spec.md b/doc/cluster_spec.md
index 0205f637a..a74aa6bbd 100644
--- a/doc/cluster_spec.md
+++ b/doc/cluster_spec.md
@@ -27,6 +27,9 @@ Some options in a running cluster specification can be changed to update the des
 | additionalWalSenders      | number of additional wal_senders in addition to the ones internally defined by stolon, useful to provide enough wal senders for external standbys (changing this value requires an instance restart)                                                                                                                                                                                                                                                                              | no                        | uint16            | 5                                                                                                                                   |
 | additionalMasterReplicationSlots | a list of additional physical replication slots to be created on the master postgres instance. They will be prefixed with `stolon_` (like internal replication slots used for standby replication) to make them "namespaced" from other replication slots. Replication slots starting with `stolon_` and not defined here (and not used for standby replication) will be dropped from the master instance.                                                                                                                                                                | no                        | []string          | null                                                                                                                                |
 | usePgrewind               | try to use pg_rewind for faster instance resyncronization.                                                                                                                                                                                                                                                                                                                                                                                                                        | no                        | bool              | false                                                                                                                               |
+| pgrewindInterval          | interval to wait until next pg_rewind try                                                                                                                                                                                                                                                                                                                                                                                                                                         | no                        | string (duration) | 0s                                                                                                                                  |
+| pgrewindTimeout           | time after which we stop trying to pg_rewind                                                                                                                                                                                                                                                                                                                                                                                                                                      | no                        | string (duration) | 0s                                                                                                                                  |
+| pgrewindCheckpoint        | force checkpoint on primary before performing pg_rewind                                                                                                                                                                                                                                                                                                                                                                                                                           | no                        | bool              | false                                                                                                                               |
 | initMode                  | The cluster initialization mode. Can be *new* or *existing*. *new* means that a new db cluster will be created on a random keeper and the other keepers will sync with it. *existing* means that a keeper (that needs to have an already created db cluster) will be choosed as the initial master and the other keepers will sync with it. In this case the `existingConfig` object needs to be populated.                                                                       | yes                       | string            |                                                                                                                                     |
 | existingConfig            | configuration for initMode of type "existing"                                                                                                                                                                                                                                                                                                                                                                                                                                     | if initMode is "existing" | ExistingConfig    |                                                                                                                                     |
 | mergePgParameters         | merge pgParameters of the initialized db cluster, useful the retain initdb generated parameters when InitMode is new, retain current parameters when initMode is existing or pitr.                                                                                                                                                                                                                                                                                                | no                        | bool              | true                                                                                                                                |