From a557e80c64c065e859958ef5940e6f7b49a69e1c Mon Sep 17 00:00:00 2001 From: Dyson Simmons Date: Mon, 19 Aug 2019 11:57:44 +0100 Subject: [PATCH 1/4] Add pg_rewind interval and timeout to clusterspec Allow the setting of the a pg_rewind interval and timeout so that the retry behaviour of pg_rewind can be customised. The defualt values if these are not set in the clusterspec are 0s which results in the current default behaviour of only trying to pg_rewind once if usePgrewind is set to true. --- cmd/sentinel/cmd/sentinel.go | 2 ++ internal/cluster/cluster.go | 16 ++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/cmd/sentinel/cmd/sentinel.go b/cmd/sentinel/cmd/sentinel.go index d443ee464..32aca1a4f 100644 --- a/cmd/sentinel/cmd/sentinel.go +++ b/cmd/sentinel/cmd/sentinel.go @@ -379,6 +379,8 @@ func (s *Sentinel) setDBSpecFromClusterSpec(cd *cluster.ClusterData) { db.Spec.RequestTimeout = *clusterSpec.RequestTimeout db.Spec.MaxStandbys = *clusterSpec.MaxStandbys db.Spec.UsePgrewind = *clusterSpec.UsePgrewind + db.Spec.PgrewindInterval = *clusterSpec.PgrewindInterval + db.Spec.PgrewindTimeout = *clusterSpec.PgrewindTimeout db.Spec.PGParameters = clusterSpec.PGParameters db.Spec.PGHBA = clusterSpec.PGHBA if db.Spec.FollowConfig != nil && db.Spec.FollowConfig.Type == cluster.FollowTypeExternal { diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go index deda5439b..836682e7e 100644 --- a/internal/cluster/cluster.go +++ b/internal/cluster/cluster.go @@ -66,6 +66,8 @@ const ( DefaultMaxSynchronousStandbys uint16 = 1 DefaultAdditionalWalSenders = 5 DefaultUsePgrewind = false + DefaultPgrewindInterval = 0 * time.Second + DefaultPgrewindTimeout = 0 * time.Second DefaultMergePGParameter = true DefaultRole ClusterRole = ClusterRoleMaster DefaultSUReplAccess SUReplAccessMode = SUReplAccessAll @@ -261,6 +263,10 @@ type ClusterSpec struct { AdditionalMasterReplicationSlots []string `json:"additionalMasterReplicationSlots"` // Whether to use pg_rewind UsePgrewind *bool `json:"usePgrewind,omitempty"` + // Interval to wait until next pg_rewind try + PgrewindInterval *Duration `json:"PgrewindInterval,omitempty"` + // Time after which we stop trying to pg_rewind + PgrewindTimeout *Duration `json:"PgrewindTimeout,omitempty"` // InitMode defines the cluster initialization mode. Current modes are: new, existing, pitr InitMode *ClusterInitMode `json:"initMode,omitempty"` // Whether to merge pgParameters of the initialized db cluster, useful @@ -379,6 +385,12 @@ func (os *ClusterSpec) WithDefaults() *ClusterSpec { if s.UsePgrewind == nil { s.UsePgrewind = BoolP(DefaultUsePgrewind) } + if s.PgrewindInterval == nil { + s.PgrewindInterval = &Duration{Duration: DefaultPgrewindInterval} + } + if s.PgrewindTimeout == nil { + s.PgrewindTimeout = &Duration{Duration: DefaultPgrewindTimeout} + } if s.MinSynchronousStandbys == nil { s.MinSynchronousStandbys = Uint16P(DefaultMinSynchronousStandbys) } @@ -607,6 +619,10 @@ type DBSpec struct { SynchronousReplication bool `json:"synchronousReplication,omitempty"` // Whether to use pg_rewind UsePgrewind bool `json:"usePgrewind,omitempty"` + // Interval to wait until next pg_rewind try + PgrewindInterval Duration `json:"PgrewindInterval,omitempty"` + // Time after which we stop trying to pg_rewind + PgrewindTimeout Duration `json:"PgrewindTimeout,omitempty"` // AdditionalWalSenders defines the number of additional wal_senders in // addition to the ones internally defined by stolon AdditionalWalSenders uint16 `json:"additionalWalSenders"` From 1c54b7e72b54ead8e1e90bceec7f8a2dcc0de537 Mon Sep 17 00:00:00 2001 From: Dyson Simmons Date: Mon, 19 Aug 2019 17:14:39 +0100 Subject: [PATCH 2/4] Add retrying to pg_rewind GC has implemented cascading replication which requires the retrying of pg_rewind to avoid expensive pg_basebackups. With cascading replication, the old sync is rebooted to repoint the recovery at the new primary. The old master will try resyncing against the new sync, while it's rebooting, which causes resync to fail. This falls back to pg_basebackup meaning the old master doesn't recover for several hours. --- cmd/keeper/cmd/keeper.go | 52 ++++++++++++++++++++++++++++++------- internal/cluster/cluster.go | 8 +++--- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/cmd/keeper/cmd/keeper.go b/cmd/keeper/cmd/keeper.go index 135be6a01..8ba5cece3 100644 --- a/cmd/keeper/cmd/keeper.go +++ b/cmd/keeper/cmd/keeper.go @@ -19,6 +19,7 @@ import ( "encoding/json" "fmt" "io/ioutil" + "math/rand" "net" "net/http" "os" @@ -809,20 +810,53 @@ func (p *PostgresKeeper) resync(db, followedDB *cluster.DB, tryPgrewind bool) er // postgresql version is > 9.5 since someone can also use an externally // installed pg_rewind for postgres 9.4. If a pg_rewind executable // doesn't exists pgm.SyncFromFollowedPGRewind will return an error and - // fallback to pg_basebackup + // fallback to pg_basebackup. If there is no pg_rewind timeout or + // interval set, we will only try pg_rewind once and fallback to + // pg_basebackup straight away. Otherwise we will keep retrying until + // pg_rewind succeeds or we reach the timeout (and then we will + // fallback to pg_basebackup). if tryPgrewind && p.usePgrewind(db) { - connParams := p.getSUConnParams(db, followedDB) - log.Infow("syncing using pg_rewind", "followedDB", followedDB.UID, "keeper", followedDB.Spec.KeeperUID) - // TODO: Make the forceCheckpoint parameter use cluster specification - if err := pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, true); err != nil { - // log pg_rewind error and fallback to pg_basebackup - log.Errorw("error syncing with pg_rewind", zap.Error(err)) - } else { - pgm.SetRecoveryParameters(p.createRecoveryParameters(true, standbySettings, nil, nil)) + var err error + timeout := db.Spec.PgrewindTimeout.Duration + interval := db.Spec.PgrewindInterval.Duration + + pgrewind := func() error { + connParams := p.getSUConnParams(db, followedDB) + log.Infow("syncing using pg_rewind", "followedDB", followedDB.UID, "keeper", followedDB.Spec.KeeperUID) + if err = pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, true); err != nil { + log.Errorw("error syncing with pg_rewind", zap.Error(err)) + pgm.SetRecoveryParameters(p.createRecoveryParameters(true, standbySettings, nil, nil)) + } + return err + } + + if err = pgrewind(); err == nil { return nil } + + // Retry pg_rewind at an interval with jitter until it succeeds or + // we timeout retrying + if timeout != 0 || interval != 0 { + timeoutAfter := time.After(timeout) + jitter := time.Duration(rand.Int63n(int64(interval)) / 4) + intervalTick := time.Tick(interval + jitter) + + Rewind: + for true { + select { + case <-timeoutAfter: + log.Errorw("timed out while trying to run pg_rewind") + break Rewind + case <-intervalTick: + if err = pgrewind(); err == nil { + return nil + } + } + } + } } + // Fallback to pg_basebackup as pg_rewind failed maj, min, err := p.pgm.BinaryVersion() if err != nil { // in case we fail to parse the binary version then log it and just don't use replSlot diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go index 836682e7e..64bfbfb5f 100644 --- a/internal/cluster/cluster.go +++ b/internal/cluster/cluster.go @@ -264,9 +264,9 @@ type ClusterSpec struct { // Whether to use pg_rewind UsePgrewind *bool `json:"usePgrewind,omitempty"` // Interval to wait until next pg_rewind try - PgrewindInterval *Duration `json:"PgrewindInterval,omitempty"` + PgrewindInterval *Duration `json:"pgrewindInterval,omitempty"` // Time after which we stop trying to pg_rewind - PgrewindTimeout *Duration `json:"PgrewindTimeout,omitempty"` + PgrewindTimeout *Duration `json:"pgrewindTimeout,omitempty"` // InitMode defines the cluster initialization mode. Current modes are: new, existing, pitr InitMode *ClusterInitMode `json:"initMode,omitempty"` // Whether to merge pgParameters of the initialized db cluster, useful @@ -620,9 +620,9 @@ type DBSpec struct { // Whether to use pg_rewind UsePgrewind bool `json:"usePgrewind,omitempty"` // Interval to wait until next pg_rewind try - PgrewindInterval Duration `json:"PgrewindInterval,omitempty"` + PgrewindInterval Duration `json:"pgrewindInterval,omitempty"` // Time after which we stop trying to pg_rewind - PgrewindTimeout Duration `json:"PgrewindTimeout,omitempty"` + PgrewindTimeout Duration `json:"pgrewindTimeout,omitempty"` // AdditionalWalSenders defines the number of additional wal_senders in // addition to the ones internally defined by stolon AdditionalWalSenders uint16 `json:"additionalWalSenders"` From 4f9534716720281a1d818097196ed640867b403d Mon Sep 17 00:00:00 2001 From: Dyson Simmons Date: Tue, 20 Aug 2019 12:16:42 +0100 Subject: [PATCH 3/4] Add pg_rewind checkpoint option to cluster spec Make the forcing of a checkpoint on the primary before performing a pg_rewind a cluster spec option so that it can be turned on or off. --- cmd/keeper/cmd/keeper.go | 3 ++- cmd/sentinel/cmd/sentinel.go | 1 + internal/cluster/cluster.go | 8 ++++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cmd/keeper/cmd/keeper.go b/cmd/keeper/cmd/keeper.go index 8ba5cece3..feb808777 100644 --- a/cmd/keeper/cmd/keeper.go +++ b/cmd/keeper/cmd/keeper.go @@ -819,11 +819,12 @@ func (p *PostgresKeeper) resync(db, followedDB *cluster.DB, tryPgrewind bool) er var err error timeout := db.Spec.PgrewindTimeout.Duration interval := db.Spec.PgrewindInterval.Duration + checkpoint := db.Spec.PgrewindCheckpoint pgrewind := func() error { connParams := p.getSUConnParams(db, followedDB) log.Infow("syncing using pg_rewind", "followedDB", followedDB.UID, "keeper", followedDB.Spec.KeeperUID) - if err = pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, true); err != nil { + if err = pgm.SyncFromFollowedPGRewind(connParams, p.pgSUPassword, checkpoint); err != nil { log.Errorw("error syncing with pg_rewind", zap.Error(err)) pgm.SetRecoveryParameters(p.createRecoveryParameters(true, standbySettings, nil, nil)) } diff --git a/cmd/sentinel/cmd/sentinel.go b/cmd/sentinel/cmd/sentinel.go index 32aca1a4f..87bbb6ca6 100644 --- a/cmd/sentinel/cmd/sentinel.go +++ b/cmd/sentinel/cmd/sentinel.go @@ -381,6 +381,7 @@ func (s *Sentinel) setDBSpecFromClusterSpec(cd *cluster.ClusterData) { db.Spec.UsePgrewind = *clusterSpec.UsePgrewind db.Spec.PgrewindInterval = *clusterSpec.PgrewindInterval db.Spec.PgrewindTimeout = *clusterSpec.PgrewindTimeout + db.Spec.PgrewindCheckpoint = *clusterSpec.PgrewindCheckpoint db.Spec.PGParameters = clusterSpec.PGParameters db.Spec.PGHBA = clusterSpec.PGHBA if db.Spec.FollowConfig != nil && db.Spec.FollowConfig.Type == cluster.FollowTypeExternal { diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go index 64bfbfb5f..ef98fef55 100644 --- a/internal/cluster/cluster.go +++ b/internal/cluster/cluster.go @@ -68,6 +68,7 @@ const ( DefaultUsePgrewind = false DefaultPgrewindInterval = 0 * time.Second DefaultPgrewindTimeout = 0 * time.Second + DefaultPgrewindCheckpoint = false DefaultMergePGParameter = true DefaultRole ClusterRole = ClusterRoleMaster DefaultSUReplAccess SUReplAccessMode = SUReplAccessAll @@ -267,6 +268,8 @@ type ClusterSpec struct { PgrewindInterval *Duration `json:"pgrewindInterval,omitempty"` // Time after which we stop trying to pg_rewind PgrewindTimeout *Duration `json:"pgrewindTimeout,omitempty"` + // Checkpoint before performing pg_rewind + PgrewindCheckpoint *bool `json:"pgrewindCheckpoint,omitempty"` // InitMode defines the cluster initialization mode. Current modes are: new, existing, pitr InitMode *ClusterInitMode `json:"initMode,omitempty"` // Whether to merge pgParameters of the initialized db cluster, useful @@ -391,6 +394,9 @@ func (os *ClusterSpec) WithDefaults() *ClusterSpec { if s.PgrewindTimeout == nil { s.PgrewindTimeout = &Duration{Duration: DefaultPgrewindTimeout} } + if s.PgrewindCheckpoint == nil { + s.PgrewindCheckpoint = BoolP(DefaultPgrewindCheckpoint) + } if s.MinSynchronousStandbys == nil { s.MinSynchronousStandbys = Uint16P(DefaultMinSynchronousStandbys) } @@ -623,6 +629,8 @@ type DBSpec struct { PgrewindInterval Duration `json:"pgrewindInterval,omitempty"` // Time after which we stop trying to pg_rewind PgrewindTimeout Duration `json:"pgrewindTimeout,omitempty"` + // Checkpoint before performing pg_rewind + PgrewindCheckpoint bool `json:"pgrewindCheckpoint,omitempty"` // AdditionalWalSenders defines the number of additional wal_senders in // addition to the ones internally defined by stolon AdditionalWalSenders uint16 `json:"additionalWalSenders"` From 821e93cb8eb7fe9a965226bbfd7e6a6178de2b04 Mon Sep 17 00:00:00 2001 From: Dyson Simmons Date: Tue, 20 Aug 2019 12:38:46 +0100 Subject: [PATCH 4/4] Update cluster spec documentation Add pgrewindInternal, prgrewindTimeout, and pgrewindCheckpoint to the cluster spec documentation. --- doc/cluster_spec.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/cluster_spec.md b/doc/cluster_spec.md index 0205f637a..a74aa6bbd 100644 --- a/doc/cluster_spec.md +++ b/doc/cluster_spec.md @@ -27,6 +27,9 @@ Some options in a running cluster specification can be changed to update the des | additionalWalSenders | number of additional wal_senders in addition to the ones internally defined by stolon, useful to provide enough wal senders for external standbys (changing this value requires an instance restart) | no | uint16 | 5 | | additionalMasterReplicationSlots | a list of additional physical replication slots to be created on the master postgres instance. They will be prefixed with `stolon_` (like internal replication slots used for standby replication) to make them "namespaced" from other replication slots. Replication slots starting with `stolon_` and not defined here (and not used for standby replication) will be dropped from the master instance. | no | []string | null | | usePgrewind | try to use pg_rewind for faster instance resyncronization. | no | bool | false | +| pgrewindInterval | interval to wait until next pg_rewind try | no | string (duration) | 0s | +| pgrewindTimeout | time after which we stop trying to pg_rewind | no | string (duration) | 0s | +| pgrewindCheckpoint | force checkpoint on primary before performing pg_rewind | no | bool | false | | initMode | The cluster initialization mode. Can be *new* or *existing*. *new* means that a new db cluster will be created on a random keeper and the other keepers will sync with it. *existing* means that a keeper (that needs to have an already created db cluster) will be choosed as the initial master and the other keepers will sync with it. In this case the `existingConfig` object needs to be populated. | yes | string | | | existingConfig | configuration for initMode of type "existing" | if initMode is "existing" | ExistingConfig | | | mergePgParameters | merge pgParameters of the initialized db cluster, useful the retain initdb generated parameters when InitMode is new, retain current parameters when initMode is existing or pitr. | no | bool | true |