From e64f64d29a0e85fa1b99d2ebe18e0d30b2672253 Mon Sep 17 00:00:00 2001 From: Steve Smith Date: Tue, 17 Mar 2026 12:20:26 -0600 Subject: [PATCH 01/12] database: add failover-safe defaults and retry logic for Postgres/AlloyDB During AlloyDB maintenance switchovers (< 1s downtime), services using database/sql with pgx fail to recover because: 1. No connection pool limits are set by default, so dead connections persist indefinitely 2. No retry logic exists for transient connection errors This adds: - DefaultPostgresConnectionsConfig() with MaxLifetime=5m, MaxIdleTime=30s to ensure dead connections are evicted quickly after failover - ApplyPostgresConnectionsConfig() that fills in safe defaults when services don't explicitly configure pool settings - IsRetryablePostgresError() to classify transient PG/network errors - RetryPostgres() for services to wrap critical DB operations Co-Authored-By: Claude Opus 4.6 --- database/database.go | 26 ++++++++- database/database_test.go | 49 +++++++++++++++++ database/model_config.go | 13 +++++ database/postgres.go | 88 +++++++++++++++++++++++++++++++ database/postgres_test.go | 107 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 282 insertions(+), 1 deletion(-) diff --git a/database/database.go b/database/database.go index 67543936..087dd5b8 100644 --- a/database/database.go +++ b/database/database.go @@ -40,7 +40,7 @@ func New(ctx context.Context, logger log.Logger, config DatabaseConfig) (*sql.DB if err != nil { return nil, fmt.Errorf("connecting to postgres: %w", err) } - return ApplyConnectionsConfig(db, &config.Postgres.Connections, logger), nil + return ApplyPostgresConnectionsConfig(db, &config.Postgres.Connections, logger), nil } return nil, ErrMissingConfig @@ -112,3 +112,27 @@ func ApplyConnectionsConfig(db *sql.DB, connections *ConnectionsConfig, logger l return db } + +// ApplyPostgresConnectionsConfig applies connection pool settings with safe defaults +// for Postgres/AlloyDB. If any value in the provided config is zero, the corresponding +// default from DefaultPostgresConnectionsConfig is used. This ensures all services get +// failover-safe pool settings even if they don't explicitly configure them. +func ApplyPostgresConnectionsConfig(db *sql.DB, connections *ConnectionsConfig, logger log.Logger) *sql.DB { + defaults := DefaultPostgresConnectionsConfig() + + applied := *connections + if applied.MaxOpen <= 0 { + applied.MaxOpen = defaults.MaxOpen + } + if applied.MaxIdle <= 0 { + applied.MaxIdle = defaults.MaxIdle + } + if applied.MaxLifetime <= 0 { + applied.MaxLifetime = defaults.MaxLifetime + } + if applied.MaxIdleTime <= 0 { + applied.MaxIdleTime = defaults.MaxIdleTime + } + + return ApplyConnectionsConfig(db, &applied, logger) +} diff --git a/database/database_test.go b/database/database_test.go index a648aa02..abda3ddf 100644 --- a/database/database_test.go +++ b/database/database_test.go @@ -5,13 +5,16 @@ package database_test import ( "bytes" + "database/sql" "errors" "os" "testing" + "time" gomysql "github.com/go-sql-driver/mysql" "github.com/jackc/pgx/v5/pgconn" "github.com/moov-io/base/database" + "github.com/moov-io/base/log" "github.com/stretchr/testify/require" ) @@ -121,6 +124,52 @@ func TestDataTooLong(t *testing.T) { } } +func TestApplyPostgresConnectionsConfig_Defaults(t *testing.T) { + // When no config values are set, defaults should be applied + db, err := sql.Open("txdb", "TestApplyPostgresConnectionsConfig_Defaults") + if err != nil { + t.Skip("skipping test without txdb driver") + } + defer db.Close() + + connections := &database.ConnectionsConfig{} + database.ApplyPostgresConnectionsConfig(db, connections, log.NewTestLogger()) + + defaults := database.DefaultPostgresConnectionsConfig() + stats := db.Stats() + require.Equal(t, defaults.MaxOpen, stats.MaxOpenConnections) +} + +func TestApplyPostgresConnectionsConfig_Overrides(t *testing.T) { + // When config values are set, they should override defaults + db, err := sql.Open("txdb", "TestApplyPostgresConnectionsConfig_Overrides") + if err != nil { + t.Skip("skipping test without txdb driver") + } + defer db.Close() + + connections := &database.ConnectionsConfig{ + MaxOpen: 10, + MaxIdle: 3, + MaxLifetime: time.Minute, + MaxIdleTime: time.Second * 15, + } + database.ApplyPostgresConnectionsConfig(db, connections, log.NewTestLogger()) + + stats := db.Stats() + require.Equal(t, 10, stats.MaxOpenConnections) +} + +func TestDefaultPostgresConnectionsConfig(t *testing.T) { + defaults := database.DefaultPostgresConnectionsConfig() + require.Greater(t, defaults.MaxOpen, 0) + require.Greater(t, defaults.MaxIdle, 0) + require.Greater(t, defaults.MaxLifetime, time.Duration(0)) + require.Greater(t, defaults.MaxIdleTime, time.Duration(0)) + // MaxIdleTime should be shorter than MaxLifetime + require.Less(t, defaults.MaxIdleTime, defaults.MaxLifetime) +} + func TestConnectionsConfigOrder(t *testing.T) { bs, err := os.ReadFile("database.go") require.NoError(t, err) diff --git a/database/model_config.go b/database/model_config.go index ea8b9026..d5b84150 100644 --- a/database/model_config.go +++ b/database/model_config.go @@ -99,6 +99,19 @@ type ConnectionsConfig struct { MaxIdleTime time.Duration } +// DefaultPostgresConnectionsConfig returns connection pool defaults tuned for +// database failover recovery (e.g. AlloyDB maintenance switchovers). Short +// lifetimes and idle times ensure dead connections are evicted quickly so +// the pool re-establishes connections to the new primary. +func DefaultPostgresConnectionsConfig() ConnectionsConfig { + return ConnectionsConfig{ + MaxOpen: 25, + MaxIdle: 5, + MaxLifetime: 5 * time.Minute, + MaxIdleTime: 30 * time.Second, + } +} + type RetryConfig struct { MaxAttempts int MinDuration time.Duration diff --git a/database/postgres.go b/database/postgres.go index 7c98cdd2..4d7f463f 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -5,8 +5,10 @@ import ( "database/sql" "errors" "fmt" + "io" "net" "strings" + "time" "cloud.google.com/go/alloydbconn" "github.com/jackc/pgx/v5" @@ -164,3 +166,89 @@ func PostgresDeadlockFound(err error) bool { return strings.Contains(err.Error(), postgresErrDeadlockFound) } + +// IsRetryablePostgresError returns true if the error is a transient connection-level +// error that is safe to retry. This covers the errors seen during AlloyDB maintenance +// switchovers and other transient network failures. +func IsRetryablePostgresError(err error) bool { + if err == nil { + return false + } + + // PostgreSQL error codes indicating the server is shutting down or unavailable + var pgErr *pgconn.PgError + if errors.As(err, &pgErr) { + switch pgErr.Code { + case "57P01": // admin_shutdown + return true + case "57P02": // crash_shutdown + return true + case "57P03": // cannot_connect_now + return true + case "08000": // connection_exception + return true + case "08001": // sqlclient_unable_to_establish_sqlconnection + return true + case "08003": // connection_does_not_exist + return true + case "08004": // sqlserver_rejected_establishment_of_sqlconnection + return true + case "08006": // connection_failure + return true + } + return false + } + + // Network-level errors: connection reset, broken pipe, EOF, etc. + // These occur when the TCP connection is severed during a switchover. + var netErr *net.OpError + if errors.As(err, &netErr) { + return true + } + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { + return true + } + if errors.Is(err, context.DeadlineExceeded) { + return false // don't retry if the caller's context timed out + } + + // pgx wraps connection errors with these messages + msg := err.Error() + if strings.Contains(msg, "connection reset by peer") || + strings.Contains(msg, "broken pipe") || + strings.Contains(msg, "connection refused") || + strings.Contains(msg, "unexpected EOF") || + strings.Contains(msg, "conn closed") { + return true + } + + return false +} + +// RetryPostgres executes fn up to maxAttempts times, retrying on transient +// connection errors. This is intended for use around individual database +// operations to survive brief outages like AlloyDB maintenance switchovers. +func RetryPostgres(ctx context.Context, maxAttempts int, fn func() error) error { + if maxAttempts <= 0 { + maxAttempts = 3 + } + var err error + for attempt := 0; attempt < maxAttempts; attempt++ { + err = fn() + if err == nil { + return nil + } + if !IsRetryablePostgresError(err) { + return err + } + if attempt < maxAttempts-1 { + backoff := time.Duration(attempt+1) * 200 * time.Millisecond + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(backoff): + } + } + } + return err +} diff --git a/database/postgres_test.go b/database/postgres_test.go index 68dcb14d..e9644e1d 100644 --- a/database/postgres_test.go +++ b/database/postgres_test.go @@ -2,11 +2,15 @@ package database_test import ( "context" + "errors" + "io" + "net" "os" "path/filepath" "testing" "time" + "github.com/jackc/pgx/v5/pgconn" "github.com/moov-io/base" "github.com/moov-io/base/database" "github.com/moov-io/base/database/testdb" @@ -180,6 +184,109 @@ func Test_Postgres_Alloy_Migrations(t *testing.T) { defer db.Close() } +func TestIsRetryablePostgresError(t *testing.T) { + // nil error is not retryable + require.False(t, database.IsRetryablePostgresError(nil)) + + // admin_shutdown is retryable (seen during AlloyDB maintenance) + require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "57P01"})) + + // crash_shutdown is retryable + require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "57P02"})) + + // cannot_connect_now is retryable + require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "57P03"})) + + // connection_exception class is retryable + require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "08006"})) + + // unique_violation is NOT retryable (application-level error) + require.False(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "23505"})) + + // syntax_error is NOT retryable + require.False(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "42601"})) + + // EOF is retryable (connection severed) + require.True(t, database.IsRetryablePostgresError(io.EOF)) + require.True(t, database.IsRetryablePostgresError(io.ErrUnexpectedEOF)) + + // net.OpError is retryable + require.True(t, database.IsRetryablePostgresError(&net.OpError{ + Op: "read", + Err: errors.New("connection reset by peer"), + })) + + // context.DeadlineExceeded is NOT retryable + require.False(t, database.IsRetryablePostgresError(context.DeadlineExceeded)) + + // String-matched connection errors + require.True(t, database.IsRetryablePostgresError(errors.New("connection reset by peer"))) + require.True(t, database.IsRetryablePostgresError(errors.New("broken pipe"))) + require.True(t, database.IsRetryablePostgresError(errors.New("conn closed"))) + + // Random application error is NOT retryable + require.False(t, database.IsRetryablePostgresError(errors.New("invalid input"))) +} + +func TestRetryPostgres(t *testing.T) { + t.Run("succeeds on first attempt", func(t *testing.T) { + calls := 0 + err := database.RetryPostgres(context.Background(), 3, func() error { + calls++ + return nil + }) + require.NoError(t, err) + require.Equal(t, 1, calls) + }) + + t.Run("retries on transient error then succeeds", func(t *testing.T) { + calls := 0 + err := database.RetryPostgres(context.Background(), 3, func() error { + calls++ + if calls < 3 { + return &pgconn.PgError{Code: "57P01"} // admin_shutdown + } + return nil + }) + require.NoError(t, err) + require.Equal(t, 3, calls) + }) + + t.Run("does not retry non-retryable errors", func(t *testing.T) { + calls := 0 + err := database.RetryPostgres(context.Background(), 3, func() error { + calls++ + return &pgconn.PgError{Code: "23505"} // unique_violation + }) + require.Error(t, err) + require.Equal(t, 1, calls) + }) + + t.Run("respects context cancellation", func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + cancel() // cancel immediately + + calls := 0 + err := database.RetryPostgres(ctx, 3, func() error { + calls++ + return io.EOF // retryable, but context is done + }) + // First call happens, then context cancellation is detected + require.Error(t, err) + }) + + t.Run("exhausts all attempts", func(t *testing.T) { + calls := 0 + err := database.RetryPostgres(context.Background(), 3, func() error { + calls++ + return io.EOF + }) + require.Error(t, err) + require.Equal(t, 3, calls) + require.ErrorIs(t, err, io.EOF) + }) +} + func Test_Postgres_UniqueViolation(t *testing.T) { if testing.Short() { t.Skip("-short flag enabled") From 931ec09a12233bc9378714a8beb1b9ef7bfb5572 Mon Sep 17 00:00:00 2001 From: Steve Smith Date: Tue, 17 Mar 2026 12:29:57 -0600 Subject: [PATCH 02/12] database: use pgxpool with HealthCheckPeriod under the hood MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Switch from sql.Open("pgx") to pgxpool.NewWithConfig() wrapped with stdlib.OpenDBFromPool(). This gives us pgxpool's HealthCheckPeriod (set to 5s) which proactively pings idle connections and evicts dead ones — the most important fix for surviving AlloyDB maintenance switchovers. The return type remains *sql.DB so no downstream changes are needed. Also cleans up the dialer TODO (dialer lifecycle is now tied to the pool) and removes the unused pgx.ParseConfig import path. Co-Authored-By: Claude Opus 4.6 --- database/postgres.go | 116 +++++++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 54 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index 4d7f463f..2b3a628c 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -11,8 +11,8 @@ import ( "time" "cloud.google.com/go/alloydbconn" - "github.com/jackc/pgx/v5" "github.com/jackc/pgx/v5/pgconn" + "github.com/jackc/pgx/v5/pgxpool" "github.com/jackc/pgx/v5/stdlib" "github.com/moov-io/base/log" ) @@ -25,67 +25,50 @@ const ( ) func postgresConnection(ctx context.Context, logger log.Logger, config PostgresConfig, databaseName string) (*sql.DB, error) { - var connStr string - if config.Alloy != nil { - c, err := getAlloyDBConnectorConnStr(ctx, config, databaseName) - if err != nil { - return nil, logger.LogErrorf("creating alloydb connection: %w", err).Err() - } - connStr = c - } else { - c, err := getPostgresConnStr(config, databaseName) - if err != nil { - return nil, logger.LogErrorf("creating postgres connection: %w", err).Err() - } - connStr = c + poolConfig, err := buildPgxPoolConfig(ctx, config, databaseName) + if err != nil { + return nil, logger.LogErrorf("building pgx pool config: %w", err).Err() } - db, err := sql.Open("pgx", connStr) + // HealthCheckPeriod makes pgxpool ping idle connections in the background. + // Dead connections (e.g. from an AlloyDB switchover) are evicted before + // the application ever sees them. + poolConfig.HealthCheckPeriod = 5 * time.Second + + pool, err := pgxpool.NewWithConfig(ctx, poolConfig) if err != nil { - return nil, logger.LogErrorf("opening database: %w", err).Err() + return nil, logger.LogErrorf("creating pgx pool: %w", err).Err() } - err = db.Ping() + err = pool.Ping(ctx) if err != nil { - _ = db.Close() + pool.Close() return nil, logger.LogErrorf("connecting to database: %w", err).Err() } + // Wrap the pgxpool in a *sql.DB so the rest of the codebase doesn't change. + // pgxpool manages the real pool (with health checks); database/sql pool + // settings are applied on top via ApplyPostgresConnectionsConfig. + db := stdlib.OpenDBFromPool(pool) + return db, nil } -func getPostgresConnStr(config PostgresConfig, databaseName string) (string, error) { - url := fmt.Sprintf("postgres://%s:%s@%s/%s", config.User, config.Password, config.Address, databaseName) - - params := "" - - if config.TLS != nil { - if len(config.TLS.Mode) < 1 { - config.TLS.Mode = "verify-full" - } - - params += "sslmode=" + config.TLS.Mode - - if len(config.TLS.CACertFile) > 0 { - params += "&sslrootcert=" + config.TLS.CACertFile - } - - if len(config.TLS.ClientCertFile) > 0 { - params += "&sslcert=" + config.TLS.ClientCertFile - } - - if len(config.TLS.ClientKeyFile) > 0 { - params += "&sslkey=" + config.TLS.ClientKeyFile - } +func buildPgxPoolConfig(ctx context.Context, config PostgresConfig, databaseName string) (*pgxpool.Config, error) { + if config.Alloy != nil { + return buildAlloyDBPoolConfig(ctx, config, databaseName) } - connStr := fmt.Sprintf("%s?%s", url, params) - return connStr, nil + connStr, err := getPostgresConnStr(config, databaseName) + if err != nil { + return nil, err + } + return pgxpool.ParseConfig(connStr) } -func getAlloyDBConnectorConnStr(ctx context.Context, config PostgresConfig, databaseName string) (string, error) { +func buildAlloyDBPoolConfig(ctx context.Context, config PostgresConfig, databaseName string) (*pgxpool.Config, error) { if config.Alloy == nil { - return "", fmt.Errorf("missing alloy config") + return nil, fmt.Errorf("missing alloy config") } var dialer *alloydbconn.Dialer @@ -94,7 +77,7 @@ func getAlloyDBConnectorConnStr(ctx context.Context, config PostgresConfig, data if config.Alloy.UseIAM { d, err := alloydbconn.NewDialer(ctx, alloydbconn.WithIAMAuthN()) if err != nil { - return "", fmt.Errorf("creating alloydb dialer: %v", err) + return nil, fmt.Errorf("creating alloydb dialer: %v", err) } dialer = d dsn = fmt.Sprintf( @@ -106,7 +89,7 @@ func getAlloyDBConnectorConnStr(ctx context.Context, config PostgresConfig, data } else { d, err := alloydbconn.NewDialer(ctx) if err != nil { - return "", fmt.Errorf("creating alloydb dialer: %v", err) + return nil, fmt.Errorf("creating alloydb dialer: %v", err) } dialer = d dsn = fmt.Sprintf( @@ -116,12 +99,9 @@ func getAlloyDBConnectorConnStr(ctx context.Context, config PostgresConfig, data ) } - // TODO - //cleanup := func() error { return d.Close() } - - connConfig, err := pgx.ParseConfig(dsn) + poolConfig, err := pgxpool.ParseConfig(dsn) if err != nil { - return "", fmt.Errorf("failed to parse pgx config: %v", err) + return nil, fmt.Errorf("failed to parse pgx pool config: %v", err) } var connOptions []alloydbconn.DialOption @@ -129,11 +109,39 @@ func getAlloyDBConnectorConnStr(ctx context.Context, config PostgresConfig, data connOptions = append(connOptions, alloydbconn.WithPSC()) } - connConfig.DialFunc = func(ctx context.Context, _ string, _ string) (net.Conn, error) { + poolConfig.ConnConfig.DialFunc = func(ctx context.Context, _ string, _ string) (net.Conn, error) { return dialer.Dial(ctx, config.Alloy.InstanceURI, connOptions...) } - connStr := stdlib.RegisterConnConfig(connConfig) + return poolConfig, nil +} + +func getPostgresConnStr(config PostgresConfig, databaseName string) (string, error) { + url := fmt.Sprintf("postgres://%s:%s@%s/%s", config.User, config.Password, config.Address, databaseName) + + params := "" + + if config.TLS != nil { + if len(config.TLS.Mode) < 1 { + config.TLS.Mode = "verify-full" + } + + params += "sslmode=" + config.TLS.Mode + + if len(config.TLS.CACertFile) > 0 { + params += "&sslrootcert=" + config.TLS.CACertFile + } + + if len(config.TLS.ClientCertFile) > 0 { + params += "&sslcert=" + config.TLS.ClientCertFile + } + + if len(config.TLS.ClientKeyFile) > 0 { + params += "&sslkey=" + config.TLS.ClientKeyFile + } + } + + connStr := fmt.Sprintf("%s?%s", url, params) return connStr, nil } From c3e3eda4e54487219b9c7649ad52bb614f8785bc Mon Sep 17 00:00:00 2001 From: Steve Smith Date: Tue, 17 Mar 2026 12:31:25 -0600 Subject: [PATCH 03/12] database: lower HealthCheckPeriod from 5s to 1s For sub-second AlloyDB switchovers, 1s health checks detect and evict dead connections faster with negligible overhead. Co-Authored-By: Claude Opus 4.6 --- database/postgres.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/database/postgres.go b/database/postgres.go index 2b3a628c..09d15c8e 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -33,7 +33,7 @@ func postgresConnection(ctx context.Context, logger log.Logger, config PostgresC // HealthCheckPeriod makes pgxpool ping idle connections in the background. // Dead connections (e.g. from an AlloyDB switchover) are evicted before // the application ever sees them. - poolConfig.HealthCheckPeriod = 5 * time.Second + poolConfig.HealthCheckPeriod = 1 * time.Second pool, err := pgxpool.NewWithConfig(ctx, poolConfig) if err != nil { From b56ff9ecbec212ab54cf201f4e7e687b152e76eb Mon Sep 17 00:00:00 2001 From: Steve Smith Date: Tue, 17 Mar 2026 13:10:23 -0600 Subject: [PATCH 04/12] database: address review feedback from Gemini - Collapse switch cases in IsRetryablePostgresError for readability - Make context cancellation test more specific (assert context.Canceled and exact call count) Co-Authored-By: Claude Opus 4.6 --- database/postgres.go | 16 ++-------------- database/postgres_test.go | 3 ++- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index 09d15c8e..f9be0d9f 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -187,21 +187,9 @@ func IsRetryablePostgresError(err error) bool { var pgErr *pgconn.PgError if errors.As(err, &pgErr) { switch pgErr.Code { - case "57P01": // admin_shutdown + case "57P01", "57P02", "57P03": // admin_shutdown, crash_shutdown, cannot_connect_now return true - case "57P02": // crash_shutdown - return true - case "57P03": // cannot_connect_now - return true - case "08000": // connection_exception - return true - case "08001": // sqlclient_unable_to_establish_sqlconnection - return true - case "08003": // connection_does_not_exist - return true - case "08004": // sqlserver_rejected_establishment_of_sqlconnection - return true - case "08006": // connection_failure + case "08000", "08001", "08003", "08004", "08006": // connection_exception class return true } return false diff --git a/database/postgres_test.go b/database/postgres_test.go index e9644e1d..2edcde38 100644 --- a/database/postgres_test.go +++ b/database/postgres_test.go @@ -272,7 +272,8 @@ func TestRetryPostgres(t *testing.T) { return io.EOF // retryable, but context is done }) // First call happens, then context cancellation is detected - require.Error(t, err) + require.ErrorIs(t, err, context.Canceled) + require.Equal(t, 1, calls) }) t.Run("exhausts all attempts", func(t *testing.T) { From aea2db6af2296d9ab10ff1ec6db9d15261c6f12c Mon Sep 17 00:00:00 2001 From: Steve Smith Date: Thu, 25 Jun 2026 12:56:18 -0600 Subject: [PATCH 05/12] database: tighten IsRetryablePostgresError to only safe, pgx-surfaced codes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the 08xxx connection_exception PgError cases — pgx never surfaces these as *pgconn.PgError, so they were dead code. Add the codes that pgx does return and where the server guarantees a rollback before returning the error: 40001 (serialization_failure), 40P01 (deadlock_detected), 53300 (too_many_connections), and 57014 (query_canceled). Co-Authored-By: Claude Sonnet 4.6 (1M context) --- database/postgres.go | 30 ++++++++++++++++++++++++++---- database/postgres_test.go | 11 +++++++++-- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index f9be0d9f..47a9200c 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -183,13 +183,35 @@ func IsRetryablePostgresError(err error) bool { return false } - // PostgreSQL error codes indicating the server is shutting down or unavailable + // PostgreSQL error codes that are safe to retry because the server guarantees + // the transaction was rolled back before the error was returned. + // + // 57P01 admin_shutdown, 57P02 crash_shutdown: fast shutdown rolls back all + // active transactions before disconnecting clients. See: + // https://www.postgresql.org/docs/current/server-shutdown.html + // + // 57P03 cannot_connect_now: server is still starting up; the operation never + // reached a transaction. + // + // 40001 serialization_failure, 40P01 deadlock_detected: the server explicitly + // rolled back the transaction and expects the client to retry. See: + // https://www.postgresql.org/docs/current/mvcc-serialization-failure-handling.html + // + // 53300 too_many_connections: rejected at connect time; no transaction started. + // + // 57014 query_canceled: the server rolled back the transaction before returning + // this error. + // + // Note: 08xxx (connection_exception class) codes are intentionally omitted. + // pgx surfaces connection-level failures as TCP/network errors, not as + // *pgconn.PgError, so those cases are handled below. var pgErr *pgconn.PgError if errors.As(err, &pgErr) { switch pgErr.Code { - case "57P01", "57P02", "57P03": // admin_shutdown, crash_shutdown, cannot_connect_now - return true - case "08000", "08001", "08003", "08004", "08006": // connection_exception class + case "57P01", "57P02", "57P03", // admin_shutdown, crash_shutdown, cannot_connect_now + "40001", "40P01", // serialization_failure, deadlock_detected + "53300", // too_many_connections + "57014": // query_canceled return true } return false diff --git a/database/postgres_test.go b/database/postgres_test.go index 2edcde38..9f663a0c 100644 --- a/database/postgres_test.go +++ b/database/postgres_test.go @@ -197,8 +197,15 @@ func TestIsRetryablePostgresError(t *testing.T) { // cannot_connect_now is retryable require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "57P03"})) - // connection_exception class is retryable - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "08006"})) + // serialization_failure and deadlock_detected are retryable (server rolled back) + require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "40001"})) + require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "40P01"})) + + // too_many_connections is retryable (rejected at connect time) + require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "53300"})) + + // query_canceled is retryable (server rolled back the transaction) + require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "57014"})) // unique_violation is NOT retryable (application-level error) require.False(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "23505"})) From 908c5f532f0d253a8d47a7de616cca52b1f471ca Mon Sep 17 00:00:00 2001 From: Steve Smith Date: Thu, 25 Jun 2026 13:44:54 -0600 Subject: [PATCH 06/12] database: replace linear backoff with full jitter in RetryPostgres MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the fixed 200ms×attempt linear backoff with a random delay in [0, 100ms) per retry gap. Full jitter spreads concurrent retries across the fleet instead of having all callers wait the same fixed interval and slam the database simultaneously. Worst-case retry window across 3 attempts is ~200ms, sized for AlloyDB planned maintenance switchovers which typically cause less than one second of downtime. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- database/postgres.go | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index 47a9200c..499704ca 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -6,6 +6,7 @@ import ( "errors" "fmt" "io" + "math/rand" "net" "strings" "time" @@ -243,6 +244,16 @@ func IsRetryablePostgresError(err error) bool { return false } +// retryJitterMax is the upper bound for the random delay between retries. +// Full jitter in [0, retryJitterMax) spreads concurrent retries across the +// fleet rather than letting them all slam the database at the same instant. +// AlloyDB planned maintenance switchovers typically cause less than one second +// of downtime, so three attempts with up to 100ms between each gives a worst- +// case retry window of ~200ms — well within typical service SLAs while still +// spanning the switchover blip. The caller's context deadline is the escape +// hatch for services with tighter latency budgets. +const retryJitterMax = 100 * time.Millisecond + // RetryPostgres executes fn up to maxAttempts times, retrying on transient // connection errors. This is intended for use around individual database // operations to survive brief outages like AlloyDB maintenance switchovers. @@ -260,11 +271,11 @@ func RetryPostgres(ctx context.Context, maxAttempts int, fn func() error) error return err } if attempt < maxAttempts-1 { - backoff := time.Duration(attempt+1) * 200 * time.Millisecond + delay := time.Duration(rand.Int63n(int64(retryJitterMax))) select { case <-ctx.Done(): return ctx.Err() - case <-time.After(backoff): + case <-time.After(delay): } } } From fb1114042e9dad26368dfd657f874dacca70e74c Mon Sep 17 00:00:00 2001 From: Steve Smith Date: Thu, 25 Jun 2026 13:48:34 -0600 Subject: [PATCH 07/12] database: remove dead string matches from IsRetryablePostgresError Drop "connection refused" (not present in pgx's codebase) and "unexpected EOF" (already caught by errors.Is(err, io.ErrUnexpectedEOF)). Add comments explaining why the remaining string matches are safe to retry. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- database/postgres.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index 499704ca..4411d660 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -231,12 +231,16 @@ func IsRetryablePostgresError(err error) bool { return false // don't retry if the caller's context timed out } - // pgx wraps connection errors with these messages + // pgx sometimes wraps TCP-level disconnection errors as plain strings rather + // than preserving the original net.OpError type. These are safe to retry + // because they occur on the write path — the server never received the query. + // "conn closed" is pgx's own sentinel for a connection already closed before use. + // Note: "connection refused" and "unexpected EOF" are intentionally excluded — + // the former does not appear in pgx's codebase, the latter is already caught + // above by errors.Is(err, io.ErrUnexpectedEOF). msg := err.Error() if strings.Contains(msg, "connection reset by peer") || strings.Contains(msg, "broken pipe") || - strings.Contains(msg, "connection refused") || - strings.Contains(msg, "unexpected EOF") || strings.Contains(msg, "conn closed") { return true } From 7192463b2071abd6135caadd48527b049df6f57f Mon Sep 17 00:00:00 2001 From: Steve Smith Date: Tue, 30 Jun 2026 15:10:31 -0600 Subject: [PATCH 08/12] database: remove retry utilities from this PR IsRetryablePostgresError and RetryPostgres are moved to a follow-up PR where the explicit transaction / pg_xact_status approach can be designed properly. This PR focuses solely on pool health: pgxpool with HealthCheckPeriod=1s evicts dead connections within ~1s of an AlloyDB switchover so the pool is healthy before the next request arrives. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- database/postgres.go | 111 ------------------------------------- database/postgres_test.go | 114 -------------------------------------- 2 files changed, 225 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index 4411d660..12db66a9 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -5,8 +5,6 @@ import ( "database/sql" "errors" "fmt" - "io" - "math/rand" "net" "strings" "time" @@ -176,112 +174,3 @@ func PostgresDeadlockFound(err error) bool { return strings.Contains(err.Error(), postgresErrDeadlockFound) } -// IsRetryablePostgresError returns true if the error is a transient connection-level -// error that is safe to retry. This covers the errors seen during AlloyDB maintenance -// switchovers and other transient network failures. -func IsRetryablePostgresError(err error) bool { - if err == nil { - return false - } - - // PostgreSQL error codes that are safe to retry because the server guarantees - // the transaction was rolled back before the error was returned. - // - // 57P01 admin_shutdown, 57P02 crash_shutdown: fast shutdown rolls back all - // active transactions before disconnecting clients. See: - // https://www.postgresql.org/docs/current/server-shutdown.html - // - // 57P03 cannot_connect_now: server is still starting up; the operation never - // reached a transaction. - // - // 40001 serialization_failure, 40P01 deadlock_detected: the server explicitly - // rolled back the transaction and expects the client to retry. See: - // https://www.postgresql.org/docs/current/mvcc-serialization-failure-handling.html - // - // 53300 too_many_connections: rejected at connect time; no transaction started. - // - // 57014 query_canceled: the server rolled back the transaction before returning - // this error. - // - // Note: 08xxx (connection_exception class) codes are intentionally omitted. - // pgx surfaces connection-level failures as TCP/network errors, not as - // *pgconn.PgError, so those cases are handled below. - var pgErr *pgconn.PgError - if errors.As(err, &pgErr) { - switch pgErr.Code { - case "57P01", "57P02", "57P03", // admin_shutdown, crash_shutdown, cannot_connect_now - "40001", "40P01", // serialization_failure, deadlock_detected - "53300", // too_many_connections - "57014": // query_canceled - return true - } - return false - } - - // Network-level errors: connection reset, broken pipe, EOF, etc. - // These occur when the TCP connection is severed during a switchover. - var netErr *net.OpError - if errors.As(err, &netErr) { - return true - } - if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { - return true - } - if errors.Is(err, context.DeadlineExceeded) { - return false // don't retry if the caller's context timed out - } - - // pgx sometimes wraps TCP-level disconnection errors as plain strings rather - // than preserving the original net.OpError type. These are safe to retry - // because they occur on the write path — the server never received the query. - // "conn closed" is pgx's own sentinel for a connection already closed before use. - // Note: "connection refused" and "unexpected EOF" are intentionally excluded — - // the former does not appear in pgx's codebase, the latter is already caught - // above by errors.Is(err, io.ErrUnexpectedEOF). - msg := err.Error() - if strings.Contains(msg, "connection reset by peer") || - strings.Contains(msg, "broken pipe") || - strings.Contains(msg, "conn closed") { - return true - } - - return false -} - -// retryJitterMax is the upper bound for the random delay between retries. -// Full jitter in [0, retryJitterMax) spreads concurrent retries across the -// fleet rather than letting them all slam the database at the same instant. -// AlloyDB planned maintenance switchovers typically cause less than one second -// of downtime, so three attempts with up to 100ms between each gives a worst- -// case retry window of ~200ms — well within typical service SLAs while still -// spanning the switchover blip. The caller's context deadline is the escape -// hatch for services with tighter latency budgets. -const retryJitterMax = 100 * time.Millisecond - -// RetryPostgres executes fn up to maxAttempts times, retrying on transient -// connection errors. This is intended for use around individual database -// operations to survive brief outages like AlloyDB maintenance switchovers. -func RetryPostgres(ctx context.Context, maxAttempts int, fn func() error) error { - if maxAttempts <= 0 { - maxAttempts = 3 - } - var err error - for attempt := 0; attempt < maxAttempts; attempt++ { - err = fn() - if err == nil { - return nil - } - if !IsRetryablePostgresError(err) { - return err - } - if attempt < maxAttempts-1 { - delay := time.Duration(rand.Int63n(int64(retryJitterMax))) - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(delay): - } - } - } - return err -} diff --git a/database/postgres_test.go b/database/postgres_test.go index 9f663a0c..0781d452 100644 --- a/database/postgres_test.go +++ b/database/postgres_test.go @@ -2,15 +2,11 @@ package database_test import ( "context" - "errors" - "io" - "net" "os" "path/filepath" "testing" "time" - "github.com/jackc/pgx/v5/pgconn" "github.com/moov-io/base" "github.com/moov-io/base/database" "github.com/moov-io/base/database/testdb" @@ -184,116 +180,6 @@ func Test_Postgres_Alloy_Migrations(t *testing.T) { defer db.Close() } -func TestIsRetryablePostgresError(t *testing.T) { - // nil error is not retryable - require.False(t, database.IsRetryablePostgresError(nil)) - - // admin_shutdown is retryable (seen during AlloyDB maintenance) - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "57P01"})) - - // crash_shutdown is retryable - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "57P02"})) - - // cannot_connect_now is retryable - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "57P03"})) - - // serialization_failure and deadlock_detected are retryable (server rolled back) - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "40001"})) - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "40P01"})) - - // too_many_connections is retryable (rejected at connect time) - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "53300"})) - - // query_canceled is retryable (server rolled back the transaction) - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "57014"})) - - // unique_violation is NOT retryable (application-level error) - require.False(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "23505"})) - - // syntax_error is NOT retryable - require.False(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "42601"})) - - // EOF is retryable (connection severed) - require.True(t, database.IsRetryablePostgresError(io.EOF)) - require.True(t, database.IsRetryablePostgresError(io.ErrUnexpectedEOF)) - - // net.OpError is retryable - require.True(t, database.IsRetryablePostgresError(&net.OpError{ - Op: "read", - Err: errors.New("connection reset by peer"), - })) - - // context.DeadlineExceeded is NOT retryable - require.False(t, database.IsRetryablePostgresError(context.DeadlineExceeded)) - - // String-matched connection errors - require.True(t, database.IsRetryablePostgresError(errors.New("connection reset by peer"))) - require.True(t, database.IsRetryablePostgresError(errors.New("broken pipe"))) - require.True(t, database.IsRetryablePostgresError(errors.New("conn closed"))) - - // Random application error is NOT retryable - require.False(t, database.IsRetryablePostgresError(errors.New("invalid input"))) -} - -func TestRetryPostgres(t *testing.T) { - t.Run("succeeds on first attempt", func(t *testing.T) { - calls := 0 - err := database.RetryPostgres(context.Background(), 3, func() error { - calls++ - return nil - }) - require.NoError(t, err) - require.Equal(t, 1, calls) - }) - - t.Run("retries on transient error then succeeds", func(t *testing.T) { - calls := 0 - err := database.RetryPostgres(context.Background(), 3, func() error { - calls++ - if calls < 3 { - return &pgconn.PgError{Code: "57P01"} // admin_shutdown - } - return nil - }) - require.NoError(t, err) - require.Equal(t, 3, calls) - }) - - t.Run("does not retry non-retryable errors", func(t *testing.T) { - calls := 0 - err := database.RetryPostgres(context.Background(), 3, func() error { - calls++ - return &pgconn.PgError{Code: "23505"} // unique_violation - }) - require.Error(t, err) - require.Equal(t, 1, calls) - }) - - t.Run("respects context cancellation", func(t *testing.T) { - ctx, cancel := context.WithCancel(context.Background()) - cancel() // cancel immediately - - calls := 0 - err := database.RetryPostgres(ctx, 3, func() error { - calls++ - return io.EOF // retryable, but context is done - }) - // First call happens, then context cancellation is detected - require.ErrorIs(t, err, context.Canceled) - require.Equal(t, 1, calls) - }) - - t.Run("exhausts all attempts", func(t *testing.T) { - calls := 0 - err := database.RetryPostgres(context.Background(), 3, func() error { - calls++ - return io.EOF - }) - require.Error(t, err) - require.Equal(t, 3, calls) - require.ErrorIs(t, err, io.EOF) - }) -} func Test_Postgres_UniqueViolation(t *testing.T) { if testing.Short() { From 4e5e7c5476529a9cc724e358d435a40c75de19ca Mon Sep 17 00:00:00 2001 From: Steve Smith Date: Tue, 30 Jun 2026 15:16:37 -0600 Subject: [PATCH 09/12] database: restore original position of getPostgresConnStr Move getPostgresConnStr back to before buildAlloyDBPoolConfig to match the pre-PR ordering and keep the diff readable. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- database/postgres.go | 58 ++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index 12db66a9..d0ccc516 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -65,6 +65,35 @@ func buildPgxPoolConfig(ctx context.Context, config PostgresConfig, databaseName return pgxpool.ParseConfig(connStr) } +func getPostgresConnStr(config PostgresConfig, databaseName string) (string, error) { + url := fmt.Sprintf("postgres://%s:%s@%s/%s", config.User, config.Password, config.Address, databaseName) + + params := "" + + if config.TLS != nil { + if len(config.TLS.Mode) < 1 { + config.TLS.Mode = "verify-full" + } + + params += "sslmode=" + config.TLS.Mode + + if len(config.TLS.CACertFile) > 0 { + params += "&sslrootcert=" + config.TLS.CACertFile + } + + if len(config.TLS.ClientCertFile) > 0 { + params += "&sslcert=" + config.TLS.ClientCertFile + } + + if len(config.TLS.ClientKeyFile) > 0 { + params += "&sslkey=" + config.TLS.ClientKeyFile + } + } + + connStr := fmt.Sprintf("%s?%s", url, params) + return connStr, nil +} + func buildAlloyDBPoolConfig(ctx context.Context, config PostgresConfig, databaseName string) (*pgxpool.Config, error) { if config.Alloy == nil { return nil, fmt.Errorf("missing alloy config") @@ -115,35 +144,6 @@ func buildAlloyDBPoolConfig(ctx context.Context, config PostgresConfig, database return poolConfig, nil } -func getPostgresConnStr(config PostgresConfig, databaseName string) (string, error) { - url := fmt.Sprintf("postgres://%s:%s@%s/%s", config.User, config.Password, config.Address, databaseName) - - params := "" - - if config.TLS != nil { - if len(config.TLS.Mode) < 1 { - config.TLS.Mode = "verify-full" - } - - params += "sslmode=" + config.TLS.Mode - - if len(config.TLS.CACertFile) > 0 { - params += "&sslrootcert=" + config.TLS.CACertFile - } - - if len(config.TLS.ClientCertFile) > 0 { - params += "&sslcert=" + config.TLS.ClientCertFile - } - - if len(config.TLS.ClientKeyFile) > 0 { - params += "&sslkey=" + config.TLS.ClientKeyFile - } - } - - connStr := fmt.Sprintf("%s?%s", url, params) - return connStr, nil -} - // PostgresUniqueViolation returns true when the provided error matches the Postgres code // for unique violation. func PostgresUniqueViolation(err error) bool { From 4f89e8353906d3440fdd0aa9df7a855b94ce695c Mon Sep 17 00:00:00 2001 From: Steve Smith Date: Tue, 30 Jun 2026 15:22:08 -0600 Subject: [PATCH 10/12] database: remove double blank line leftover from retry test removal Co-Authored-By: Claude Sonnet 4.6 (1M context) --- database/postgres_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/database/postgres_test.go b/database/postgres_test.go index 0781d452..68dcb14d 100644 --- a/database/postgres_test.go +++ b/database/postgres_test.go @@ -180,7 +180,6 @@ func Test_Postgres_Alloy_Migrations(t *testing.T) { defer db.Close() } - func Test_Postgres_UniqueViolation(t *testing.T) { if testing.Short() { t.Skip("-short flag enabled") From 596686cd8cad0635b5ffc40cf315839b7a66762a Mon Sep 17 00:00:00 2001 From: Steve Smith Date: Wed, 1 Jul 2026 17:07:19 -0600 Subject: [PATCH 11/12] database: replace HealthCheckPeriod with ShouldPing for dead connection detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HealthCheckPeriod only evicts connections that have exceeded their age thresholds — it does not ping. ShouldPing fires at acquire time and sends an actual liveness check to the server before handing the connection to the caller. Ping if idle > 200ms catches dead connections from an AlloyDB switchover without adding overhead on hot connections used moments ago. Remove ApplyPostgresConnectionsConfig and DefaultPostgresConnectionsConfig: sql.DB setters have no effect on the underlying pgxpool, and SetMaxIdleConns with a non-zero value actively breaks pgxpool by preventing connections from being released back to the pool. OpenDBFromPool already sets MaxIdleConns(0) automatically. Co-Authored-By: Claude Sonnet 4.6 (1M context) --- database/database.go | 25 +------------------- database/database_test.go | 48 --------------------------------------- database/model_config.go | 13 ----------- database/postgres.go | 21 +++++++++++------ 4 files changed, 15 insertions(+), 92 deletions(-) diff --git a/database/database.go b/database/database.go index 087dd5b8..dec12d18 100644 --- a/database/database.go +++ b/database/database.go @@ -40,7 +40,7 @@ func New(ctx context.Context, logger log.Logger, config DatabaseConfig) (*sql.DB if err != nil { return nil, fmt.Errorf("connecting to postgres: %w", err) } - return ApplyPostgresConnectionsConfig(db, &config.Postgres.Connections, logger), nil + return db, nil } return nil, ErrMissingConfig @@ -113,26 +113,3 @@ func ApplyConnectionsConfig(db *sql.DB, connections *ConnectionsConfig, logger l return db } -// ApplyPostgresConnectionsConfig applies connection pool settings with safe defaults -// for Postgres/AlloyDB. If any value in the provided config is zero, the corresponding -// default from DefaultPostgresConnectionsConfig is used. This ensures all services get -// failover-safe pool settings even if they don't explicitly configure them. -func ApplyPostgresConnectionsConfig(db *sql.DB, connections *ConnectionsConfig, logger log.Logger) *sql.DB { - defaults := DefaultPostgresConnectionsConfig() - - applied := *connections - if applied.MaxOpen <= 0 { - applied.MaxOpen = defaults.MaxOpen - } - if applied.MaxIdle <= 0 { - applied.MaxIdle = defaults.MaxIdle - } - if applied.MaxLifetime <= 0 { - applied.MaxLifetime = defaults.MaxLifetime - } - if applied.MaxIdleTime <= 0 { - applied.MaxIdleTime = defaults.MaxIdleTime - } - - return ApplyConnectionsConfig(db, &applied, logger) -} diff --git a/database/database_test.go b/database/database_test.go index abda3ddf..bc17f75c 100644 --- a/database/database_test.go +++ b/database/database_test.go @@ -5,16 +5,13 @@ package database_test import ( "bytes" - "database/sql" "errors" "os" "testing" - "time" gomysql "github.com/go-sql-driver/mysql" "github.com/jackc/pgx/v5/pgconn" "github.com/moov-io/base/database" - "github.com/moov-io/base/log" "github.com/stretchr/testify/require" ) @@ -124,51 +121,6 @@ func TestDataTooLong(t *testing.T) { } } -func TestApplyPostgresConnectionsConfig_Defaults(t *testing.T) { - // When no config values are set, defaults should be applied - db, err := sql.Open("txdb", "TestApplyPostgresConnectionsConfig_Defaults") - if err != nil { - t.Skip("skipping test without txdb driver") - } - defer db.Close() - - connections := &database.ConnectionsConfig{} - database.ApplyPostgresConnectionsConfig(db, connections, log.NewTestLogger()) - - defaults := database.DefaultPostgresConnectionsConfig() - stats := db.Stats() - require.Equal(t, defaults.MaxOpen, stats.MaxOpenConnections) -} - -func TestApplyPostgresConnectionsConfig_Overrides(t *testing.T) { - // When config values are set, they should override defaults - db, err := sql.Open("txdb", "TestApplyPostgresConnectionsConfig_Overrides") - if err != nil { - t.Skip("skipping test without txdb driver") - } - defer db.Close() - - connections := &database.ConnectionsConfig{ - MaxOpen: 10, - MaxIdle: 3, - MaxLifetime: time.Minute, - MaxIdleTime: time.Second * 15, - } - database.ApplyPostgresConnectionsConfig(db, connections, log.NewTestLogger()) - - stats := db.Stats() - require.Equal(t, 10, stats.MaxOpenConnections) -} - -func TestDefaultPostgresConnectionsConfig(t *testing.T) { - defaults := database.DefaultPostgresConnectionsConfig() - require.Greater(t, defaults.MaxOpen, 0) - require.Greater(t, defaults.MaxIdle, 0) - require.Greater(t, defaults.MaxLifetime, time.Duration(0)) - require.Greater(t, defaults.MaxIdleTime, time.Duration(0)) - // MaxIdleTime should be shorter than MaxLifetime - require.Less(t, defaults.MaxIdleTime, defaults.MaxLifetime) -} func TestConnectionsConfigOrder(t *testing.T) { bs, err := os.ReadFile("database.go") diff --git a/database/model_config.go b/database/model_config.go index d5b84150..ea8b9026 100644 --- a/database/model_config.go +++ b/database/model_config.go @@ -99,19 +99,6 @@ type ConnectionsConfig struct { MaxIdleTime time.Duration } -// DefaultPostgresConnectionsConfig returns connection pool defaults tuned for -// database failover recovery (e.g. AlloyDB maintenance switchovers). Short -// lifetimes and idle times ensure dead connections are evicted quickly so -// the pool re-establishes connections to the new primary. -func DefaultPostgresConnectionsConfig() ConnectionsConfig { - return ConnectionsConfig{ - MaxOpen: 25, - MaxIdle: 5, - MaxLifetime: 5 * time.Minute, - MaxIdleTime: 30 * time.Second, - } -} - type RetryConfig struct { MaxAttempts int MinDuration time.Duration diff --git a/database/postgres.go b/database/postgres.go index d0ccc516..a426f6f6 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -29,10 +29,16 @@ func postgresConnection(ctx context.Context, logger log.Logger, config PostgresC return nil, logger.LogErrorf("building pgx pool config: %w", err).Err() } - // HealthCheckPeriod makes pgxpool ping idle connections in the background. - // Dead connections (e.g. from an AlloyDB switchover) are evicted before - // the application ever sees them. - poolConfig.HealthCheckPeriod = 1 * time.Second + // Ping connections that have been idle for more than 200ms before handing + // them to the caller. This catches dead connections left by an AlloyDB + // switchover before a query is attempted, without adding overhead on + // hot connections used moments ago. + // HealthCheckPeriod (the background reaper) does NOT ping — it only evicts + // connections that have exceeded their age thresholds. ShouldPing is the + // mechanism that actually tests liveness at acquire time. + poolConfig.ShouldPing = func(_ context.Context, p pgxpool.ShouldPingParams) bool { + return p.IdleDuration > 200*time.Millisecond + } pool, err := pgxpool.NewWithConfig(ctx, poolConfig) if err != nil { @@ -45,9 +51,10 @@ func postgresConnection(ctx context.Context, logger log.Logger, config PostgresC return nil, logger.LogErrorf("connecting to database: %w", err).Err() } - // Wrap the pgxpool in a *sql.DB so the rest of the codebase doesn't change. - // pgxpool manages the real pool (with health checks); database/sql pool - // settings are applied on top via ApplyPostgresConnectionsConfig. + // OpenDBFromPool wraps pgxpool in a *sql.DB for compatibility with the rest + // of the codebase. It automatically sets MaxIdleConns to 0 on the sql.DB — + // this must not be overridden, as pgxpool manages its own connection pool + // and a non-zero value would prevent connections from being released back. db := stdlib.OpenDBFromPool(pool) return db, nil From f6d8d4cc8a5532939d224a1be9cf7a58c21c972b Mon Sep 17 00:00:00 2001 From: Jeff Braucher Date: Thu, 2 Jul 2026 11:59:44 -0400 Subject: [PATCH 12/12] Preserve ability to tune (some of) the connection config for the database when using pgxpool. --- database/postgres.go | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/database/postgres.go b/database/postgres.go index a426f6f6..dd326cd5 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -29,6 +29,8 @@ func postgresConnection(ctx context.Context, logger log.Logger, config PostgresC return nil, logger.LogErrorf("building pgx pool config: %w", err).Err() } + applyPgxPoolConnectionsConfig(logger, poolConfig, config.Connections) + // Ping connections that have been idle for more than 200ms before handing // them to the caller. This catches dead connections left by an AlloyDB // switchover before a query is attempted, without adding overhead on @@ -60,6 +62,31 @@ func postgresConnection(ctx context.Context, logger log.Logger, config PostgresC return db, nil } +// applyPgxPoolConnectionsConfig translates ConnectionsConfig onto a pgxpool.Config. +// MaxIdle has no pgxpool equivalent — pgxpool caps total connections via MaxConns +// rather than idle count, and keeps a floor via MinConns. When set, MaxIdle is +// logged and ignored so operators aren't misled into thinking it took effect. +func applyPgxPoolConnectionsConfig(logger log.Logger, poolConfig *pgxpool.Config, connections ConnectionsConfig) { + if connections.MaxOpen > 0 { + logger.Logf("setting pgx pool MaxConns to %d", connections.MaxOpen) + poolConfig.MaxConns = int32(connections.MaxOpen) + } + + if connections.MaxIdle > 0 { + logger.Logf("ignoring ConnectionsConfig.MaxIdle=%d: pgxpool has no MaxIdle equivalent", connections.MaxIdle) + } + + if connections.MaxIdleTime > 0 { + logger.Logf("setting pgx pool MaxConnIdleTime to %v", connections.MaxIdleTime) + poolConfig.MaxConnIdleTime = connections.MaxIdleTime + } + + if connections.MaxLifetime > 0 { + logger.Logf("setting pgx pool MaxConnLifetime to %v", connections.MaxLifetime) + poolConfig.MaxConnLifetime = connections.MaxLifetime + } +} + func buildPgxPoolConfig(ctx context.Context, config PostgresConfig, databaseName string) (*pgxpool.Config, error) { if config.Alloy != nil { return buildAlloyDBPoolConfig(ctx, config, databaseName)