From 192854765b15cbf8fbe58ed2c47f48120ec132f0 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 29 Jun 2026 15:49:17 +0000 Subject: [PATCH 01/13] database: split RetryPostgres into safe RetryPostgresTx and generic RetryUnsafe Implements the transaction-retry pattern suggested in PR #490 review comments (discussion_r3470551946 and discussion_r3487236671): - RetryPostgresTx wraps fn in an explicit transaction (fn receives a *sql.Tx) and retries the whole transaction on transient errors. This is the "safe" variant: the explicit transaction guarantees fn's work is atomic, so a retried attempt starts from a clean slate. - RetryUnsafe retries fn (no transaction wrapper) on any error by default; the caller vouches that fn is idempotent. This is the "unsafe" variant for cases where a transaction isn't feasible. - RetryMySQLTx and RetrySpannerTx are added as not-implemented placeholders so the cross-backend API surface is visible and the gap is enforced by tests. The safe variant's default classifier (isRetryablePostgresTxError) is the OR of three signals: 1. driver.ErrBadConn -- pgx's stdlib adapter converts pgconn.SafeToRetry-flagged Exec/Query errors to driver.ErrBadConn before fn sees them; this is the primary retry trigger inside a transaction. 2. pgconn.SafeToRetry -- covers Begin/Commit failures where the original pgx error flows through unchanged. 3. IsRetryablePostgresError -- server-returned SQLSTATE codes that guarantee rollback (40001, 40P01, 57P01, ...). RetryTxOptions.IsRetryable is additive on top of the default classifier so consumers can extend it with implementation-specific cases without narrowing the safety floor. RetryUnsafeOptions.IsRetryable replaces the default (caller takes full control). The existing RetryPostgres is removed (the PR is unmerged). A stacked follow-up will add pg_current_xact_id()/pg_xact_status() commit verification; a TODO marks the insertion point in retryTx. Co-authored-by: E-Love (Eric Loveland) --- database/mysql.go | 15 ++ database/postgres.go | 63 +++++--- database/postgres_test.go | 40 +++-- database/retry.go | 149 +++++++++++++++++ database/retry_test.go | 332 ++++++++++++++++++++++++++++++++++++++ database/spanner.go | 20 +++ 6 files changed, 582 insertions(+), 37 deletions(-) create mode 100644 database/retry.go create mode 100644 database/retry_test.go diff --git a/database/mysql.go b/database/mysql.go index 08e5da0d..06badb6b 100644 --- a/database/mysql.go +++ b/database/mysql.go @@ -245,3 +245,18 @@ func MySQLDeadlockFound(err error) bool { return strings.Contains(err.Error(), fmt.Sprintf("Error %d", mysqlErrDeadlockFound)) } + +// RetryMySQLTx is the MySQL equivalent of RetryPostgresTx. +// TODO: implement with MySQL retryable codes (1213 deadlock, 1205 lock wait +// timeout, 2013 connection lost, network resets). The shared retryTx helper +// in retry.go already handles driver.ErrBadConn (backend-agnostic) and the +// Begin/fn/Commit/Rollback loop; the missing piece is a MySQL-specific +// default classifier analogous to isRetryablePostgresTxError. See +// https://dev.mysql.com/doc/mysql-errors/8.0/en/server-error-reference.html +func RetryMySQLTx(ctx context.Context, db *sql.DB, opts RetryTxOptions, fn func(*sql.Tx) error) error { + _ = ctx + _ = db + _ = opts + _ = fn + return errors.New("database: RetryMySQLTx is not yet implemented; see TODO") +} diff --git a/database/postgres.go b/database/postgres.go index 4411d660..cce4ff27 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -3,10 +3,10 @@ package database import ( "context" "database/sql" + "database/sql/driver" "errors" "fmt" "io" - "math/rand" "net" "strings" "time" @@ -258,30 +258,43 @@ func IsRetryablePostgresError(err error) bool { // hatch for services with tighter latency budgets. const retryJitterMax = 100 * time.Millisecond -// RetryPostgres executes fn up to maxAttempts times, retrying on transient -// connection errors. This is intended for use around individual database -// operations to survive brief outages like AlloyDB maintenance switchovers. -func RetryPostgres(ctx context.Context, maxAttempts int, fn func() error) error { - if maxAttempts <= 0 { - maxAttempts = 3 +// isRetryablePostgresTxError is the default classifier used by RetryPostgresTx. +// It is the OR of three signals: +// +// 1. errors.Is(err, driver.ErrBadConn) — covers fn's tx.Exec/tx.Query +// pre-send failures. pgx's stdlib adapter converts pgconn.SafeToRetry- +// flagged errors from Exec/Query into driver.ErrBadConn before +// database/sql (and thus fn) sees them, discarding the original pgx +// error. This is the primary retry trigger inside a transaction. +// 2. pgconn.SafeToRetry(err) — covers Begin/Commit failures where the +// original pgx error flows through unchanged (connect errors, conn-busy, +// HA NotPreferredError, pre-send timeouts). +// 3. IsRetryablePostgresError(err) — covers server-returned SQLSTATE codes +// that guarantee rollback (40001, 40P01, 57P01, 57P02, 57P03, 53300, +// 57014). These are NOT SafeToRetry-flagged by pgx and flow through +// unchanged. +// +// driver.ErrBadConn is backend-agnostic, so the same "retry on bad conn" +// leg will carry over to RetryMySQLTx / RetrySpannerTx when implemented. +func isRetryablePostgresTxError(err error) bool { + if errors.Is(err, driver.ErrBadConn) { + return true } - var err error - for attempt := 0; attempt < maxAttempts; attempt++ { - err = fn() - if err == nil { - return nil - } - if !IsRetryablePostgresError(err) { - return err - } - if attempt < maxAttempts-1 { - delay := time.Duration(rand.Int63n(int64(retryJitterMax))) - select { - case <-ctx.Done(): - return ctx.Err() - case <-time.After(delay): - } - } + if pgconn.SafeToRetry(err) { + return true } - return err + return IsRetryablePostgresError(err) +} + +// RetryPostgresTx executes fn inside a Postgres transaction, retrying the +// entire transaction on transient errors. fn receives a fresh *sql.Tx each +// attempt; if fn returns an error the tx is rolled back, if nil the tx is +// committed. An error is retried when isRetryablePostgresTxError(err) OR +// opts.IsRetryable(err) is true. +// +// db MUST be a *sql.DB backed by the pgx stdlib adapter (e.g. one returned +// by database.New with a PostgresConfig). Using a *sql.DB backed by another +// driver will not produce pgx-specific retries. +func RetryPostgresTx(ctx context.Context, db *sql.DB, opts RetryTxOptions, fn func(*sql.Tx) error) error { + return retryTx(ctx, db, isRetryablePostgresTxError, opts, fn) } diff --git a/database/postgres_test.go b/database/postgres_test.go index 9f663a0c..75c0ea5e 100644 --- a/database/postgres_test.go +++ b/database/postgres_test.go @@ -235,10 +235,10 @@ func TestIsRetryablePostgresError(t *testing.T) { require.False(t, database.IsRetryablePostgresError(errors.New("invalid input"))) } -func TestRetryPostgres(t *testing.T) { +func TestRetryUnsafe(t *testing.T) { t.Run("succeeds on first attempt", func(t *testing.T) { calls := 0 - err := database.RetryPostgres(context.Background(), 3, func() error { + err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{MaxAttempts: 3}, func() error { calls++ return nil }) @@ -246,12 +246,15 @@ func TestRetryPostgres(t *testing.T) { require.Equal(t, 1, calls) }) - t.Run("retries on transient error then succeeds", func(t *testing.T) { + t.Run("retries on any error by default, including non-Pg-retryable ones", func(t *testing.T) { + // unique_violation is NOT retryable per IsRetryablePostgresError, but + // RetryUnsafe's default predicate retries any error except context + // cancellation because the caller has vouched that fn is idempotent. calls := 0 - err := database.RetryPostgres(context.Background(), 3, func() error { + err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{MaxAttempts: 3}, func() error { calls++ if calls < 3 { - return &pgconn.PgError{Code: "57P01"} // admin_shutdown + return &pgconn.PgError{Code: "23505"} // unique_violation } return nil }) @@ -259,33 +262,46 @@ func TestRetryPostgres(t *testing.T) { require.Equal(t, 3, calls) }) - t.Run("does not retry non-retryable errors", func(t *testing.T) { + t.Run("custom IsRetryable short-circuits non-retryable errors", func(t *testing.T) { calls := 0 - err := database.RetryPostgres(context.Background(), 3, func() error { + neverRetry := func(error) bool { return false } + err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{MaxAttempts: 3, IsRetryable: neverRetry}, func() error { calls++ - return &pgconn.PgError{Code: "23505"} // unique_violation + return io.EOF + }) + require.Error(t, err) + require.Equal(t, 1, calls) + require.ErrorIs(t, err, io.EOF) + }) + + t.Run("does not retry context cancellation by default", func(t *testing.T) { + calls := 0 + err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{MaxAttempts: 3}, func() error { + calls++ + return context.Canceled }) require.Error(t, err) require.Equal(t, 1, calls) + require.ErrorIs(t, err, context.Canceled) }) - t.Run("respects context cancellation", func(t *testing.T) { + t.Run("respects context cancellation between attempts", func(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) cancel() // cancel immediately calls := 0 - err := database.RetryPostgres(ctx, 3, func() error { + err := database.RetryUnsafe(ctx, database.RetryUnsafeOptions{MaxAttempts: 3}, func() error { calls++ return io.EOF // retryable, but context is done }) - // First call happens, then context cancellation is detected + // First call happens, then context cancellation is detected before the next attempt require.ErrorIs(t, err, context.Canceled) require.Equal(t, 1, calls) }) t.Run("exhausts all attempts", func(t *testing.T) { calls := 0 - err := database.RetryPostgres(context.Background(), 3, func() error { + err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{MaxAttempts: 3}, func() error { calls++ return io.EOF }) diff --git a/database/retry.go b/database/retry.go new file mode 100644 index 00000000..3625f6c5 --- /dev/null +++ b/database/retry.go @@ -0,0 +1,149 @@ +package database + +import ( + "context" + "database/sql" + "errors" + "math/rand" + "time" +) + +// RetryTxOptions configures the transactional (safe) retry functions +// (RetryPostgresTx and the future RetryMySQLTx / RetrySpannerTx). +type RetryTxOptions struct { + // MaxAttempts caps the number of transaction attempts. Defaults to 3 if <= 0. + MaxAttempts int + // TxOptions is passed to (*sql.DB).BeginTx on each attempt. nil = default isolation. + TxOptions *sql.TxOptions + // IsRetryable, if non-nil, is consulted IN ADDITION to the backend's default + // classifier (e.g. isRetryablePostgresTxError). Lets consumers add + // implementation-specific cases on top of the library's known-safe floor. + IsRetryable func(err error) bool +} + +// RetryUnsafeOptions configures RetryUnsafe. +type RetryUnsafeOptions struct { + // MaxAttempts caps the number of attempts. Defaults to 3 if <= 0. + MaxAttempts int + // IsRetryable, if nil, defaults to "retry on any error except + // context.Canceled / context.DeadlineExceeded". If non-nil, replaces the default. + IsRetryable func(err error) bool +} + +// beginTxer is satisfied by *sql.DB so the retry loop is testable with fakes +// without needing a real database driver. +type beginTxer interface { + BeginTx(ctx context.Context, opts *sql.TxOptions) (*sql.Tx, error) +} + +// RetryUnsafe executes fn up to MaxAttempts times, retrying on any error that +// opts.IsRetryable classifies as retryable (default: any error except context +// cancellation/deadline). fn MUST be idempotent — no transaction wrapper is +// provided, so a retry may re-execute work that already committed. +func RetryUnsafe(ctx context.Context, opts RetryUnsafeOptions, fn func() error) error { + maxAttempts := opts.MaxAttempts + if maxAttempts <= 0 { + maxAttempts = 3 + } + isRetryable := opts.IsRetryable + if isRetryable == nil { + isRetryable = isRetryableUnsafeDefault + } + var err error + for attempt := 0; attempt < maxAttempts; attempt++ { + err = fn() + if err == nil { + return nil + } + if !isRetryable(err) { + return err + } + if !sleepWithJitter(ctx, attempt, maxAttempts) { + return ctx.Err() + } + } + return err +} + +// isRetryableUnsafeDefault retries on any error except context cancellation +// / deadline, since the caller of RetryUnsafe has vouched that fn is idempotent. +func isRetryableUnsafeDefault(err error) bool { + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return false + } + return true +} + +// retryTx is the shared transactional retry loop used by RetryPostgresTx and +// the future RetryMySQLTx / RetrySpannerTx implementations. defaultClassifier +// is the backend's known-safe floor; opts.IsRetryable is additive on top of it. +func retryTx(ctx context.Context, db beginTxer, defaultClassifier func(error) bool, opts RetryTxOptions, fn func(*sql.Tx) error) error { + maxAttempts := opts.MaxAttempts + if maxAttempts <= 0 { + maxAttempts = 3 + } + isRetryable := defaultClassifier + if opts.IsRetryable != nil { + extra := opts.IsRetryable + isRetryable = func(err error) bool { + return defaultClassifier(err) || extra(err) + } + } + var lastErr error + for attempt := 0; attempt < maxAttempts; attempt++ { + lastErr = runOneTxAttempt(ctx, db, opts.TxOptions, fn) + if lastErr == nil { + return nil + } + if !isRetryable(lastErr) { + return lastErr + } + if !sleepWithJitter(ctx, attempt, maxAttempts) { + return ctx.Err() + } + } + return lastErr +} + +// runOneTxAttempt runs a single begin/fn/commit attempt and returns the first +// error encountered (from BeginTx, fn, or Commit). On a successful commit it +// returns nil. The deferred Rollback is a no-op after a successful Commit and +// cleans up the transaction if fn returns an error or panics. +func runOneTxAttempt(ctx context.Context, db beginTxer, txOpts *sql.TxOptions, fn func(*sql.Tx) error) (err error) { + tx, beginErr := db.BeginTx(ctx, txOpts) + if beginErr != nil { + return beginErr + } + defer func() { + if p := recover(); p != nil { + _ = tx.Rollback() + panic(p) + } + if err != nil { + _ = tx.Rollback() + } + }() + if err = fn(tx); err != nil { + return err + } + // TODO(stacked-pr): capture pg_current_xact_id() before commit and + // consult pg_xact_status() on commit error to detect whether the commit + // landed. Postgres-only; see RetryPostgresTx. + return tx.Commit() +} + +// sleepWithJitter waits for a random duration in [0, retryJitterMax) before +// the next retry attempt, respecting ctx. It returns false if ctx was done +// before the sleep completed. It does not sleep after the final attempt. +func sleepWithJitter(ctx context.Context, attempt, maxAttempts int) bool { + if attempt >= maxAttempts-1 { + return true + } + delay := time.Duration(rand.Int63n(int64(retryJitterMax))) + select { + case <-ctx.Done(): + return false + case <-time.After(delay): + return true + } +} diff --git a/database/retry_test.go b/database/retry_test.go new file mode 100644 index 00000000..851c9633 --- /dev/null +++ b/database/retry_test.go @@ -0,0 +1,332 @@ +package database + +import ( + "context" + "database/sql" + "database/sql/driver" + "errors" + "fmt" + "io" + "sync" + "testing" + + "github.com/jackc/pgx/v5/pgconn" + "github.com/stretchr/testify/require" +) + +// safeToRetryErr is a test double for a pgx error that pgconn.SafeToRetry +// flags as safe (i.e. it implements the unexported interface{ SafeToRetry() bool } +// that pgconn.SafeToRetry looks for via errors.As). pgx's own SafeToRetry-flagged +// types have unexported fields, so we construct one directly here. +type safeToRetryErr struct{ msg string } + +func (e *safeToRetryErr) Error() string { return e.msg } +func (e *safeToRetryErr) SafeToRetry() bool { return true } + +// mockConnector + mockDriver + mockConn + mockTx form a minimal +// database/sql/driver implementation whose BeginTx/Commit/Rollback/Exec return +// configurable, stateful error sequences. This lets RetryPostgresTx (and the +// shared retryTx helper it delegates to) be driven through a real *sql.DB +// without a Postgres. *sql.Tx is concrete, so a mock driver is the only way +// to control Commit/Rollback errors. +type mockConnector struct{ d *mockDriver } + +func (c *mockConnector) Connect(context.Context) (driver.Conn, error) { + return &mockConn{d: c.d}, nil +} +func (c *mockConnector) Driver() driver.Driver { return c.d } + +type mockDriver struct { + beginErrs []error + commitErrs []error + rollbackErrs []error + execErrs []error + mu sync.Mutex + beginCalls int + commitCalls int + rollbackCalls int + execCalls int +} + +func (d *mockDriver) Open(string) (driver.Conn, error) { return &mockConn{d: d}, nil } + +func (d *mockDriver) nthBegin() (int, error) { + d.mu.Lock() + defer d.mu.Unlock() + i := d.beginCalls + d.beginCalls++ + return i, nthErr(d.beginErrs, i) +} +func (d *mockDriver) nthCommit() (int, error) { + d.mu.Lock() + defer d.mu.Unlock() + i := d.commitCalls + d.commitCalls++ + return i, nthErr(d.commitErrs, i) +} +func (d *mockDriver) nthRollback() (int, error) { + d.mu.Lock() + defer d.mu.Unlock() + i := d.rollbackCalls + d.rollbackCalls++ + return i, nthErr(d.rollbackErrs, i) +} + +type mockConn struct{ d *mockDriver } + +func (c *mockConn) BeginTx(context.Context, driver.TxOptions) (driver.Tx, error) { + if _, err := c.d.nthBegin(); err != nil { + return nil, err + } + return &mockTx{d: c.d}, nil +} +func (c *mockConn) Begin() (driver.Tx, error) { + return c.BeginTx(context.Background(), driver.TxOptions{}) +} +func (c *mockConn) Close() error { return nil } +func (c *mockConn) Prepare(query string) (driver.Stmt, error) { + return c.PrepareContext(context.Background(), query) +} +func (c *mockConn) PrepareContext(context.Context, string) (driver.Stmt, error) { + return nil, errors.New("mockDriver: PrepareContext not implemented") +} + +type mockTx struct{ d *mockDriver } + +func (t *mockTx) Commit() error { + _, err := t.d.nthCommit() + return err +} +func (t *mockTx) Rollback() error { + _, err := t.d.nthRollback() + return err +} + +// nthErr returns errs[i] or nil when i is out of range. Tests pass a sequence +// like []error{err1, err2} to mean "fail twice then succeed". +func nthErr(errs []error, i int) error { + if i < len(errs) { + return errs[i] + } + return nil +} + +func newMockDB(d *mockDriver) *sql.DB { + return sql.OpenDB(&mockConnector{d: d}) +} + +func TestRetryPostgresTx(t *testing.T) { + t.Run("succeeds on first attempt", func(t *testing.T) { + d := &mockDriver{} + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + calls++ + return nil + }) + require.NoError(t, err) + require.Equal(t, 1, calls) + require.Equal(t, 1, d.beginCalls) + require.Equal(t, 1, d.commitCalls) + }) + + t.Run("fn error then succeed rolls back between attempts", func(t *testing.T) { + d := &mockDriver{} + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + calls++ + if calls < 2 { + return &pgconn.PgError{Code: "40001"} // serialization_failure -> retryable + } + return nil + }) + require.NoError(t, err) + require.Equal(t, 2, calls) + // Two begins, two commits (first attempt's commit is skipped because fn + // returned an error -> rollback instead). + require.Equal(t, 2, d.beginCalls) + require.Equal(t, 1, d.commitCalls) // only the successful attempt commits + require.Equal(t, 1, d.rollbackCalls) + }) + + t.Run("non-retryable fn error short-circuits", func(t *testing.T) { + d := &mockDriver{} + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + calls++ + return &pgconn.PgError{Code: "23505"} // unique_violation -> NOT retryable + }) + require.Error(t, err) + require.Equal(t, 1, calls) + require.Equal(t, 1, d.beginCalls) + require.Equal(t, 0, d.commitCalls) + require.Equal(t, 1, d.rollbackCalls) + }) + + t.Run("driver.ErrBadConn from fn is retried (fn-path signal)", func(t *testing.T) { + // pgx's stdlib adapter converts SafeToRetry-flagged Exec/Query errors to + // driver.ErrBadConn before fn sees them, so this is the primary retry + // trigger inside a transaction. + d := &mockDriver{} + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + calls++ + if calls < 2 { + return driver.ErrBadConn + } + return nil + }) + require.NoError(t, err) + require.Equal(t, 2, calls) + require.Equal(t, 2, d.beginCalls) + }) + + t.Run("SafeToRetry-flagged Begin failure is retried (Begin/Commit-path signal)", func(t *testing.T) { + // Begin/Commit errors flow through the stdlib adapter unchanged, so + // pgconn.SafeToRetry sees the original pgx error. *sql.DB does not + // internally retry on non-ErrBadConn errors, so our retry loop sees it. + d := &mockDriver{ + beginErrs: []error{&safeToRetryErr{msg: "pre-send begin failure"}}, + } + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + calls++ + return nil + }) + require.NoError(t, err) + require.Equal(t, 1, calls) + require.Equal(t, 2, d.beginCalls) // first Begin failed, second succeeded + }) + + t.Run("SafeToRetry-flagged Commit failure is retried", func(t *testing.T) { + d := &mockDriver{ + commitErrs: []error{&safeToRetryErr{msg: "pre-send commit failure"}}, + } + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + calls++ + return nil + }) + require.NoError(t, err) + require.Equal(t, 2, calls) + require.Equal(t, 2, d.beginCalls) + require.Equal(t, 2, d.commitCalls) // first commit failed, second succeeded + }) + + t.Run("additive opts.IsRetryable extends the default classifier", func(t *testing.T) { + // sentinelErr is not driver.ErrBadConn, not SafeToRetry-flagged, and not + // an IsRetryablePostgresError SQLSTATE code, so the default classifier + // returns false. opts.IsRetryable adds it on top. + sentinelErr := errors.New("custom retryable sentinel") + + d := &mockDriver{} + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{ + MaxAttempts: 3, + IsRetryable: func(err error) bool { return errors.Is(err, sentinelErr) }, + }, func(*sql.Tx) error { + calls++ + if calls < 2 { + return sentinelErr + } + return nil + }) + require.NoError(t, err) + require.Equal(t, 2, calls) + }) + + t.Run("respects context cancellation between attempts", func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + d := &mockDriver{} + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresTx(ctx, db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + calls++ + if calls == 1 { + cancel() // cancel after the first attempt runs + return &pgconn.PgError{Code: "40001"} // retryable, but ctx is now done + } + return nil + }) + // First attempt runs, then context cancellation is detected before the next attempt + require.ErrorIs(t, err, context.Canceled) + require.Equal(t, 1, calls) + }) + + t.Run("exhausts all attempts", func(t *testing.T) { + d := &mockDriver{} + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + calls++ + return &pgconn.PgError{Code: "40P01"} // deadlock_detected -> retryable + }) + require.Error(t, err) + require.Equal(t, 3, calls) + require.Equal(t, 3, d.beginCalls) + require.Equal(t, 0, d.commitCalls) // fn always errors, so no commit + require.Equal(t, 3, d.rollbackCalls) + }) +} + +func TestIsRetryablePostgresTxError(t *testing.T) { + // Leg 1: driver.ErrBadConn (fn-path signal — pgx stdlib adapter converts + // SafeToRetry Exec/Query errors to this). + require.True(t, isRetryablePostgresTxError(driver.ErrBadConn)) + require.True(t, isRetryablePostgresTxError(fmt.Errorf("wrapped: %w", driver.ErrBadConn))) + + // Leg 2: pgconn.SafeToRetry-flagged (Begin/Commit-path signal). + require.True(t, isRetryablePostgresTxError(&safeToRetryErr{msg: "pre-send"})) + + // Leg 3: IsRetryablePostgresError SQLSTATE codes that guarantee rollback. + require.True(t, isRetryablePostgresTxError(&pgconn.PgError{Code: "40001"})) // serialization_failure + require.True(t, isRetryablePostgresTxError(&pgconn.PgError{Code: "40P01"})) // deadlock_detected + require.True(t, isRetryablePostgresTxError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown + + // Non-retryable: unique_violation is an application error, not a transient one. + require.False(t, isRetryablePostgresTxError(&pgconn.PgError{Code: "23505"})) + // Non-retryable: a random error that matches none of the three legs. + require.False(t, isRetryablePostgresTxError(errors.New("some application error"))) + // Non-retryable: context cancellation is never retryable. + require.False(t, isRetryablePostgresTxError(context.Canceled)) + // Network errors are retryable via IsRetryablePostgresError's typed checks. + require.True(t, isRetryablePostgresTxError(io.EOF)) +} + +func TestRetryMySQLTxNotImplemented(t *testing.T) { + err := RetryMySQLTx(context.Background(), nil, RetryTxOptions{}, func(*sql.Tx) error { return nil }) + require.Error(t, err) + require.Contains(t, err.Error(), "not yet implemented") +} + +func TestRetrySpannerTxNotImplemented(t *testing.T) { + err := RetrySpannerTx(context.Background(), nil, RetryTxOptions{}, func(*sql.Tx) error { return nil }) + require.Error(t, err) + require.Contains(t, err.Error(), "not yet implemented") +} diff --git a/database/spanner.go b/database/spanner.go index 220f7939..f1986977 100644 --- a/database/spanner.go +++ b/database/spanner.go @@ -1,7 +1,9 @@ package database import ( + "context" "database/sql" + "errors" "fmt" "strings" @@ -41,3 +43,21 @@ func SpannerUniqueViolation(err error) bool { return spanner.ErrCode(err) == codes.AlreadyExists || strings.Contains(err.Error(), "AlreadyExists") } + +// RetrySpannerTx is the Spanner equivalent of RetryPostgresTx. +// TODO: consider delegating to spannerdriver.RunTransactionWithOptions for +// ABORTED replay (it replays statements + buffered mutations on a new +// transaction and surfaces ErrAbortedDueToConcurrentModification when the +// replay sees different data), plus network-error retry on top. The shared +// retryTx helper in retry.go already handles driver.ErrBadConn +// (backend-agnostic) and the Begin/fn/Commit/Rollback loop; the missing +// piece is a Spanner-specific default classifier analogous to +// isRetryablePostgresTxError. See +// https://pkg.go.dev/github.com/googleapis/go-sql-spanner +func RetrySpannerTx(ctx context.Context, db *sql.DB, opts RetryTxOptions, fn func(*sql.Tx) error) error { + _ = ctx + _ = db + _ = opts + _ = fn + return errors.New("database: RetrySpannerTx is not yet implemented; see TODO") +} From 39cee60354a1cfc616e1458f310e96c35625a2d0 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 29 Jun 2026 18:32:17 +0000 Subject: [PATCH 02/13] database: trim IsRetryablePostgresError to server-rolled-back codes only Addresses review comments r3482973525 and r3483137216 on PR #490: without an explicit transaction there is no way to know whether a network error occurred before or after the server accepted the query, so retrying could duplicate committed work. IsRetryablePostgresError now classifies only PostgreSQL SQLSTATE codes that the server guarantees were rolled back before the error was returned (40001, 40P01, 57P01, 57P02, 57P03, 53300, 57014). The net.OpError, io.EOF, io.ErrUnexpectedEOF, and string-matching ("connection reset by peer"/"broken pipe"/"conn closed") checks are removed. The context.DeadlineExceeded check is removed as redundant (non-PgError errors already return false). The doc comment explains why network errors are excluded and points callers to RetryPostgresTx for transactional retry. The network-error checks move into a new internal isPostgresNetworkError helper, which is wired into isRetryablePostgresTxError as a 4th leg. Network errors ARE safe to retry within an explicit transaction (the server rolls back the uncommitted transaction), so RetryPostgresTx keeps them. String matching is dropped entirely -- driver.ErrBadConn and pgconn.SafeToRetry cover pgx's pre-send cases, and the typed net.OpError / io checks cover post-send cases, so substring matching is redundant and fragile. Tests updated: - TestIsRetryablePostgresError: io.EOF, io.ErrUnexpectedEOF, net.OpError, and the string-matched errors now assert false; SQLSTATE assertions unchanged. - TestIsRetryablePostgresTxError: added a Leg 4 section asserting io.EOF, io.ErrUnexpectedEOF, and net.OpError are retryable via the transactional classifier. Co-authored-by: E-Love (Eric Loveland) --- database/postgres.go | 84 +++++++++++++++++++++++---------------- database/postgres_test.go | 26 +++++++----- database/retry_test.go | 12 +++++- 3 files changed, 75 insertions(+), 47 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index cce4ff27..e6aaaa57 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -176,17 +176,23 @@ func PostgresDeadlockFound(err error) bool { return strings.Contains(err.Error(), postgresErrDeadlockFound) } -// IsRetryablePostgresError returns true if the error is a transient connection-level -// error that is safe to retry. This covers the errors seen during AlloyDB maintenance -// switchovers and other transient network failures. +// IsRetryablePostgresError returns true if the error is a PostgreSQL SQLSTATE +// code that the server guarantees was rolled back before the error was +// returned. These are safe to retry even without an explicit transaction +// because the server has already undone any partial work. +// +// Network-level errors (connection reset, broken pipe, EOF, etc.) are +// intentionally NOT classified as retryable here: without an explicit +// transaction there is no way to know whether the error occurred before or +// after the server accepted the query, so retrying could duplicate committed +// work. Callers who wrap their work in a transaction should use RetryPostgresTx, +// whose classifier (isRetryablePostgresTxError) adds network errors back in +// because the transaction makes them safe to retry. func IsRetryablePostgresError(err error) bool { if err == nil { return false } - // PostgreSQL error codes that are safe to retry because the server guarantees - // the transaction was rolled back before the error was returned. - // // 57P01 admin_shutdown, 57P02 crash_shutdown: fast shutdown rolls back all // active transactions before disconnecting clients. See: // https://www.postgresql.org/docs/current/server-shutdown.html @@ -204,48 +210,43 @@ func IsRetryablePostgresError(err error) bool { // this error. // // Note: 08xxx (connection_exception class) codes are intentionally omitted. - // pgx surfaces connection-level failures as TCP/network errors, not as - // *pgconn.PgError, so those cases are handled below. + // pgx surfaces connection-level failures as network errors, not as + // *pgconn.PgError, so those cases are handled by isPostgresNetworkError + // (used by isRetryablePostgresTxError within a transaction). var pgErr *pgconn.PgError if errors.As(err, &pgErr) { switch pgErr.Code { case "57P01", "57P02", "57P03", // admin_shutdown, crash_shutdown, cannot_connect_now "40001", "40P01", // serialization_failure, deadlock_detected - "53300", // too_many_connections - "57014": // query_canceled + "53300", // too_many_connections + "57014": // query_canceled return true } return false } - // Network-level errors: connection reset, broken pipe, EOF, etc. - // These occur when the TCP connection is severed during a switchover. + // Non-PgError errors (network errors, context errors, application errors) + // are not retryable here — see the doc comment above. + return false +} + +// isPostgresNetworkError returns true for typed network-level errors that +// indicate the TCP connection was severed. These are safe to retry ONLY within +// an explicit transaction (the server rolls back the uncommitted transaction), +// so this helper is used by isRetryablePostgresTxError and NOT by +// IsRetryablePostgresError. +// +// String-matching on error messages is intentionally avoided: pgx flags +// pre-send failures via pgconn.SafeToRetry (which the stdlib adapter converts +// to driver.ErrBadConn), and post-send failures surface as typed net.OpError / +// io errors, so the typed checks are sufficient and far less fragile than +// substring matching. +func isPostgresNetworkError(err error) bool { var netErr *net.OpError if errors.As(err, &netErr) { return true } - if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { - return true - } - if errors.Is(err, context.DeadlineExceeded) { - return false // don't retry if the caller's context timed out - } - - // pgx sometimes wraps TCP-level disconnection errors as plain strings rather - // than preserving the original net.OpError type. These are safe to retry - // because they occur on the write path — the server never received the query. - // "conn closed" is pgx's own sentinel for a connection already closed before use. - // Note: "connection refused" and "unexpected EOF" are intentionally excluded — - // the former does not appear in pgx's codebase, the latter is already caught - // above by errors.Is(err, io.ErrUnexpectedEOF). - msg := err.Error() - if strings.Contains(msg, "connection reset by peer") || - strings.Contains(msg, "broken pipe") || - strings.Contains(msg, "conn closed") { - return true - } - - return false + return errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) } // retryJitterMax is the upper bound for the random delay between retries. @@ -259,7 +260,7 @@ func IsRetryablePostgresError(err error) bool { const retryJitterMax = 100 * time.Millisecond // isRetryablePostgresTxError is the default classifier used by RetryPostgresTx. -// It is the OR of three signals: +// It is the OR of four signals: // // 1. errors.Is(err, driver.ErrBadConn) — covers fn's tx.Exec/tx.Query // pre-send failures. pgx's stdlib adapter converts pgconn.SafeToRetry- @@ -273,6 +274,16 @@ const retryJitterMax = 100 * time.Millisecond // that guarantee rollback (40001, 40P01, 57P01, 57P02, 57P03, 53300, // 57014). These are NOT SafeToRetry-flagged by pgx and flow through // unchanged. +// 4. isPostgresNetworkError(err) — covers typed network errors (net.OpError, +// io.EOF, io.ErrUnexpectedEOF) where the TCP connection was severed. +// These are safe to retry within an explicit transaction because the +// server rolls back the uncommitted transaction. This leg is intentionally +// NOT part of IsRetryablePostgresError (which is for use without a +// transaction, where network errors are unsafe because you can't know if +// the query landed). The residual timing issue — commit succeeded but the +// response was lost — is accepted here and will be addressed by the +// stacked pg_current_xact_id()/pg_xact_status() commit-verification +// follow-up. // // driver.ErrBadConn is backend-agnostic, so the same "retry on bad conn" // leg will carry over to RetryMySQLTx / RetrySpannerTx when implemented. @@ -283,6 +294,9 @@ func isRetryablePostgresTxError(err error) bool { if pgconn.SafeToRetry(err) { return true } + if isPostgresNetworkError(err) { + return true + } return IsRetryablePostgresError(err) } diff --git a/database/postgres_test.go b/database/postgres_test.go index 75c0ea5e..6b000d94 100644 --- a/database/postgres_test.go +++ b/database/postgres_test.go @@ -213,24 +213,28 @@ func TestIsRetryablePostgresError(t *testing.T) { // syntax_error is NOT retryable require.False(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "42601"})) - // EOF is retryable (connection severed) - require.True(t, database.IsRetryablePostgresError(io.EOF)) - require.True(t, database.IsRetryablePostgresError(io.ErrUnexpectedEOF)) - - // net.OpError is retryable - require.True(t, database.IsRetryablePostgresError(&net.OpError{ + // Network errors (EOF, net.OpError) are NOT retryable here: without an + // explicit transaction there is no way to know whether the error occurred + // before or after the server accepted the query, so retrying could + // duplicate committed work. Use RetryPostgresTx (whose classifier adds + // network errors back in) for transactional retry. + require.False(t, database.IsRetryablePostgresError(io.EOF)) + require.False(t, database.IsRetryablePostgresError(io.ErrUnexpectedEOF)) + require.False(t, database.IsRetryablePostgresError(&net.OpError{ Op: "read", Err: errors.New("connection reset by peer"), })) + // String-matched connection errors are NOT retryable: string matching is + // fragile, and the typed checks above (plus driver.ErrBadConn and + // pgconn.SafeToRetry in the transactional classifier) cover the real cases. + require.False(t, database.IsRetryablePostgresError(errors.New("connection reset by peer"))) + require.False(t, database.IsRetryablePostgresError(errors.New("broken pipe"))) + require.False(t, database.IsRetryablePostgresError(errors.New("conn closed"))) + // context.DeadlineExceeded is NOT retryable require.False(t, database.IsRetryablePostgresError(context.DeadlineExceeded)) - // String-matched connection errors - require.True(t, database.IsRetryablePostgresError(errors.New("connection reset by peer"))) - require.True(t, database.IsRetryablePostgresError(errors.New("broken pipe"))) - require.True(t, database.IsRetryablePostgresError(errors.New("conn closed"))) - // Random application error is NOT retryable require.False(t, database.IsRetryablePostgresError(errors.New("invalid input"))) } diff --git a/database/retry_test.go b/database/retry_test.go index 851c9633..b6d06e77 100644 --- a/database/retry_test.go +++ b/database/retry_test.go @@ -7,6 +7,7 @@ import ( "errors" "fmt" "io" + "net" "sync" "testing" @@ -315,8 +316,17 @@ func TestIsRetryablePostgresTxError(t *testing.T) { require.False(t, isRetryablePostgresTxError(errors.New("some application error"))) // Non-retryable: context cancellation is never retryable. require.False(t, isRetryablePostgresTxError(context.Canceled)) - // Network errors are retryable via IsRetryablePostgresError's typed checks. + + // Leg 4: isPostgresNetworkError (typed network errors — safe within a + // transaction because the server rolls back the uncommitted tx). These are + // intentionally NOT in IsRetryablePostgresError (unsafe without a tx), but + // IS safe here. require.True(t, isRetryablePostgresTxError(io.EOF)) + require.True(t, isRetryablePostgresTxError(io.ErrUnexpectedEOF)) + require.True(t, isRetryablePostgresTxError(&net.OpError{ + Op: "read", + Err: errors.New("connection reset by peer"), + })) } func TestRetryMySQLTxNotImplemented(t *testing.T) { From 926d26caf3c1ebe69d979ed44c11115dca466cb2 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 29 Jun 2026 18:56:14 +0000 Subject: [PATCH 03/13] database: split retry classifier into pre-commit (broad) and commit (narrow) phases Addresses review feedback questioning the claim that commit-phase errors always roll back: a commit-phase error is ambiguous because the commit may have succeeded on the server before the error was returned to the client (e.g., the TCP connection was severed after the COMMIT message was sent but before the response was received). Retrying such an error could duplicate the committed work. Changes: - IsRetryablePostgresError -> isSafeRetryablePostgresError (unexported). It classified only SQLSTATE codes that the server guarantees were rolled back, and the recommended path for callers is now RetryPostgresTx or RetryUnsafe, not building a custom retry loop on top of this classifier. - isRetryablePostgresTxError -> isRetryablePostgresPreCommitError (broad, for errors from BeginTx or fn): driver.ErrBadConn + pgconn.SafeToRetry + isSafeRetryablePostgresError + isPostgresNetworkError. Any error from Begin or fn is safe to retry because the transaction rolls back, so network errors are included. Added a comment on the pgconn.SafeToRetry leg explaining pgx guarantees these always occur before any data is sent to the server. - New isRetryablePostgresCommitError (narrow, for errors from (*sql.Tx).Commit): driver.ErrBadConn + pgconn.SafeToRetry + isSafeRetryablePostgresError. Network errors are intentionally EXCLUDED because they could mean the commit succeeded but the response was lost. - New ErrCommitPhase sentinel: commit-phase errors that are not retried (e.g., a network error during commit) are wrapped with ErrCommitPhase so callers can detect them via errors.Is and decide whether to alert, reconcile, or check whether the commit landed (future pg_xact_status() work). The underlying error is preserved for inspection. - retryTx now takes a retryClassifier (preCommit + commit funcs) and runOneTxAttempt returns (err, commitPhase bool); the retry loop applies the commit classifier when commitPhase is true and wraps the final commit-phase error with ErrCommitPhase. opts.IsRetryable remains additive on top of BOTH phases. - TODO(stacked-pr) -> TODO(future) for the pg_current_xact_id()/ pg_xact_status() commit-verification hook. Tests: - Removed TestIsRetryablePostgresError from postgres_test.go (function is now private); SQLSTATE coverage moved to internal TestIsSafeRetryablePostgresError. - Renamed TestIsRetryablePostgresTxError -> TestIsRetryablePostgresPreCommitError. - Added TestIsRetryablePostgresCommitError (network NOT retryable at commit). - Added TestRetryPostgresTx subtests: commit-phase network error is not retried and is wrapped with ErrCommitPhase; commit-phase exhaustion wraps the final error with ErrCommitPhase. - Added TestErrCommitPhase (sentinel detection + underlying error preserved). Co-authored-by: E-Love (Eric Loveland) --- database/postgres.go | 130 +++++++++++++++++++++-------- database/postgres_test.go | 57 ------------- database/retry.go | 96 ++++++++++++++++----- database/retry_test.go | 171 ++++++++++++++++++++++++++++++++------ 4 files changed, 315 insertions(+), 139 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index e6aaaa57..b898e315 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -176,19 +176,22 @@ func PostgresDeadlockFound(err error) bool { return strings.Contains(err.Error(), postgresErrDeadlockFound) } -// IsRetryablePostgresError returns true if the error is a PostgreSQL SQLSTATE -// code that the server guarantees was rolled back before the error was -// returned. These are safe to retry even without an explicit transaction +// isSafeRetryablePostgresError returns true if the error is a PostgreSQL +// SQLSTATE code that the server guarantees was rolled back before the error +// was returned. These are safe to retry even without an explicit transaction // because the server has already undone any partial work. // // Network-level errors (connection reset, broken pipe, EOF, etc.) are // intentionally NOT classified as retryable here: without an explicit // transaction there is no way to know whether the error occurred before or // after the server accepted the query, so retrying could duplicate committed -// work. Callers who wrap their work in a transaction should use RetryPostgresTx, -// whose classifier (isRetryablePostgresTxError) adds network errors back in -// because the transaction makes them safe to retry. -func IsRetryablePostgresError(err error) bool { +// work. isRetryablePostgresPreCommitError adds network errors back in for use +// within RetryPostgresTx, where the transaction makes them safe to retry. +// +// This is unexported because the recommended path for callers is RetryPostgresTx +// (safe, transactional) or RetryUnsafe (idempotent caller), not building a +// custom retry loop on top of this classifier. +func isSafeRetryablePostgresError(err error) bool { if err == nil { return false } @@ -212,7 +215,7 @@ func IsRetryablePostgresError(err error) bool { // Note: 08xxx (connection_exception class) codes are intentionally omitted. // pgx surfaces connection-level failures as network errors, not as // *pgconn.PgError, so those cases are handled by isPostgresNetworkError - // (used by isRetryablePostgresTxError within a transaction). + // (used by isRetryablePostgresPreCommitError within a transaction). var pgErr *pgconn.PgError if errors.As(err, &pgErr) { switch pgErr.Code { @@ -233,8 +236,8 @@ func IsRetryablePostgresError(err error) bool { // isPostgresNetworkError returns true for typed network-level errors that // indicate the TCP connection was severed. These are safe to retry ONLY within // an explicit transaction (the server rolls back the uncommitted transaction), -// so this helper is used by isRetryablePostgresTxError and NOT by -// IsRetryablePostgresError. +// so this helper is used by isRetryablePostgresPreCommitError and NOT by +// isSafeRetryablePostgresError. // // String-matching on error messages is intentionally avoided: pgx flags // pre-send failures via pgconn.SafeToRetry (which the stdlib adapter converts @@ -259,35 +262,36 @@ func isPostgresNetworkError(err error) bool { // hatch for services with tighter latency budgets. const retryJitterMax = 100 * time.Millisecond -// isRetryablePostgresTxError is the default classifier used by RetryPostgresTx. -// It is the OR of four signals: +// isRetryablePostgresPreCommitError is the default classifier used by +// RetryPostgresTx for errors that occur BEFORE Commit is called (i.e., from +// BeginTx or fn). It is deliberately BROAD: any error from Begin or fn is safe +// to retry because the transaction rolls back, so transient network errors and +// server-rolled-back SQLSTATE codes are all included. It is the OR of four +// signals: // // 1. errors.Is(err, driver.ErrBadConn) — covers fn's tx.Exec/tx.Query // pre-send failures. pgx's stdlib adapter converts pgconn.SafeToRetry- // flagged errors from Exec/Query into driver.ErrBadConn before // database/sql (and thus fn) sees them, discarding the original pgx // error. This is the primary retry trigger inside a transaction. -// 2. pgconn.SafeToRetry(err) — covers Begin/Commit failures where the -// original pgx error flows through unchanged (connect errors, conn-busy, -// HA NotPreferredError, pre-send timeouts). -// 3. IsRetryablePostgresError(err) — covers server-returned SQLSTATE codes -// that guarantee rollback (40001, 40P01, 57P01, 57P02, 57P03, 53300, -// 57014). These are NOT SafeToRetry-flagged by pgx and flow through -// unchanged. -// 4. isPostgresNetworkError(err) — covers typed network errors (net.OpError, -// io.EOF, io.ErrUnexpectedEOF) where the TCP connection was severed. -// These are safe to retry within an explicit transaction because the -// server rolls back the uncommitted transaction. This leg is intentionally -// NOT part of IsRetryablePostgresError (which is for use without a -// transaction, where network errors are unsafe because you can't know if -// the query landed). The residual timing issue — commit succeeded but the -// response was lost — is accepted here and will be addressed by the -// stacked pg_current_xact_id()/pg_xact_status() commit-verification -// follow-up. +// 2. pgconn.SafeToRetry(err) — pgx guarantees these errors ALWAYS occur +// before any data is sent to the server (e.g., during connection +// acquisition, conn-busy, HA NotPreferredError, pre-send timeouts), so +// they are safe to retry regardless of transaction phase. For Begin/Commit +// the original pgx error flows through the stdlib adapter unchanged. +// 3. isSafeRetryablePostgresError(err) — server-returned SQLSTATE codes that +// guarantee rollback (40001, 40P01, 57P01, 57P02, 57P03, 53300, 57014). +// These are NOT SafeToRetry-flagged by pgx and flow through unchanged. +// 4. isPostgresNetworkError(err) — typed network errors (net.OpError, io.EOF, +// io.ErrUnexpectedEOF) where the TCP connection was severed. Safe to retry +// pre-commit because the server rolls back the uncommitted transaction. +// This leg is intentionally NOT part of isSafeRetryablePostgresError +// (which is for use without a transaction, where network errors are unsafe +// because you can't know if the query landed). // -// driver.ErrBadConn is backend-agnostic, so the same "retry on bad conn" -// leg will carry over to RetryMySQLTx / RetrySpannerTx when implemented. -func isRetryablePostgresTxError(err error) bool { +// driver.ErrBadConn is backend-agnostic, so leg 1 will carry over to +// RetryMySQLTx / RetrySpannerTx when implemented. +func isRetryablePostgresPreCommitError(err error) bool { if errors.Is(err, driver.ErrBadConn) { return true } @@ -297,18 +301,72 @@ func isRetryablePostgresTxError(err error) bool { if isPostgresNetworkError(err) { return true } - return IsRetryablePostgresError(err) + return isSafeRetryablePostgresError(err) +} + +// isRetryablePostgresCommitError is the default classifier used by +// RetryPostgresTx for errors that occur DURING the commit phase (i.e., from +// (*sql.Tx).Commit). It is deliberately NARROWER than +// isRetryablePostgresPreCommitError because a commit-phase error is ambiguous: +// the commit may have succeeded on the server before the error was returned to +// the client (e.g., the TCP connection was severed after the COMMIT message +// was sent but before the response was received). Retrying such an error could +// duplicate the committed work. +// +// Only errors that GUARANTEE the commit did not happen are retryable here: +// - errors.Is(err, driver.ErrBadConn): pgx's stdlib adapter only produces +// this for pgconn.SafeToRetry-flagged (pre-send) errors, so it implies the +// COMMIT message was never sent. +// - pgconn.SafeToRetry(err): pgx guarantees the error occurred before the +// COMMIT message was sent (pre-send). +// - isSafeRetryablePostgresError(err): server-returned SQLSTATE codes that +// guarantee rollback (e.g., 40001 serialization_failure at commit time) — +// the server rejected the commit and rolled back. +// +// Network errors (net.OpError, io.EOF) are intentionally NOT retryable here: +// they could mean the commit succeeded but the response was lost. Such errors +// are returned to the caller wrapped with ErrCommitPhase so the caller can +// decide whether to alert, reconcile, or check via pg_xact_status() (future). +func isRetryablePostgresCommitError(err error) bool { + if errors.Is(err, driver.ErrBadConn) { + return true + } + if pgconn.SafeToRetry(err) { + return true + } + return isSafeRetryablePostgresError(err) } // RetryPostgresTx executes fn inside a Postgres transaction, retrying the // entire transaction on transient errors. fn receives a fresh *sql.Tx each // attempt; if fn returns an error the tx is rolled back, if nil the tx is -// committed. An error is retried when isRetryablePostgresTxError(err) OR -// opts.IsRetryable(err) is true. +// committed. +// +// Pre-commit errors (from BeginTx or fn) are retried when +// isRetryablePostgresPreCommitError(err) OR opts.IsRetryable(err) is true — a +// broad set, because the transaction rolls back so retrying is safe. +// +// Commit-phase errors (from (*sql.Tx).Commit) are retried only when +// isRetryablePostgresCommitError(err) OR opts.IsRetryable(err) is true — a +// narrower set, because a commit-phase error may mean the commit already +// succeeded (e.g., the connection was severed after the COMMIT was sent but +// before the response was received). Commit-phase errors that are NOT retried +// are returned to the caller wrapped with ErrCommitPhase so the caller can +// decide whether to alert, reconcile, or check via pg_xact_status() (future). // // db MUST be a *sql.DB backed by the pgx stdlib adapter (e.g. one returned // by database.New with a PostgresConfig). Using a *sql.DB backed by another // driver will not produce pgx-specific retries. func RetryPostgresTx(ctx context.Context, db *sql.DB, opts RetryTxOptions, fn func(*sql.Tx) error) error { - return retryTx(ctx, db, isRetryablePostgresTxError, opts, fn) + return retryTx(ctx, db, postgresTxClassifier, opts, fn) +} + +// postgresTxClassifier is the per-phase retry classifier used by RetryPostgresTx. +// The preCommit classifier is broad (any transient error is safe because the +// transaction rolls back); the commit classifier is narrow (only errors that +// guarantee the commit did not happen), because a commit-phase error may mean +// the commit already succeeded. See ErrCommitPhase. +var postgresTxClassifier = retryClassifier{ + preCommit: isRetryablePostgresPreCommitError, + commit: isRetryablePostgresCommitError, } diff --git a/database/postgres_test.go b/database/postgres_test.go index 6b000d94..776fcddf 100644 --- a/database/postgres_test.go +++ b/database/postgres_test.go @@ -2,9 +2,7 @@ package database_test import ( "context" - "errors" "io" - "net" "os" "path/filepath" "testing" @@ -184,61 +182,6 @@ func Test_Postgres_Alloy_Migrations(t *testing.T) { defer db.Close() } -func TestIsRetryablePostgresError(t *testing.T) { - // nil error is not retryable - require.False(t, database.IsRetryablePostgresError(nil)) - - // admin_shutdown is retryable (seen during AlloyDB maintenance) - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "57P01"})) - - // crash_shutdown is retryable - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "57P02"})) - - // cannot_connect_now is retryable - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "57P03"})) - - // serialization_failure and deadlock_detected are retryable (server rolled back) - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "40001"})) - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "40P01"})) - - // too_many_connections is retryable (rejected at connect time) - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "53300"})) - - // query_canceled is retryable (server rolled back the transaction) - require.True(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "57014"})) - - // unique_violation is NOT retryable (application-level error) - require.False(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "23505"})) - - // syntax_error is NOT retryable - require.False(t, database.IsRetryablePostgresError(&pgconn.PgError{Code: "42601"})) - - // Network errors (EOF, net.OpError) are NOT retryable here: without an - // explicit transaction there is no way to know whether the error occurred - // before or after the server accepted the query, so retrying could - // duplicate committed work. Use RetryPostgresTx (whose classifier adds - // network errors back in) for transactional retry. - require.False(t, database.IsRetryablePostgresError(io.EOF)) - require.False(t, database.IsRetryablePostgresError(io.ErrUnexpectedEOF)) - require.False(t, database.IsRetryablePostgresError(&net.OpError{ - Op: "read", - Err: errors.New("connection reset by peer"), - })) - - // String-matched connection errors are NOT retryable: string matching is - // fragile, and the typed checks above (plus driver.ErrBadConn and - // pgconn.SafeToRetry in the transactional classifier) cover the real cases. - require.False(t, database.IsRetryablePostgresError(errors.New("connection reset by peer"))) - require.False(t, database.IsRetryablePostgresError(errors.New("broken pipe"))) - require.False(t, database.IsRetryablePostgresError(errors.New("conn closed"))) - - // context.DeadlineExceeded is NOT retryable - require.False(t, database.IsRetryablePostgresError(context.DeadlineExceeded)) - - // Random application error is NOT retryable - require.False(t, database.IsRetryablePostgresError(errors.New("invalid input"))) -} - func TestRetryUnsafe(t *testing.T) { t.Run("succeeds on first attempt", func(t *testing.T) { calls := 0 diff --git a/database/retry.go b/database/retry.go index 3625f6c5..08ce5282 100644 --- a/database/retry.go +++ b/database/retry.go @@ -4,6 +4,7 @@ import ( "context" "database/sql" "errors" + "fmt" "math/rand" "time" ) @@ -16,8 +17,10 @@ type RetryTxOptions struct { // TxOptions is passed to (*sql.DB).BeginTx on each attempt. nil = default isolation. TxOptions *sql.TxOptions // IsRetryable, if non-nil, is consulted IN ADDITION to the backend's default - // classifier (e.g. isRetryablePostgresTxError). Lets consumers add + // classifier (both the pre-commit and commit phases). Lets consumers add // implementation-specific cases on top of the library's known-safe floor. + // Callers should be careful adding cases that are unsafe at commit time — a + // commit-phase error may mean the commit already succeeded (see ErrCommitPhase). IsRetryable func(err error) bool } @@ -30,12 +33,48 @@ type RetryUnsafeOptions struct { IsRetryable func(err error) bool } +// ErrCommitPhase wraps an error that occurred during the commit phase of a +// retried transaction (i.e., from (*sql.Tx).Commit) and was NOT retried. A +// commit-phase error is ambiguous: the commit may have succeeded on the server +// before the error was returned to the client (e.g., the TCP connection was +// severed after the COMMIT message was sent but before the response was +// received). Retrying such an operation could duplicate the committed work, so +// RetryPostgresTx deliberately does NOT retry commit-phase errors whose type +// could indicate a successful commit (e.g., network errors). +// +// The error is returned wrapped with ErrCommitPhase so the caller can detect +// it via errors.Is(err, database.ErrCommitPhase) and decide whether to alert, +// reconcile, or check whether the commit landed (e.g., via pg_xact_status(), +// which is future work). The underlying error is preserved for inspection via +// errors.Is/errors.As. +var ErrCommitPhase = errors.New("database: error occurred during the commit phase; the transaction may have committed before the error was returned") + // beginTxer is satisfied by *sql.DB so the retry loop is testable with fakes // without needing a real database driver. type beginTxer interface { BeginTx(ctx context.Context, opts *sql.TxOptions) (*sql.Tx, error) } +// retryClassifier holds the per-phase retryability classifiers used by retryTx. +// preCommit classifies errors from BeginTx or fn (before Commit is called); +// commit classifies errors from (*sql.Tx).Commit. The commit classifier should +// be NARROWER than preCommit because a commit-phase error may mean the commit +// already succeeded (see ErrCommitPhase). +type retryClassifier struct { + preCommit func(error) bool + commit func(error) bool +} + +// classify returns whether err is retryable, using the commit classifier when +// the error occurred during the commit phase and the preCommit classifier +// otherwise. +func (c retryClassifier) classify(err error, commitPhase bool) bool { + if commitPhase { + return c.commit(err) + } + return c.preCommit(err) +} + // RetryUnsafe executes fn up to MaxAttempts times, retrying on any error that // opts.IsRetryable classifies as retryable (default: any error except context // cancellation/deadline). fn MUST be idempotent — no transaction wrapper is @@ -75,44 +114,58 @@ func isRetryableUnsafeDefault(err error) bool { } // retryTx is the shared transactional retry loop used by RetryPostgresTx and -// the future RetryMySQLTx / RetrySpannerTx implementations. defaultClassifier -// is the backend's known-safe floor; opts.IsRetryable is additive on top of it. -func retryTx(ctx context.Context, db beginTxer, defaultClassifier func(error) bool, opts RetryTxOptions, fn func(*sql.Tx) error) error { +// the future RetryMySQLTx / RetrySpannerTx implementations. base is the +// backend's per-phase known-safe floor; opts.IsRetryable is additive on top of +// BOTH phases. Commit-phase errors that are not retried are wrapped with +// ErrCommitPhase before being returned to the caller, so the caller can detect +// that the error occurred during the commit phase (where the commit may have +// succeeded before the error was returned). +func retryTx(ctx context.Context, db beginTxer, base retryClassifier, opts RetryTxOptions, fn func(*sql.Tx) error) error { maxAttempts := opts.MaxAttempts if maxAttempts <= 0 { maxAttempts = 3 } - isRetryable := defaultClassifier + classifier := base if opts.IsRetryable != nil { extra := opts.IsRetryable - isRetryable = func(err error) bool { - return defaultClassifier(err) || extra(err) + classifier = retryClassifier{ + preCommit: func(err error) bool { return base.preCommit(err) || extra(err) }, + commit: func(err error) bool { return base.commit(err) || extra(err) }, } } var lastErr error + var lastCommitPhase bool for attempt := 0; attempt < maxAttempts; attempt++ { - lastErr = runOneTxAttempt(ctx, db, opts.TxOptions, fn) + var commitPhase bool + lastErr, commitPhase = runOneTxAttempt(ctx, db, opts.TxOptions, fn) + lastCommitPhase = commitPhase if lastErr == nil { return nil } - if !isRetryable(lastErr) { - return lastErr + if !classifier.classify(lastErr, commitPhase) { + break } if !sleepWithJitter(ctx, attempt, maxAttempts) { return ctx.Err() } } + if lastCommitPhase { + return fmt.Errorf("%w: %w", ErrCommitPhase, lastErr) + } return lastErr } // runOneTxAttempt runs a single begin/fn/commit attempt and returns the first -// error encountered (from BeginTx, fn, or Commit). On a successful commit it -// returns nil. The deferred Rollback is a no-op after a successful Commit and -// cleans up the transaction if fn returns an error or panics. -func runOneTxAttempt(ctx context.Context, db beginTxer, txOpts *sql.TxOptions, fn func(*sql.Tx) error) (err error) { +// error encountered (from BeginTx, fn, or Commit) along with a commitPhase +// flag indicating whether the error came from (*sql.Tx).Commit. On a +// successful commit it returns (nil, false). The deferred Rollback is a no-op +// after a successful Commit and cleans up the transaction if fn returns an +// error or panics; after a failed Commit the transaction is already in a +// terminal state so Rollback returns sql.ErrTxDone (ignored). +func runOneTxAttempt(ctx context.Context, db beginTxer, txOpts *sql.TxOptions, fn func(*sql.Tx) error) (err error, commitPhase bool) { tx, beginErr := db.BeginTx(ctx, txOpts) if beginErr != nil { - return beginErr + return beginErr, false } defer func() { if p := recover(); p != nil { @@ -124,12 +177,15 @@ func runOneTxAttempt(ctx context.Context, db beginTxer, txOpts *sql.TxOptions, f } }() if err = fn(tx); err != nil { - return err + return err, false + } + // TODO(future): capture pg_current_xact_id() before commit and consult + // pg_xact_status() on commit error to detect whether the commit landed. + // Postgres-only; see RetryPostgresTx and ErrCommitPhase. + if err = tx.Commit(); err != nil { + return err, true } - // TODO(stacked-pr): capture pg_current_xact_id() before commit and - // consult pg_xact_status() on commit error to detect whether the commit - // landed. Postgres-only; see RetryPostgresTx. - return tx.Commit() + return nil, false } // sleepWithJitter waits for a random duration in [0, retryJitterMax) before diff --git a/database/retry_test.go b/database/retry_test.go index b6d06e77..cc80904a 100644 --- a/database/retry_test.go +++ b/database/retry_test.go @@ -231,6 +231,60 @@ func TestRetryPostgresTx(t *testing.T) { require.Equal(t, 2, d.commitCalls) // first commit failed, second succeeded }) + t.Run("commit-phase network error is NOT retried and is wrapped with ErrCommitPhase", func(t *testing.T) { + // A network error during commit could mean the commit succeeded but the + // response was lost, so the narrow commit classifier does NOT retry it. + // It is returned to the caller wrapped with ErrCommitPhase so the caller + // can detect the ambiguous commit-phase failure. + d := &mockDriver{ + commitErrs: []error{io.EOF}, + } + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + calls++ + return nil + }) + require.Error(t, err) + require.Equal(t, 1, calls) // not retried + require.Equal(t, 1, d.beginCalls) + require.Equal(t, 1, d.commitCalls) // commit was attempted (and failed) + require.Equal(t, 0, d.rollbackCalls) // commit failed; rollback is a no-op (ErrTxDone) + require.ErrorIs(t, err, ErrCommitPhase) + require.ErrorIs(t, err, io.EOF) // underlying error preserved + }) + + t.Run("commit-phase exhaustion wraps the final error with ErrCommitPhase", func(t *testing.T) { + // A retryable commit error (40001 at commit) that never succeeds: the + // final error is a commit-phase error, so it is wrapped with + // ErrCommitPhase even though we exhausted attempts retrying it. + d := &mockDriver{ + commitErrs: []error{ + &pgconn.PgError{Code: "40001"}, // serialization_failure at commit (retryable) + &pgconn.PgError{Code: "40001"}, + &pgconn.PgError{Code: "40001"}, + }, + } + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + calls++ + return nil + }) + require.Error(t, err) + require.Equal(t, 3, calls) + require.Equal(t, 3, d.commitCalls) + require.ErrorIs(t, err, ErrCommitPhase) + // The underlying *pgconn.PgError is preserved for inspection via errors.As. + var pgErr *pgconn.PgError + require.ErrorAs(t, err, &pgErr) + require.Equal(t, "40001", pgErr.Code) + }) + t.Run("additive opts.IsRetryable extends the default classifier", func(t *testing.T) { // sentinelErr is not driver.ErrBadConn, not SafeToRetry-flagged, and not // an IsRetryablePostgresError SQLSTATE code, so the default classifier @@ -296,37 +350,88 @@ func TestRetryPostgresTx(t *testing.T) { }) } -func TestIsRetryablePostgresTxError(t *testing.T) { +func TestIsSafeRetryablePostgresError(t *testing.T) { + // SQLSTATE codes that the server guarantees were rolled back. + require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown + require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P02"})) // crash_shutdown + require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P03"})) // cannot_connect_now + require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "40001"})) // serialization_failure + require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "40P01"})) // deadlock_detected + require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "53300"})) // too_many_connections + require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57014"})) // query_canceled + + // Application errors are not retryable. + require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "23505"})) // unique_violation + require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "42601"})) // syntax_error + + // Network errors are NOT retryable here (unsafe without a transaction). + require.False(t, isSafeRetryablePostgresError(nil)) + require.False(t, isSafeRetryablePostgresError(io.EOF)) + require.False(t, isSafeRetryablePostgresError(io.ErrUnexpectedEOF)) + require.False(t, isSafeRetryablePostgresError(&net.OpError{Op: "read", Err: errors.New("connection reset by peer")})) + require.False(t, isSafeRetryablePostgresError(context.Canceled)) + require.False(t, isSafeRetryablePostgresError(errors.New("some application error"))) +} + +func TestIsRetryablePostgresPreCommitError(t *testing.T) { // Leg 1: driver.ErrBadConn (fn-path signal — pgx stdlib adapter converts // SafeToRetry Exec/Query errors to this). - require.True(t, isRetryablePostgresTxError(driver.ErrBadConn)) - require.True(t, isRetryablePostgresTxError(fmt.Errorf("wrapped: %w", driver.ErrBadConn))) - - // Leg 2: pgconn.SafeToRetry-flagged (Begin/Commit-path signal). - require.True(t, isRetryablePostgresTxError(&safeToRetryErr{msg: "pre-send"})) - - // Leg 3: IsRetryablePostgresError SQLSTATE codes that guarantee rollback. - require.True(t, isRetryablePostgresTxError(&pgconn.PgError{Code: "40001"})) // serialization_failure - require.True(t, isRetryablePostgresTxError(&pgconn.PgError{Code: "40P01"})) // deadlock_detected - require.True(t, isRetryablePostgresTxError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown - - // Non-retryable: unique_violation is an application error, not a transient one. - require.False(t, isRetryablePostgresTxError(&pgconn.PgError{Code: "23505"})) - // Non-retryable: a random error that matches none of the three legs. - require.False(t, isRetryablePostgresTxError(errors.New("some application error"))) - // Non-retryable: context cancellation is never retryable. - require.False(t, isRetryablePostgresTxError(context.Canceled)) - - // Leg 4: isPostgresNetworkError (typed network errors — safe within a - // transaction because the server rolls back the uncommitted tx). These are - // intentionally NOT in IsRetryablePostgresError (unsafe without a tx), but - // IS safe here. - require.True(t, isRetryablePostgresTxError(io.EOF)) - require.True(t, isRetryablePostgresTxError(io.ErrUnexpectedEOF)) - require.True(t, isRetryablePostgresTxError(&net.OpError{ + require.True(t, isRetryablePostgresPreCommitError(driver.ErrBadConn)) + require.True(t, isRetryablePostgresPreCommitError(fmt.Errorf("wrapped: %w", driver.ErrBadConn))) + + // Leg 2: pgconn.SafeToRetry-flagged — pgx guarantees these ALWAYS occur + // before any data is sent to the server. + require.True(t, isRetryablePostgresPreCommitError(&safeToRetryErr{msg: "pre-send"})) + + // Leg 3: isSafeRetryablePostgresError SQLSTATE codes that guarantee rollback. + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "40001"})) // serialization_failure + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "40P01"})) // deadlock_detected + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown + + // Leg 4: isPostgresNetworkError (typed network errors — safe pre-commit + // because the server rolls back the uncommitted tx). NOT in + // isSafeRetryablePostgresError (unsafe without a tx). + require.True(t, isRetryablePostgresPreCommitError(io.EOF)) + require.True(t, isRetryablePostgresPreCommitError(io.ErrUnexpectedEOF)) + require.True(t, isRetryablePostgresPreCommitError(&net.OpError{ + Op: "read", + Err: errors.New("connection reset by peer"), + })) + + // Non-retryable: application errors and context cancellation. + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "23505"})) // unique_violation + require.False(t, isRetryablePostgresPreCommitError(errors.New("some application error"))) + require.False(t, isRetryablePostgresPreCommitError(context.Canceled)) +} + +func TestIsRetryablePostgresCommitError(t *testing.T) { + // The commit classifier is NARROWER than the pre-commit classifier: a + // commit-phase error may mean the commit already succeeded, so only + // errors that GUARANTEE the commit did not happen are retryable here. + + // Retryable at commit: pre-send guarantees. + require.True(t, isRetryablePostgresCommitError(driver.ErrBadConn)) + require.True(t, isRetryablePostgresCommitError(&safeToRetryErr{msg: "pre-send commit"})) + + // Retryable at commit: server-rolled-back SQLSTATE codes (e.g. 40001 + // serialization_failure raised at commit time — server rolled back). + require.True(t, isRetryablePostgresCommitError(&pgconn.PgError{Code: "40001"})) + require.True(t, isRetryablePostgresCommitError(&pgconn.PgError{Code: "40P01"})) + + // NOT retryable at commit: network errors could mean the commit succeeded + // but the response was lost. These are returned to the caller wrapped with + // ErrCommitPhase (see TestRetryPostgresTx/commit-phase_network_error). + require.False(t, isRetryablePostgresCommitError(io.EOF)) + require.False(t, isRetryablePostgresCommitError(io.ErrUnexpectedEOF)) + require.False(t, isRetryablePostgresCommitError(&net.OpError{ Op: "read", Err: errors.New("connection reset by peer"), })) + + // NOT retryable at commit: application errors. + require.False(t, isRetryablePostgresCommitError(&pgconn.PgError{Code: "23505"})) // unique_violation + require.False(t, isRetryablePostgresCommitError(errors.New("some application error"))) + require.False(t, isRetryablePostgresCommitError(context.Canceled)) } func TestRetryMySQLTxNotImplemented(t *testing.T) { @@ -340,3 +445,17 @@ func TestRetrySpannerTxNotImplemented(t *testing.T) { require.Error(t, err) require.Contains(t, err.Error(), "not yet implemented") } + +func TestErrCommitPhase(t *testing.T) { + // ErrCommitPhase is a sentinel that wraps commit-phase errors so callers + // can detect them via errors.Is. The underlying error is preserved. + underlying := errors.New("connection severed during commit") + wrapped := fmt.Errorf("%w: %w", ErrCommitPhase, underlying) + + require.ErrorIs(t, wrapped, ErrCommitPhase, "caller can detect commit-phase failures") + require.ErrorIs(t, wrapped, underlying, "underlying error is preserved for inspection") + + // A bare (non-commit-phase) error is NOT ErrCommitPhase. + require.NotErrorIs(t, underlying, ErrCommitPhase) + require.NotErrorIs(t, io.EOF, ErrCommitPhase) +} From 0d15c834c75b6b9625d80de123af169faca5c11e Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 29 Jun 2026 19:34:51 +0000 Subject: [PATCH 04/13] database: merge commit classifier into isSafeRetryablePostgresError isRetryablePostgresCommitError was a strict subset of isRetryablePostgresPreCommitError (pre-send + server-rolled-back, without the pre-commit-only network leg), duplicating the driver.ErrBadConn and pgconn.SafeToRetry checks that also belonged to the 'safe at any phase' bucket. Folded those two legs into isSafeRetryablePostgresError so it becomes the single 'safe to retry regardless of transaction phase' classifier (pre-send + server-rolled-back SQLSTATE codes), and assigned it directly to postgresTxClassifier.commit. isRetryablePostgresPreCommitError now delegates: isSafeRetryablePostgresError || isPostgresNetworkError. Behavior is unchanged; only the code structure is simpler. Also documented on isRetryablePostgresPreCommitError the permanent-error categories that are intentionally NOT retried pre-commit even though retrying them would be safe (the transaction rolls back): constraint violations (23xxx), syntax errors (42601), schema errors (42xxx), permission errors (42501), and data exceptions (22xxx). These surface at query time (not connection time) and would fail again on retry, so retrying only adds ~200ms of latency before the caller receives the error. Callers who want to retry a broader set can pass opts.IsRetryable. Tests: folded TestIsRetryablePostgresCommitError assertions into TestIsSafeRetryablePostgresError (added driver.ErrBadConn + SafeToRetry -> true); removed TestIsRetryablePostgresCommitError; TestIsRetryablePostgresPreCommitError unchanged. Co-authored-by: E-Love (Eric Loveland) --- database/postgres.go | 173 +++++++++++++++++++---------------------- database/retry_test.go | 48 ++++-------- 2 files changed, 95 insertions(+), 126 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index b898e315..be91d4dc 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -176,17 +176,32 @@ func PostgresDeadlockFound(err error) bool { return strings.Contains(err.Error(), postgresErrDeadlockFound) } -// isSafeRetryablePostgresError returns true if the error is a PostgreSQL -// SQLSTATE code that the server guarantees was rolled back before the error -// was returned. These are safe to retry even without an explicit transaction -// because the server has already undone any partial work. +// isSafeRetryablePostgresError returns true if the error is safe to retry +// regardless of transaction phase — i.e., the error guarantees no side effects +// occurred (the operation never reached the server, or the server rolled back +// before returning the error). It is the OR of: // -// Network-level errors (connection reset, broken pipe, EOF, etc.) are -// intentionally NOT classified as retryable here: without an explicit -// transaction there is no way to know whether the error occurred before or -// after the server accepted the query, so retrying could duplicate committed -// work. isRetryablePostgresPreCommitError adds network errors back in for use -// within RetryPostgresTx, where the transaction makes them safe to retry. +// - errors.Is(err, driver.ErrBadConn): pgx's stdlib adapter only produces +// this for pgconn.SafeToRetry-flagged (pre-send) errors, so the operation +// never reached the server. +// - pgconn.SafeToRetry(err): pgx guarantees the error occurred before any +// data was sent to the server (e.g., during connection acquisition, +// conn-busy, HA NotPreferredError, pre-send timeouts). +// - server-returned SQLSTATE codes that guarantee rollback (40001, 40P01, +// 57P01, 57P02, 57P03, 53300, 57014) — the server rolled back the +// transaction before returning the error. +// +// It is used directly as the commit-phase classifier for RetryPostgresTx +// (where network errors are NOT safe because the commit may have succeeded — +// see ErrCommitPhase) and as the foundation of the pre-commit classifier +// isRetryablePostgresPreCommitError (which adds network errors via +// isPostgresNetworkError, since pre-commit the transaction rolls back). +// +// Network-level errors (net.OpError, io.EOF) are intentionally NOT classified +// here: without an explicit transaction there is no way to know whether the +// error occurred before or after the server accepted the query, and at commit +// time a network error could mean the commit succeeded but the response was +// lost. // // This is unexported because the recommended path for callers is RetryPostgresTx // (safe, transactional) or RetryUnsafe (idempotent caller), not building a @@ -195,7 +210,18 @@ func isSafeRetryablePostgresError(err error) bool { if err == nil { return false } - + // driver.ErrBadConn is produced by pgx's stdlib adapter only for + // pgconn.SafeToRetry-flagged (pre-send) errors from Exec/Query, so the + // operation never reached the server. Safe to retry at any phase. + if errors.Is(err, driver.ErrBadConn) { + return true + } + // pgconn.SafeToRetry is pgx's authoritative flag for "occurred before any + // data was sent to the server". Covers Begin/Commit failures where the + // original pgx error flows through the stdlib adapter unchanged. + if pgconn.SafeToRetry(err) { + return true + } // 57P01 admin_shutdown, 57P02 crash_shutdown: fast shutdown rolls back all // active transactions before disconnecting clients. See: // https://www.postgresql.org/docs/current/server-shutdown.html @@ -227,9 +253,6 @@ func isSafeRetryablePostgresError(err error) bool { } return false } - - // Non-PgError errors (network errors, context errors, application errors) - // are not retryable here — see the doc comment above. return false } @@ -264,77 +287,34 @@ const retryJitterMax = 100 * time.Millisecond // isRetryablePostgresPreCommitError is the default classifier used by // RetryPostgresTx for errors that occur BEFORE Commit is called (i.e., from -// BeginTx or fn). It is deliberately BROAD: any error from Begin or fn is safe -// to retry because the transaction rolls back, so transient network errors and -// server-rolled-back SQLSTATE codes are all included. It is the OR of four -// signals: +// BeginTx or fn). It is isSafeRetryablePostgresError OR isPostgresNetworkError: +// pre-commit, typed network errors are also safe to retry because the server +// rolls back the uncommitted transaction. // -// 1. errors.Is(err, driver.ErrBadConn) — covers fn's tx.Exec/tx.Query -// pre-send failures. pgx's stdlib adapter converts pgconn.SafeToRetry- -// flagged errors from Exec/Query into driver.ErrBadConn before -// database/sql (and thus fn) sees them, discarding the original pgx -// error. This is the primary retry trigger inside a transaction. -// 2. pgconn.SafeToRetry(err) — pgx guarantees these errors ALWAYS occur -// before any data is sent to the server (e.g., during connection -// acquisition, conn-busy, HA NotPreferredError, pre-send timeouts), so -// they are safe to retry regardless of transaction phase. For Begin/Commit -// the original pgx error flows through the stdlib adapter unchanged. -// 3. isSafeRetryablePostgresError(err) — server-returned SQLSTATE codes that -// guarantee rollback (40001, 40P01, 57P01, 57P02, 57P03, 53300, 57014). -// These are NOT SafeToRetry-flagged by pgx and flow through unchanged. -// 4. isPostgresNetworkError(err) — typed network errors (net.OpError, io.EOF, -// io.ErrUnexpectedEOF) where the TCP connection was severed. Safe to retry -// pre-commit because the server rolls back the uncommitted transaction. -// This leg is intentionally NOT part of isSafeRetryablePostgresError -// (which is for use without a transaction, where network errors are unsafe -// because you can't know if the query landed). +// Permanent application errors are intentionally NOT retried here, even though +// retrying them would be safe (the transaction rolls back): they would fail +// again on the next attempt, so retrying only adds ~200ms of latency (3 +// attempts with full-jitter backoff up to retryJitterMax) before the caller +// receives the error. The categories we deliberately do NOT retry are: // -// driver.ErrBadConn is backend-agnostic, so leg 1 will carry over to -// RetryMySQLTx / RetrySpannerTx when implemented. -func isRetryablePostgresPreCommitError(err error) bool { - if errors.Is(err, driver.ErrBadConn) { - return true - } - if pgconn.SafeToRetry(err) { - return true - } - if isPostgresNetworkError(err) { - return true - } - return isSafeRetryablePostgresError(err) -} - -// isRetryablePostgresCommitError is the default classifier used by -// RetryPostgresTx for errors that occur DURING the commit phase (i.e., from -// (*sql.Tx).Commit). It is deliberately NARROWER than -// isRetryablePostgresPreCommitError because a commit-phase error is ambiguous: -// the commit may have succeeded on the server before the error was returned to -// the client (e.g., the TCP connection was severed after the COMMIT message -// was sent but before the response was received). Retrying such an error could -// duplicate the committed work. -// -// Only errors that GUARANTEE the commit did not happen are retryable here: -// - errors.Is(err, driver.ErrBadConn): pgx's stdlib adapter only produces -// this for pgconn.SafeToRetry-flagged (pre-send) errors, so it implies the -// COMMIT message was never sent. -// - pgconn.SafeToRetry(err): pgx guarantees the error occurred before the -// COMMIT message was sent (pre-send). -// - isSafeRetryablePostgresError(err): server-returned SQLSTATE codes that -// guarantee rollback (e.g., 40001 serialization_failure at commit time) — -// the server rejected the commit and rolled back. +// - Constraint violations (class 23xxx) — e.g. unique_violation 23505, +// foreign_key_violation 23503, check_violation 23514, not_null_violation +// 23502. The data conflict persists across retries. +// - Syntax errors (42601) — the SQL is malformed; indicates a code bug. +// - Schema errors (class 42xxx) — e.g. undefined_table 42P01, +// undefined_column 42703. Indicates a migration drift or code bug. +// - Permission errors (42501 insufficient_privilege) — permissions don't +// change mid-query. +// - Data exceptions (class 22xxx) — e.g. string_data_right_truncation 22001, +// datatype_mismatch 42804. The data doesn't fit the column/type. // -// Network errors (net.OpError, io.EOF) are intentionally NOT retryable here: -// they could mean the commit succeeded but the response was lost. Such errors -// are returned to the caller wrapped with ErrCommitPhase so the caller can -// decide whether to alert, reconcile, or check via pg_xact_status() (future). -func isRetryablePostgresCommitError(err error) bool { - if errors.Is(err, driver.ErrBadConn) { - return true - } - if pgconn.SafeToRetry(err) { - return true - } - return isSafeRetryablePostgresError(err) +// These are discovered at query time (not connection time, where +// incompatibility/handshake failures surface), so they reach fn and would +// recur on retry. Callers who want to retry a broader set (e.g. a custom +// "transaction rolled back" sentinel from a stored procedure) can pass +// opts.IsRetryable, which is additive on top of this classifier. +func isRetryablePostgresPreCommitError(err error) bool { + return isSafeRetryablePostgresError(err) || isPostgresNetworkError(err) } // RetryPostgresTx executes fn inside a Postgres transaction, retrying the @@ -344,15 +324,19 @@ func isRetryablePostgresCommitError(err error) bool { // // Pre-commit errors (from BeginTx or fn) are retried when // isRetryablePostgresPreCommitError(err) OR opts.IsRetryable(err) is true — a -// broad set, because the transaction rolls back so retrying is safe. +// broad set (pre-send + server-rolled-back + network), because the transaction +// rolls back so retrying is safe. Permanent application errors (constraint +// violations, syntax/schema/permission errors) are NOT retried; see +// isRetryablePostgresPreCommitError's doc for the full list. // // Commit-phase errors (from (*sql.Tx).Commit) are retried only when -// isRetryablePostgresCommitError(err) OR opts.IsRetryable(err) is true — a -// narrower set, because a commit-phase error may mean the commit already -// succeeded (e.g., the connection was severed after the COMMIT was sent but -// before the response was received). Commit-phase errors that are NOT retried -// are returned to the caller wrapped with ErrCommitPhase so the caller can -// decide whether to alert, reconcile, or check via pg_xact_status() (future). +// isSafeRetryablePostgresError(err) OR opts.IsRetryable(err) is true — a +// narrower set (pre-send + server-rolled-back; network errors excluded), +// because a commit-phase network error may mean the commit already succeeded +// (e.g., the connection was severed after the COMMIT was sent but before the +// response was received). Commit-phase errors that are NOT retried are +// returned to the caller wrapped with ErrCommitPhase so the caller can decide +// whether to alert, reconcile, or check via pg_xact_status() (future). // // db MUST be a *sql.DB backed by the pgx stdlib adapter (e.g. one returned // by database.New with a PostgresConfig). Using a *sql.DB backed by another @@ -362,11 +346,12 @@ func RetryPostgresTx(ctx context.Context, db *sql.DB, opts RetryTxOptions, fn fu } // postgresTxClassifier is the per-phase retry classifier used by RetryPostgresTx. -// The preCommit classifier is broad (any transient error is safe because the -// transaction rolls back); the commit classifier is narrow (only errors that -// guarantee the commit did not happen), because a commit-phase error may mean -// the commit already succeeded. See ErrCommitPhase. +// The preCommit classifier is broad (pre-send + server-rolled-back + network — +// any transient error is safe because the transaction rolls back); the commit +// classifier is the narrow isSafeRetryablePostgresError (pre-send + +// server-rolled-back; network excluded), because a commit-phase network error +// may mean the commit already succeeded. See ErrCommitPhase. var postgresTxClassifier = retryClassifier{ preCommit: isRetryablePostgresPreCommitError, - commit: isRetryablePostgresCommitError, + commit: isSafeRetryablePostgresError, } diff --git a/database/retry_test.go b/database/retry_test.go index cc80904a..06d67b3a 100644 --- a/database/retry_test.go +++ b/database/retry_test.go @@ -351,6 +351,18 @@ func TestRetryPostgresTx(t *testing.T) { } func TestIsSafeRetryablePostgresError(t *testing.T) { + // isSafeRetryablePostgresError is the "safe to retry at any phase" + // classifier — pre-send guarantees + server-rolled-back SQLSTATE codes. + // It is used directly as the commit-phase classifier for RetryPostgresTx + // and as the foundation of the pre-commit classifier. + + // Pre-send: driver.ErrBadConn (pgx stdlib adapter only produces this for + // SafeToRetry-flagged Exec/Query errors) and pgconn.SafeToRetry-flagged + // errors. Safe at any phase because the operation never reached the server. + require.True(t, isSafeRetryablePostgresError(driver.ErrBadConn)) + require.True(t, isSafeRetryablePostgresError(fmt.Errorf("wrapped: %w", driver.ErrBadConn))) + require.True(t, isSafeRetryablePostgresError(&safeToRetryErr{msg: "pre-send"})) + // SQLSTATE codes that the server guarantees were rolled back. require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P02"})) // crash_shutdown @@ -360,11 +372,13 @@ func TestIsSafeRetryablePostgresError(t *testing.T) { require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "53300"})) // too_many_connections require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57014"})) // query_canceled - // Application errors are not retryable. + // Permanent application errors are not retryable (would fail again). require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "23505"})) // unique_violation require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "42601"})) // syntax_error - // Network errors are NOT retryable here (unsafe without a transaction). + // Network errors are NOT retryable here: without a transaction there's no + // way to know if the query landed, and at commit time a network error + // could mean the commit succeeded but the response was lost (ErrCommitPhase). require.False(t, isSafeRetryablePostgresError(nil)) require.False(t, isSafeRetryablePostgresError(io.EOF)) require.False(t, isSafeRetryablePostgresError(io.ErrUnexpectedEOF)) @@ -404,36 +418,6 @@ func TestIsRetryablePostgresPreCommitError(t *testing.T) { require.False(t, isRetryablePostgresPreCommitError(context.Canceled)) } -func TestIsRetryablePostgresCommitError(t *testing.T) { - // The commit classifier is NARROWER than the pre-commit classifier: a - // commit-phase error may mean the commit already succeeded, so only - // errors that GUARANTEE the commit did not happen are retryable here. - - // Retryable at commit: pre-send guarantees. - require.True(t, isRetryablePostgresCommitError(driver.ErrBadConn)) - require.True(t, isRetryablePostgresCommitError(&safeToRetryErr{msg: "pre-send commit"})) - - // Retryable at commit: server-rolled-back SQLSTATE codes (e.g. 40001 - // serialization_failure raised at commit time — server rolled back). - require.True(t, isRetryablePostgresCommitError(&pgconn.PgError{Code: "40001"})) - require.True(t, isRetryablePostgresCommitError(&pgconn.PgError{Code: "40P01"})) - - // NOT retryable at commit: network errors could mean the commit succeeded - // but the response was lost. These are returned to the caller wrapped with - // ErrCommitPhase (see TestRetryPostgresTx/commit-phase_network_error). - require.False(t, isRetryablePostgresCommitError(io.EOF)) - require.False(t, isRetryablePostgresCommitError(io.ErrUnexpectedEOF)) - require.False(t, isRetryablePostgresCommitError(&net.OpError{ - Op: "read", - Err: errors.New("connection reset by peer"), - })) - - // NOT retryable at commit: application errors. - require.False(t, isRetryablePostgresCommitError(&pgconn.PgError{Code: "23505"})) // unique_violation - require.False(t, isRetryablePostgresCommitError(errors.New("some application error"))) - require.False(t, isRetryablePostgresCommitError(context.Canceled)) -} - func TestRetryMySQLTxNotImplemented(t *testing.T) { err := RetryMySQLTx(context.Background(), nil, RetryTxOptions{}, func(*sql.Tx) error { return nil }) require.Error(t, err) From 4d03a6ae53a2bf45c5bae03d58d40f101724ce88 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Mon, 29 Jun 2026 20:42:21 +0000 Subject: [PATCH 05/13] database: flip pre-commit classifier to opt-out and drop MaxAttempts config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related changes from review discussion: 1. Pre-commit classifier is now OPT-OUT. Pre-commit, retrying is always safe (the transaction rolls back), so the cost of a false positive (retrying a permanent error we didn't blocklist) is ~200ms of wasted latency, while the cost of a false negative (NOT retrying a transient error we didn't allowlist) is a spurious user-facing failure. The former is strictly safer, so isRetryablePostgresPreCommitError now retries everything EXCEPT context.Canceled/DeadlineExceeded and the permanent SQLSTATE classes (22xxx data exceptions, 23xxx constraint violations, 42xxx syntax/schema/permission), via a new isPermanentPostgresError helper. Unknown *pgconn.PgError codes and non-PgError errors are retried. The commit classifier (isSafeRetryablePostgresError) stays OPT-IN (narrow) because at commit retrying is not always safe. isPostgresNetworkError is removed (the pre-commit catch-all now covers network errors; the commit classifier never used it). 2. MaxAttempts is no longer configurable. Added a maxRetryAttempts=3 constant matching database/sql's maxBadConnRetries+1 convention, and removed the MaxAttempts field from RetryTxOptions and RetryUnsafeOptions (plus the '<= 0 -> default to 3' logic). Services that need a different retry budget should bound the total time via the context deadline or wrap the call in their own retry loop. Added a comment on retryTx explaining the interaction with database/sql's internal driver.ErrBadConn retry (immediate, up to 3 attempts for *sql.DB methods) — our outer retry layers on top with full-jitter backoff to survive longer outages; for *sql.Tx methods there is no internal retry, so this is the only one. Tests: TestIsRetryablePostgresPreCommitError rewritten for opt-out (application error and unknown PgError code now retryable; added 22xxx/ 42xxx permanent cases); removed isPostgresNetworkError comment refs; replaced all MaxAttempts: 3 with {} in RetryTxOptions/RetryUnsafeOptions. Co-authored-by: E-Love (Eric Loveland) --- database/postgres.go | 166 +++++++++++++++++++++++--------------- database/postgres_test.go | 12 +-- database/retry.go | 47 ++++++----- database/retry_test.go | 75 ++++++++++------- 4 files changed, 180 insertions(+), 120 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index be91d4dc..7765005c 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -6,7 +6,6 @@ import ( "database/sql/driver" "errors" "fmt" - "io" "net" "strings" "time" @@ -191,17 +190,16 @@ func PostgresDeadlockFound(err error) bool { // 57P01, 57P02, 57P03, 53300, 57014) — the server rolled back the // transaction before returning the error. // -// It is used directly as the commit-phase classifier for RetryPostgresTx -// (where network errors are NOT safe because the commit may have succeeded — -// see ErrCommitPhase) and as the foundation of the pre-commit classifier -// isRetryablePostgresPreCommitError (which adds network errors via -// isPostgresNetworkError, since pre-commit the transaction rolls back). +// It is used directly as the commit-phase classifier for RetryPostgresTx. +// The pre-commit classifier (isRetryablePostgresPreCommitError) is a separate +// opt-out classifier that retries a broader set, because pre-commit the +// transaction rolls back so retrying is always safe. // // Network-level errors (net.OpError, io.EOF) are intentionally NOT classified // here: without an explicit transaction there is no way to know whether the // error occurred before or after the server accepted the query, and at commit // time a network error could mean the commit succeeded but the response was -// lost. +// lost (see ErrCommitPhase). // // This is unexported because the recommended path for callers is RetryPostgresTx // (safe, transactional) or RetryUnsafe (idempotent caller), not building a @@ -238,10 +236,12 @@ func isSafeRetryablePostgresError(err error) bool { // 57014 query_canceled: the server rolled back the transaction before returning // this error. // - // Note: 08xxx (connection_exception class) codes are intentionally omitted. - // pgx surfaces connection-level failures as network errors, not as - // *pgconn.PgError, so those cases are handled by isPostgresNetworkError - // (used by isRetryablePostgresPreCommitError within a transaction). + // Note: 08xxx (connection_exception class) codes are intentionally omitted + // here. pgx surfaces connection-level failures as network errors, not as + // *pgconn.PgError; the pre-commit classifier (isRetryablePostgresPreCommitError) + // retries those because pre-commit the transaction rolls back. At commit time + // they would be wrapped with ErrCommitPhase (conservative — no server response + // means the commit may have succeeded). var pgErr *pgconn.PgError if errors.As(err, &pgErr) { switch pgErr.Code { @@ -256,23 +256,29 @@ func isSafeRetryablePostgresError(err error) bool { return false } -// isPostgresNetworkError returns true for typed network-level errors that -// indicate the TCP connection was severed. These are safe to retry ONLY within -// an explicit transaction (the server rolls back the uncommitted transaction), -// so this helper is used by isRetryablePostgresPreCommitError and NOT by -// isSafeRetryablePostgresError. +// isPermanentPostgresError returns true for PostgreSQL SQLSTATE classes that +// indicate a permanent error which would fail again on retry. Used by the +// pre-commit classifier (isRetryablePostgresPreCommitError) to opt OUT of +// retrying these — pre-commit retrying them would be safe (the transaction +// rolls back) but futile, adding ~200ms of latency before the caller receives +// the error. The classes blocked here all surface at query time (not connection +// time) and don't resolve across retries: // -// String-matching on error messages is intentionally avoided: pgx flags -// pre-send failures via pgconn.SafeToRetry (which the stdlib adapter converts -// to driver.ErrBadConn), and post-send failures surface as typed net.OpError / -// io errors, so the typed checks are sufficient and far less fragile than -// substring matching. -func isPostgresNetworkError(err error) bool { - var netErr *net.OpError - if errors.As(err, &netErr) { - return true - } - return errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) +// - 22xxx data exceptions (e.g. string_data_right_truncation 22001) +// - 23xxx integrity constraint violations (e.g. unique_violation 23505, +// foreign_key_violation 23503) +// - 42xxx syntax/access-rule violations (includes 42501 +// insufficient_privilege, 42601 syntax_error, undefined_table 42P01, +// undefined_column 42703, datatype_mismatch 42804) +// +// Other classes (e.g. 08xxx connection exceptions, 58xxx system errors, XX000 +// internal_error) are NOT blocked: they may be transient, and pre-commit +// retrying them is safe. Blocking only the clearly-permanent classes keeps the +// blocklist conservative so we don't accidentally skip a retryable error. +func isPermanentPostgresError(pgErr *pgconn.PgError) bool { + return strings.HasPrefix(pgErr.Code, "22") || // data exception + strings.HasPrefix(pgErr.Code, "23") || // integrity constraint violation + strings.HasPrefix(pgErr.Code, "42") // syntax / access rule violation } // retryJitterMax is the upper bound for the random delay between retries. @@ -287,34 +293,58 @@ const retryJitterMax = 100 * time.Millisecond // isRetryablePostgresPreCommitError is the default classifier used by // RetryPostgresTx for errors that occur BEFORE Commit is called (i.e., from -// BeginTx or fn). It is isSafeRetryablePostgresError OR isPostgresNetworkError: -// pre-commit, typed network errors are also safe to retry because the server -// rolls back the uncommitted transaction. +// BeginTx or fn). Pre-commit, retrying is ALWAYS safe: the transaction rolls +// back, so a retried attempt starts from a clean slate. To avoid missing a +// transient error we haven't enumerated, this classifier OPTS OUT of retrying +// only the errors known to be permanent; everything else is retried. +// +// NOT retried (would fail again on the next attempt, so retrying only adds +// ~200ms of latency before the caller receives the error): // -// Permanent application errors are intentionally NOT retried here, even though -// retrying them would be safe (the transaction rolls back): they would fail -// again on the next attempt, so retrying only adds ~200ms of latency (3 -// attempts with full-jitter backoff up to retryJitterMax) before the caller -// receives the error. The categories we deliberately do NOT retry are: +// - context.Canceled / context.DeadlineExceeded: the caller's context is +// done; retrying would hit the deadline immediately. +// - Permanent PostgreSQL SQLSTATE classes — see isPermanentPostgresError: +// * 22xxx data exceptions (e.g. string_data_right_truncation 22001) +// * 23xxx integrity constraint violations (e.g. unique_violation 23505, +// foreign_key_violation 23503) +// * 42xxx syntax/access-rule violations (42501 insufficient_privilege, +// 42601 syntax_error, undefined_table 42P01, undefined_column 42703, +// datatype_mismatch 42804) // -// - Constraint violations (class 23xxx) — e.g. unique_violation 23505, -// foreign_key_violation 23503, check_violation 23514, not_null_violation -// 23502. The data conflict persists across retries. -// - Syntax errors (42601) — the SQL is malformed; indicates a code bug. -// - Schema errors (class 42xxx) — e.g. undefined_table 42P01, -// undefined_column 42703. Indicates a migration drift or code bug. -// - Permission errors (42501 insufficient_privilege) — permissions don't -// change mid-query. -// - Data exceptions (class 22xxx) — e.g. string_data_right_truncation 22001, -// datatype_mismatch 42804. The data doesn't fit the column/type. +// Retried (transient or unknown): everything else — including driver.ErrBadConn, +// pgconn.SafeToRetry-flagged errors, typed network errors (net.OpError, io.EOF), +// the known-transient SQLSTATE codes (40001, 40P01, 57P01, 57P02, 57P03, 53300, +// 57014), any *pgconn.PgError whose code is NOT in the permanent blocklist +// (opt-out — assume retryable unless known permanent, to avoid spurious +// failures on transient errors we haven't enumerated), and any other non- +// PgError error. // -// These are discovered at query time (not connection time, where -// incompatibility/handshake failures surface), so they reach fn and would -// recur on retry. Callers who want to retry a broader set (e.g. a custom -// "transaction rolled back" sentinel from a stored procedure) can pass -// opts.IsRetryable, which is additive on top of this classifier. +// The cost of a false positive (retrying a permanent error we didn't blocklist) +// is ~200ms of wasted latency; the cost of a false negative (NOT retrying a +// transient error we didn't allowlist) is a spurious user-facing failure. +// Pre-commit the former is strictly safer, so we opt out. +// +// Callers who want to ALSO opt out of retrying additional errors should pass +// opts.IsRetryable that returns false for them. Note that opts.IsRetryable is +// ADDITIVE (it can only widen the retry set via OR), so to NARROW it you'd need +// a wrapper that re-checks; if narrowing is a common need we can revisit the +// composition semantics. func isRetryablePostgresPreCommitError(err error) bool { - return isSafeRetryablePostgresError(err) || isPostgresNetworkError(err) + if err == nil { + return false + } + // Caller's context is done — retrying would hit the deadline immediately. + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return false + } + // Permanent SQLSTATE codes would fail again on retry. + var pgErr *pgconn.PgError + if errors.As(err, &pgErr) { + return !isPermanentPostgresError(pgErr) + } + // Everything else (network errors, driver.ErrBadConn, SafeToRetry-flagged, + // unknown errors) is retried — pre-commit the transaction rolls back. + return true } // RetryPostgresTx executes fn inside a Postgres transaction, retrying the @@ -323,20 +353,21 @@ func isRetryablePostgresPreCommitError(err error) bool { // committed. // // Pre-commit errors (from BeginTx or fn) are retried when -// isRetryablePostgresPreCommitError(err) OR opts.IsRetryable(err) is true — a -// broad set (pre-send + server-rolled-back + network), because the transaction -// rolls back so retrying is safe. Permanent application errors (constraint -// violations, syntax/schema/permission errors) are NOT retried; see -// isRetryablePostgresPreCommitError's doc for the full list. +// isRetryablePostgresPreCommitError(err) OR opts.IsRetryable(err) is true. +// isRetryablePostgresPreCommitError is an OPT-OUT classifier: pre-commit +// retrying is always safe (the transaction rolls back), so it retries +// everything EXCEPT context.Canceled/DeadlineExceeded and the permanent +// SQLSTATE classes (22xxx data exceptions, 23xxx constraint violations, 42xxx +// syntax/schema/permission). See its doc for the rationale. // // Commit-phase errors (from (*sql.Tx).Commit) are retried only when // isSafeRetryablePostgresError(err) OR opts.IsRetryable(err) is true — a -// narrower set (pre-send + server-rolled-back; network errors excluded), -// because a commit-phase network error may mean the commit already succeeded -// (e.g., the connection was severed after the COMMIT was sent but before the -// response was received). Commit-phase errors that are NOT retried are -// returned to the caller wrapped with ErrCommitPhase so the caller can decide -// whether to alert, reconcile, or check via pg_xact_status() (future). +// narrower OPT-IN set (pre-send + server-rolled-back SQLSTATE; network errors +// excluded), because a commit-phase network error may mean the commit already +// succeeded (e.g., the connection was severed after the COMMIT was sent but +// before the response was received). Commit-phase errors that are NOT retried +// are returned to the caller wrapped with ErrCommitPhase so the caller can +// decide whether to alert, reconcile, or check via pg_xact_status() (future). // // db MUST be a *sql.DB backed by the pgx stdlib adapter (e.g. one returned // by database.New with a PostgresConfig). Using a *sql.DB backed by another @@ -346,11 +377,12 @@ func RetryPostgresTx(ctx context.Context, db *sql.DB, opts RetryTxOptions, fn fu } // postgresTxClassifier is the per-phase retry classifier used by RetryPostgresTx. -// The preCommit classifier is broad (pre-send + server-rolled-back + network — -// any transient error is safe because the transaction rolls back); the commit -// classifier is the narrow isSafeRetryablePostgresError (pre-send + -// server-rolled-back; network excluded), because a commit-phase network error -// may mean the commit already succeeded. See ErrCommitPhase. +// The preCommit classifier is an OPT-OUT classifier (broad: retry everything +// except permanent errors and context cancellation, because pre-commit the +// transaction rolls back so retrying is always safe); the commit classifier is +// the OPT-IN isSafeRetryablePostgresError (narrow: only pre-send + +// server-rolled-back, because a commit-phase network error may mean the commit +// already succeeded). See ErrCommitPhase. var postgresTxClassifier = retryClassifier{ preCommit: isRetryablePostgresPreCommitError, commit: isSafeRetryablePostgresError, diff --git a/database/postgres_test.go b/database/postgres_test.go index 776fcddf..8e659952 100644 --- a/database/postgres_test.go +++ b/database/postgres_test.go @@ -185,7 +185,7 @@ func Test_Postgres_Alloy_Migrations(t *testing.T) { func TestRetryUnsafe(t *testing.T) { t.Run("succeeds on first attempt", func(t *testing.T) { calls := 0 - err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{MaxAttempts: 3}, func() error { + err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{}, func() error { calls++ return nil }) @@ -198,7 +198,7 @@ func TestRetryUnsafe(t *testing.T) { // RetryUnsafe's default predicate retries any error except context // cancellation because the caller has vouched that fn is idempotent. calls := 0 - err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{MaxAttempts: 3}, func() error { + err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{}, func() error { calls++ if calls < 3 { return &pgconn.PgError{Code: "23505"} // unique_violation @@ -212,7 +212,7 @@ func TestRetryUnsafe(t *testing.T) { t.Run("custom IsRetryable short-circuits non-retryable errors", func(t *testing.T) { calls := 0 neverRetry := func(error) bool { return false } - err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{MaxAttempts: 3, IsRetryable: neverRetry}, func() error { + err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{IsRetryable: neverRetry}, func() error { calls++ return io.EOF }) @@ -223,7 +223,7 @@ func TestRetryUnsafe(t *testing.T) { t.Run("does not retry context cancellation by default", func(t *testing.T) { calls := 0 - err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{MaxAttempts: 3}, func() error { + err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{}, func() error { calls++ return context.Canceled }) @@ -237,7 +237,7 @@ func TestRetryUnsafe(t *testing.T) { cancel() // cancel immediately calls := 0 - err := database.RetryUnsafe(ctx, database.RetryUnsafeOptions{MaxAttempts: 3}, func() error { + err := database.RetryUnsafe(ctx, database.RetryUnsafeOptions{}, func() error { calls++ return io.EOF // retryable, but context is done }) @@ -248,7 +248,7 @@ func TestRetryUnsafe(t *testing.T) { t.Run("exhausts all attempts", func(t *testing.T) { calls := 0 - err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{MaxAttempts: 3}, func() error { + err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{}, func() error { calls++ return io.EOF }) diff --git a/database/retry.go b/database/retry.go index 08ce5282..6eaaf85a 100644 --- a/database/retry.go +++ b/database/retry.go @@ -12,8 +12,6 @@ import ( // RetryTxOptions configures the transactional (safe) retry functions // (RetryPostgresTx and the future RetryMySQLTx / RetrySpannerTx). type RetryTxOptions struct { - // MaxAttempts caps the number of transaction attempts. Defaults to 3 if <= 0. - MaxAttempts int // TxOptions is passed to (*sql.DB).BeginTx on each attempt. nil = default isolation. TxOptions *sql.TxOptions // IsRetryable, if non-nil, is consulted IN ADDITION to the backend's default @@ -26,13 +24,19 @@ type RetryTxOptions struct { // RetryUnsafeOptions configures RetryUnsafe. type RetryUnsafeOptions struct { - // MaxAttempts caps the number of attempts. Defaults to 3 if <= 0. - MaxAttempts int // IsRetryable, if nil, defaults to "retry on any error except // context.Canceled / context.DeadlineExceeded". If non-nil, replaces the default. IsRetryable func(err error) bool } +// maxRetryAttempts is the maximum number of attempts for RetryPostgresTx and +// RetryUnsafe (the initial attempt plus 2 retries), matching database/sql's +// maxBadConnRetries+1 convention. It is intentionally not configurable: +// services that need a different retry budget should bound the total time via +// the context deadline (the retry loop exits early when ctx is done) or wrap +// the call in their own retry loop for more attempts. +const maxRetryAttempts = 3 + // ErrCommitPhase wraps an error that occurred during the commit phase of a // retried transaction (i.e., from (*sql.Tx).Commit) and was NOT retried. A // commit-phase error is ambiguous: the commit may have succeeded on the server @@ -75,21 +79,17 @@ func (c retryClassifier) classify(err error, commitPhase bool) bool { return c.preCommit(err) } -// RetryUnsafe executes fn up to MaxAttempts times, retrying on any error that -// opts.IsRetryable classifies as retryable (default: any error except context -// cancellation/deadline). fn MUST be idempotent — no transaction wrapper is -// provided, so a retry may re-execute work that already committed. +// RetryUnsafe executes fn up to maxRetryAttempts times, retrying on any error +// that opts.IsRetryable classifies as retryable (default: any error except +// context cancellation/deadline). fn MUST be idempotent — no transaction +// wrapper is provided, so a retry may re-execute work that already committed. func RetryUnsafe(ctx context.Context, opts RetryUnsafeOptions, fn func() error) error { - maxAttempts := opts.MaxAttempts - if maxAttempts <= 0 { - maxAttempts = 3 - } isRetryable := opts.IsRetryable if isRetryable == nil { isRetryable = isRetryableUnsafeDefault } var err error - for attempt := 0; attempt < maxAttempts; attempt++ { + for attempt := 0; attempt < maxRetryAttempts; attempt++ { err = fn() if err == nil { return nil @@ -97,7 +97,7 @@ func RetryUnsafe(ctx context.Context, opts RetryUnsafeOptions, fn func() error) if !isRetryable(err) { return err } - if !sleepWithJitter(ctx, attempt, maxAttempts) { + if !sleepWithJitter(ctx, attempt, maxRetryAttempts) { return ctx.Err() } } @@ -120,11 +120,18 @@ func isRetryableUnsafeDefault(err error) bool { // ErrCommitPhase before being returned to the caller, so the caller can detect // that the error occurred during the commit phase (where the commit may have // succeeded before the error was returned). +// +// database/sql already retries driver.ErrBadConn internally for *sql.DB methods +// (up to maxBadConnRetries=2, i.e. 3 total attempts) before surfacing it. That +// internal retry is immediate (no delay) and handles transient bad connections +// on fresh conns; this outer retry layers on top with full-jitter backoff to +// survive longer outages (e.g. an AlloyDB switchover). The composition is +// intentional: the internal retry covers the "connection was bad before I used +// it" case without delay, and this outer retry covers the "the database was +// briefly unavailable" case with jitter to spread fleet-wide retries. For +// *sql.Tx methods (tx.Exec, tx.Commit) there is no internal retry, so this +// outer retry is the only retry. func retryTx(ctx context.Context, db beginTxer, base retryClassifier, opts RetryTxOptions, fn func(*sql.Tx) error) error { - maxAttempts := opts.MaxAttempts - if maxAttempts <= 0 { - maxAttempts = 3 - } classifier := base if opts.IsRetryable != nil { extra := opts.IsRetryable @@ -135,7 +142,7 @@ func retryTx(ctx context.Context, db beginTxer, base retryClassifier, opts Retry } var lastErr error var lastCommitPhase bool - for attempt := 0; attempt < maxAttempts; attempt++ { + for attempt := 0; attempt < maxRetryAttempts; attempt++ { var commitPhase bool lastErr, commitPhase = runOneTxAttempt(ctx, db, opts.TxOptions, fn) lastCommitPhase = commitPhase @@ -145,7 +152,7 @@ func retryTx(ctx context.Context, db beginTxer, base retryClassifier, opts Retry if !classifier.classify(lastErr, commitPhase) { break } - if !sleepWithJitter(ctx, attempt, maxAttempts) { + if !sleepWithJitter(ctx, attempt, maxRetryAttempts) { return ctx.Err() } } diff --git a/database/retry_test.go b/database/retry_test.go index 06d67b3a..8ab42588 100644 --- a/database/retry_test.go +++ b/database/retry_test.go @@ -123,7 +123,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { calls++ return nil }) @@ -139,7 +139,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { calls++ if calls < 2 { return &pgconn.PgError{Code: "40001"} // serialization_failure -> retryable @@ -161,7 +161,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { calls++ return &pgconn.PgError{Code: "23505"} // unique_violation -> NOT retryable }) @@ -181,7 +181,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { calls++ if calls < 2 { return driver.ErrBadConn @@ -204,7 +204,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { calls++ return nil }) @@ -221,7 +221,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { calls++ return nil }) @@ -243,7 +243,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { calls++ return nil }) @@ -271,7 +271,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { calls++ return nil }) @@ -297,7 +297,6 @@ func TestRetryPostgresTx(t *testing.T) { calls := 0 err := RetryPostgresTx(context.Background(), db, RetryTxOptions{ - MaxAttempts: 3, IsRetryable: func(err error) bool { return errors.Is(err, sentinelErr) }, }, func(*sql.Tx) error { calls++ @@ -319,7 +318,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(ctx, db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + err := RetryPostgresTx(ctx, db, RetryTxOptions{}, func(*sql.Tx) error { calls++ if calls == 1 { cancel() // cancel after the first attempt runs @@ -338,7 +337,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{MaxAttempts: 3}, func(*sql.Tx) error { + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { calls++ return &pgconn.PgError{Code: "40P01"} // deadlock_detected -> retryable }) @@ -351,10 +350,12 @@ func TestRetryPostgresTx(t *testing.T) { } func TestIsSafeRetryablePostgresError(t *testing.T) { - // isSafeRetryablePostgresError is the "safe to retry at any phase" - // classifier — pre-send guarantees + server-rolled-back SQLSTATE codes. - // It is used directly as the commit-phase classifier for RetryPostgresTx - // and as the foundation of the pre-commit classifier. + // isSafeRetryablePostgresError is the OPT-IN commit-phase classifier for + // RetryPostgresTx: pre-send guarantees + server-rolled-back SQLSTATE codes. + // Network errors are excluded (at commit, a network error could mean the + // commit succeeded but the response was lost — see ErrCommitPhase). The + // pre-commit classifier (isRetryablePostgresPreCommitError) is a separate + // OPT-OUT classifier that retries a broader set. // Pre-send: driver.ErrBadConn (pgx stdlib adapter only produces this for // SafeToRetry-flagged Exec/Query errors) and pgconn.SafeToRetry-flagged @@ -388,23 +389,22 @@ func TestIsSafeRetryablePostgresError(t *testing.T) { } func TestIsRetryablePostgresPreCommitError(t *testing.T) { - // Leg 1: driver.ErrBadConn (fn-path signal — pgx stdlib adapter converts - // SafeToRetry Exec/Query errors to this). + // isRetryablePostgresPreCommitError is an OPT-OUT classifier: pre-commit + // retrying is always safe (the transaction rolls back), so it retries + // everything EXCEPT context.Canceled/DeadlineExceeded and the permanent + // SQLSTATE classes (22xxx, 23xxx, 42xxx). + + // Retried: pre-send guarantees. require.True(t, isRetryablePostgresPreCommitError(driver.ErrBadConn)) require.True(t, isRetryablePostgresPreCommitError(fmt.Errorf("wrapped: %w", driver.ErrBadConn))) - - // Leg 2: pgconn.SafeToRetry-flagged — pgx guarantees these ALWAYS occur - // before any data is sent to the server. require.True(t, isRetryablePostgresPreCommitError(&safeToRetryErr{msg: "pre-send"})) - // Leg 3: isSafeRetryablePostgresError SQLSTATE codes that guarantee rollback. + // Retried: known-transient SQLSTATE codes (server rolled back). require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "40001"})) // serialization_failure require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "40P01"})) // deadlock_detected require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown - // Leg 4: isPostgresNetworkError (typed network errors — safe pre-commit - // because the server rolls back the uncommitted tx). NOT in - // isSafeRetryablePostgresError (unsafe without a tx). + // Retried: typed network errors (non-PgError -> retry under opt-out). require.True(t, isRetryablePostgresPreCommitError(io.EOF)) require.True(t, isRetryablePostgresPreCommitError(io.ErrUnexpectedEOF)) require.True(t, isRetryablePostgresPreCommitError(&net.OpError{ @@ -412,10 +412,31 @@ func TestIsRetryablePostgresPreCommitError(t *testing.T) { Err: errors.New("connection reset by peer"), })) - // Non-retryable: application errors and context cancellation. - require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "23505"})) // unique_violation - require.False(t, isRetryablePostgresPreCommitError(errors.New("some application error"))) + // Retried (opt-out): an UNKNOWN *pgconn.PgError code is retried because + // pre-commit it's safe and we'd rather not miss a transient error we + // haven't enumerated. XX000 (internal_error) is not in the permanent + // blocklist, so it's retried. + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "XX000"})) // internal_error (unknown/transient) + + // Retried (opt-out): a non-PgError, non-network error is retried. fn might + // return a wrapped error; pre-commit retrying is safe, and the cost of a + // false positive (200ms wasted on a permanent error) is less than the cost + // of a false negative (spurious user-facing failure). + require.True(t, isRetryablePostgresPreCommitError(errors.New("some application error"))) + + // NOT retried: permanent SQLSTATE classes (would fail again on retry). + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "23505"})) // 23xxx unique_violation + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "23503"})) // 23xxx foreign_key_violation + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "22001"})) // 22xxx string_data_right_truncation + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "42601"})) // 42xxx syntax_error + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "42501"})) // 42xxx insufficient_privilege + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "42P01"})) // 42xxx undefined_table + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "42703"})) // 42xxx undefined_column + + // NOT retried: caller's context is done. require.False(t, isRetryablePostgresPreCommitError(context.Canceled)) + require.False(t, isRetryablePostgresPreCommitError(context.DeadlineExceeded)) + require.False(t, isRetryablePostgresPreCommitError(nil)) } func TestRetryMySQLTxNotImplemented(t *testing.T) { From e310ce905bbf9b84d45235d1e9aa4ee73e51a5c9 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 30 Jun 2026 03:47:12 +0000 Subject: [PATCH 06/13] database: drop driver.ErrBadConn leg from the commit classifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The driver.ErrBadConn check in isSafeRetryablePostgresError (the commit-phase classifier) was unreachable dead code: - *sql.Tx.Commit does not retry driver.ErrBadConn internally (only *sql.DB methods have the maxBadConnRetries loop — see golang/go#11264), so there is no database/sql retry to double up with at commit time. - pgx's wrapTx.Commit returns the native pgx error directly (return wtx.tx.Commit(wtx.ctx)); the SafeToRetry -> driver.ErrBadConn conversion only happens in Conn.ExecContext/QueryContext (i.e., for fn's tx.Exec/tx.Query), NOT for Commit. So the commit classifier never sees driver.ErrBadConn with pgx. - The pre-send commit case (a SafeToRetry-flagged pgx error from commit) is already covered by the pgconn.SafeToRetry leg. Removed the leg and the now-unused database/sql/driver import from postgres.go, and documented on isSafeRetryablePostgresError why driver.ErrBadConn is not checked there. driver.ErrBadConn from fn's tx.Exec is still retryable — that's a pre-commit error handled by isRetryablePostgresPreCommitError's opt-out catch-all (TestIsRetryablePostgresPreCommitError covers it). TestIsSafeRetryablePostgresError: removed the driver.ErrBadConn -> true assertions (now false) and added a comment explaining why driver.ErrBadConn isn't in the commit classifier. Co-authored-by: E-Love (Eric Loveland) --- database/postgres.go | 21 +++++++++++---------- database/retry_test.go | 22 +++++++++++++++------- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index 7765005c..b4f7f1cc 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -3,7 +3,6 @@ package database import ( "context" "database/sql" - "database/sql/driver" "errors" "fmt" "net" @@ -180,9 +179,6 @@ func PostgresDeadlockFound(err error) bool { // occurred (the operation never reached the server, or the server rolled back // before returning the error). It is the OR of: // -// - errors.Is(err, driver.ErrBadConn): pgx's stdlib adapter only produces -// this for pgconn.SafeToRetry-flagged (pre-send) errors, so the operation -// never reached the server. // - pgconn.SafeToRetry(err): pgx guarantees the error occurred before any // data was sent to the server (e.g., during connection acquisition, // conn-busy, HA NotPreferredError, pre-send timeouts). @@ -195,6 +191,17 @@ func PostgresDeadlockFound(err error) bool { // opt-out classifier that retries a broader set, because pre-commit the // transaction rolls back so retrying is always safe. // +// driver.ErrBadConn is intentionally NOT checked here. It is produced by pgx's +// stdlib adapter only inside Conn.ExecContext/QueryContext (i.e., from fn's +// tx.Exec/tx.Query), NOT from wrapTx.Commit — pgx's Commit returns the native +// pgx error directly. So the commit classifier never sees driver.ErrBadConn +// with pgx, and the pre-send commit case is already covered by pgconn.SafeToRetry +// above. (driver.ErrBadConn from fn's tx.Exec IS retryable, but that's a +// pre-commit error handled by isRetryablePostgresPreCommitError's opt-out +// catch-all.) Note also that *sql.Tx.Commit does not retry driver.ErrBadConn +// internally — only *sql.DB methods do — so there is no database/sql retry to +// double up with at commit time. +// // Network-level errors (net.OpError, io.EOF) are intentionally NOT classified // here: without an explicit transaction there is no way to know whether the // error occurred before or after the server accepted the query, and at commit @@ -208,12 +215,6 @@ func isSafeRetryablePostgresError(err error) bool { if err == nil { return false } - // driver.ErrBadConn is produced by pgx's stdlib adapter only for - // pgconn.SafeToRetry-flagged (pre-send) errors from Exec/Query, so the - // operation never reached the server. Safe to retry at any phase. - if errors.Is(err, driver.ErrBadConn) { - return true - } // pgconn.SafeToRetry is pgx's authoritative flag for "occurred before any // data was sent to the server". Covers Begin/Commit failures where the // original pgx error flows through the stdlib adapter unchanged. diff --git a/database/retry_test.go b/database/retry_test.go index 8ab42588..353066bc 100644 --- a/database/retry_test.go +++ b/database/retry_test.go @@ -356,13 +356,21 @@ func TestIsSafeRetryablePostgresError(t *testing.T) { // commit succeeded but the response was lost — see ErrCommitPhase). The // pre-commit classifier (isRetryablePostgresPreCommitError) is a separate // OPT-OUT classifier that retries a broader set. - - // Pre-send: driver.ErrBadConn (pgx stdlib adapter only produces this for - // SafeToRetry-flagged Exec/Query errors) and pgconn.SafeToRetry-flagged - // errors. Safe at any phase because the operation never reached the server. - require.True(t, isSafeRetryablePostgresError(driver.ErrBadConn)) - require.True(t, isSafeRetryablePostgresError(fmt.Errorf("wrapped: %w", driver.ErrBadConn))) - require.True(t, isSafeRetryablePostgresError(&safeToRetryErr{msg: "pre-send"})) + // + // driver.ErrBadConn is intentionally NOT in this classifier: pgx's + // wrapTx.Commit returns native pgx errors (not driver.ErrBadConn), so the + // commit path never sees it. The pre-send commit case is covered by + // pgconn.SafeToRetry below. driver.ErrBadConn from fn's tx.Exec IS + // retryable, but that's a pre-commit error handled by the opt-out catch-all + // (see TestIsRetryablePostgresPreCommitError). + + // Pre-send: pgconn.SafeToRetry-flagged errors. Safe at any phase because + // the operation never reached the server. + require.True(t, isSafeRetryablePostgresError(&safeToRetryErr{msg: "pre-send commit"})) + + // driver.ErrBadConn is NOT retryable here (the commit classifier doesn't + // check it — see the comment above). + require.False(t, isSafeRetryablePostgresError(driver.ErrBadConn)) // SQLSTATE codes that the server guarantees were rolled back. require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown From 53917aa1d6357db5a2b7499fcdb2c289b69d959d Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 30 Jun 2026 04:16:42 +0000 Subject: [PATCH 07/13] database: tidy retry-tx comments for concision Trim verbose comments, drop general-knowledge explanations (e.g. Rollback is a no-op after Commit), remove rationale duplicated across function docs, and fix stale references to the renamed isRetryablePostgresTxError in the mysql.go/spanner.go placeholder TODOs. Behavior unchanged. Co-authored-by: E-Love (Eric Loveland) --- database/mysql.go | 8 +- database/postgres.go | 211 +++++++++++------------------------------ database/retry.go | 120 +++++++++-------------- database/retry_test.go | 61 ++++-------- database/spanner.go | 11 +-- 5 files changed, 120 insertions(+), 291 deletions(-) diff --git a/database/mysql.go b/database/mysql.go index 06badb6b..c0d75cf9 100644 --- a/database/mysql.go +++ b/database/mysql.go @@ -247,11 +247,9 @@ func MySQLDeadlockFound(err error) bool { } // RetryMySQLTx is the MySQL equivalent of RetryPostgresTx. -// TODO: implement with MySQL retryable codes (1213 deadlock, 1205 lock wait -// timeout, 2013 connection lost, network resets). The shared retryTx helper -// in retry.go already handles driver.ErrBadConn (backend-agnostic) and the -// Begin/fn/Commit/Rollback loop; the missing piece is a MySQL-specific -// default classifier analogous to isRetryablePostgresTxError. See +// TODO: implement a MySQL-specific retryClassifier (retryTx already handles the +// Begin/fn/Commit/Rollback loop and driver.ErrBadConn). Cover 1213 deadlock, +// 1205 lock wait timeout, 2013 connection lost. See // https://dev.mysql.com/doc/mysql-errors/8.0/en/server-error-reference.html func RetryMySQLTx(ctx context.Context, db *sql.DB, opts RetryTxOptions, fn func(*sql.Tx) error) error { _ = ctx diff --git a/database/postgres.go b/database/postgres.go index b4f7f1cc..a51a752b 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -174,216 +174,113 @@ func PostgresDeadlockFound(err error) bool { return strings.Contains(err.Error(), postgresErrDeadlockFound) } -// isSafeRetryablePostgresError returns true if the error is safe to retry -// regardless of transaction phase — i.e., the error guarantees no side effects -// occurred (the operation never reached the server, or the server rolled back -// before returning the error). It is the OR of: +// isSafeRetryablePostgresError is the commit-phase classifier for +// RetryPostgresTx: true iff the error guarantees no side effects (pre-send or +// server-rolled-back), so retry is safe even at commit. // -// - pgconn.SafeToRetry(err): pgx guarantees the error occurred before any -// data was sent to the server (e.g., during connection acquisition, -// conn-busy, HA NotPreferredError, pre-send timeouts). -// - server-returned SQLSTATE codes that guarantee rollback (40001, 40P01, -// 57P01, 57P02, 57P03, 53300, 57014) — the server rolled back the -// transaction before returning the error. +// - pgconn.SafeToRetry: pgx guarantees the error occurred before any data +// was sent to the server. +// - SQLSTATE codes the server rolled back before returning: 40001, 40P01, +// 57P01, 57P02, 57P03, 53300, 57014. // -// It is used directly as the commit-phase classifier for RetryPostgresTx. -// The pre-commit classifier (isRetryablePostgresPreCommitError) is a separate -// opt-out classifier that retries a broader set, because pre-commit the -// transaction rolls back so retrying is always safe. +// driver.ErrBadConn not checked: pgx's Commit returns native errors (the +// ErrBadConn conversion is only in Exec/Query), so the commit path never sees +// it; pre-send commit is covered by pgconn.SafeToRetry. *sql.Tx.Commit doesn't +// retry ErrBadConn internally (only *sql.DB methods do), so no double-retry. // -// driver.ErrBadConn is intentionally NOT checked here. It is produced by pgx's -// stdlib adapter only inside Conn.ExecContext/QueryContext (i.e., from fn's -// tx.Exec/tx.Query), NOT from wrapTx.Commit — pgx's Commit returns the native -// pgx error directly. So the commit classifier never sees driver.ErrBadConn -// with pgx, and the pre-send commit case is already covered by pgconn.SafeToRetry -// above. (driver.ErrBadConn from fn's tx.Exec IS retryable, but that's a -// pre-commit error handled by isRetryablePostgresPreCommitError's opt-out -// catch-all.) Note also that *sql.Tx.Commit does not retry driver.ErrBadConn -// internally — only *sql.DB methods do — so there is no database/sql retry to -// double up with at commit time. +// Network errors excluded: at commit, could mean commit succeeded but the +// response was lost (see ErrCommitPhase). // -// Network-level errors (net.OpError, io.EOF) are intentionally NOT classified -// here: without an explicit transaction there is no way to know whether the -// error occurred before or after the server accepted the query, and at commit -// time a network error could mean the commit succeeded but the response was -// lost (see ErrCommitPhase). -// -// This is unexported because the recommended path for callers is RetryPostgresTx -// (safe, transactional) or RetryUnsafe (idempotent caller), not building a -// custom retry loop on top of this classifier. +// Unexported: use RetryPostgresTx or RetryUnsafe rather than building on this. func isSafeRetryablePostgresError(err error) bool { if err == nil { return false } - // pgconn.SafeToRetry is pgx's authoritative flag for "occurred before any - // data was sent to the server". Covers Begin/Commit failures where the - // original pgx error flows through the stdlib adapter unchanged. if pgconn.SafeToRetry(err) { return true } - // 57P01 admin_shutdown, 57P02 crash_shutdown: fast shutdown rolls back all - // active transactions before disconnecting clients. See: - // https://www.postgresql.org/docs/current/server-shutdown.html - // - // 57P03 cannot_connect_now: server is still starting up; the operation never - // reached a transaction. - // - // 40001 serialization_failure, 40P01 deadlock_detected: the server explicitly - // rolled back the transaction and expects the client to retry. See: - // https://www.postgresql.org/docs/current/mvcc-serialization-failure-handling.html - // - // 53300 too_many_connections: rejected at connect time; no transaction started. - // - // 57014 query_canceled: the server rolled back the transaction before returning - // this error. - // - // Note: 08xxx (connection_exception class) codes are intentionally omitted - // here. pgx surfaces connection-level failures as network errors, not as - // *pgconn.PgError; the pre-commit classifier (isRetryablePostgresPreCommitError) - // retries those because pre-commit the transaction rolls back. At commit time - // they would be wrapped with ErrCommitPhase (conservative — no server response - // means the commit may have succeeded). + // SQLSTATE codes the server rolled back (or never started a tx) before + // returning — safe to retry. See: + // https://www.postgresql.org/docs/current/server-shutdown.html (57P01/57P02/57P03) + // https://www.postgresql.org/docs/current/mvcc-serialization-failure-handling.html (40001/40P01) + // 53300 too_many_connections and 57014 query_canceled also rolled back. + // 08xxx omitted: pgx surfaces those as network errors, not *pgconn.PgError. var pgErr *pgconn.PgError if errors.As(err, &pgErr) { switch pgErr.Code { - case "57P01", "57P02", "57P03", // admin_shutdown, crash_shutdown, cannot_connect_now + case "57P01", "57P02", "57P03", // admin/crash shutdown, cannot_connect_now "40001", "40P01", // serialization_failure, deadlock_detected "53300", // too_many_connections "57014": // query_canceled return true } - return false } return false } -// isPermanentPostgresError returns true for PostgreSQL SQLSTATE classes that -// indicate a permanent error which would fail again on retry. Used by the -// pre-commit classifier (isRetryablePostgresPreCommitError) to opt OUT of -// retrying these — pre-commit retrying them would be safe (the transaction -// rolls back) but futile, adding ~200ms of latency before the caller receives -// the error. The classes blocked here all surface at query time (not connection -// time) and don't resolve across retries: +// isPermanentPostgresError reports SQLSTATE classes that would fail again on +// retry, used by isRetryablePostgresPreCommitError to opt out. Only +// clearly-permanent classes are listed so unknown/transient codes are retried. // -// - 22xxx data exceptions (e.g. string_data_right_truncation 22001) -// - 23xxx integrity constraint violations (e.g. unique_violation 23505, -// foreign_key_violation 23503) -// - 42xxx syntax/access-rule violations (includes 42501 -// insufficient_privilege, 42601 syntax_error, undefined_table 42P01, -// undefined_column 42703, datatype_mismatch 42804) -// -// Other classes (e.g. 08xxx connection exceptions, 58xxx system errors, XX000 -// internal_error) are NOT blocked: they may be transient, and pre-commit -// retrying them is safe. Blocking only the clearly-permanent classes keeps the -// blocklist conservative so we don't accidentally skip a retryable error. +// - 22xxx data exceptions (e.g. 22001 string_data_right_truncation) +// - 23xxx integrity constraint violations (e.g. 23505 unique_violation) +// - 42xxx syntax/access-rule violations (42501, 42601, 42P01, 42703, 42804) func isPermanentPostgresError(pgErr *pgconn.PgError) bool { return strings.HasPrefix(pgErr.Code, "22") || // data exception strings.HasPrefix(pgErr.Code, "23") || // integrity constraint violation strings.HasPrefix(pgErr.Code, "42") // syntax / access rule violation } -// retryJitterMax is the upper bound for the random delay between retries. -// Full jitter in [0, retryJitterMax) spreads concurrent retries across the -// fleet rather than letting them all slam the database at the same instant. -// AlloyDB planned maintenance switchovers typically cause less than one second -// of downtime, so three attempts with up to 100ms between each gives a worst- -// case retry window of ~200ms — well within typical service SLAs while still -// spanning the switchover blip. The caller's context deadline is the escape -// hatch for services with tighter latency budgets. +// retryJitterMax is the upper bound on the random delay between retries. Full +// jitter in [0, retryJitterMax) spreads concurrent retries across the fleet +// rather than slamming the database all at once. AlloyDB maintenance +// switchovers cause <1s downtime, so maxRetryAttempts attempts with up to +// retryJitterMax between each spans the blip; the caller's context deadline is +// the escape hatch for tighter latency budgets. const retryJitterMax = 100 * time.Millisecond -// isRetryablePostgresPreCommitError is the default classifier used by -// RetryPostgresTx for errors that occur BEFORE Commit is called (i.e., from -// BeginTx or fn). Pre-commit, retrying is ALWAYS safe: the transaction rolls -// back, so a retried attempt starts from a clean slate. To avoid missing a -// transient error we haven't enumerated, this classifier OPTS OUT of retrying -// only the errors known to be permanent; everything else is retried. -// -// NOT retried (would fail again on the next attempt, so retrying only adds -// ~200ms of latency before the caller receives the error): +// isRetryablePostgresPreCommitError is the pre-commit classifier for +// RetryPostgresTx (errors from BeginTx or fn). OPTS OUT: pre-commit retry is +// always safe (the transaction rolls back), so it retries everything EXCEPT +// context.Canceled/DeadlineExceeded and the permanent SQLSTATE classes +// (isPermanentPostgresError). Unknown *pgconn.PgError codes and non-PgError +// errors (network, driver.ErrBadConn, SafeToRetry) are retried — a false +// positive (~200ms wasted on a permanent error) costs less than a false +// negative (spurious user-facing failure), so opt-out is safer than opt-in. // -// - context.Canceled / context.DeadlineExceeded: the caller's context is -// done; retrying would hit the deadline immediately. -// - Permanent PostgreSQL SQLSTATE classes — see isPermanentPostgresError: -// * 22xxx data exceptions (e.g. string_data_right_truncation 22001) -// * 23xxx integrity constraint violations (e.g. unique_violation 23505, -// foreign_key_violation 23503) -// * 42xxx syntax/access-rule violations (42501 insufficient_privilege, -// 42601 syntax_error, undefined_table 42P01, undefined_column 42703, -// datatype_mismatch 42804) -// -// Retried (transient or unknown): everything else — including driver.ErrBadConn, -// pgconn.SafeToRetry-flagged errors, typed network errors (net.OpError, io.EOF), -// the known-transient SQLSTATE codes (40001, 40P01, 57P01, 57P02, 57P03, 53300, -// 57014), any *pgconn.PgError whose code is NOT in the permanent blocklist -// (opt-out — assume retryable unless known permanent, to avoid spurious -// failures on transient errors we haven't enumerated), and any other non- -// PgError error. -// -// The cost of a false positive (retrying a permanent error we didn't blocklist) -// is ~200ms of wasted latency; the cost of a false negative (NOT retrying a -// transient error we didn't allowlist) is a spurious user-facing failure. -// Pre-commit the former is strictly safer, so we opt out. -// -// Callers who want to ALSO opt out of retrying additional errors should pass -// opts.IsRetryable that returns false for them. Note that opts.IsRetryable is -// ADDITIVE (it can only widen the retry set via OR), so to NARROW it you'd need -// a wrapper that re-checks; if narrowing is a common need we can revisit the -// composition semantics. +// opts.IsRetryable is additive (OR); to narrow the set, wrap with a predicate +// that re-checks. func isRetryablePostgresPreCommitError(err error) bool { if err == nil { return false } - // Caller's context is done — retrying would hit the deadline immediately. if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { return false } - // Permanent SQLSTATE codes would fail again on retry. var pgErr *pgconn.PgError if errors.As(err, &pgErr) { return !isPermanentPostgresError(pgErr) } - // Everything else (network errors, driver.ErrBadConn, SafeToRetry-flagged, - // unknown errors) is retried — pre-commit the transaction rolls back. return true } -// RetryPostgresTx executes fn inside a Postgres transaction, retrying the -// entire transaction on transient errors. fn receives a fresh *sql.Tx each -// attempt; if fn returns an error the tx is rolled back, if nil the tx is -// committed. -// -// Pre-commit errors (from BeginTx or fn) are retried when -// isRetryablePostgresPreCommitError(err) OR opts.IsRetryable(err) is true. -// isRetryablePostgresPreCommitError is an OPT-OUT classifier: pre-commit -// retrying is always safe (the transaction rolls back), so it retries -// everything EXCEPT context.Canceled/DeadlineExceeded and the permanent -// SQLSTATE classes (22xxx data exceptions, 23xxx constraint violations, 42xxx -// syntax/schema/permission). See its doc for the rationale. +// RetryPostgresTx runs fn in a Postgres transaction, retrying the whole +// transaction on transient errors. fn gets a fresh *sql.Tx each attempt; if it +// returns an error the tx is rolled back, if nil the tx is committed. // -// Commit-phase errors (from (*sql.Tx).Commit) are retried only when -// isSafeRetryablePostgresError(err) OR opts.IsRetryable(err) is true — a -// narrower OPT-IN set (pre-send + server-rolled-back SQLSTATE; network errors -// excluded), because a commit-phase network error may mean the commit already -// succeeded (e.g., the connection was severed after the COMMIT was sent but -// before the response was received). Commit-phase errors that are NOT retried -// are returned to the caller wrapped with ErrCommitPhase so the caller can -// decide whether to alert, reconcile, or check via pg_xact_status() (future). +// Pre-commit errors use the opt-out isRetryablePostgresPreCommitError; commit +// errors use the narrower opt-in isSafeRetryablePostgresError (a commit-phase +// network error may mean the commit already succeeded). Non-retried commit +// errors are wrapped with ErrCommitPhase. opts.IsRetryable is additive on both. // -// db MUST be a *sql.DB backed by the pgx stdlib adapter (e.g. one returned -// by database.New with a PostgresConfig). Using a *sql.DB backed by another -// driver will not produce pgx-specific retries. +// db must be pgx-backed (e.g. from database.New with a PostgresConfig); other +// drivers won't get pgx-specific retries. func RetryPostgresTx(ctx context.Context, db *sql.DB, opts RetryTxOptions, fn func(*sql.Tx) error) error { return retryTx(ctx, db, postgresTxClassifier, opts, fn) } -// postgresTxClassifier is the per-phase retry classifier used by RetryPostgresTx. -// The preCommit classifier is an OPT-OUT classifier (broad: retry everything -// except permanent errors and context cancellation, because pre-commit the -// transaction rolls back so retrying is always safe); the commit classifier is -// the OPT-IN isSafeRetryablePostgresError (narrow: only pre-send + -// server-rolled-back, because a commit-phase network error may mean the commit -// already succeeded). See ErrCommitPhase. +// postgresTxClassifier pairs the pre-commit (opt-out) and commit (opt-in) +// classifiers. See ErrCommitPhase. var postgresTxClassifier = retryClassifier{ preCommit: isRetryablePostgresPreCommitError, commit: isSafeRetryablePostgresError, diff --git a/database/retry.go b/database/retry.go index 6eaaf85a..d33c1888 100644 --- a/database/retry.go +++ b/database/retry.go @@ -9,69 +9,50 @@ import ( "time" ) -// RetryTxOptions configures the transactional (safe) retry functions -// (RetryPostgresTx and the future RetryMySQLTx / RetrySpannerTx). +// RetryTxOptions configures RetryPostgresTx (and future RetryMySQLTx/SpannerTx). type RetryTxOptions struct { - // TxOptions is passed to (*sql.DB).BeginTx on each attempt. nil = default isolation. + // TxOptions passed to (*sql.DB).BeginTx each attempt; nil = default isolation. TxOptions *sql.TxOptions - // IsRetryable, if non-nil, is consulted IN ADDITION to the backend's default - // classifier (both the pre-commit and commit phases). Lets consumers add - // implementation-specific cases on top of the library's known-safe floor. - // Callers should be careful adding cases that are unsafe at commit time — a - // commit-phase error may mean the commit already succeeded (see ErrCommitPhase). + // IsRetryable, if non-nil, is OR'd with the backend's default classifier + // (both phases). Be careful adding cases unsafe at commit (see ErrCommitPhase). IsRetryable func(err error) bool } // RetryUnsafeOptions configures RetryUnsafe. type RetryUnsafeOptions struct { - // IsRetryable, if nil, defaults to "retry on any error except - // context.Canceled / context.DeadlineExceeded". If non-nil, replaces the default. + // IsRetryable, if nil, retries any error except context.Canceled/DeadlineExceeded. + // If non-nil, replaces the default. IsRetryable func(err error) bool } -// maxRetryAttempts is the maximum number of attempts for RetryPostgresTx and -// RetryUnsafe (the initial attempt plus 2 retries), matching database/sql's -// maxBadConnRetries+1 convention. It is intentionally not configurable: -// services that need a different retry budget should bound the total time via -// the context deadline (the retry loop exits early when ctx is done) or wrap -// the call in their own retry loop for more attempts. +// maxRetryAttempts is the attempt count for RetryPostgresTx and RetryUnsafe +// (initial + 2 retries), matching database/sql's maxBadConnRetries+1. Not +// configurable: use the context deadline to bound total time, or wrap for more. const maxRetryAttempts = 3 -// ErrCommitPhase wraps an error that occurred during the commit phase of a -// retried transaction (i.e., from (*sql.Tx).Commit) and was NOT retried. A -// commit-phase error is ambiguous: the commit may have succeeded on the server -// before the error was returned to the client (e.g., the TCP connection was -// severed after the COMMIT message was sent but before the response was -// received). Retrying such an operation could duplicate the committed work, so -// RetryPostgresTx deliberately does NOT retry commit-phase errors whose type -// could indicate a successful commit (e.g., network errors). -// -// The error is returned wrapped with ErrCommitPhase so the caller can detect -// it via errors.Is(err, database.ErrCommitPhase) and decide whether to alert, -// reconcile, or check whether the commit landed (e.g., via pg_xact_status(), -// which is future work). The underlying error is preserved for inspection via -// errors.Is/errors.As. -var ErrCommitPhase = errors.New("database: error occurred during the commit phase; the transaction may have committed before the error was returned") +// ErrCommitPhase wraps a non-retried error from (*sql.Tx).Commit. A commit- +// phase error is ambiguous: the commit may have succeeded before the error was +// returned (e.g. connection severed after COMMIT sent but before the response). +// RetryPostgresTx doesn't retry such errors (could duplicate the work) and +// wraps them with ErrCommitPhase so the caller can detect them via errors.Is +// and decide whether to alert/reconcile/check if the commit landed (future +// pg_xact_status() work). The underlying error is preserved. +var ErrCommitPhase = errors.New("database: commit-phase error; transaction may have committed before the error was returned") -// beginTxer is satisfied by *sql.DB so the retry loop is testable with fakes -// without needing a real database driver. +// beginTxer is satisfied by *sql.DB; the interface lets tests fake BeginTx. type beginTxer interface { BeginTx(ctx context.Context, opts *sql.TxOptions) (*sql.Tx, error) } -// retryClassifier holds the per-phase retryability classifiers used by retryTx. -// preCommit classifies errors from BeginTx or fn (before Commit is called); -// commit classifies errors from (*sql.Tx).Commit. The commit classifier should -// be NARROWER than preCommit because a commit-phase error may mean the commit -// already succeeded (see ErrCommitPhase). +// retryClassifier holds per-phase classifiers: preCommit for BeginTx/fn errors, +// commit for (*sql.Tx).Commit errors. commit must be narrower (a commit-phase +// error may mean the commit already succeeded — see ErrCommitPhase). type retryClassifier struct { preCommit func(error) bool commit func(error) bool } -// classify returns whether err is retryable, using the commit classifier when -// the error occurred during the commit phase and the preCommit classifier -// otherwise. +// classify picks the commit classifier when commitPhase, else preCommit. func (c retryClassifier) classify(err error, commitPhase bool) bool { if commitPhase { return c.commit(err) @@ -79,10 +60,10 @@ func (c retryClassifier) classify(err error, commitPhase bool) bool { return c.preCommit(err) } -// RetryUnsafe executes fn up to maxRetryAttempts times, retrying on any error -// that opts.IsRetryable classifies as retryable (default: any error except -// context cancellation/deadline). fn MUST be idempotent — no transaction -// wrapper is provided, so a retry may re-execute work that already committed. +// RetryUnsafe runs fn up to maxRetryAttempts times, retrying on any error +// opts.IsRetryable says is retryable (default: any except context cancellation/ +// deadline). fn MUST be idempotent — no transaction wrapper, so a retry may +// re-execute work that already committed. func RetryUnsafe(ctx context.Context, opts RetryUnsafeOptions, fn func() error) error { isRetryable := opts.IsRetryable if isRetryable == nil { @@ -104,8 +85,8 @@ func RetryUnsafe(ctx context.Context, opts RetryUnsafeOptions, fn func() error) return err } -// isRetryableUnsafeDefault retries on any error except context cancellation -// / deadline, since the caller of RetryUnsafe has vouched that fn is idempotent. +// isRetryableUnsafeDefault retries any error except context cancellation/deadline +// (RetryUnsafe callers vouch fn is idempotent). func isRetryableUnsafeDefault(err error) bool { if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { return false @@ -113,24 +94,14 @@ func isRetryableUnsafeDefault(err error) bool { return true } -// retryTx is the shared transactional retry loop used by RetryPostgresTx and -// the future RetryMySQLTx / RetrySpannerTx implementations. base is the -// backend's per-phase known-safe floor; opts.IsRetryable is additive on top of -// BOTH phases. Commit-phase errors that are not retried are wrapped with -// ErrCommitPhase before being returned to the caller, so the caller can detect -// that the error occurred during the commit phase (where the commit may have -// succeeded before the error was returned). +// retryTx is the shared retry loop for RetryPostgresTx (and future MySQL/Spanner +// implementations). opts.IsRetryable is OR'd with base on both phases. Non- +// retried commit-phase errors are wrapped with ErrCommitPhase. // -// database/sql already retries driver.ErrBadConn internally for *sql.DB methods -// (up to maxBadConnRetries=2, i.e. 3 total attempts) before surfacing it. That -// internal retry is immediate (no delay) and handles transient bad connections -// on fresh conns; this outer retry layers on top with full-jitter backoff to -// survive longer outages (e.g. an AlloyDB switchover). The composition is -// intentional: the internal retry covers the "connection was bad before I used -// it" case without delay, and this outer retry covers the "the database was -// briefly unavailable" case with jitter to spread fleet-wide retries. For -// *sql.Tx methods (tx.Exec, tx.Commit) there is no internal retry, so this -// outer retry is the only retry. +// database/sql already retries driver.ErrBadConn for *sql.DB methods (immediate, +// up to 3 attempts) before surfacing it; this outer loop layers on top with +// jitter for longer outages. *sql.Tx methods have no internal retry, so this is +// the only retry for tx.Exec/tx.Commit. func retryTx(ctx context.Context, db beginTxer, base retryClassifier, opts RetryTxOptions, fn func(*sql.Tx) error) error { classifier := base if opts.IsRetryable != nil { @@ -162,13 +133,9 @@ func retryTx(ctx context.Context, db beginTxer, base retryClassifier, opts Retry return lastErr } -// runOneTxAttempt runs a single begin/fn/commit attempt and returns the first -// error encountered (from BeginTx, fn, or Commit) along with a commitPhase -// flag indicating whether the error came from (*sql.Tx).Commit. On a -// successful commit it returns (nil, false). The deferred Rollback is a no-op -// after a successful Commit and cleans up the transaction if fn returns an -// error or panics; after a failed Commit the transaction is already in a -// terminal state so Rollback returns sql.ErrTxDone (ignored). +// runOneTxAttempt runs one begin/fn/commit attempt, returning the first error +// and whether it came from Commit (commitPhase). The deferred Rollback is a +// no-op after a successful Commit. func runOneTxAttempt(ctx context.Context, db beginTxer, txOpts *sql.TxOptions, fn func(*sql.Tx) error) (err error, commitPhase bool) { tx, beginErr := db.BeginTx(ctx, txOpts) if beginErr != nil { @@ -186,18 +153,17 @@ func runOneTxAttempt(ctx context.Context, db beginTxer, txOpts *sql.TxOptions, f if err = fn(tx); err != nil { return err, false } - // TODO(future): capture pg_current_xact_id() before commit and consult - // pg_xact_status() on commit error to detect whether the commit landed. - // Postgres-only; see RetryPostgresTx and ErrCommitPhase. + // TODO(future): capture pg_current_xact_id() pre-commit and consult + // pg_xact_status() on commit error to detect if the commit landed. if err = tx.Commit(); err != nil { return err, true } return nil, false } -// sleepWithJitter waits for a random duration in [0, retryJitterMax) before -// the next retry attempt, respecting ctx. It returns false if ctx was done -// before the sleep completed. It does not sleep after the final attempt. +// sleepWithJitter sleeps a random duration in [0, retryJitterMax) before the +// next attempt, respecting ctx. Returns false if ctx is done. No sleep after +// the final attempt. func sleepWithJitter(ctx context.Context, attempt, maxAttempts int) bool { if attempt >= maxAttempts-1 { return true diff --git a/database/retry_test.go b/database/retry_test.go index 353066bc..586a950b 100644 --- a/database/retry_test.go +++ b/database/retry_test.go @@ -350,29 +350,14 @@ func TestRetryPostgresTx(t *testing.T) { } func TestIsSafeRetryablePostgresError(t *testing.T) { - // isSafeRetryablePostgresError is the OPT-IN commit-phase classifier for - // RetryPostgresTx: pre-send guarantees + server-rolled-back SQLSTATE codes. - // Network errors are excluded (at commit, a network error could mean the - // commit succeeded but the response was lost — see ErrCommitPhase). The - // pre-commit classifier (isRetryablePostgresPreCommitError) is a separate - // OPT-OUT classifier that retries a broader set. - // - // driver.ErrBadConn is intentionally NOT in this classifier: pgx's - // wrapTx.Commit returns native pgx errors (not driver.ErrBadConn), so the - // commit path never sees it. The pre-send commit case is covered by - // pgconn.SafeToRetry below. driver.ErrBadConn from fn's tx.Exec IS - // retryable, but that's a pre-commit error handled by the opt-out catch-all - // (see TestIsRetryablePostgresPreCommitError). - - // Pre-send: pgconn.SafeToRetry-flagged errors. Safe at any phase because - // the operation never reached the server. - require.True(t, isSafeRetryablePostgresError(&safeToRetryErr{msg: "pre-send commit"})) - - // driver.ErrBadConn is NOT retryable here (the commit classifier doesn't - // check it — see the comment above). - require.False(t, isSafeRetryablePostgresError(driver.ErrBadConn)) - - // SQLSTATE codes that the server guarantees were rolled back. + // Commit-phase classifier (opt-in): pre-send + server-rolled-back only. + // driver.ErrBadConn excluded — see isSafeRetryablePostgresError doc. + + require.True(t, isSafeRetryablePostgresError(&safeToRetryErr{msg: "pre-send commit"})) // SafeToRetry-flagged + + require.False(t, isSafeRetryablePostgresError(driver.ErrBadConn)) // not checked at commit + + // Server-rolled-back SQLSTATE codes. require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P02"})) // crash_shutdown require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P03"})) // cannot_connect_now @@ -381,13 +366,9 @@ func TestIsSafeRetryablePostgresError(t *testing.T) { require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "53300"})) // too_many_connections require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57014"})) // query_canceled - // Permanent application errors are not retryable (would fail again). + // Permanent + network + context errors not retryable here. require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "23505"})) // unique_violation require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "42601"})) // syntax_error - - // Network errors are NOT retryable here: without a transaction there's no - // way to know if the query landed, and at commit time a network error - // could mean the commit succeeded but the response was lost (ErrCommitPhase). require.False(t, isSafeRetryablePostgresError(nil)) require.False(t, isSafeRetryablePostgresError(io.EOF)) require.False(t, isSafeRetryablePostgresError(io.ErrUnexpectedEOF)) @@ -397,17 +378,15 @@ func TestIsSafeRetryablePostgresError(t *testing.T) { } func TestIsRetryablePostgresPreCommitError(t *testing.T) { - // isRetryablePostgresPreCommitError is an OPT-OUT classifier: pre-commit - // retrying is always safe (the transaction rolls back), so it retries - // everything EXCEPT context.Canceled/DeadlineExceeded and the permanent - // SQLSTATE classes (22xxx, 23xxx, 42xxx). + // Pre-commit classifier (opt-out): retries everything except context + // cancellation and permanent SQLSTATE classes (22xxx/23xxx/42xxx). // Retried: pre-send guarantees. require.True(t, isRetryablePostgresPreCommitError(driver.ErrBadConn)) require.True(t, isRetryablePostgresPreCommitError(fmt.Errorf("wrapped: %w", driver.ErrBadConn))) require.True(t, isRetryablePostgresPreCommitError(&safeToRetryErr{msg: "pre-send"})) - // Retried: known-transient SQLSTATE codes (server rolled back). + // Retried: known-transient SQLSTATE (server rolled back). require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "40001"})) // serialization_failure require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "40P01"})) // deadlock_detected require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown @@ -420,19 +399,13 @@ func TestIsRetryablePostgresPreCommitError(t *testing.T) { Err: errors.New("connection reset by peer"), })) - // Retried (opt-out): an UNKNOWN *pgconn.PgError code is retried because - // pre-commit it's safe and we'd rather not miss a transient error we - // haven't enumerated. XX000 (internal_error) is not in the permanent - // blocklist, so it's retried. - require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "XX000"})) // internal_error (unknown/transient) - - // Retried (opt-out): a non-PgError, non-network error is retried. fn might - // return a wrapped error; pre-commit retrying is safe, and the cost of a - // false positive (200ms wasted on a permanent error) is less than the cost - // of a false negative (spurious user-facing failure). + // Retried: unknown PgError codes (not in the permanent blocklist) and + // arbitrary non-PgError errors — opt-out assumes retryable unless known + // permanent. + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "XX000"})) // internal_error (unknown) require.True(t, isRetryablePostgresPreCommitError(errors.New("some application error"))) - // NOT retried: permanent SQLSTATE classes (would fail again on retry). + // NOT retried: permanent SQLSTATE classes. require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "23505"})) // 23xxx unique_violation require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "23503"})) // 23xxx foreign_key_violation require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "22001"})) // 22xxx string_data_right_truncation diff --git a/database/spanner.go b/database/spanner.go index f1986977..feec8f0f 100644 --- a/database/spanner.go +++ b/database/spanner.go @@ -46,14 +46,9 @@ func SpannerUniqueViolation(err error) bool { // RetrySpannerTx is the Spanner equivalent of RetryPostgresTx. // TODO: consider delegating to spannerdriver.RunTransactionWithOptions for -// ABORTED replay (it replays statements + buffered mutations on a new -// transaction and surfaces ErrAbortedDueToConcurrentModification when the -// replay sees different data), plus network-error retry on top. The shared -// retryTx helper in retry.go already handles driver.ErrBadConn -// (backend-agnostic) and the Begin/fn/Commit/Rollback loop; the missing -// piece is a Spanner-specific default classifier analogous to -// isRetryablePostgresTxError. See -// https://pkg.go.dev/github.com/googleapis/go-sql-spanner +// ABORTED replay, plus network retry. retryTx already handles the loop and +// driver.ErrBadConn; the missing piece is a Spanner-specific retryClassifier. +// See https://pkg.go.dev/github.com/googleapis/go-sql-spanner func RetrySpannerTx(ctx context.Context, db *sql.DB, opts RetryTxOptions, fn func(*sql.Tx) error) error { _ = ctx _ = db From d1172486a9989af4aa6c56083c57998ca45fb75e Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 30 Jun 2026 04:28:22 +0000 Subject: [PATCH 08/13] =?UTF-8?q?database:=20fix=20HealthCheckPeriod=20com?= =?UTF-8?q?ment=20=E2=80=94=20it=20doesn't=20ping?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HealthCheckPeriod drives the background goroutine that evicts connections exceeded MaxConnLifetime/MaxConnIdleTime; it does NOT ping for liveness. Liveness pinging is at acquire time (stdlib ResetSession, default ping if idle > 1s), with database/sql retrying on a fresh conn. The old comment claimed HealthCheckPeriod pings idle connections in the background and evicts dead connections before the app sees them — both wrong. Co-authored-by: E-Love (Eric Loveland) --- database/postgres.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index a51a752b..fa7a13da 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -29,9 +29,11 @@ func postgresConnection(ctx context.Context, logger log.Logger, config PostgresC return nil, logger.LogErrorf("building pgx pool config: %w", err).Err() } - // HealthCheckPeriod makes pgxpool ping idle connections in the background. - // Dead connections (e.g. from an AlloyDB switchover) are evicted before - // the application ever sees them. + // HealthCheckPeriod is how often the background goroutine evicts connections + // that exceeded MaxConnLifetime or MaxConnIdleTime. It does NOT ping for + // liveness — dead connections are caught at acquire time by the ResetSession + // ping (default: ping if idle > 1s), with database/sql retrying on a fresh + // conn and RetryPostgresTx retrying beyond that. poolConfig.HealthCheckPeriod = 1 * time.Second pool, err := pgxpool.NewWithConfig(ctx, poolConfig) From dad59557723511f4d6466d3f666658a7197f7897 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 30 Jun 2026 04:35:43 +0000 Subject: [PATCH 09/13] database: exclude 57P02 (crash_shutdown) from the commit classifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit quickdie() (the SIGQUIT handler that raises ERRCODE_CRASH_SHUTDOWN) does _exit(2) WITHOUT rolling back — the source comment says 'we don't want to try to clean up our transaction'. The rollback is via crash recovery on restart, not before the client sees 57P02. Contrast with die() (57P01 admin_shutdown), which processes the interrupt through proc_exit/ AbortTransaction and does roll back before sending. So 57P02 at commit is ambiguous: the commit may have been recorded before SIGQUIT, and quickdie skips rollback. Removed 57P02 from isSafeRetryable- PostgresError (the commit classifier) so it's wrapped with ErrCommitPhase at commit (caller decides). 57P02 is still retried pre-commit via isRetryablePostgresPreCommitError's opt-out — the tx didn't commit (process exited before committing), so retry is safe there. Tests: TestIsSafeRetryablePostgresError asserts 57P02 -> false at commit; TestIsRetryablePostgresPreCommitError asserts 57P02 -> true pre-commit (with a comment noting it's safe despite no pre-return rollback). Co-authored-by: E-Love (Eric Loveland) --- database/postgres.go | 31 +++++++++++++++++++++---------- database/retry_test.go | 19 +++++++++++++------ 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index fa7a13da..807a7250 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -177,13 +177,20 @@ func PostgresDeadlockFound(err error) bool { } // isSafeRetryablePostgresError is the commit-phase classifier for -// RetryPostgresTx: true iff the error guarantees no side effects (pre-send or -// server-rolled-back), so retry is safe even at commit. +// RetryPostgresTx: true iff the error guarantees the commit didn't happen, so +// retry is safe even at commit. // // - pgconn.SafeToRetry: pgx guarantees the error occurred before any data // was sent to the server. -// - SQLSTATE codes the server rolled back before returning: 40001, 40P01, -// 57P01, 57P02, 57P03, 53300, 57014. +// - SQLSTATE codes that guarantee non-commit: 40001, 40P01, 57014 (server +// rejected + rolled back), 57P01 (die() aborts the in-flight tx), 57P03, +// 53300 (connect-time, never started a tx). +// +// 57P02 (crash_shutdown) excluded: quickdie() does _exit(2) WITHOUT rolling +// back (rollback is via crash recovery on restart), and the commit may have +// been recorded before SIGQUIT — so 57P02 at commit is ambiguous. It's still +// retried pre-commit via isRetryablePostgresPreCommitError's opt-out (the tx +// didn't commit). See ErrCommitPhase. // // driver.ErrBadConn not checked: pgx's Commit returns native errors (the // ErrBadConn conversion is only in Exec/Query), so the commit path never sees @@ -201,16 +208,20 @@ func isSafeRetryablePostgresError(err error) bool { if pgconn.SafeToRetry(err) { return true } - // SQLSTATE codes the server rolled back (or never started a tx) before - // returning — safe to retry. See: - // https://www.postgresql.org/docs/current/server-shutdown.html (57P01/57P02/57P03) - // https://www.postgresql.org/docs/current/mvcc-serialization-failure-handling.html (40001/40P01) - // 53300 too_many_connections and 57014 query_canceled also rolled back. + // SQLSTATE codes that guarantee the commit didn't happen: + // 40001 serialization_failure, 40P01 deadlock_detected — server rejected + // the commit and rolled back (ErrorResponse proves non-commit). See + // https://www.postgresql.org/docs/current/mvcc-serialization-failure-handling.html + // 57P01 admin_shutdown — die() aborts the in-flight tx before sending. + // 57P03 cannot_connect_now, 53300 too_many_connections — connect-time + // rejections (won't surface from Commit, but harmless). + // 57014 query_canceled — server canceled and rolled back. + // 57P02 (crash_shutdown) excluded — see doc comment above. // 08xxx omitted: pgx surfaces those as network errors, not *pgconn.PgError. var pgErr *pgconn.PgError if errors.As(err, &pgErr) { switch pgErr.Code { - case "57P01", "57P02", "57P03", // admin/crash shutdown, cannot_connect_now + case "57P01", "57P03", // admin_shutdown, cannot_connect_now "40001", "40P01", // serialization_failure, deadlock_detected "53300", // too_many_connections "57014": // query_canceled diff --git a/database/retry_test.go b/database/retry_test.go index 586a950b..c72b5108 100644 --- a/database/retry_test.go +++ b/database/retry_test.go @@ -350,22 +350,25 @@ func TestRetryPostgresTx(t *testing.T) { } func TestIsSafeRetryablePostgresError(t *testing.T) { - // Commit-phase classifier (opt-in): pre-send + server-rolled-back only. - // driver.ErrBadConn excluded — see isSafeRetryablePostgresError doc. + // Commit-phase classifier (opt-in): only errors that guarantee the commit + // didn't happen. driver.ErrBadConn and 57P02 excluded — see doc. require.True(t, isSafeRetryablePostgresError(&safeToRetryErr{msg: "pre-send commit"})) // SafeToRetry-flagged require.False(t, isSafeRetryablePostgresError(driver.ErrBadConn)) // not checked at commit - // Server-rolled-back SQLSTATE codes. - require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown - require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P02"})) // crash_shutdown + // SQLSTATE codes that guarantee non-commit. + require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown (die() aborts tx) require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P03"})) // cannot_connect_now require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "40001"})) // serialization_failure require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "40P01"})) // deadlock_detected require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "53300"})) // too_many_connections require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57014"})) // query_canceled + // 57P02 crash_shutdown excluded: quickdie() does _exit(2) without rollback, + // and the commit may have been recorded before SIGQUIT — ambiguous at commit. + require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P02"})) + // Permanent + network + context errors not retryable here. require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "23505"})) // unique_violation require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "42601"})) // syntax_error @@ -386,10 +389,14 @@ func TestIsRetryablePostgresPreCommitError(t *testing.T) { require.True(t, isRetryablePostgresPreCommitError(fmt.Errorf("wrapped: %w", driver.ErrBadConn))) require.True(t, isRetryablePostgresPreCommitError(&safeToRetryErr{msg: "pre-send"})) - // Retried: known-transient SQLSTATE (server rolled back). + // Retried: known-transient SQLSTATE (server rolled back or tx didn't commit). require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "40001"})) // serialization_failure require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "40P01"})) // deadlock_detected require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown + // 57P02 crash_shutdown is retried pre-commit: quickdie() skips rollback, but + // the tx didn't commit (process exited before committing) — safe pre-commit. + // (Excluded from the commit classifier — see TestIsSafeRetryablePostgresError.) + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "57P02"})) // crash_shutdown // Retried: typed network errors (non-PgError -> retry under opt-out). require.True(t, isRetryablePostgresPreCommitError(io.EOF)) From 1ea99cabea333fc05b585e533fc7606444435379 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 30 Jun 2026 04:51:33 +0000 Subject: [PATCH 10/13] database: maximally-conservative commit classifier + adopt pgerrcode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit classifier (isSafeRetryablePostgresError) now retries only errors that can NEVER mean 'possibly committed': - pgconn.SafeToRetry (pgx guarantees pre-send: COMMIT message never sent) - pgerrcode.IsTransactionRollback (class 40: server rolled back the tx before returning the ErrorResponse — 40001 serialization_failure, 40P01 deadlock_detected, 40002, 40003, 40000) Dropped 57P01 (admin_shutdown), 57P02 (crash_shutdown), 57014 (query_canceled), 57P03, 53300 from the commit classifier — each could mean 'possibly committed' (signal handlers can interrupt after the commit is recorded; quickdie skips rollback; cancel timing has an edge case; connect-time codes are irrelevant). They're still retried pre-commit via isRetryablePostgresPreCommitError's opt-out (the tx didn't commit pre-commit). At commit they're wrapped with ErrCommitPhase so the caller decides. Adopted github.com/jackc/pgerrcode (canonical Postgres error-code constants, by the pgx author) to eliminate string literals and the inline human-friendly- name comments: removed the local postgresErrUniqueViolation/postgresErrDeadlock- Found consts; PostgresUniqueViolation/PostgresDeadlockFound use pgerrcode; isPermanentPostgresError uses pgerrcode.IsDataException/IsIntegrityConstraint- Violation/IsSyntaxErrororAccessRuleViolation (class 22/23/42); the commit classifier uses pgerrcode.IsTransactionRollback (class 40). Tests: TestIsSafeRetryablePostgresError now asserts only SafeToRetry + class 40 retryable at commit (57P01/57P02/57P03/53300/57014 false); added a TestRetryPostgresTx subtest for 57P01-at-commit not retried + wrapped with ErrCommitPhase; TestIsRetryablePostgresPreCommitError covers 57P01/57P02/57P03/ 53300/57014 still retryable pre-commit; all string literals swapped for pgerrcode constants (postgres_test.go, database_test.go too). Co-authored-by: E-Love (Eric Loveland) --- database/database_test.go | 5 +- database/postgres.go | 82 ++++++++++---------------- database/postgres_test.go | 3 +- database/retry_test.go | 117 ++++++++++++++++++++++++-------------- go.mod | 1 + go.sum | 14 +---- 6 files changed, 111 insertions(+), 111 deletions(-) diff --git a/database/database_test.go b/database/database_test.go index abda3ddf..2b1cc151 100644 --- a/database/database_test.go +++ b/database/database_test.go @@ -12,6 +12,7 @@ import ( "time" gomysql "github.com/go-sql-driver/mysql" + "github.com/jackc/pgerrcode" "github.com/jackc/pgx/v5/pgconn" "github.com/moov-io/base/database" "github.com/moov-io/base/log" @@ -38,7 +39,7 @@ func TestUniqueViolation(t *testing.T) { t.Error("should have matched postgres unique violation") } pgconnErr := &pgconn.PgError{ - Code: "23505", + Code: pgerrcode.UniqueViolation, } if !database.UniqueViolation(pgconnErr) { t.Error("should have matched PgError unique violation") @@ -78,7 +79,7 @@ func TestDeadlockFound(t *testing.T) { t.Error("should have matched postgres deadlock found") } pgconnErr := &pgconn.PgError{ - Code: "40P01", + Code: pgerrcode.DeadlockDetected, } if !database.DeadlockFound(pgconnErr) { t.Error("should have matched PgError deadlock found") diff --git a/database/postgres.go b/database/postgres.go index 807a7250..de03af16 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -10,19 +10,13 @@ import ( "time" "cloud.google.com/go/alloydbconn" + "github.com/jackc/pgerrcode" "github.com/jackc/pgx/v5/pgconn" "github.com/jackc/pgx/v5/pgxpool" "github.com/jackc/pgx/v5/stdlib" "github.com/moov-io/base/log" ) -const ( - // PostgreSQL Error Codes - // https://www.postgresql.org/docs/current/errcodes-appendix.html - postgresErrUniqueViolation = "23505" - postgresErrDeadlockFound = "40P01" -) - func postgresConnection(ctx context.Context, logger log.Logger, config PostgresConfig, databaseName string) (*sql.DB, error) { poolConfig, err := buildPgxPoolConfig(ctx, config, databaseName) if err != nil { @@ -154,11 +148,11 @@ func PostgresUniqueViolation(err error) bool { } var pgError *pgconn.PgError - if errors.As(err, &pgError) && pgError.Code == postgresErrUniqueViolation { + if errors.As(err, &pgError) && pgError.Code == pgerrcode.UniqueViolation { return true } - return strings.Contains(err.Error(), postgresErrUniqueViolation) + return strings.Contains(err.Error(), pgerrcode.UniqueViolation) } // PostgresDeadlockFound returns true when the provided error matches the Postgres code @@ -169,36 +163,40 @@ func PostgresDeadlockFound(err error) bool { } var pgError *pgconn.PgError - if errors.As(err, &pgError) && pgError.Code == postgresErrDeadlockFound { + if errors.As(err, &pgError) && pgError.Code == pgerrcode.DeadlockDetected { return true } - return strings.Contains(err.Error(), postgresErrDeadlockFound) + return strings.Contains(err.Error(), pgerrcode.DeadlockDetected) } // isSafeRetryablePostgresError is the commit-phase classifier for // RetryPostgresTx: true iff the error guarantees the commit didn't happen, so -// retry is safe even at commit. +// retry is safe even at commit. Maximally conservative — only errors that can +// NEVER mean "possibly committed": // // - pgconn.SafeToRetry: pgx guarantees the error occurred before any data -// was sent to the server. -// - SQLSTATE codes that guarantee non-commit: 40001, 40P01, 57014 (server -// rejected + rolled back), 57P01 (die() aborts the in-flight tx), 57P03, -// 53300 (connect-time, never started a tx). -// -// 57P02 (crash_shutdown) excluded: quickdie() does _exit(2) WITHOUT rolling -// back (rollback is via crash recovery on restart), and the commit may have -// been recorded before SIGQUIT — so 57P02 at commit is ambiguous. It's still -// retried pre-commit via isRetryablePostgresPreCommitError's opt-out (the tx -// didn't commit). See ErrCommitPhase. +// was sent to the server (COMMIT message never sent). +// - pgerrcode.IsTransactionRollback: class 40 SQLSTATE codes (40001 +// serialization_failure, 40P01 deadlock_detected, 40002, 40003, 40000) — +// the server rolled back the transaction before returning the ErrorResponse. +// See https://www.postgresql.org/docs/current/mvcc-serialization-failure-handling.html // -// driver.ErrBadConn not checked: pgx's Commit returns native errors (the -// ErrBadConn conversion is only in Exec/Query), so the commit path never sees -// it; pre-send commit is covered by pgconn.SafeToRetry. *sql.Tx.Commit doesn't -// retry ErrBadConn internally (only *sql.DB methods do), so no double-retry. +// Excluded (could mean "possibly committed" → wrapped with ErrCommitPhase): +// - 57P01 admin_shutdown, 57P02 crash_shutdown: die()/quickdie() are signal +// handlers that can interrupt after the commit is recorded but before the +// response is sent. quickdie() also skips rollback (_exit(2)). +// - 57014 query_canceled: a cancel arriving after the commit records is a +// theoretical edge case. +// - 57P03/53300: connect-time, won't surface from Commit. +// - network errors: could mean commit succeeded but the response was lost. +// - driver.ErrBadConn: pgx's Commit returns native errors (the ErrBadConn +// conversion is only in Exec/Query), so the commit path never sees it; +// pre-send commit is covered by pgconn.SafeToRetry. *sql.Tx.Commit doesn't +// retry ErrBadConn internally (only *sql.DB methods do). // -// Network errors excluded: at commit, could mean commit succeeded but the -// response was lost (see ErrCommitPhase). +// Everything excluded here is still retried pre-commit via +// isRetryablePostgresPreCommitError's opt-out (the tx didn't commit pre-commit). // // Unexported: use RetryPostgresTx or RetryUnsafe rather than building on this. func isSafeRetryablePostgresError(err error) bool { @@ -208,25 +206,9 @@ func isSafeRetryablePostgresError(err error) bool { if pgconn.SafeToRetry(err) { return true } - // SQLSTATE codes that guarantee the commit didn't happen: - // 40001 serialization_failure, 40P01 deadlock_detected — server rejected - // the commit and rolled back (ErrorResponse proves non-commit). See - // https://www.postgresql.org/docs/current/mvcc-serialization-failure-handling.html - // 57P01 admin_shutdown — die() aborts the in-flight tx before sending. - // 57P03 cannot_connect_now, 53300 too_many_connections — connect-time - // rejections (won't surface from Commit, but harmless). - // 57014 query_canceled — server canceled and rolled back. - // 57P02 (crash_shutdown) excluded — see doc comment above. - // 08xxx omitted: pgx surfaces those as network errors, not *pgconn.PgError. var pgErr *pgconn.PgError if errors.As(err, &pgErr) { - switch pgErr.Code { - case "57P01", "57P03", // admin_shutdown, cannot_connect_now - "40001", "40P01", // serialization_failure, deadlock_detected - "53300", // too_many_connections - "57014": // query_canceled - return true - } + return pgerrcode.IsTransactionRollback(pgErr.Code) } return false } @@ -234,14 +216,10 @@ func isSafeRetryablePostgresError(err error) bool { // isPermanentPostgresError reports SQLSTATE classes that would fail again on // retry, used by isRetryablePostgresPreCommitError to opt out. Only // clearly-permanent classes are listed so unknown/transient codes are retried. -// -// - 22xxx data exceptions (e.g. 22001 string_data_right_truncation) -// - 23xxx integrity constraint violations (e.g. 23505 unique_violation) -// - 42xxx syntax/access-rule violations (42501, 42601, 42P01, 42703, 42804) func isPermanentPostgresError(pgErr *pgconn.PgError) bool { - return strings.HasPrefix(pgErr.Code, "22") || // data exception - strings.HasPrefix(pgErr.Code, "23") || // integrity constraint violation - strings.HasPrefix(pgErr.Code, "42") // syntax / access rule violation + return pgerrcode.IsDataException(pgErr.Code) || // class 22 + pgerrcode.IsIntegrityConstraintViolation(pgErr.Code) || // class 23 + pgerrcode.IsSyntaxErrororAccessRuleViolation(pgErr.Code) // class 42 } // retryJitterMax is the upper bound on the random delay between retries. Full diff --git a/database/postgres_test.go b/database/postgres_test.go index 8e659952..442ed0c0 100644 --- a/database/postgres_test.go +++ b/database/postgres_test.go @@ -8,6 +8,7 @@ import ( "testing" "time" + "github.com/jackc/pgerrcode" "github.com/jackc/pgx/v5/pgconn" "github.com/moov-io/base" "github.com/moov-io/base/database" @@ -201,7 +202,7 @@ func TestRetryUnsafe(t *testing.T) { err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{}, func() error { calls++ if calls < 3 { - return &pgconn.PgError{Code: "23505"} // unique_violation + return &pgconn.PgError{Code: pgerrcode.UniqueViolation} } return nil }) diff --git a/database/retry_test.go b/database/retry_test.go index c72b5108..6ab5db14 100644 --- a/database/retry_test.go +++ b/database/retry_test.go @@ -11,6 +11,7 @@ import ( "sync" "testing" + "github.com/jackc/pgerrcode" "github.com/jackc/pgx/v5/pgconn" "github.com/stretchr/testify/require" ) @@ -142,7 +143,7 @@ func TestRetryPostgresTx(t *testing.T) { err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { calls++ if calls < 2 { - return &pgconn.PgError{Code: "40001"} // serialization_failure -> retryable + return &pgconn.PgError{Code: pgerrcode.SerializationFailure} } return nil }) @@ -163,7 +164,7 @@ func TestRetryPostgresTx(t *testing.T) { calls := 0 err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { calls++ - return &pgconn.PgError{Code: "23505"} // unique_violation -> NOT retryable + return &pgconn.PgError{Code: pgerrcode.UniqueViolation} }) require.Error(t, err) require.Equal(t, 1, calls) @@ -256,15 +257,39 @@ func TestRetryPostgresTx(t *testing.T) { require.ErrorIs(t, err, io.EOF) // underlying error preserved }) + t.Run("commit-phase admin_shutdown (57P01) is NOT retried and is wrapped with ErrCommitPhase", func(t *testing.T) { + // 57P01 is excluded from the commit classifier: die() can interrupt after + // the commit is recorded, so 57P01 at commit could mean "possibly + // committed". Conservatively wrap with ErrCommitPhase (caller decides). + d := &mockDriver{ + commitErrs: []error{&pgconn.PgError{Code: pgerrcode.AdminShutdown}}, + } + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { + calls++ + return nil + }) + require.Error(t, err) + require.Equal(t, 1, calls) // not retried + require.Equal(t, 1, d.commitCalls) + require.ErrorIs(t, err, ErrCommitPhase) + var pgErr *pgconn.PgError + require.ErrorAs(t, err, &pgErr) + require.Equal(t, pgerrcode.AdminShutdown, pgErr.Code) + }) + t.Run("commit-phase exhaustion wraps the final error with ErrCommitPhase", func(t *testing.T) { - // A retryable commit error (40001 at commit) that never succeeds: the - // final error is a commit-phase error, so it is wrapped with - // ErrCommitPhase even though we exhausted attempts retrying it. + // A retryable commit error (serialization_failure at commit) that never + // succeeds: the final error is a commit-phase error, so it is wrapped + // with ErrCommitPhase even though we exhausted attempts retrying it. d := &mockDriver{ commitErrs: []error{ - &pgconn.PgError{Code: "40001"}, // serialization_failure at commit (retryable) - &pgconn.PgError{Code: "40001"}, - &pgconn.PgError{Code: "40001"}, + &pgconn.PgError{Code: pgerrcode.SerializationFailure}, + &pgconn.PgError{Code: pgerrcode.SerializationFailure}, + &pgconn.PgError{Code: pgerrcode.SerializationFailure}, }, } db := newMockDB(d) @@ -282,7 +307,7 @@ func TestRetryPostgresTx(t *testing.T) { // The underlying *pgconn.PgError is preserved for inspection via errors.As. var pgErr *pgconn.PgError require.ErrorAs(t, err, &pgErr) - require.Equal(t, "40001", pgErr.Code) + require.Equal(t, pgerrcode.SerializationFailure, pgErr.Code) }) t.Run("additive opts.IsRetryable extends the default classifier", func(t *testing.T) { @@ -322,7 +347,7 @@ func TestRetryPostgresTx(t *testing.T) { calls++ if calls == 1 { cancel() // cancel after the first attempt runs - return &pgconn.PgError{Code: "40001"} // retryable, but ctx is now done + return &pgconn.PgError{Code: pgerrcode.SerializationFailure} } return nil }) @@ -339,7 +364,7 @@ func TestRetryPostgresTx(t *testing.T) { calls := 0 err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { calls++ - return &pgconn.PgError{Code: "40P01"} // deadlock_detected -> retryable + return &pgconn.PgError{Code: pgerrcode.DeadlockDetected} }) require.Error(t, err) require.Equal(t, 3, calls) @@ -350,28 +375,30 @@ func TestRetryPostgresTx(t *testing.T) { } func TestIsSafeRetryablePostgresError(t *testing.T) { - // Commit-phase classifier (opt-in): only errors that guarantee the commit - // didn't happen. driver.ErrBadConn and 57P02 excluded — see doc. + // Commit-phase classifier (maximally conservative): only errors that can + // NEVER mean "possibly committed" — pgconn.SafeToRetry (pre-send) and + // class 40 (server rolled back before the ErrorResponse). Everything else + // is excluded → wrapped with ErrCommitPhase. See doc. require.True(t, isSafeRetryablePostgresError(&safeToRetryErr{msg: "pre-send commit"})) // SafeToRetry-flagged - require.False(t, isSafeRetryablePostgresError(driver.ErrBadConn)) // not checked at commit - - // SQLSTATE codes that guarantee non-commit. - require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown (die() aborts tx) - require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P03"})) // cannot_connect_now - require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "40001"})) // serialization_failure - require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "40P01"})) // deadlock_detected - require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "53300"})) // too_many_connections - require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57014"})) // query_canceled + // Class 40 (Transaction Rollback) — server rolled back before returning. + require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.SerializationFailure})) + require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.DeadlockDetected})) + require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.TransactionRollback})) + require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.TransactionIntegrityConstraintViolation})) - // 57P02 crash_shutdown excluded: quickdie() does _exit(2) without rollback, - // and the commit may have been recorded before SIGQUIT — ambiguous at commit. - require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "57P02"})) + // Excluded (could mean "possibly committed" → ErrCommitPhase at commit). + require.False(t, isSafeRetryablePostgresError(driver.ErrBadConn)) // not seen at commit (pgx Commit returns native errors) + require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.AdminShutdown})) // die() can interrupt after commit recorded + require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.CrashShutdown})) // quickdie() _exit(2) without rollback + require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.CannotConnectNow})) // connect-time, not a commit error + require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.TooManyConnections})) + require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.QueryCanceled})) // cancel-after-commit edge case // Permanent + network + context errors not retryable here. - require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "23505"})) // unique_violation - require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: "42601"})) // syntax_error + require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.UniqueViolation})) + require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.SyntaxError})) require.False(t, isSafeRetryablePostgresError(nil)) require.False(t, isSafeRetryablePostgresError(io.EOF)) require.False(t, isSafeRetryablePostgresError(io.ErrUnexpectedEOF)) @@ -389,14 +416,16 @@ func TestIsRetryablePostgresPreCommitError(t *testing.T) { require.True(t, isRetryablePostgresPreCommitError(fmt.Errorf("wrapped: %w", driver.ErrBadConn))) require.True(t, isRetryablePostgresPreCommitError(&safeToRetryErr{msg: "pre-send"})) - // Retried: known-transient SQLSTATE (server rolled back or tx didn't commit). - require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "40001"})) // serialization_failure - require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "40P01"})) // deadlock_detected - require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "57P01"})) // admin_shutdown - // 57P02 crash_shutdown is retried pre-commit: quickdie() skips rollback, but - // the tx didn't commit (process exited before committing) — safe pre-commit. - // (Excluded from the commit classifier — see TestIsSafeRetryablePostgresError.) - require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "57P02"})) // crash_shutdown + // Retried: class 40 (server rolled back) and shutdown/cancel codes — the tx + // didn't commit pre-commit, so retry is safe even though some of these + // (57P01/57P02/57014) are excluded from the commit classifier. + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.SerializationFailure})) + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.DeadlockDetected})) + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.AdminShutdown})) + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.CrashShutdown})) + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.CannotConnectNow})) + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.TooManyConnections})) + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.QueryCanceled})) // Retried: typed network errors (non-PgError -> retry under opt-out). require.True(t, isRetryablePostgresPreCommitError(io.EOF)) @@ -409,17 +438,17 @@ func TestIsRetryablePostgresPreCommitError(t *testing.T) { // Retried: unknown PgError codes (not in the permanent blocklist) and // arbitrary non-PgError errors — opt-out assumes retryable unless known // permanent. - require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "XX000"})) // internal_error (unknown) + require.True(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.InternalError})) require.True(t, isRetryablePostgresPreCommitError(errors.New("some application error"))) - // NOT retried: permanent SQLSTATE classes. - require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "23505"})) // 23xxx unique_violation - require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "23503"})) // 23xxx foreign_key_violation - require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "22001"})) // 22xxx string_data_right_truncation - require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "42601"})) // 42xxx syntax_error - require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "42501"})) // 42xxx insufficient_privilege - require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "42P01"})) // 42xxx undefined_table - require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: "42703"})) // 42xxx undefined_column + // NOT retried: permanent SQLSTATE classes (22xxx/23xxx/42xxx). + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.UniqueViolation})) + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.ForeignKeyViolation})) + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.StringDataRightTruncationDataException})) + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.SyntaxError})) + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.InsufficientPrivilege})) + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.UndefinedTable})) + require.False(t, isRetryablePostgresPreCommitError(&pgconn.PgError{Code: pgerrcode.UndefinedColumn})) // NOT retried: caller's context is done. require.False(t, isRetryablePostgresPreCommitError(context.Canceled)) diff --git a/go.mod b/go.mod index 9ead87e7..b209ab14 100644 --- a/go.mod +++ b/go.mod @@ -14,6 +14,7 @@ require ( github.com/googleapis/gax-go/v2 v2.22.0 github.com/googleapis/go-sql-spanner v1.25.1 github.com/gorilla/mux v1.8.1 + github.com/jackc/pgerrcode v0.0.0-20220416144525-469b46aa5efa github.com/jackc/pgx/v5 v5.9.2 github.com/madflojo/testcerts v1.5.0 github.com/markbates/pkger v0.17.1 diff --git a/go.sum b/go.sum index a0194c29..1e6377a6 100644 --- a/go.sum +++ b/go.sum @@ -5,8 +5,6 @@ cloud.google.com/go v0.123.0 h1:2NAUJwPR47q+E35uaJeYoNhuNEM9kM8SjgRgdeOJUSE= cloud.google.com/go v0.123.0/go.mod h1:xBoMV08QcqUGuPW65Qfm1o9Y4zKZBpGS+7bImXLTAZU= cloud.google.com/go/alloydb v1.26.0 h1:UTzyumJ8tEo0CqwzLkV4WMGnCxvvhw3BDy1nXfCt9KE= cloud.google.com/go/alloydb v1.26.0/go.mod h1:oqHGc/Xb5fWtH+wIDpu2wcPJX9oML/fGJuH/sp8ysyo= -cloud.google.com/go/alloydbconn v1.18.2 h1:zxxUnSU50d1sS/nswGBldpZsaxF3emirbO+xF+Rtgig= -cloud.google.com/go/alloydbconn v1.18.2/go.mod h1:0vfrUSwleLoK13ycnoaZ1A4yCfg3r7rig7Xm5vKlEtM= cloud.google.com/go/alloydbconn v1.18.3 h1:7A8QN5DtPkyGToqekWSm4+Ryrx2+xYWkvRYh0A0fAVg= cloud.google.com/go/alloydbconn v1.18.3/go.mod h1:cbwUQHP9eLp3Y66xZyQZrKmlcvqLnlb91qKc6kXV46Q= cloud.google.com/go/auth v0.20.0 h1:kXTssoVb4azsVDoUiF8KvxAqrsQcQtB53DcSgta74CA= @@ -160,6 +158,8 @@ github.com/grpc-ecosystem/grpc-gateway/v2 v2.29.0 h1:5VipnvEpbqr2gA2VbM+nYVbkIF2 github.com/grpc-ecosystem/grpc-gateway/v2 v2.29.0/go.mod h1:Hyl3n6Twe1hvtd9XUXDec4pTvgMSEixRuQKPTMH2bNs= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/jackc/pgerrcode v0.0.0-20220416144525-469b46aa5efa h1:s+4MhCQ6YrzisK6hFJUX53drDT4UsSW3DEhKn0ifuHw= +github.com/jackc/pgerrcode v0.0.0-20220416144525-469b46aa5efa/go.mod h1:a/s9Lp5W7n/DD0VrVoyJ00FbP2ytTPDVOivvn2bMlds= github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= @@ -280,8 +280,6 @@ go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= -golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= golang.org/x/crypto v0.51.0 h1:IBPXwPfKxY7cWQZ38ZCIRPI50YLeevDLlLnyC5wRGTI= golang.org/x/crypto v0.51.0/go.mod h1:8AdwkbraGNABw2kOX6YFPs3WM22XqI4EXEd8g+x7Oc8= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -294,8 +292,6 @@ golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73r golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= -golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= golang.org/x/net v0.54.0 h1:2zJIZAxAHV/OHCDTCOHAYehQzLfSXuf/5SoL/Dv6w/w= golang.org/x/net v0.54.0/go.mod h1:Sj4oj8jK6XmHpBZU/zWHw3BV3abl4Kvi+Ut7cQcY+cQ= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= @@ -314,8 +310,6 @@ golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= -golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= -golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= golang.org/x/text v0.37.0 h1:Cqjiwd9eSg8e0QAkyCaQTNHFIIzWtidPahFWR83rTrc= golang.org/x/text v0.37.0/go.mod h1:a5sjxXGs9hsn/AJVwuElvCAo9v8QYLzvavO5z2PiM38= golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= @@ -339,8 +333,6 @@ google.golang.org/genproto v0.0.0-20260504160031-60b97b32f348 h1:JjVGDZYWkJWZcxv google.golang.org/genproto v0.0.0-20260504160031-60b97b32f348/go.mod h1:95PqD4xM+AdOcBGsmgfaofXsiA37uXDtDufVbntT3TU= google.golang.org/genproto/googleapis/api v0.0.0-20260504160031-60b97b32f348 h1:U8orV30l6KpDsi9dxU0CoJZGbjS8EEpw+6ba+XwGPQA= google.golang.org/genproto/googleapis/api v0.0.0-20260504160031-60b97b32f348/go.mod h1:Yzdzr5OOZFgSsEV2D/Xi9NL3bszpXFAg0hFJiRohcD8= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260504160031-60b97b32f348 h1:pfIbyB44sWzHiCpRqIen67ZQnVXSfIxWrqUMk1qwODE= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260504160031-60b97b32f348/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= google.golang.org/genproto/googleapis/rpc v0.0.0-20260511170946-3700d4141b60 h1:seT2EwLWM78plQ7wcDfuWBc/4FAEAXDDiaSol4ku4qo= google.golang.org/genproto/googleapis/rpc v0.0.0-20260511170946-3700d4141b60/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= @@ -348,8 +340,6 @@ google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyac google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= -google.golang.org/grpc v1.81.0 h1:W3G9N3KQf3BU+YuCtGKJk0CmxQNbAISICD/9AORxLIw= -google.golang.org/grpc v1.81.0/go.mod h1:xGH9GfzOyMTGIOXBJmXt+BX/V0kcdQbdcuwQ/zNw42I= google.golang.org/grpc v1.81.1 h1:VnnIIZ88UzOOKLukQi+ImGz8O1Wdp8nAGGnvOfEIWQQ= google.golang.org/grpc v1.81.1/go.mod h1:xGH9GfzOyMTGIOXBJmXt+BX/V0kcdQbdcuwQ/zNw42I= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= From 6dd632d0bcb38ba029026b66b29112b45ed22bce Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 30 Jun 2026 12:07:55 +0000 Subject: [PATCH 11/13] database: rename RetryUnsafe->RetryIdempotent, RetryPostgresTx->RetryPostgresNonIdempotent Renames so the common case (selects, idempotent DML) doesn't put 'unsafe' all over callers' codebases, and the transactional variant's name signals it's for non-idempotent ops: RetryUnsafe -> RetryIdempotent RetryUnsafeOptions -> RetryIdempotentOptions RetryPostgresTx -> RetryPostgresNonIdempotent RetryMySQLTx -> RetryMySQLNonIdempotent (placeholder) RetrySpannerTx -> RetrySpannerNonIdempotent (placeholder) RetryTxOptions -> RetryNonIdempotentOptions Dropped the redundant 'Tx' suffix from the public names: the fn func(*sql.Tx) parameter makes the transaction obvious at every call site, so 'Tx' in the name added little. The internal retryTx helper keeps its name (unexported, descriptive). No behavior change. Co-authored-by: E-Love (Eric Loveland) --- database/mysql.go | 6 +++--- database/postgres.go | 12 ++++++------ database/postgres_test.go | 16 ++++++++-------- database/retry.go | 23 ++++++++++++----------- database/retry_test.go | 36 ++++++++++++++++++------------------ database/spanner.go | 6 +++--- 6 files changed, 50 insertions(+), 49 deletions(-) diff --git a/database/mysql.go b/database/mysql.go index c0d75cf9..07d0d8b4 100644 --- a/database/mysql.go +++ b/database/mysql.go @@ -246,15 +246,15 @@ func MySQLDeadlockFound(err error) bool { return strings.Contains(err.Error(), fmt.Sprintf("Error %d", mysqlErrDeadlockFound)) } -// RetryMySQLTx is the MySQL equivalent of RetryPostgresTx. +// RetryMySQLNonIdempotent is the MySQL equivalent of RetryPostgresNonIdempotent. // TODO: implement a MySQL-specific retryClassifier (retryTx already handles the // Begin/fn/Commit/Rollback loop and driver.ErrBadConn). Cover 1213 deadlock, // 1205 lock wait timeout, 2013 connection lost. See // https://dev.mysql.com/doc/mysql-errors/8.0/en/server-error-reference.html -func RetryMySQLTx(ctx context.Context, db *sql.DB, opts RetryTxOptions, fn func(*sql.Tx) error) error { +func RetryMySQLNonIdempotent(ctx context.Context, db *sql.DB, opts RetryNonIdempotentOptions, fn func(*sql.Tx) error) error { _ = ctx _ = db _ = opts _ = fn - return errors.New("database: RetryMySQLTx is not yet implemented; see TODO") + return errors.New("database: RetryMySQLNonIdempotent is not yet implemented; see TODO") } diff --git a/database/postgres.go b/database/postgres.go index de03af16..641b1893 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -27,7 +27,7 @@ func postgresConnection(ctx context.Context, logger log.Logger, config PostgresC // that exceeded MaxConnLifetime or MaxConnIdleTime. It does NOT ping for // liveness — dead connections are caught at acquire time by the ResetSession // ping (default: ping if idle > 1s), with database/sql retrying on a fresh - // conn and RetryPostgresTx retrying beyond that. + // conn and RetryPostgresNonIdempotent retrying beyond that. poolConfig.HealthCheckPeriod = 1 * time.Second pool, err := pgxpool.NewWithConfig(ctx, poolConfig) @@ -171,7 +171,7 @@ func PostgresDeadlockFound(err error) bool { } // isSafeRetryablePostgresError is the commit-phase classifier for -// RetryPostgresTx: true iff the error guarantees the commit didn't happen, so +// RetryPostgresNonIdempotent: true iff the error guarantees the commit didn't happen, so // retry is safe even at commit. Maximally conservative — only errors that can // NEVER mean "possibly committed": // @@ -198,7 +198,7 @@ func PostgresDeadlockFound(err error) bool { // Everything excluded here is still retried pre-commit via // isRetryablePostgresPreCommitError's opt-out (the tx didn't commit pre-commit). // -// Unexported: use RetryPostgresTx or RetryUnsafe rather than building on this. +// Unexported: use RetryPostgresNonIdempotent or RetryIdempotent rather than building on this. func isSafeRetryablePostgresError(err error) bool { if err == nil { return false @@ -231,7 +231,7 @@ func isPermanentPostgresError(pgErr *pgconn.PgError) bool { const retryJitterMax = 100 * time.Millisecond // isRetryablePostgresPreCommitError is the pre-commit classifier for -// RetryPostgresTx (errors from BeginTx or fn). OPTS OUT: pre-commit retry is +// RetryPostgresNonIdempotent (errors from BeginTx or fn). OPTS OUT: pre-commit retry is // always safe (the transaction rolls back), so it retries everything EXCEPT // context.Canceled/DeadlineExceeded and the permanent SQLSTATE classes // (isPermanentPostgresError). Unknown *pgconn.PgError codes and non-PgError @@ -255,7 +255,7 @@ func isRetryablePostgresPreCommitError(err error) bool { return true } -// RetryPostgresTx runs fn in a Postgres transaction, retrying the whole +// RetryPostgresNonIdempotent runs fn in a Postgres transaction, retrying the whole // transaction on transient errors. fn gets a fresh *sql.Tx each attempt; if it // returns an error the tx is rolled back, if nil the tx is committed. // @@ -266,7 +266,7 @@ func isRetryablePostgresPreCommitError(err error) bool { // // db must be pgx-backed (e.g. from database.New with a PostgresConfig); other // drivers won't get pgx-specific retries. -func RetryPostgresTx(ctx context.Context, db *sql.DB, opts RetryTxOptions, fn func(*sql.Tx) error) error { +func RetryPostgresNonIdempotent(ctx context.Context, db *sql.DB, opts RetryNonIdempotentOptions, fn func(*sql.Tx) error) error { return retryTx(ctx, db, postgresTxClassifier, opts, fn) } diff --git a/database/postgres_test.go b/database/postgres_test.go index 442ed0c0..6023c369 100644 --- a/database/postgres_test.go +++ b/database/postgres_test.go @@ -183,10 +183,10 @@ func Test_Postgres_Alloy_Migrations(t *testing.T) { defer db.Close() } -func TestRetryUnsafe(t *testing.T) { +func TestRetryIdempotent(t *testing.T) { t.Run("succeeds on first attempt", func(t *testing.T) { calls := 0 - err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{}, func() error { + err := database.RetryIdempotent(context.Background(), database.RetryIdempotentOptions{}, func() error { calls++ return nil }) @@ -196,10 +196,10 @@ func TestRetryUnsafe(t *testing.T) { t.Run("retries on any error by default, including non-Pg-retryable ones", func(t *testing.T) { // unique_violation is NOT retryable per IsRetryablePostgresError, but - // RetryUnsafe's default predicate retries any error except context + // RetryIdempotent's default predicate retries any error except context // cancellation because the caller has vouched that fn is idempotent. calls := 0 - err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{}, func() error { + err := database.RetryIdempotent(context.Background(), database.RetryIdempotentOptions{}, func() error { calls++ if calls < 3 { return &pgconn.PgError{Code: pgerrcode.UniqueViolation} @@ -213,7 +213,7 @@ func TestRetryUnsafe(t *testing.T) { t.Run("custom IsRetryable short-circuits non-retryable errors", func(t *testing.T) { calls := 0 neverRetry := func(error) bool { return false } - err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{IsRetryable: neverRetry}, func() error { + err := database.RetryIdempotent(context.Background(), database.RetryIdempotentOptions{IsRetryable: neverRetry}, func() error { calls++ return io.EOF }) @@ -224,7 +224,7 @@ func TestRetryUnsafe(t *testing.T) { t.Run("does not retry context cancellation by default", func(t *testing.T) { calls := 0 - err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{}, func() error { + err := database.RetryIdempotent(context.Background(), database.RetryIdempotentOptions{}, func() error { calls++ return context.Canceled }) @@ -238,7 +238,7 @@ func TestRetryUnsafe(t *testing.T) { cancel() // cancel immediately calls := 0 - err := database.RetryUnsafe(ctx, database.RetryUnsafeOptions{}, func() error { + err := database.RetryIdempotent(ctx, database.RetryIdempotentOptions{}, func() error { calls++ return io.EOF // retryable, but context is done }) @@ -249,7 +249,7 @@ func TestRetryUnsafe(t *testing.T) { t.Run("exhausts all attempts", func(t *testing.T) { calls := 0 - err := database.RetryUnsafe(context.Background(), database.RetryUnsafeOptions{}, func() error { + err := database.RetryIdempotent(context.Background(), database.RetryIdempotentOptions{}, func() error { calls++ return io.EOF }) diff --git a/database/retry.go b/database/retry.go index d33c1888..fbb07eb3 100644 --- a/database/retry.go +++ b/database/retry.go @@ -9,8 +9,9 @@ import ( "time" ) -// RetryTxOptions configures RetryPostgresTx (and future RetryMySQLTx/SpannerTx). -type RetryTxOptions struct { +// RetryNonIdempotentOptions configures RetryPostgresNonIdempotent (and future +// RetryMySQLNonIdempotent / RetrySpannerNonIdempotent). +type RetryNonIdempotentOptions struct { // TxOptions passed to (*sql.DB).BeginTx each attempt; nil = default isolation. TxOptions *sql.TxOptions // IsRetryable, if non-nil, is OR'd with the backend's default classifier @@ -18,14 +19,14 @@ type RetryTxOptions struct { IsRetryable func(err error) bool } -// RetryUnsafeOptions configures RetryUnsafe. -type RetryUnsafeOptions struct { +// RetryIdempotentOptions configures RetryIdempotent. +type RetryIdempotentOptions struct { // IsRetryable, if nil, retries any error except context.Canceled/DeadlineExceeded. // If non-nil, replaces the default. IsRetryable func(err error) bool } -// maxRetryAttempts is the attempt count for RetryPostgresTx and RetryUnsafe +// maxRetryAttempts is the attempt count for RetryPostgresNonIdempotent and RetryIdempotent // (initial + 2 retries), matching database/sql's maxBadConnRetries+1. Not // configurable: use the context deadline to bound total time, or wrap for more. const maxRetryAttempts = 3 @@ -33,7 +34,7 @@ const maxRetryAttempts = 3 // ErrCommitPhase wraps a non-retried error from (*sql.Tx).Commit. A commit- // phase error is ambiguous: the commit may have succeeded before the error was // returned (e.g. connection severed after COMMIT sent but before the response). -// RetryPostgresTx doesn't retry such errors (could duplicate the work) and +// RetryPostgresNonIdempotent doesn't retry such errors (could duplicate the work) and // wraps them with ErrCommitPhase so the caller can detect them via errors.Is // and decide whether to alert/reconcile/check if the commit landed (future // pg_xact_status() work). The underlying error is preserved. @@ -60,11 +61,11 @@ func (c retryClassifier) classify(err error, commitPhase bool) bool { return c.preCommit(err) } -// RetryUnsafe runs fn up to maxRetryAttempts times, retrying on any error +// RetryIdempotent runs fn up to maxRetryAttempts times, retrying on any error // opts.IsRetryable says is retryable (default: any except context cancellation/ // deadline). fn MUST be idempotent — no transaction wrapper, so a retry may // re-execute work that already committed. -func RetryUnsafe(ctx context.Context, opts RetryUnsafeOptions, fn func() error) error { +func RetryIdempotent(ctx context.Context, opts RetryIdempotentOptions, fn func() error) error { isRetryable := opts.IsRetryable if isRetryable == nil { isRetryable = isRetryableUnsafeDefault @@ -86,7 +87,7 @@ func RetryUnsafe(ctx context.Context, opts RetryUnsafeOptions, fn func() error) } // isRetryableUnsafeDefault retries any error except context cancellation/deadline -// (RetryUnsafe callers vouch fn is idempotent). +// (RetryIdempotent callers vouch fn is idempotent). func isRetryableUnsafeDefault(err error) bool { if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { return false @@ -94,7 +95,7 @@ func isRetryableUnsafeDefault(err error) bool { return true } -// retryTx is the shared retry loop for RetryPostgresTx (and future MySQL/Spanner +// retryTx is the shared retry loop for RetryPostgresNonIdempotent (and future MySQL/Spanner // implementations). opts.IsRetryable is OR'd with base on both phases. Non- // retried commit-phase errors are wrapped with ErrCommitPhase. // @@ -102,7 +103,7 @@ func isRetryableUnsafeDefault(err error) bool { // up to 3 attempts) before surfacing it; this outer loop layers on top with // jitter for longer outages. *sql.Tx methods have no internal retry, so this is // the only retry for tx.Exec/tx.Commit. -func retryTx(ctx context.Context, db beginTxer, base retryClassifier, opts RetryTxOptions, fn func(*sql.Tx) error) error { +func retryTx(ctx context.Context, db beginTxer, base retryClassifier, opts RetryNonIdempotentOptions, fn func(*sql.Tx) error) error { classifier := base if opts.IsRetryable != nil { extra := opts.IsRetryable diff --git a/database/retry_test.go b/database/retry_test.go index 6ab5db14..0545090d 100644 --- a/database/retry_test.go +++ b/database/retry_test.go @@ -27,7 +27,7 @@ func (e *safeToRetryErr) SafeToRetry() bool { return true } // mockConnector + mockDriver + mockConn + mockTx form a minimal // database/sql/driver implementation whose BeginTx/Commit/Rollback/Exec return -// configurable, stateful error sequences. This lets RetryPostgresTx (and the +// configurable, stateful error sequences. This lets RetryPostgresNonIdempotent (and the // shared retryTx helper it delegates to) be driven through a real *sql.DB // without a Postgres. *sql.Tx is concrete, so a mock driver is the only way // to control Commit/Rollback errors. @@ -117,14 +117,14 @@ func newMockDB(d *mockDriver) *sql.DB { return sql.OpenDB(&mockConnector{d: d}) } -func TestRetryPostgresTx(t *testing.T) { +func TestRetryPostgresNonIdempotent(t *testing.T) { t.Run("succeeds on first attempt", func(t *testing.T) { d := &mockDriver{} db := newMockDB(d) defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { calls++ return nil }) @@ -140,7 +140,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { calls++ if calls < 2 { return &pgconn.PgError{Code: pgerrcode.SerializationFailure} @@ -162,7 +162,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { calls++ return &pgconn.PgError{Code: pgerrcode.UniqueViolation} }) @@ -182,7 +182,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { calls++ if calls < 2 { return driver.ErrBadConn @@ -205,7 +205,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { calls++ return nil }) @@ -222,7 +222,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { calls++ return nil }) @@ -244,7 +244,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { calls++ return nil }) @@ -268,7 +268,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { calls++ return nil }) @@ -296,7 +296,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { calls++ return nil }) @@ -321,7 +321,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{ + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{ IsRetryable: func(err error) bool { return errors.Is(err, sentinelErr) }, }, func(*sql.Tx) error { calls++ @@ -343,7 +343,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(ctx, db, RetryTxOptions{}, func(*sql.Tx) error { + err := RetryPostgresNonIdempotent(ctx, db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { calls++ if calls == 1 { cancel() // cancel after the first attempt runs @@ -362,7 +362,7 @@ func TestRetryPostgresTx(t *testing.T) { defer db.Close() calls := 0 - err := RetryPostgresTx(context.Background(), db, RetryTxOptions{}, func(*sql.Tx) error { + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { calls++ return &pgconn.PgError{Code: pgerrcode.DeadlockDetected} }) @@ -456,14 +456,14 @@ func TestIsRetryablePostgresPreCommitError(t *testing.T) { require.False(t, isRetryablePostgresPreCommitError(nil)) } -func TestRetryMySQLTxNotImplemented(t *testing.T) { - err := RetryMySQLTx(context.Background(), nil, RetryTxOptions{}, func(*sql.Tx) error { return nil }) +func TestRetryMySQLNonIdempotentNotImplemented(t *testing.T) { + err := RetryMySQLNonIdempotent(context.Background(), nil, RetryNonIdempotentOptions{}, func(*sql.Tx) error { return nil }) require.Error(t, err) require.Contains(t, err.Error(), "not yet implemented") } -func TestRetrySpannerTxNotImplemented(t *testing.T) { - err := RetrySpannerTx(context.Background(), nil, RetryTxOptions{}, func(*sql.Tx) error { return nil }) +func TestRetrySpannerNonIdempotentNotImplemented(t *testing.T) { + err := RetrySpannerNonIdempotent(context.Background(), nil, RetryNonIdempotentOptions{}, func(*sql.Tx) error { return nil }) require.Error(t, err) require.Contains(t, err.Error(), "not yet implemented") } diff --git a/database/spanner.go b/database/spanner.go index feec8f0f..4f68d283 100644 --- a/database/spanner.go +++ b/database/spanner.go @@ -44,15 +44,15 @@ func SpannerUniqueViolation(err error) bool { strings.Contains(err.Error(), "AlreadyExists") } -// RetrySpannerTx is the Spanner equivalent of RetryPostgresTx. +// RetrySpannerNonIdempotent is the Spanner equivalent of RetryPostgresNonIdempotent. // TODO: consider delegating to spannerdriver.RunTransactionWithOptions for // ABORTED replay, plus network retry. retryTx already handles the loop and // driver.ErrBadConn; the missing piece is a Spanner-specific retryClassifier. // See https://pkg.go.dev/github.com/googleapis/go-sql-spanner -func RetrySpannerTx(ctx context.Context, db *sql.DB, opts RetryTxOptions, fn func(*sql.Tx) error) error { +func RetrySpannerNonIdempotent(ctx context.Context, db *sql.DB, opts RetryNonIdempotentOptions, fn func(*sql.Tx) error) error { _ = ctx _ = db _ = opts _ = fn - return errors.New("database: RetrySpannerTx is not yet implemented; see TODO") + return errors.New("database: RetrySpannerNonIdempotent is not yet implemented; see TODO") } From 8c8167d4513ceb392f15bad4765ff9ac1547a71b Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 30 Jun 2026 12:57:20 +0000 Subject: [PATCH 12/13] database: verify ambiguous commits via pg_current_xact_id_if_assigned + pg_xact_status MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RetryPostgresNonIdempotent now verifies whether an ambiguous commit landed, per r3470551946 and the PostgreSQL docs (pg_xact_status is documented for 'determine whether their transaction committed or aborted after the application and database server become disconnected while a COMMIT is in progress'). Always-on for the non-idempotent variant (no opt-in). Before each Commit, capture the transaction id: SELECT pg_current_xact_id_if_assigned()::text (NULL → read-only, no writes to duplicate). On a commit-phase error the commit classifier rejects (network, 57P01/57P02, etc.), query pg_xact_status($1::xid8) on a FRESH connection (the original may be dead; xid8 is globally unique so this works across connections/reconnects, modulo a failover replay-lag window): - 'aborted' → retry (commit didn't land). - 'committed' → ErrCommitted (don't retry — would duplicate). - NULL/'in progress'/error → ErrCommitPhase (ambiguous, caller decides). - read-only (xid nil) → retry without verifying (no writes to duplicate). Class-40 commit errors (40001/40P01, server rolled back) are retryable per the commit classifier, so they're retried without verification; on exhaustion they return the original error (not ErrCommitPhase — they're known non-commit). ErrCommitPhase is now only for INCONCLUSIVE commits. Implementation: - ErrCommitted sentinel (distinct from ErrCommitPhase). - commitStatus enum (aborted/committed/unknown). - retryDB interface (BeginTx + QueryRowContext) so retryTx can run the fresh-connection pg_xact_status query. - retryClassifier gains captureXid/verifyCommit hooks (nil for future MySQL/Spanner → ErrCommitPhase fallback, current behavior). - runOneTxAttempt captures the xid before Commit and returns it. - retryTx loop: on a classifier-rejected commit error, verify and decide retry / ErrCommitted / ErrCommitPhase; read-only → retry. - postgres.go: capturePostgresXid (::text cast sidesteps pgx's xid8 Go type-mapping) and verifyPostgresCommit (::xid8 cast), wired into postgresTxClassifier. Removed the TODO(future). Tests: mock driver implements QueryerContext, scripting the two verification queries (captureXidResult/verifyStatusResult/verifyErr). New TestRetryPostgres- NonIdempotent subtests cover verify-aborted→retry, verify-committed→ErrCommitted, verify-unknown→ErrCommitPhase, read-only→retry-without-verifying, verify-error →ErrCommitPhase, and class-40-exhaustion→original-error. TestErrCommitted added. ErrCommitPhase-exhaustion test repurposed (class-40 now returns the original, not ErrCommitPhase). Co-authored-by: E-Love (Eric Loveland) --- database/postgres.go | 73 +++++++++++++-- database/retry.go | 137 ++++++++++++++++++++------- database/retry_test.go | 206 ++++++++++++++++++++++++++++++++++++----- 3 files changed, 351 insertions(+), 65 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index 641b1893..534892e1 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -255,24 +255,77 @@ func isRetryablePostgresPreCommitError(err error) bool { return true } -// RetryPostgresNonIdempotent runs fn in a Postgres transaction, retrying the whole -// transaction on transient errors. fn gets a fresh *sql.Tx each attempt; if it -// returns an error the tx is rolled back, if nil the tx is committed. +// capturePostgresXid returns the current transaction's id (pg_current_xact_id_if_assigned) +// as a string, or nil if no id is assigned (read-only transaction). Captured +// before Commit so verifyPostgresCommit can check whether an ambiguous commit +// landed. The ::text cast sidesteps pgx's xid8 Go type-mapping; the string is +// passed back to pg_xact_status($1::xid8) on verification. Returns nil for +// read-only transactions (no xid), in which case retryTx retries an ambiguous +// commit without verifying (no writes to duplicate). +func capturePostgresXid(tx *sql.Tx) (any, error) { + var s sql.NullString + if err := tx.QueryRow("SELECT pg_current_xact_id_if_assigned()::text").Scan(&s); err != nil { + return nil, err + } + if !s.Valid { + return nil, nil // read-only, no xid assigned + } + return s.String, nil +} + +// verifyPostgresCommit checks whether the transaction with the given xid +// committed, aborted, or is indeterminate, via pg_xact_status on a fresh +// connection (the original may be dead). xid is the string from +// capturePostgresXid. Returns: +// - commitStatusAborted: the commit did NOT land → safe to retry. +// - commitStatusCommitted: the commit DID land → don't retry (ErrCommitted). +// - commitStatusUnknown: pg_xact_status returned NULL (xid too old, or not +// yet visible after a failover) or errored, or status was "in progress" → +// ambiguous (ErrCommitPhase). +func verifyPostgresCommit(ctx context.Context, db retryDB, xid any) (commitStatus, error) { + s, ok := xid.(string) + if !ok || s == "" { + return commitStatusUnknown, nil + } + var status sql.NullString + if err := db.QueryRowContext(ctx, "SELECT pg_xact_status($1::xid8)", s).Scan(&status); err != nil { + return commitStatusUnknown, err + } + switch status.String { + case "aborted": + return commitStatusAborted, nil + case "committed": + return commitStatusCommitted, nil + default: // "" (NULL), "in progress", or unexpected → ambiguous + return commitStatusUnknown, nil + } +} + +// RetryPostgresNonIdempotent runs fn in a Postgres transaction, retrying the +// whole transaction on transient errors. fn gets a fresh *sql.Tx each attempt; +// if it returns an error the tx is rolled back, if nil the tx is committed. // // Pre-commit errors use the opt-out isRetryablePostgresPreCommitError; commit -// errors use the narrower opt-in isSafeRetryablePostgresError (a commit-phase -// network error may mean the commit already succeeded). Non-retried commit -// errors are wrapped with ErrCommitPhase. opts.IsRetryable is additive on both. +// errors use the narrower opt-in isSafeRetryablePostgresError. Commit-phase +// errors the commit classifier rejects are verified: before each Commit the +// transaction id is captured (pg_current_xact_id_if_assigned), and on an +// ambiguous commit error pg_xact_status is queried on a fresh connection — +// aborted → retry, committed → ErrCommitted (don't retry), inconclusive → +// ErrCommitPhase. Read-only transactions (no xid) retry without verification. +// opts.IsRetryable is additive on both classifiers. // // db must be pgx-backed (e.g. from database.New with a PostgresConfig); other -// drivers won't get pgx-specific retries. +// drivers won't get pgx-specific retries or commit verification. func RetryPostgresNonIdempotent(ctx context.Context, db *sql.DB, opts RetryNonIdempotentOptions, fn func(*sql.Tx) error) error { return retryTx(ctx, db, postgresTxClassifier, opts, fn) } // postgresTxClassifier pairs the pre-commit (opt-out) and commit (opt-in) -// classifiers. See ErrCommitPhase. +// classifiers with commit verification (capturePostgresXid/verifyPostgresCommit). +// See ErrCommitPhase / ErrCommitted. var postgresTxClassifier = retryClassifier{ - preCommit: isRetryablePostgresPreCommitError, - commit: isSafeRetryablePostgresError, + preCommit: isRetryablePostgresPreCommitError, + commit: isSafeRetryablePostgresError, + captureXid: capturePostgresXid, + verifyCommit: verifyPostgresCommit, } diff --git a/database/retry.go b/database/retry.go index fbb07eb3..53743d3e 100644 --- a/database/retry.go +++ b/database/retry.go @@ -31,26 +31,63 @@ type RetryIdempotentOptions struct { // configurable: use the context deadline to bound total time, or wrap for more. const maxRetryAttempts = 3 -// ErrCommitPhase wraps a non-retried error from (*sql.Tx).Commit. A commit- -// phase error is ambiguous: the commit may have succeeded before the error was -// returned (e.g. connection severed after COMMIT sent but before the response). -// RetryPostgresNonIdempotent doesn't retry such errors (could duplicate the work) and -// wraps them with ErrCommitPhase so the caller can detect them via errors.Is -// and decide whether to alert/reconcile/check if the commit landed (future -// pg_xact_status() work). The underlying error is preserved. -var ErrCommitPhase = errors.New("database: commit-phase error; transaction may have committed before the error was returned") +// ErrCommitPhase wraps a non-retried commit-phase error whose commit status +// could not be verified — i.e. the commit may or may not have landed (e.g. a +// network error after COMMIT was sent, or pg_xact_status() couldn't determine +// the outcome). The caller should treat it as ambiguous: alert/reconcile, or +// re-check whether the commit landed. The underlying error is preserved. +// +// With commit verification enabled (RetryPostgresNonIdempotent), ErrCommitPhase +// is the fallback when verification is inconclusive; verified-aborted commits +// are retried, verified-committed commits return ErrCommitted. +var ErrCommitPhase = errors.New("database: commit-phase error with inconclusive commit status; the transaction may have committed before the error was returned") + +// ErrCommitted wraps a commit-phase error where verification (pg_xact_status) +// confirmed the transaction DID commit on the server before the error was +// returned to the client (e.g. the connection was severed after the commit +// recorded but before the response arrived). The caller must NOT retry (that +// would duplicate the committed work); it should reconcile/alert. The +// underlying commit error is preserved for inspection via errors.Is/errors.As. +var ErrCommitted = errors.New("database: transaction committed before the error was returned; do not retry") + +// commitStatus is the verified outcome of an ambiguous commit-phase error. +type commitStatus int + +const ( + commitStatusUnknown commitStatus = iota // can't determine → ErrCommitPhase + commitStatusAborted // commit did NOT land → safe to retry + commitStatusCommitted // commit DID land → ErrCommitted, don't retry +) // beginTxer is satisfied by *sql.DB; the interface lets tests fake BeginTx. type beginTxer interface { BeginTx(ctx context.Context, opts *sql.TxOptions) (*sql.Tx, error) } -// retryClassifier holds per-phase classifiers: preCommit for BeginTx/fn errors, -// commit for (*sql.Tx).Commit errors. commit must be narrower (a commit-phase -// error may mean the commit already succeeded — see ErrCommitPhase). +// retryDB extends beginTxer with QueryRowContext, so the retry loop can run a +// fresh-connection query (pg_xact_status) to verify an ambiguous commit. +// Satisfied by *sql.DB. +type retryDB interface { + beginTxer + QueryRowContext(ctx context.Context, query string, args ...any) *sql.Row +} + +// retryClassifier holds per-phase classifiers and optional commit-verification +// hooks. preCommit classifies BeginTx/fn errors; commit classifies +// (*sql.Tx).Commit errors (must be narrower — a commit-phase error may mean +// the commit already succeeded). captureXid/verifyCommit enable commit +// verification for ambiguous commit errors (see ErrCommitPhase/ErrCommitted). type retryClassifier struct { preCommit func(error) bool commit func(error) bool + // captureXid, if non-nil, is called after fn succeeds and before Commit to + // capture a transaction identifier for verifyCommit. Returns nil if the + // transaction has no id (e.g. read-only — safe to retry without verifying). + captureXid func(*sql.Tx) (any, error) + // verifyCommit, if non-nil, is called on a commit-phase error the commit + // classifier rejected, to check whether the commit landed. Runs on a fresh + // connection (db) since the original may be dead. xid is from captureXid. + verifyCommit func(ctx context.Context, db retryDB, xid any) (commitStatus, error) } // classify picks the commit classifier when commitPhase, else preCommit. @@ -95,52 +132,84 @@ func isRetryableUnsafeDefault(err error) bool { return true } -// retryTx is the shared retry loop for RetryPostgresNonIdempotent (and future MySQL/Spanner -// implementations). opts.IsRetryable is OR'd with base on both phases. Non- -// retried commit-phase errors are wrapped with ErrCommitPhase. +// retryTx is the shared retry loop for RetryPostgresNonIdempotent (and future +// MySQL/Spanner implementations). opts.IsRetryable is OR'd with base on both +// phases. Commit-phase errors the commit classifier rejects are verified via +// base.verifyCommit when an xid was captured: verified-aborted → retry, +// verified-committed → ErrCommitted (don't retry), inconclusive → ErrCommitPhase. +// Read-only transactions (no xid) retry without verification (no writes to +// duplicate). Commit-phase errors the classifier accepts (class 40: server +// rolled back) are retried; on exhaustion they return the original error (not +// ErrCommitPhase — they're known non-commit). // // database/sql already retries driver.ErrBadConn for *sql.DB methods (immediate, // up to 3 attempts) before surfacing it; this outer loop layers on top with // jitter for longer outages. *sql.Tx methods have no internal retry, so this is // the only retry for tx.Exec/tx.Commit. -func retryTx(ctx context.Context, db beginTxer, base retryClassifier, opts RetryNonIdempotentOptions, fn func(*sql.Tx) error) error { +func retryTx(ctx context.Context, db retryDB, base retryClassifier, opts RetryNonIdempotentOptions, fn func(*sql.Tx) error) error { classifier := base if opts.IsRetryable != nil { extra := opts.IsRetryable - classifier = retryClassifier{ - preCommit: func(err error) bool { return base.preCommit(err) || extra(err) }, - commit: func(err error) bool { return base.commit(err) || extra(err) }, - } + classifier.preCommit = func(err error) bool { return base.preCommit(err) || extra(err) } + classifier.commit = func(err error) bool { return base.commit(err) || extra(err) } } var lastErr error var lastCommitPhase bool + var lastRetryable bool for attempt := 0; attempt < maxRetryAttempts; attempt++ { var commitPhase bool - lastErr, commitPhase = runOneTxAttempt(ctx, db, opts.TxOptions, fn) + var xid any + lastErr, commitPhase, xid = runOneTxAttempt(ctx, db, opts.TxOptions, base.captureXid, fn) lastCommitPhase = commitPhase if lastErr == nil { return nil } - if !classifier.classify(lastErr, commitPhase) { + retryable := classifier.classify(lastErr, commitPhase) + if !retryable && commitPhase { + // Commit-phase error the classifier rejected — verify whether the + // commit landed, if we captured an xid and have a verifier. + switch { + case base.captureXid != nil && xid == nil: + // Read-only transaction (no xid assigned): no writes to + // duplicate, so retry is safe without verification. + retryable = true + case base.verifyCommit != nil && xid != nil: + switch status, verr := base.verifyCommit(ctx, db, xid); { + case verr == nil && status == commitStatusAborted: + retryable = true + case verr == nil && status == commitStatusCommitted: + return fmt.Errorf("%w: %w", ErrCommitted, lastErr) + default: // commitStatusUnknown or verification error → ambiguous + retryable = false + } + } + } + lastRetryable = retryable + if !retryable { break } if !sleepWithJitter(ctx, attempt, maxRetryAttempts) { return ctx.Err() } } - if lastCommitPhase { + // A retryable commit-phase error (class-40 rolled back, verified-aborted, + // or read-only) is known non-commit, so on exhaustion return the original + // error — not ErrCommitPhase (which is only for inconclusive commits). + if lastCommitPhase && !lastRetryable { return fmt.Errorf("%w: %w", ErrCommitPhase, lastErr) } return lastErr } -// runOneTxAttempt runs one begin/fn/commit attempt, returning the first error -// and whether it came from Commit (commitPhase). The deferred Rollback is a -// no-op after a successful Commit. -func runOneTxAttempt(ctx context.Context, db beginTxer, txOpts *sql.TxOptions, fn func(*sql.Tx) error) (err error, commitPhase bool) { +// runOneTxAttempt runs one begin/fn/commit attempt, returning the first error, +// whether it came from Commit (commitPhase), and the transaction id captured +// by captureXid (nil if none). The deferred Rollback is a no-op after a +// successful Commit. A captureXid failure is treated as a pre-commit error +// (rolled back, classified by preCommit). +func runOneTxAttempt(ctx context.Context, db beginTxer, txOpts *sql.TxOptions, captureXid func(*sql.Tx) (any, error), fn func(*sql.Tx) error) (err error, commitPhase bool, xid any) { tx, beginErr := db.BeginTx(ctx, txOpts) if beginErr != nil { - return beginErr, false + return beginErr, false, nil } defer func() { if p := recover(); p != nil { @@ -152,14 +221,18 @@ func runOneTxAttempt(ctx context.Context, db beginTxer, txOpts *sql.TxOptions, f } }() if err = fn(tx); err != nil { - return err, false + return err, false, nil + } + if captureXid != nil { + xid, err = captureXid(tx) + if err != nil { + return err, false, nil + } } - // TODO(future): capture pg_current_xact_id() pre-commit and consult - // pg_xact_status() on commit error to detect if the commit landed. if err = tx.Commit(); err != nil { - return err, true + return err, true, xid } - return nil, false + return nil, false, nil } // sleepWithJitter sleeps a random duration in [0, retryJitterMax) before the diff --git a/database/retry_test.go b/database/retry_test.go index 0545090d..36d6153a 100644 --- a/database/retry_test.go +++ b/database/retry_test.go @@ -8,6 +8,7 @@ import ( "fmt" "io" "net" + "strings" "sync" "testing" @@ -43,11 +44,21 @@ type mockDriver struct { commitErrs []error rollbackErrs []error execErrs []error - mu sync.Mutex - beginCalls int - commitCalls int - rollbackCalls int - execCalls int + // Commit-verification scripting (matched by query substring in QueryContext): + // captureXidResult is returned for pg_current_xact_id_if_assigned (a string + // xid, or nil for read-only/NULL); verifyStatusResult for pg_xact_status + // ("aborted"/"committed"/"in progress"/"" for NULL); verifyErr makes the + // pg_xact_status query fail. + captureXidResult any + verifyStatusResult string + verifyErr error + mu sync.Mutex + beginCalls int + commitCalls int + rollbackCalls int + execCalls int + captureCalls int + verifyCalls int } func (d *mockDriver) Open(string) (driver.Conn, error) { return &mockConn{d: d}, nil } @@ -93,6 +104,26 @@ func (c *mockConn) PrepareContext(context.Context, string) (driver.Stmt, error) return nil, errors.New("mockDriver: PrepareContext not implemented") } +// QueryContext implements driver.QueryerContext so *sql.Tx.QueryRow and +// *sql.DB.QueryRowContext work. It recognizes the commit-verification queries +// (by substring) and returns the scripted value from the driver. +func (c *mockConn) QueryContext(_ context.Context, query string, _ []driver.NamedValue) (driver.Rows, error) { + c.d.mu.Lock() + defer c.d.mu.Unlock() + switch { + case strings.Contains(query, "pg_current_xact_id_if_assigned"): + c.d.captureCalls++ + return &mockRows{val: c.d.captureXidResult}, nil + case strings.Contains(query, "pg_xact_status"): + c.d.verifyCalls++ + if c.d.verifyErr != nil { + return nil, c.d.verifyErr + } + return &mockRows{val: c.d.verifyStatusResult}, nil + } + return nil, errors.New("mockDriver: unexpected query: " + query) +} + type mockTx struct{ d *mockDriver } func (t *mockTx) Commit() error { @@ -104,6 +135,24 @@ func (t *mockTx) Rollback() error { return err } +// mockRows is a single-row, single-column result set holding val (a string, or +// nil for SQL NULL). Used by QueryContext for the verification queries. +type mockRows struct { + val any + read bool +} + +func (r *mockRows) Columns() []string { return []string{"col"} } +func (r *mockRows) Close() error { return nil } +func (r *mockRows) Next(dest []driver.Value) error { + if r.read { + return io.EOF + } + r.read = true + dest[0] = r.val + return nil +} + // nthErr returns errs[i] or nil when i is out of range. Tests pass a sequence // like []error{err1, err2} to mean "fail twice then succeed". func nthErr(errs []error, i int) error { @@ -232,13 +281,15 @@ func TestRetryPostgresNonIdempotent(t *testing.T) { require.Equal(t, 2, d.commitCalls) // first commit failed, second succeeded }) - t.Run("commit-phase network error is NOT retried and is wrapped with ErrCommitPhase", func(t *testing.T) { + t.Run("commit-phase network error with inconclusive verification is wrapped with ErrCommitPhase", func(t *testing.T) { // A network error during commit could mean the commit succeeded but the - // response was lost, so the narrow commit classifier does NOT retry it. - // It is returned to the caller wrapped with ErrCommitPhase so the caller - // can detect the ambiguous commit-phase failure. + // response was lost. The commit classifier rejects it; verification + // (pg_xact_status) returns NULL (inconclusive, e.g. xid not yet visible + // after a failover) → ambiguous → ErrCommitPhase (caller decides). d := &mockDriver{ - commitErrs: []error{io.EOF}, + captureXidResult: "1", // write transaction, has an xid + verifyStatusResult: "", // NULL → inconclusive + commitErrs: []error{io.EOF}, } db := newMockDB(d) defer db.Close() @@ -250,19 +301,21 @@ func TestRetryPostgresNonIdempotent(t *testing.T) { }) require.Error(t, err) require.Equal(t, 1, calls) // not retried - require.Equal(t, 1, d.beginCalls) - require.Equal(t, 1, d.commitCalls) // commit was attempted (and failed) + require.Equal(t, 1, d.commitCalls) require.Equal(t, 0, d.rollbackCalls) // commit failed; rollback is a no-op (ErrTxDone) + require.Equal(t, 1, d.captureCalls) // xid captured before commit + require.Equal(t, 1, d.verifyCalls) // pg_xact_status queried once require.ErrorIs(t, err, ErrCommitPhase) require.ErrorIs(t, err, io.EOF) // underlying error preserved }) - t.Run("commit-phase admin_shutdown (57P01) is NOT retried and is wrapped with ErrCommitPhase", func(t *testing.T) { - // 57P01 is excluded from the commit classifier: die() can interrupt after - // the commit is recorded, so 57P01 at commit could mean "possibly - // committed". Conservatively wrap with ErrCommitPhase (caller decides). + t.Run("commit-phase admin_shutdown (57P01) with inconclusive verification is wrapped with ErrCommitPhase", func(t *testing.T) { + // 57P01 is excluded from the commit classifier; verification inconclusive + // → ErrCommitPhase. d := &mockDriver{ - commitErrs: []error{&pgconn.PgError{Code: pgerrcode.AdminShutdown}}, + captureXidResult: "1", + verifyStatusResult: "", + commitErrs: []error{&pgconn.PgError{Code: pgerrcode.AdminShutdown}}, } db := newMockDB(d) defer db.Close() @@ -275,17 +328,112 @@ func TestRetryPostgresNonIdempotent(t *testing.T) { require.Error(t, err) require.Equal(t, 1, calls) // not retried require.Equal(t, 1, d.commitCalls) + require.Equal(t, 1, d.verifyCalls) require.ErrorIs(t, err, ErrCommitPhase) var pgErr *pgconn.PgError require.ErrorAs(t, err, &pgErr) require.Equal(t, pgerrcode.AdminShutdown, pgErr.Code) }) - t.Run("commit-phase exhaustion wraps the final error with ErrCommitPhase", func(t *testing.T) { - // A retryable commit error (serialization_failure at commit) that never - // succeeds: the final error is a commit-phase error, so it is wrapped - // with ErrCommitPhase even though we exhausted attempts retrying it. + t.Run("commit-phase verify-aborted is retried", func(t *testing.T) { + // Network error at commit; pg_xact_status says aborted (commit didn't + // land) → safe to retry → second attempt succeeds. d := &mockDriver{ + captureXidResult: "1", + verifyStatusResult: "aborted", + commitErrs: []error{io.EOF, nil}, + } + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { + calls++ + return nil + }) + require.NoError(t, err) + require.Equal(t, 2, calls) + require.Equal(t, 2, d.commitCalls) + require.Equal(t, 1, d.verifyCalls) // verified once (first attempt's commit) + }) + + t.Run("commit-phase verify-committed returns ErrCommitted (not retried)", func(t *testing.T) { + // Network error at commit; pg_xact_status says committed (commit DID + // land) → must NOT retry (would duplicate) → ErrCommitted. + d := &mockDriver{ + captureXidResult: "1", + verifyStatusResult: "committed", + commitErrs: []error{io.EOF}, + } + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { + calls++ + return nil + }) + require.Error(t, err) + require.Equal(t, 1, calls) // not retried + require.Equal(t, 1, d.verifyCalls) + require.ErrorIs(t, err, ErrCommitted) + require.ErrorIs(t, err, io.EOF) // underlying commit error preserved + require.NotErrorIs(t, err, ErrCommitPhase) // not ambiguous — verified committed + }) + + t.Run("commit-phase read-only (no xid) is retried without verifying", func(t *testing.T) { + // Read-only transaction (pg_current_xact_id_if_assigned returns NULL): + // no writes to duplicate, so retry is safe without calling pg_xact_status. + d := &mockDriver{ + captureXidResult: nil, // read-only + commitErrs: []error{io.EOF, nil}, + } + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { + calls++ + return nil + }) + require.NoError(t, err) + require.Equal(t, 2, calls) + require.Equal(t, 0, d.verifyCalls) // verification skipped (read-only) + require.Equal(t, 2, d.captureCalls) + }) + + t.Run("commit-phase verify-error is ambiguous (ErrCommitPhase)", func(t *testing.T) { + // pg_xact_status itself errors (e.g. the fresh connection also failed) → + // inconclusive → ErrCommitPhase. + d := &mockDriver{ + captureXidResult: "1", + verifyErr: errors.New("verify connection lost"), + commitErrs: []error{io.EOF}, + } + db := newMockDB(d) + defer db.Close() + + calls := 0 + err := RetryPostgresNonIdempotent(context.Background(), db, RetryNonIdempotentOptions{}, func(*sql.Tx) error { + calls++ + return nil + }) + require.Error(t, err) + require.Equal(t, 1, calls) // not retried + require.Equal(t, 1, d.verifyCalls) + require.ErrorIs(t, err, ErrCommitPhase) + require.ErrorIs(t, err, io.EOF) + }) + + t.Run("commit-phase class-40 exhaustion returns the original error (not ErrCommitPhase)", func(t *testing.T) { + // serialization_failure (class 40) at commit is retryable per the commit + // classifier (server rolled back → known non-commit). Retried until + // exhausted; the original error is returned (NOT ErrCommitPhase — it's + // not ambiguous, and verification isn't consulted for classifier- + // retryable commit errors). + d := &mockDriver{ + captureXidResult: "1", + verifyStatusResult: "aborted", // unused — classifier says retryable, verify not called commitErrs: []error{ &pgconn.PgError{Code: pgerrcode.SerializationFailure}, &pgconn.PgError{Code: pgerrcode.SerializationFailure}, @@ -303,8 +451,8 @@ func TestRetryPostgresNonIdempotent(t *testing.T) { require.Error(t, err) require.Equal(t, 3, calls) require.Equal(t, 3, d.commitCalls) - require.ErrorIs(t, err, ErrCommitPhase) - // The underlying *pgconn.PgError is preserved for inspection via errors.As. + require.Equal(t, 0, d.verifyCalls) // classifier-retryable → no verification + require.NotErrorIs(t, err, ErrCommitPhase) // known non-commit, not ambiguous var pgErr *pgconn.PgError require.ErrorAs(t, err, &pgErr) require.Equal(t, pgerrcode.SerializationFailure, pgErr.Code) @@ -481,3 +629,15 @@ func TestErrCommitPhase(t *testing.T) { require.NotErrorIs(t, underlying, ErrCommitPhase) require.NotErrorIs(t, io.EOF, ErrCommitPhase) } + +func TestErrCommitted(t *testing.T) { + // ErrCommitted wraps a commit error verified to have committed on the server + // (the response was lost). Callers detect it via errors.Is and must NOT retry. + underlying := errors.New("connection severed after commit recorded") + wrapped := fmt.Errorf("%w: %w", ErrCommitted, underlying) + + require.ErrorIs(t, wrapped, ErrCommitted, "caller can detect verified-committed") + require.ErrorIs(t, wrapped, underlying, "underlying commit error preserved") + require.NotErrorIs(t, wrapped, ErrCommitPhase) // distinct from ambiguous ErrCommitPhase + require.NotErrorIs(t, io.EOF, ErrCommitted) +} From c2855b04b59357f50dbff91866fe56253744a09a Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Tue, 30 Jun 2026 13:33:30 +0000 Subject: [PATCH 13/13] =?UTF-8?q?database:=20always=20verify=20commit=20er?= =?UTF-8?q?rors=20=E2=80=94=20drop=20the=20commit=20classifier?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that commit verification (pg_xact_status) is in place, the separate 'commit classifier' (isSafeRetryablePostgresError) is unnecessary. Commit errors are rare, and we already pay a round-trip to capture the xid before every commit — so always checking pg_xact_status on a commit error (one more round-trip, only on the rare commit error) is negligible, and it lets the SERVER be the authority on whether the commit landed instead of a client-side guess. retryTx now: pre-commit errors → preCommit classifier (opt-out, unchanged); commit errors → always verify (read-only/xid-nil → retry without verifying; aborted → retry; committed → ErrCommitted; inconclusive → ErrCommitPhase). No commit classifier, no classify method. opts.IsRetryable is additive on preCommit only (commit errors are verified, not classified — safer, since a caller shouldn't override commit verification). Removed isSafeRetryablePostgresError (and its complex exclusion doc about 57P01/57P02/57014/57P03/53300/network/driver.ErrBadConn) and TestIsSafeRetryablePostgresError. postgresTxClassifier drops the commit field. Trade-off: class-40 / SafeToRetry commit errors now go through verification (+1 pg_xact_status round-trip; previously fast-pathed). No regression for common cases — class-40 → 'aborted' → retry (same outcome); the rare SafeToRetry+failover+replay-lag case → ErrCommitPhase (recoverable). Class-40 doesn't hit the replay-lag case (the server responded, no failover). Tests updated: SafeToRetry-commit subtest now verifies (verifyStatusResult= aborted, verifyCalls==1); class-40-exhaustion subtest verifies each attempt (verifyCalls==3) and still returns the original error (not ErrCommitPhase). Co-authored-by: E-Love (Eric Loveland) --- database/postgres.go | 67 +++++++--------------------------- database/retry.go | 81 ++++++++++++++++++++++-------------------- database/retry_test.go | 60 +++++++++---------------------- 3 files changed, 70 insertions(+), 138 deletions(-) diff --git a/database/postgres.go b/database/postgres.go index 534892e1..33aa1e22 100644 --- a/database/postgres.go +++ b/database/postgres.go @@ -170,49 +170,6 @@ func PostgresDeadlockFound(err error) bool { return strings.Contains(err.Error(), pgerrcode.DeadlockDetected) } -// isSafeRetryablePostgresError is the commit-phase classifier for -// RetryPostgresNonIdempotent: true iff the error guarantees the commit didn't happen, so -// retry is safe even at commit. Maximally conservative — only errors that can -// NEVER mean "possibly committed": -// -// - pgconn.SafeToRetry: pgx guarantees the error occurred before any data -// was sent to the server (COMMIT message never sent). -// - pgerrcode.IsTransactionRollback: class 40 SQLSTATE codes (40001 -// serialization_failure, 40P01 deadlock_detected, 40002, 40003, 40000) — -// the server rolled back the transaction before returning the ErrorResponse. -// See https://www.postgresql.org/docs/current/mvcc-serialization-failure-handling.html -// -// Excluded (could mean "possibly committed" → wrapped with ErrCommitPhase): -// - 57P01 admin_shutdown, 57P02 crash_shutdown: die()/quickdie() are signal -// handlers that can interrupt after the commit is recorded but before the -// response is sent. quickdie() also skips rollback (_exit(2)). -// - 57014 query_canceled: a cancel arriving after the commit records is a -// theoretical edge case. -// - 57P03/53300: connect-time, won't surface from Commit. -// - network errors: could mean commit succeeded but the response was lost. -// - driver.ErrBadConn: pgx's Commit returns native errors (the ErrBadConn -// conversion is only in Exec/Query), so the commit path never sees it; -// pre-send commit is covered by pgconn.SafeToRetry. *sql.Tx.Commit doesn't -// retry ErrBadConn internally (only *sql.DB methods do). -// -// Everything excluded here is still retried pre-commit via -// isRetryablePostgresPreCommitError's opt-out (the tx didn't commit pre-commit). -// -// Unexported: use RetryPostgresNonIdempotent or RetryIdempotent rather than building on this. -func isSafeRetryablePostgresError(err error) bool { - if err == nil { - return false - } - if pgconn.SafeToRetry(err) { - return true - } - var pgErr *pgconn.PgError - if errors.As(err, &pgErr) { - return pgerrcode.IsTransactionRollback(pgErr.Code) - } - return false -} - // isPermanentPostgresError reports SQLSTATE classes that would fail again on // retry, used by isRetryablePostgresPreCommitError to opt out. Only // clearly-permanent classes are listed so unknown/transient codes are retried. @@ -305,14 +262,15 @@ func verifyPostgresCommit(ctx context.Context, db retryDB, xid any) (commitStatu // whole transaction on transient errors. fn gets a fresh *sql.Tx each attempt; // if it returns an error the tx is rolled back, if nil the tx is committed. // -// Pre-commit errors use the opt-out isRetryablePostgresPreCommitError; commit -// errors use the narrower opt-in isSafeRetryablePostgresError. Commit-phase -// errors the commit classifier rejects are verified: before each Commit the -// transaction id is captured (pg_current_xact_id_if_assigned), and on an -// ambiguous commit error pg_xact_status is queried on a fresh connection — -// aborted → retry, committed → ErrCommitted (don't retry), inconclusive → -// ErrCommitPhase. Read-only transactions (no xid) retry without verification. -// opts.IsRetryable is additive on both classifiers. +// Pre-commit errors (from BeginTx or fn) use the opt-out +// isRetryablePostgresPreCommitError (opts.IsRetryable is additive on it). +// Commit errors are always verified: before each Commit the transaction id is +// captured (pg_current_xact_id_if_assigned), and on a commit error +// pg_xact_status is queried on a fresh connection — aborted → retry, +// committed → ErrCommitted (don't retry), inconclusive → ErrCommitPhase. +// Read-only transactions (no xid) retry without verification. No client-side +// "retryable commit" guess — the server is the authority on whether the commit +// landed. // // db must be pgx-backed (e.g. from database.New with a PostgresConfig); other // drivers won't get pgx-specific retries or commit verification. @@ -320,12 +278,11 @@ func RetryPostgresNonIdempotent(ctx context.Context, db *sql.DB, opts RetryNonId return retryTx(ctx, db, postgresTxClassifier, opts, fn) } -// postgresTxClassifier pairs the pre-commit (opt-out) and commit (opt-in) -// classifiers with commit verification (capturePostgresXid/verifyPostgresCommit). -// See ErrCommitPhase / ErrCommitted. +// postgresTxClassifier pairs the pre-commit (opt-out) classifier with commit +// verification (capturePostgresXid/verifyPostgresCommit). Commit errors are +// always verified, not classified. See ErrCommitPhase / ErrCommitted. var postgresTxClassifier = retryClassifier{ preCommit: isRetryablePostgresPreCommitError, - commit: isSafeRetryablePostgresError, captureXid: capturePostgresXid, verifyCommit: verifyPostgresCommit, } diff --git a/database/retry.go b/database/retry.go index 53743d3e..a5325dfe 100644 --- a/database/retry.go +++ b/database/retry.go @@ -14,8 +14,9 @@ import ( type RetryNonIdempotentOptions struct { // TxOptions passed to (*sql.DB).BeginTx each attempt; nil = default isolation. TxOptions *sql.TxOptions - // IsRetryable, if non-nil, is OR'd with the backend's default classifier - // (both phases). Be careful adding cases unsafe at commit (see ErrCommitPhase). + // IsRetryable, if non-nil, is OR'd with the backend's pre-commit classifier + // (errors from BeginTx or fn). Commit errors are always verified via + // pg_xact_status, not classified, so IsRetryable does not affect them. IsRetryable func(err error) bool } @@ -72,32 +73,24 @@ type retryDB interface { QueryRowContext(ctx context.Context, query string, args ...any) *sql.Row } -// retryClassifier holds per-phase classifiers and optional commit-verification -// hooks. preCommit classifies BeginTx/fn errors; commit classifies -// (*sql.Tx).Commit errors (must be narrower — a commit-phase error may mean -// the commit already succeeded). captureXid/verifyCommit enable commit -// verification for ambiguous commit errors (see ErrCommitPhase/ErrCommitted). +// retryClassifier holds the pre-commit classifier and optional commit- +// verification hooks. preCommit classifies BeginTx/fn errors (opt-out: pre- +// commit retry is always safe — the transaction rolls back). Commit errors are +// not classified; they're always verified via captureXid/verifyCommit (a commit +// error may mean the commit already succeeded, so the server is the authority, +// not a client-side guess). See ErrCommitPhase/ErrCommitted. type retryClassifier struct { preCommit func(error) bool - commit func(error) bool // captureXid, if non-nil, is called after fn succeeds and before Commit to // capture a transaction identifier for verifyCommit. Returns nil if the // transaction has no id (e.g. read-only — safe to retry without verifying). captureXid func(*sql.Tx) (any, error) - // verifyCommit, if non-nil, is called on a commit-phase error the commit - // classifier rejected, to check whether the commit landed. Runs on a fresh - // connection (db) since the original may be dead. xid is from captureXid. + // verifyCommit, if non-nil, is called on every commit error (with the xid + // from captureXid) to check whether the commit landed. Runs on a fresh + // connection (db) since the original may be dead. verifyCommit func(ctx context.Context, db retryDB, xid any) (commitStatus, error) } -// classify picks the commit classifier when commitPhase, else preCommit. -func (c retryClassifier) classify(err error, commitPhase bool) bool { - if commitPhase { - return c.commit(err) - } - return c.preCommit(err) -} - // RetryIdempotent runs fn up to maxRetryAttempts times, retrying on any error // opts.IsRetryable says is retryable (default: any except context cancellation/ // deadline). fn MUST be idempotent — no transaction wrapper, so a retry may @@ -133,25 +126,29 @@ func isRetryableUnsafeDefault(err error) bool { } // retryTx is the shared retry loop for RetryPostgresNonIdempotent (and future -// MySQL/Spanner implementations). opts.IsRetryable is OR'd with base on both -// phases. Commit-phase errors the commit classifier rejects are verified via -// base.verifyCommit when an xid was captured: verified-aborted → retry, -// verified-committed → ErrCommitted (don't retry), inconclusive → ErrCommitPhase. -// Read-only transactions (no xid) retry without verification (no writes to -// duplicate). Commit-phase errors the classifier accepts (class 40: server -// rolled back) are retried; on exhaustion they return the original error (not -// ErrCommitPhase — they're known non-commit). +// MySQL/Spanner implementations). opts.IsRetryable is OR'd with base.preCommit +// (pre-commit errors only — commit errors are always verified, not classified). +// +// Pre-commit errors (from BeginTx or fn): retryable per preCommit (opt-out — +// the transaction rolls back, so retry is always safe). +// +// Commit errors (from (*sql.Tx).Commit): always verified via verifyCommit when +// an xid was captured — verified-aborted → retry, verified-committed → +// ErrCommitted (don't retry), inconclusive → ErrCommitPhase. Read-only +// transactions (no xid) retry without verifying (no writes to duplicate). A +// retryable commit error (aborted/read-only) on exhaustion returns the original +// error, not ErrCommitPhase (it's known non-commit); only inconclusive commits +// get ErrCommitPhase. // // database/sql already retries driver.ErrBadConn for *sql.DB methods (immediate, // up to 3 attempts) before surfacing it; this outer loop layers on top with // jitter for longer outages. *sql.Tx methods have no internal retry, so this is // the only retry for tx.Exec/tx.Commit. func retryTx(ctx context.Context, db retryDB, base retryClassifier, opts RetryNonIdempotentOptions, fn func(*sql.Tx) error) error { - classifier := base + preCommit := base.preCommit if opts.IsRetryable != nil { extra := opts.IsRetryable - classifier.preCommit = func(err error) bool { return base.preCommit(err) || extra(err) } - classifier.commit = func(err error) bool { return base.commit(err) || extra(err) } + preCommit = func(err error) bool { return base.preCommit(err) || extra(err) } } var lastErr error var lastCommitPhase bool @@ -164,14 +161,18 @@ func retryTx(ctx context.Context, db retryDB, base retryClassifier, opts RetryNo if lastErr == nil { return nil } - retryable := classifier.classify(lastErr, commitPhase) - if !retryable && commitPhase { - // Commit-phase error the classifier rejected — verify whether the - // commit landed, if we captured an xid and have a verifier. + var retryable bool + if !commitPhase { + // Pre-commit error: opt-out classifier decides. + retryable = preCommit(lastErr) + } else { + // Commit error: always verify whether the commit landed (the server + // is the authority — a client-side guess can't tell committed-but- + // response-lost from genuinely-aborted). switch { case base.captureXid != nil && xid == nil: - // Read-only transaction (no xid assigned): no writes to - // duplicate, so retry is safe without verification. + // Read-only transaction (no xid): no writes to duplicate, retry + // is safe without verification. retryable = true case base.verifyCommit != nil && xid != nil: switch status, verr := base.verifyCommit(ctx, db, xid); { @@ -182,6 +183,8 @@ func retryTx(ctx context.Context, db retryDB, base retryClassifier, opts RetryNo default: // commitStatusUnknown or verification error → ambiguous retryable = false } + default: + retryable = false // no verifier → ambiguous (ErrCommitPhase) } } lastRetryable = retryable @@ -192,9 +195,9 @@ func retryTx(ctx context.Context, db retryDB, base retryClassifier, opts RetryNo return ctx.Err() } } - // A retryable commit-phase error (class-40 rolled back, verified-aborted, - // or read-only) is known non-commit, so on exhaustion return the original - // error — not ErrCommitPhase (which is only for inconclusive commits). + // A retryable commit error (verified-aborted or read-only) is known + // non-commit, so on exhaustion return the original error — not ErrCommitPhase + // (which is only for inconclusive commits). if lastCommitPhase && !lastRetryable { return fmt.Errorf("%w: %w", ErrCommitPhase, lastErr) } diff --git a/database/retry_test.go b/database/retry_test.go index 36d6153a..a92a346f 100644 --- a/database/retry_test.go +++ b/database/retry_test.go @@ -263,9 +263,15 @@ func TestRetryPostgresNonIdempotent(t *testing.T) { require.Equal(t, 2, d.beginCalls) // first Begin failed, second succeeded }) - t.Run("SafeToRetry-flagged Commit failure is retried", func(t *testing.T) { + t.Run("SafeToRetry-flagged Commit failure is verified-aborted and retried", func(t *testing.T) { + // A SafeToRetry-flagged commit error (pre-send) for a write transaction + // is now verified (not fast-pathed): pg_xact_status says aborted (the + // COMMIT never sent, so the tx didn't commit) → retry → second attempt + // succeeds. d := &mockDriver{ - commitErrs: []error{&safeToRetryErr{msg: "pre-send commit failure"}}, + captureXidResult: "1", // write transaction + verifyStatusResult: "aborted", + commitErrs: []error{&safeToRetryErr{msg: "pre-send commit failure"}}, } db := newMockDB(d) defer db.Close() @@ -277,8 +283,8 @@ func TestRetryPostgresNonIdempotent(t *testing.T) { }) require.NoError(t, err) require.Equal(t, 2, calls) - require.Equal(t, 2, d.beginCalls) - require.Equal(t, 2, d.commitCalls) // first commit failed, second succeeded + require.Equal(t, 2, d.commitCalls) + require.Equal(t, 1, d.verifyCalls) // verified once (first attempt's commit) }) t.Run("commit-phase network error with inconclusive verification is wrapped with ErrCommitPhase", func(t *testing.T) { @@ -426,14 +432,13 @@ func TestRetryPostgresNonIdempotent(t *testing.T) { }) t.Run("commit-phase class-40 exhaustion returns the original error (not ErrCommitPhase)", func(t *testing.T) { - // serialization_failure (class 40) at commit is retryable per the commit - // classifier (server rolled back → known non-commit). Retried until - // exhausted; the original error is returned (NOT ErrCommitPhase — it's - // not ambiguous, and verification isn't consulted for classifier- - // retryable commit errors). + // serialization_failure (class 40) at commit: the server rolled back, + // so pg_xact_status says aborted → retry. Retried until exhausted; the + // original error is returned (NOT ErrCommitPhase — it's known + // non-commit, just out of retries). d := &mockDriver{ captureXidResult: "1", - verifyStatusResult: "aborted", // unused — classifier says retryable, verify not called + verifyStatusResult: "aborted", // server rolled back → aborted → retry each time commitErrs: []error{ &pgconn.PgError{Code: pgerrcode.SerializationFailure}, &pgconn.PgError{Code: pgerrcode.SerializationFailure}, @@ -451,7 +456,7 @@ func TestRetryPostgresNonIdempotent(t *testing.T) { require.Error(t, err) require.Equal(t, 3, calls) require.Equal(t, 3, d.commitCalls) - require.Equal(t, 0, d.verifyCalls) // classifier-retryable → no verification + require.Equal(t, 3, d.verifyCalls) // verified each attempt (always-verify) require.NotErrorIs(t, err, ErrCommitPhase) // known non-commit, not ambiguous var pgErr *pgconn.PgError require.ErrorAs(t, err, &pgErr) @@ -522,39 +527,6 @@ func TestRetryPostgresNonIdempotent(t *testing.T) { }) } -func TestIsSafeRetryablePostgresError(t *testing.T) { - // Commit-phase classifier (maximally conservative): only errors that can - // NEVER mean "possibly committed" — pgconn.SafeToRetry (pre-send) and - // class 40 (server rolled back before the ErrorResponse). Everything else - // is excluded → wrapped with ErrCommitPhase. See doc. - - require.True(t, isSafeRetryablePostgresError(&safeToRetryErr{msg: "pre-send commit"})) // SafeToRetry-flagged - - // Class 40 (Transaction Rollback) — server rolled back before returning. - require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.SerializationFailure})) - require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.DeadlockDetected})) - require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.TransactionRollback})) - require.True(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.TransactionIntegrityConstraintViolation})) - - // Excluded (could mean "possibly committed" → ErrCommitPhase at commit). - require.False(t, isSafeRetryablePostgresError(driver.ErrBadConn)) // not seen at commit (pgx Commit returns native errors) - require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.AdminShutdown})) // die() can interrupt after commit recorded - require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.CrashShutdown})) // quickdie() _exit(2) without rollback - require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.CannotConnectNow})) // connect-time, not a commit error - require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.TooManyConnections})) - require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.QueryCanceled})) // cancel-after-commit edge case - - // Permanent + network + context errors not retryable here. - require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.UniqueViolation})) - require.False(t, isSafeRetryablePostgresError(&pgconn.PgError{Code: pgerrcode.SyntaxError})) - require.False(t, isSafeRetryablePostgresError(nil)) - require.False(t, isSafeRetryablePostgresError(io.EOF)) - require.False(t, isSafeRetryablePostgresError(io.ErrUnexpectedEOF)) - require.False(t, isSafeRetryablePostgresError(&net.OpError{Op: "read", Err: errors.New("connection reset by peer")})) - require.False(t, isSafeRetryablePostgresError(context.Canceled)) - require.False(t, isSafeRetryablePostgresError(errors.New("some application error"))) -} - func TestIsRetryablePostgresPreCommitError(t *testing.T) { // Pre-commit classifier (opt-out): retries everything except context // cancellation and permanent SQLSTATE classes (22xxx/23xxx/42xxx).