From 6d71baf68321bdc679ddf10ae4acc2baa5b869be Mon Sep 17 00:00:00 2001
From: Bo-Yi Wu <appleboy.tw@gmail.com>
Date: Sun, 12 Apr 2026 21:43:26 +0800
Subject: [PATCH 1/4] perf(db): add Phase 1 indexes, batched cleanup, and
 distributed cleanup lock

- Add composite and single-column indexes on oauth_applications, user_authorizations, oauth_connections, audit_logs, device_codes, and authorization_codes for multi-pod hot paths
- Ship PostgreSQL migration using CREATE INDEX CONCURRENTLY to avoid write locks on large tables
- Batch DELETE of expired tokens, device codes, and old audit logs via shared deleteInBatches helper (10k rows per iteration)
- Add Redis-backed distributed cleanup lock via rueidislock so only one pod runs each cleanup per tick
- Introduce .env.production.example with recommended redis-aside cache defaults for 5+ pod deployments

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .env.production.example                       |  71 +++++++
 internal/bootstrap/bootstrap.go               |  13 +-
 internal/bootstrap/cleanup_lock.go            |  80 ++++++++
 internal/bootstrap/server.go                  |  90 ++++++---
 internal/config/config.go                     |   8 +
 internal/models/audit_log.go                  |  18 +-
 internal/models/authorization_code.go         |   4 +-
 internal/models/device_code.go                |   4 +-
 internal/models/oauth_application.go          |   6 +-
 internal/models/oauth_connection.go           |   2 +-
 internal/models/user_authorization.go         |   8 +-
 internal/store/audit_log.go                   |   6 +-
 internal/store/cleanup.go                     |  46 ++++-
 internal/store/cleanup_test.go                | 181 ++++++++++++++++++
 .../migrations/001_add_phase1_indexes.sql     |  43 +++++
 15 files changed, 528 insertions(+), 52 deletions(-)
 create mode 100644 .env.production.example
 create mode 100644 internal/bootstrap/cleanup_lock.go
 create mode 100644 internal/store/cleanup_test.go
 create mode 100644 internal/store/migrations/001_add_phase1_indexes.sql

diff --git a/.env.production.example b/.env.production.example
new file mode 100644
index 00000000..dccb7db2
--- /dev/null
+++ b/.env.production.example
@@ -0,0 +1,71 @@
+# AuthGate production environment template for multi-pod deployments
+# (20k+ users, 5+ replicas, PostgreSQL + Redis).
+#
+# Copy to .env or inject via your secrets manager / Helm values, then override
+# secrets (JWT_SECRET, SESSION_SECRET, DATABASE_DSN, REDIS_ADDR, etc.) to match
+# your infrastructure. Adjust cache/pool sizes only after observing real
+# traffic metrics (cache hit rate, DB CPU, connection count).
+# =============================================================================
+
+ENVIRONMENT=production
+
+# ---- Secrets (REQUIRED: generate fresh values with `openssl rand -hex 32`) ---
+JWT_SECRET=CHANGE_ME
+SESSION_SECRET=CHANGE_ME
+
+# ---- Database ---------------------------------------------------------------
+DATABASE_DRIVER=postgres
+DATABASE_DSN=host=postgres user=authgate password=CHANGE_ME dbname=authgate port=5432 sslmode=require
+
+# Connection pool: 5 pods × 25 conns = 125; ensure PG max_connections >= 200.
+DB_MAX_OPEN_CONNS=25
+DB_MAX_IDLE_CONNS=10
+DB_CONN_MAX_LIFETIME=5m
+DB_CONN_MAX_IDLE_TIME=10m
+
+# ---- Redis (shared cache + rate limit + cleanup lock) -----------------------
+REDIS_ADDR=redis:6379
+# REDIS_PASSWORD=
+# REDIS_DB=0
+
+# ---- Token verification cache (major DB-load reducer) -----------------------
+# Off by default in .env.example; production should enable this.
+TOKEN_CACHE_ENABLED=true
+TOKEN_CACHE_TYPE=redis-aside
+TOKEN_CACHE_TTL=10h
+TOKEN_CACHE_CLIENT_TTL=1h
+# If pod memory is tight, drop this to 16 (MB) per connection.
+TOKEN_CACHE_SIZE_PER_CONN=32
+
+# ---- Client / User / Metrics cache (shared across pods) ---------------------
+CLIENT_CACHE_TYPE=redis-aside
+CLIENT_COUNT_CACHE_TYPE=redis-aside
+USER_CACHE_TYPE=redis-aside
+METRICS_CACHE_TYPE=redis-aside
+
+# ---- Expired token / device code cleanup ------------------------------------
+# All pods may enable this: a Redis-backed distributed lock (below) prevents
+# concurrent runs — only one pod does the DELETE each interval.
+ENABLE_EXPIRED_TOKEN_CLEANUP=true
+EXPIRED_TOKEN_CLEANUP_INTERVAL=30m
+
+# Distributed cleanup lock via rueidislock. Required for multi-pod.
+ENABLE_CLEANUP_LOCK=true
+CLEANUP_LOCK_KEY_VALIDITY=5m
+
+# ---- Audit logging ----------------------------------------------------------
+ENABLE_AUDIT_LOGGING=true
+AUDIT_LOG_RETENTION=2160h   # 90 days
+
+# ---- Rate limiting (distributed) --------------------------------------------
+ENABLE_RATE_LIMIT=true
+RATE_LIMIT_STORE=redis
+
+# ---- Metrics ----------------------------------------------------------------
+METRICS_ENABLED=true
+# Enable on ONE replica only; other pods should set this to false to avoid
+# duplicated gauge updates. Alternatively, use PromQL avg()/max() aggregation.
+METRICS_GAUGE_UPDATE_ENABLED=true
+
+# ---- Sessions ---------------------------------------------------------------
+SESSION_FINGERPRINT=true
diff --git a/internal/bootstrap/bootstrap.go b/internal/bootstrap/bootstrap.go
index 8587cfed..f02505d6 100644
--- a/internal/bootstrap/bootstrap.go
+++ b/internal/bootstrap/bootstrap.go
@@ -15,6 +15,7 @@ import (
 	"github.com/gin-gonic/gin"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/redis/go-redis/v9"
+	"github.com/redis/rueidis/rueidislock"
 )
 
 // Application holds all initialized components
@@ -39,6 +40,7 @@ type Application struct {
 	TokenCache             core.Cache[models.AccessToken]
 	TokenCacheCloser       func() error
 	RateLimitRedisClient   *redis.Client
+	CleanupLocker          rueidislock.Locker
 
 	// Services
 	AuditService core.AuditLogger
@@ -137,6 +139,12 @@ func (app *Application) initializeInfrastructure(ctx context.Context) error {
 		return err
 	}
 
+	// Distributed cleanup lock (multi-pod: serialize DELETE jobs)
+	app.CleanupLocker, err = initializeCleanupLocker(app.Config)
+	if err != nil {
+		return err
+	}
+
 	return nil
 }
 
@@ -222,8 +230,9 @@ func (app *Application) startWithGracefulShutdown() {
 	addClientCacheCleanupJob(m, app.ClientCache, app.Config)
 	addTokenCacheCleanupJob(m, app.TokenCache, app.Config)
 	addDatabaseShutdownJob(m, app.DB, app.Config)
-	addAuditLogCleanupJob(m, app.Config, app.AuditService)
-	addExpiredTokenCleanupJob(m, app.DB, app.Config)
+	addAuditLogCleanupJob(m, app.Config, app.AuditService, app.CleanupLocker)
+	addExpiredTokenCleanupJob(m, app.DB, app.Config, app.CleanupLocker)
+	addCleanupLockerShutdownJob(m, app.CleanupLocker)
 	addMetricsGaugeUpdateJob(m, app.Config, app.DB, app.MetricsRecorder, app.MetricsCache)
 
 	// Wait for graceful shutdown
diff --git a/internal/bootstrap/cleanup_lock.go b/internal/bootstrap/cleanup_lock.go
new file mode 100644
index 00000000..97cb1788
--- /dev/null
+++ b/internal/bootstrap/cleanup_lock.go
@@ -0,0 +1,80 @@
+package bootstrap
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"log"
+
+	"github.com/redis/rueidis"
+	"github.com/redis/rueidis/rueidislock"
+
+	"github.com/go-authgate/authgate/internal/config"
+)
+
+// Lock names for the distributed cleanup jobs. Keep in sync with docs/runbooks
+// that may reference these keys for debugging stuck cleanups.
+const (
+	cleanupLockAuditLogs     = "cleanup:audit-logs"
+	cleanupLockExpiredTokens = "cleanup:expired-tokens"
+)
+
+// initializeCleanupLocker builds a Redis-backed distributed locker that
+// serializes periodic cleanup jobs across multi-pod deployments. Returns
+// (nil, nil) when cleanup lock is disabled; callers treat a nil locker as
+// "run unconditionally" (single-instance mode).
+//
+// KeyMajority is 1 (single Redis target) rather than a Redlock quorum. A
+// Redis failover window could allow two pods to hold the lock simultaneously,
+// but cleanup DELETEs are idempotent (the inner SELECT finds no matching rows
+// on the second pod), so this is safe — the worst case is transient double
+// work, never data loss or corruption.
+func initializeCleanupLocker(cfg *config.Config) (rueidislock.Locker, error) {
+	if !cfg.EnableCleanupLock {
+		return nil, nil //nolint:nilnil // locker not needed when feature is disabled
+	}
+	if cfg.RedisAddr == "" {
+		return nil, errors.New("ENABLE_CLEANUP_LOCK requires REDIS_ADDR to be set")
+	}
+
+	locker, err := rueidislock.NewLocker(rueidislock.LockerOption{
+		ClientOption: rueidis.ClientOption{
+			InitAddress: []string{cfg.RedisAddr},
+			Password:    cfg.RedisPassword,
+			SelectDB:    cfg.RedisDB,
+		},
+		KeyPrefix:   "authgate:lock",
+		KeyMajority: 1,
+		KeyValidity: cfg.CleanupLockKeyValidity,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to create cleanup locker: %w", err)
+	}
+
+	log.Printf("Cleanup lock initialized (validity: %v)", cfg.CleanupLockKeyValidity)
+	return locker, nil
+}
+
+// runWithCleanupLock executes fn while holding the named distributed lock.
+// When locker is nil (single-instance mode) fn runs unconditionally.
+// When another pod currently holds the lock, fn is skipped silently and
+// nil is returned — the next tick will try again.
+func runWithCleanupLock(
+	ctx context.Context,
+	locker rueidislock.Locker,
+	name string,
+	fn func(context.Context) error,
+) error {
+	if locker == nil {
+		return fn(ctx)
+	}
+	lockCtx, cancel, err := locker.TryWithContext(ctx, name)
+	if err != nil {
+		if errors.Is(err, rueidislock.ErrNotLocked) {
+			return nil
+		}
+		return fmt.Errorf("acquire cleanup lock %q: %w", name, err)
+	}
+	defer cancel()
+	return fn(lockCtx)
+}
diff --git a/internal/bootstrap/server.go b/internal/bootstrap/server.go
index c9962082..282c3eb2 100644
--- a/internal/bootstrap/server.go
+++ b/internal/bootstrap/server.go
@@ -14,6 +14,7 @@ import (
 
 	"github.com/appleboy/graceful"
 	"github.com/redis/go-redis/v9"
+	"github.com/redis/rueidis/rueidislock"
 )
 
 // createHTTPServer creates the HTTP server instance
@@ -118,36 +119,46 @@ func addAuditServiceShutdownJob(
 	})
 }
 
-// addAuditLogCleanupJob adds periodic audit log cleanup job
+// addAuditLogCleanupJob adds periodic audit log cleanup job.
+// When a cleanup locker is supplied, only one pod across the fleet performs
+// the DELETE per tick; others skip silently.
 func addAuditLogCleanupJob(
 	m *graceful.Manager,
 	cfg *config.Config,
 	auditService core.AuditLogger,
+	locker rueidislock.Locker,
 ) {
 	if !cfg.EnableAuditLogging || cfg.AuditLogRetention <= 0 {
 		return
 	}
 
+	run := func(ctx context.Context) error {
+		return runWithCleanupLock(ctx, locker, cleanupLockAuditLogs, func(context.Context) error {
+			deleted, err := auditService.CleanupOldLogs(cfg.AuditLogRetention)
+			if err != nil {
+				log.Printf("Failed to cleanup old audit logs: %v", err)
+				return nil
+			}
+			if deleted > 0 {
+				log.Printf("Cleaned up %d old audit logs", deleted)
+			}
+			return nil
+		})
+	}
+
 	m.AddRunningJob(func(ctx context.Context) error {
 		ticker := time.NewTicker(24 * time.Hour)
 		defer ticker.Stop()
 
-		// Run cleanup immediately on startup
-		if deleted, err := auditService.CleanupOldLogs(cfg.AuditLogRetention); err != nil {
-			log.Printf("Failed to cleanup old audit logs: %v", err)
-		} else if deleted > 0 {
-			log.Printf("Cleaned up %d old audit logs", deleted)
+		if err := run(ctx); err != nil {
+			log.Printf("Audit log cleanup run error: %v", err)
 		}
 
 		for {
 			select {
 			case <-ticker.C:
-				if deleted, err := auditService.CleanupOldLogs(
-					cfg.AuditLogRetention,
-				); err != nil {
-					log.Printf("Failed to cleanup old audit logs: %v", err)
-				} else if deleted > 0 {
-					log.Printf("Cleaned up %d old audit logs", deleted)
+				if err := run(ctx); err != nil {
+					log.Printf("Audit log cleanup run error: %v", err)
 				}
 			case <-ctx.Done():
 				return nil
@@ -290,31 +301,48 @@ func addTokenCacheCleanupJob(
 
 // addExpiredTokenCleanupJob adds a periodic job that purges expired access tokens
 // and device codes from the database to prevent unbounded table growth.
-func addExpiredTokenCleanupJob(m *graceful.Manager, db *store.Store, cfg *config.Config) {
+// When a cleanup locker is supplied, only one pod across the fleet performs
+// the DELETE per tick; others skip silently.
+func addExpiredTokenCleanupJob(
+	m *graceful.Manager,
+	db *store.Store,
+	cfg *config.Config,
+	locker rueidislock.Locker,
+) {
 	if !cfg.EnableExpiredTokenCleanup {
 		return
 	}
 
+	run := func(ctx context.Context) error {
+		return runWithCleanupLock(
+			ctx,
+			locker,
+			cleanupLockExpiredTokens,
+			func(context.Context) error {
+				if err := db.DeleteExpiredTokens(); err != nil {
+					log.Printf("Failed to cleanup expired tokens: %v", err)
+				}
+				if err := db.DeleteExpiredDeviceCodes(); err != nil {
+					log.Printf("Failed to cleanup expired device codes: %v", err)
+				}
+				return nil
+			},
+		)
+	}
+
 	m.AddRunningJob(func(ctx context.Context) error {
 		ticker := time.NewTicker(cfg.ExpiredTokenCleanupInterval)
 		defer ticker.Stop()
 
-		// Run cleanup immediately on startup
-		if err := db.DeleteExpiredTokens(); err != nil {
-			log.Printf("Failed to cleanup expired tokens: %v", err)
-		}
-		if err := db.DeleteExpiredDeviceCodes(); err != nil {
-			log.Printf("Failed to cleanup expired device codes: %v", err)
+		if err := run(ctx); err != nil {
+			log.Printf("Expired token cleanup run error: %v", err)
 		}
 
 		for {
 			select {
 			case <-ticker.C:
-				if err := db.DeleteExpiredTokens(); err != nil {
-					log.Printf("Failed to cleanup expired tokens: %v", err)
-				}
-				if err := db.DeleteExpiredDeviceCodes(); err != nil {
-					log.Printf("Failed to cleanup expired device codes: %v", err)
+				if err := run(ctx); err != nil {
+					log.Printf("Expired token cleanup run error: %v", err)
 				}
 			case <-ctx.Done():
 				return nil
@@ -323,6 +351,20 @@ func addExpiredTokenCleanupJob(m *graceful.Manager, db *store.Store, cfg *config
 	})
 }
 
+// addCleanupLockerShutdownJob closes the distributed cleanup locker on
+// shutdown. No-op when locker is nil.
+func addCleanupLockerShutdownJob(m *graceful.Manager, locker rueidislock.Locker) {
+	if locker == nil {
+		return
+	}
+	m.AddShutdownJob(func() error {
+		log.Println("Closing cleanup locker...")
+		locker.Close()
+		log.Println("Cleanup locker closed")
+		return nil
+	})
+}
+
 // addDatabaseShutdownJob adds database connection close handler
 func addDatabaseShutdownJob(m *graceful.Manager, db *store.Store, cfg *config.Config) {
 	m.AddShutdownJob(func() error {
diff --git a/internal/config/config.go b/internal/config/config.go
index d3a9a3ce..2a8ff395 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -164,6 +164,10 @@ type Config struct {
 	EnableExpiredTokenCleanup   bool          // Enable periodic cleanup of expired tokens and device codes (default: false)
 	ExpiredTokenCleanupInterval time.Duration // How often to purge expired rows (default: 1h)
 
+	// Distributed cleanup lock (multi-pod deployments)
+	EnableCleanupLock      bool          // Use Redis lock so only one pod runs cleanup per interval (default: false)
+	CleanupLockKeyValidity time.Duration // Lock validity window, auto-extended by rueidislock (default: 5m)
+
 	// Prometheus Metrics settings
 	MetricsEnabled             bool          // Enable Prometheus metrics endpoint (default: false)
 	MetricsToken               string        // Bearer token for /metrics (empty = no auth, recommended for production)
@@ -371,6 +375,10 @@ func Load() *Config {
 		EnableExpiredTokenCleanup:   getEnvBool("ENABLE_EXPIRED_TOKEN_CLEANUP", false),
 		ExpiredTokenCleanupInterval: getEnvDuration("EXPIRED_TOKEN_CLEANUP_INTERVAL", time.Hour),
 
+		// Distributed cleanup lock
+		EnableCleanupLock:      getEnvBool("ENABLE_CLEANUP_LOCK", false),
+		CleanupLockKeyValidity: getEnvDuration("CLEANUP_LOCK_KEY_VALIDITY", 5*time.Minute),
+
 		// Prometheus Metrics settings
 		MetricsEnabled:             getEnvBool("METRICS_ENABLED", false),
 		MetricsToken:               getEnv("METRICS_TOKEN", ""),
diff --git a/internal/models/audit_log.go b/internal/models/audit_log.go
index 551abd84..662a1551 100644
--- a/internal/models/audit_log.go
+++ b/internal/models/audit_log.go
@@ -137,19 +137,19 @@ type AuditLog struct {
 	ID string `gorm:"primaryKey;type:varchar(36)" json:"id"`
 
 	// Event information
-	EventType EventType     `gorm:"type:varchar(50);index;not null" json:"event_type"`
-	EventTime time.Time     `gorm:"index;not null"                  json:"event_time"`
-	Severity  EventSeverity `gorm:"type:varchar(20);not null"       json:"severity"`
+	EventType EventType     `gorm:"type:varchar(50);index;not null;index:idx_audit_type_time,priority:1"                                                                                             json:"event_type"`
+	EventTime time.Time     `gorm:"index;not null;index:idx_audit_type_time,priority:2,sort:desc;index:idx_audit_actor_time,priority:2,sort:desc;index:idx_audit_resource_time,priority:3,sort:desc" json:"event_time"`
+	Severity  EventSeverity `gorm:"type:varchar(20);not null;index"                                                                                                                                  json:"severity"`
 
 	// Actor information
-	ActorUserID   string `gorm:"type:varchar(36);index" json:"actor_user_id"`
-	ActorUsername string `gorm:"type:varchar(100)"      json:"actor_username"`
-	ActorIP       string `gorm:"type:varchar(45);index" json:"actor_ip"` // Support IPv6
+	ActorUserID   string `gorm:"type:varchar(36);index;index:idx_audit_actor_time,priority:1" json:"actor_user_id"`
+	ActorUsername string `gorm:"type:varchar(100)"                                            json:"actor_username"`
+	ActorIP       string `gorm:"type:varchar(45);index"                                       json:"actor_ip"` // Support IPv6
 
 	// Resource information
-	ResourceType ResourceType `gorm:"type:varchar(50);index" json:"resource_type"`
-	ResourceID   string       `gorm:"type:varchar(36);index" json:"resource_id"`
-	ResourceName string       `gorm:"type:varchar(255)"      json:"resource_name"`
+	ResourceType ResourceType `gorm:"type:varchar(50);index;index:idx_audit_resource_time,priority:1" json:"resource_type"`
+	ResourceID   string       `gorm:"type:varchar(36);index;index:idx_audit_resource_time,priority:2" json:"resource_id"`
+	ResourceName string       `gorm:"type:varchar(255)"                                               json:"resource_name"`
 
 	// Operation details
 	Action       string       `gorm:"type:varchar(255);not null" json:"action"`
diff --git a/internal/models/authorization_code.go b/internal/models/authorization_code.go
index 8acbf895..83222565 100644
--- a/internal/models/authorization_code.go
+++ b/internal/models/authorization_code.go
@@ -27,8 +27,8 @@ type AuthorizationCode struct {
 	// OIDC (OpenID Connect Core 1.0 §3.1.2.1)
 	Nonce string `gorm:"default:''"` // nonce from authorization request (empty if not provided)
 
-	ExpiresAt time.Time  `gorm:"index"`
-	UsedAt    *time.Time // Set immediately upon exchange; prevents replay attacks
+	ExpiresAt time.Time  `gorm:"index;index:idx_authcode_used_exp,priority:2"`
+	UsedAt    *time.Time `gorm:"index:idx_authcode_used_exp,priority:1"` // Set immediately upon exchange; prevents replay attacks
 	CreatedAt time.Time
 }
 
diff --git a/internal/models/device_code.go b/internal/models/device_code.go
index 6abdaa0e..bc03cbd7 100644
--- a/internal/models/device_code.go
+++ b/internal/models/device_code.go
@@ -13,10 +13,10 @@ type DeviceCode struct {
 	UserCode       string    `gorm:"uniqueIndex;not null"`
 	ClientID       string    `gorm:"not null;index"`
 	Scopes         string    `gorm:"not null"` // space-separated scopes
-	ExpiresAt      time.Time `gorm:"index"`
+	ExpiresAt      time.Time `gorm:"index;index:idx_device_auth_exp,priority:2"`
 	Interval       int       // polling interval in seconds
 	UserID         string    // filled after authorization
-	Authorized     bool      `gorm:"default:false"`
+	Authorized     bool      `gorm:"default:false;index:idx_device_auth_exp,priority:1"`
 	AuthorizedAt   time.Time
 	CreatedAt      time.Time
 	UpdatedAt      time.Time
diff --git a/internal/models/oauth_application.go b/internal/models/oauth_application.go
index 8fe215ae..a86b709a 100644
--- a/internal/models/oauth_application.go
+++ b/internal/models/oauth_application.go
@@ -35,15 +35,15 @@ type OAuthApplication struct {
 	ClientSecret                string      `gorm:"not null"` // bcrypt hashed secret
 	ClientName                  string      `gorm:"not null"`
 	Description                 string      `gorm:"type:text"`
-	UserID                      string      `gorm:"not null"`
+	UserID                      string      `gorm:"not null;index"`
 	Scopes                      string      `gorm:"not null"`
 	GrantTypes                  string      `gorm:"not null;default:'device_code'"`
 	RedirectURIs                StringArray `gorm:"type:json"`
 	ClientType                  string      `gorm:"not null;default:'public'"` // "confidential" or "public"
 	EnableDeviceFlow            bool        `gorm:"not null;default:true"`
 	EnableAuthCodeFlow          bool        `gorm:"not null;default:false"`
-	EnableClientCredentialsFlow bool        `gorm:"not null;default:false"`    // Client Credentials Grant (RFC 6749 §4.4); confidential clients only
-	Status                      string      `gorm:"not null;default:'active'"` // ClientStatusPending / ClientStatusActive / ClientStatusInactive
+	EnableClientCredentialsFlow bool        `gorm:"not null;default:false"`          // Client Credentials Grant (RFC 6749 §4.4); confidential clients only
+	Status                      string      `gorm:"not null;default:'active';index"` // ClientStatusPending / ClientStatusActive / ClientStatusInactive
 	CreatedBy                   string
 	CreatedAt                   time.Time
 	UpdatedAt                   time.Time
diff --git a/internal/models/oauth_connection.go b/internal/models/oauth_connection.go
index 010edba6..f8742410 100644
--- a/internal/models/oauth_connection.go
+++ b/internal/models/oauth_connection.go
@@ -7,7 +7,7 @@ import (
 // OAuthConnection represents an OAuth provider connection for a user
 type OAuthConnection struct {
 	ID             string `gorm:"primaryKey"`
-	UserID         string `gorm:"not null;uniqueIndex:idx_oauth_user_provider,priority:1"`
+	UserID         string `gorm:"not null;index;uniqueIndex:idx_oauth_user_provider,priority:1"`
 	Provider       string `gorm:"not null;uniqueIndex:idx_oauth_provider_user,priority:1;uniqueIndex:idx_oauth_user_provider,priority:2"` // "github", "gitea", "gitlab"
 	ProviderUserID string `gorm:"not null;uniqueIndex:idx_oauth_provider_user,priority:2"`                                                // Provider's user ID
 
diff --git a/internal/models/user_authorization.go b/internal/models/user_authorization.go
index 1e23b8a2..3ab0953c 100644
--- a/internal/models/user_authorization.go
+++ b/internal/models/user_authorization.go
@@ -9,14 +9,14 @@ type UserAuthorization struct {
 	UUID string `gorm:"uniqueIndex;size:36;not null"` // Public UUID for API/UI identification
 
 	// Relations (composite unique index ensures one grant per user+app)
-	UserID        string `gorm:"not null;uniqueIndex:idx_user_app"` // FK → User.ID
-	ApplicationID int64  `gorm:"not null;uniqueIndex:idx_user_app"` // FK → OAuthApplication.ID
-	ClientID      string `gorm:"not null;index"`                    // Denormalized ClientID UUID (for UI display and API responses)
+	UserID        string `gorm:"not null;uniqueIndex:idx_user_app;index:idx_user_active,priority:1"` // FK → User.ID
+	ApplicationID int64  `gorm:"not null;uniqueIndex:idx_user_app"`                                  // FK → OAuthApplication.ID
+	ClientID      string `gorm:"not null;index;index:idx_client_active,priority:1"`                  // Denormalized ClientID UUID (for UI display and API responses)
 
 	Scopes    string `gorm:"not null"`
 	GrantedAt time.Time
 	RevokedAt *time.Time
-	IsActive  bool `gorm:"not null;default:true"`
+	IsActive  bool `gorm:"not null;default:true;index:idx_user_active,priority:2;index:idx_client_active,priority:2"`
 
 	CreatedAt time.Time
 	UpdatedAt time.Time
diff --git a/internal/store/audit_log.go b/internal/store/audit_log.go
index 3a6a1f6e..c2161d60 100644
--- a/internal/store/audit_log.go
+++ b/internal/store/audit_log.go
@@ -89,10 +89,10 @@ func (s *Store) GetAuditLogsPaginated(
 	return logs, pagination, nil
 }
 
-// DeleteOldAuditLogs deletes audit logs older than the specified time
+// DeleteOldAuditLogs deletes audit logs older than the specified time in
+// bounded batches to keep lock duration short on large tables.
 func (s *Store) DeleteOldAuditLogs(olderThan time.Time) (int64, error) {
-	result := s.db.Where("created_at < ?", olderThan).Delete(&models.AuditLog{})
-	return result.RowsAffected, result.Error
+	return s.deleteInBatches(&models.AuditLog{}, "created_at < ?", olderThan)
 }
 
 // GetAuditLogStats returns statistics about audit logs in a given time range
diff --git a/internal/store/cleanup.go b/internal/store/cleanup.go
index 01b13059..ff0eea40 100644
--- a/internal/store/cleanup.go
+++ b/internal/store/cleanup.go
@@ -8,12 +8,54 @@ import (
 
 // Cleanup and Metrics operations (implements core.CleanupStore + core.MetricsStore)
 
+// cleanupBatchSize bounds each DELETE to keep lock duration short on large tables.
+// Chosen empirically: 10k rows × a single indexed DELETE completes in well under
+// a second even on a busy PostgreSQL, so replicas / WAL apply do not fall behind.
+// Declared as var (not const) so tests can shrink it to exercise the batching loop.
+var cleanupBatchSize = 10000
+
+// cleanupBatchPause gives WAL / replication / autovacuum breathing room between
+// batches. A short sleep is sufficient; we are optimizing for background-job
+// friendliness, not throughput. Declared as var so tests can zero it out.
+var cleanupBatchPause = 200 * time.Millisecond
+
+// deleteInBatches DELETEs rows of `model` matching whereClause in batches of
+// cleanupBatchSize. The subquery form (`id IN (SELECT id … LIMIT N)`) works on
+// PostgreSQL — which does not support `DELETE … LIMIT` directly — and lets the
+// inner SELECT use the WHERE-clause index (e.g. expires_at).
+func (s *Store) deleteInBatches(
+	model any,
+	whereClause string,
+	args ...any,
+) (int64, error) {
+	var total int64
+	for {
+		sub := s.db.Model(model).
+			Select("id").
+			Where(whereClause, args...).
+			Limit(cleanupBatchSize)
+		res := s.db.Where("id IN (?)", sub).Delete(model)
+		if res.Error != nil {
+			return total, res.Error
+		}
+		total += res.RowsAffected
+		if res.RowsAffected < int64(cleanupBatchSize) {
+			return total, nil
+		}
+		time.Sleep(cleanupBatchPause)
+	}
+}
+
+// DeleteExpiredTokens removes access/refresh tokens past expiry.
 func (s *Store) DeleteExpiredTokens() error {
-	return s.db.Where("expires_at < ?", time.Now()).Delete(&models.AccessToken{}).Error
+	_, err := s.deleteInBatches(&models.AccessToken{}, "expires_at < ?", time.Now())
+	return err
 }
 
+// DeleteExpiredDeviceCodes removes device authorization codes past expiry.
 func (s *Store) DeleteExpiredDeviceCodes() error {
-	return s.db.Where("expires_at < ?", time.Now()).Delete(&models.DeviceCode{}).Error
+	_, err := s.deleteInBatches(&models.DeviceCode{}, "expires_at < ?", time.Now())
+	return err
 }
 
 // CountActiveTokensByCategory counts active, non-expired tokens by category
diff --git a/internal/store/cleanup_test.go b/internal/store/cleanup_test.go
new file mode 100644
index 00000000..ce68402a
--- /dev/null
+++ b/internal/store/cleanup_test.go
@@ -0,0 +1,181 @@
+package store
+
+import (
+	"testing"
+	"time"
+
+	"github.com/go-authgate/authgate/internal/models"
+
+	"github.com/google/uuid"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// withSmallCleanupBatch shrinks the cleanup batch loop parameters for the
+// duration of a test so we can exercise the multi-batch path with a realistic
+// row count. Restores the production defaults on cleanup.
+func withSmallCleanupBatch(t *testing.T, size int) {
+	t.Helper()
+	origSize := cleanupBatchSize
+	origPause := cleanupBatchPause
+	cleanupBatchSize = size
+	cleanupBatchPause = 0
+	t.Cleanup(func() {
+		cleanupBatchSize = origSize
+		cleanupBatchPause = origPause
+	})
+}
+
+// seedExpiredTokens creates `count` AccessToken rows already past expiry.
+func seedExpiredTokens(t *testing.T, s *Store, count int) {
+	t.Helper()
+	expired := time.Now().Add(-time.Hour)
+	tokens := make([]*models.AccessToken, 0, count)
+	for range count {
+		tokens = append(tokens, &models.AccessToken{
+			ID:            uuid.New().String(),
+			TokenHash:     uuid.New().String(),
+			TokenCategory: models.TokenCategoryAccess,
+			Status:        models.TokenStatusActive,
+			UserID:        "user-" + uuid.New().String(),
+			ClientID:      "client-x",
+			Scopes:        "read",
+			ExpiresAt:     expired,
+		})
+	}
+	require.NoError(t, s.db.CreateInBatches(tokens, 100).Error)
+}
+
+// seedActiveTokens creates `count` AccessToken rows still valid for a while.
+func seedActiveTokens(t *testing.T, s *Store, count int) {
+	t.Helper()
+	future := time.Now().Add(time.Hour)
+	tokens := make([]*models.AccessToken, 0, count)
+	for range count {
+		tokens = append(tokens, &models.AccessToken{
+			ID:            uuid.New().String(),
+			TokenHash:     uuid.New().String(),
+			TokenCategory: models.TokenCategoryAccess,
+			Status:        models.TokenStatusActive,
+			UserID:        "user-" + uuid.New().String(),
+			ClientID:      "client-x",
+			Scopes:        "read",
+			ExpiresAt:     future,
+		})
+	}
+	require.NoError(t, s.db.CreateInBatches(tokens, 100).Error)
+}
+
+func TestDeleteExpiredTokensBatched(t *testing.T) {
+	withSmallCleanupBatch(t, 100) // force the loop to iterate
+	s := createFreshStore(t, "sqlite", nil)
+
+	seedExpiredTokens(t, s, 250)
+	seedActiveTokens(t, s, 30)
+
+	require.NoError(t, s.DeleteExpiredTokens())
+
+	var remaining int64
+	require.NoError(t, s.db.Model(&models.AccessToken{}).Count(&remaining).Error)
+	assert.Equal(t, int64(30), remaining, "only unexpired tokens should remain")
+}
+
+func TestDeleteExpiredTokensNoRows(t *testing.T) {
+	withSmallCleanupBatch(t, 100)
+	s := createFreshStore(t, "sqlite", nil)
+
+	// Nothing expired; cleanup must still complete without error and leave
+	// everything intact.
+	seedActiveTokens(t, s, 10)
+
+	require.NoError(t, s.DeleteExpiredTokens())
+
+	var remaining int64
+	require.NoError(t, s.db.Model(&models.AccessToken{}).Count(&remaining).Error)
+	assert.Equal(t, int64(10), remaining)
+}
+
+func TestDeleteExpiredDeviceCodesBatched(t *testing.T) {
+	withSmallCleanupBatch(t, 50)
+	s := createFreshStore(t, "sqlite", nil)
+
+	expired := time.Now().Add(-time.Hour)
+	future := time.Now().Add(time.Hour)
+
+	expiredCodes := make([]*models.DeviceCode, 0, 120)
+	for range 120 {
+		expiredCodes = append(expiredCodes, &models.DeviceCode{
+			DeviceCodeHash: uuid.New().String(),
+			DeviceCodeSalt: "salt",
+			DeviceCodeID:   uuid.New().String()[:8],
+			UserCode:       uuid.New().String()[:8],
+			ClientID:       "client-x",
+			Scopes:         "read",
+			ExpiresAt:      expired,
+		})
+	}
+	require.NoError(t, s.db.CreateInBatches(expiredCodes, 40).Error)
+
+	activeCodes := make([]*models.DeviceCode, 0, 15)
+	for range 15 {
+		activeCodes = append(activeCodes, &models.DeviceCode{
+			DeviceCodeHash: uuid.New().String(),
+			DeviceCodeSalt: "salt",
+			DeviceCodeID:   uuid.New().String()[:8],
+			UserCode:       uuid.New().String()[:8],
+			ClientID:       "client-x",
+			Scopes:         "read",
+			ExpiresAt:      future,
+		})
+	}
+	require.NoError(t, s.db.CreateInBatches(activeCodes, 40).Error)
+
+	require.NoError(t, s.DeleteExpiredDeviceCodes())
+
+	var remaining int64
+	require.NoError(t, s.db.Model(&models.DeviceCode{}).Count(&remaining).Error)
+	assert.Equal(t, int64(15), remaining)
+}
+
+func TestDeleteOldAuditLogsBatched(t *testing.T) {
+	withSmallCleanupBatch(t, 50)
+	s := createFreshStore(t, "sqlite", nil)
+
+	// 80 old entries and 20 recent entries; retention cutoff is "now minus 1h"
+	oldTime := time.Now().Add(-24 * time.Hour)
+	recent := time.Now()
+
+	oldLogs := make([]*models.AuditLog, 0, 80)
+	for range 80 {
+		oldLogs = append(oldLogs, &models.AuditLog{
+			ID:        uuid.New().String(),
+			EventType: models.EventAuthenticationSuccess,
+			EventTime: oldTime,
+			Severity:  models.SeverityInfo,
+			Action:    "test",
+			CreatedAt: oldTime,
+		})
+	}
+	require.NoError(t, s.db.CreateInBatches(oldLogs, 40).Error)
+
+	newLogs := make([]*models.AuditLog, 0, 20)
+	for range 20 {
+		newLogs = append(newLogs, &models.AuditLog{
+			ID:        uuid.New().String(),
+			EventType: models.EventAuthenticationSuccess,
+			EventTime: recent,
+			Severity:  models.SeverityInfo,
+			Action:    "test",
+			CreatedAt: recent,
+		})
+	}
+	require.NoError(t, s.db.CreateInBatches(newLogs, 40).Error)
+
+	deleted, err := s.DeleteOldAuditLogs(time.Now().Add(-time.Hour))
+	require.NoError(t, err)
+	assert.Equal(t, int64(80), deleted)
+
+	var remaining int64
+	require.NoError(t, s.db.Model(&models.AuditLog{}).Count(&remaining).Error)
+	assert.Equal(t, int64(20), remaining)
+}
diff --git a/internal/store/migrations/001_add_phase1_indexes.sql b/internal/store/migrations/001_add_phase1_indexes.sql
new file mode 100644
index 00000000..8c2ff3f1
--- /dev/null
+++ b/internal/store/migrations/001_add_phase1_indexes.sql
@@ -0,0 +1,43 @@
+-- Phase 1 index additions for AuthGate performance at 20k+ user scale.
+--
+-- Usage notes
+--   * PostgreSQL only. Run each statement individually; CREATE INDEX
+--     CONCURRENTLY cannot run inside a transaction block.
+--   * Safe to re-run: `IF NOT EXISTS` short-circuits when indexes already exist.
+--   * AutoMigrate in internal/store/sqlite.go will create equivalent indexes on
+--     fresh databases via GORM tags; this file is only needed for existing
+--     production databases where CONCURRENTLY is required to avoid write locks.
+
+-- oauth_applications: owner lookups and status filtering
+CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_oauth_app_user_id
+    ON oauth_applications (user_id);
+CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_oauth_app_status
+    ON oauth_applications (status);
+
+-- user_authorizations: ListUserAuthorizations / RevokeAllUserAuthorizationsByClientID
+CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_user_active
+    ON user_authorizations (user_id, is_active);
+CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_client_active
+    ON user_authorizations (client_id, is_active);
+
+-- oauth_connections: GetOAuthConnectionsByUserID
+CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_oauth_conn_user_id
+    ON oauth_connections (user_id);
+
+-- audit_logs: admin filter combinations
+CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_audit_severity
+    ON audit_logs (severity);
+CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_audit_type_time
+    ON audit_logs (event_type, event_time DESC);
+CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_audit_actor_time
+    ON audit_logs (actor_user_id, event_time DESC);
+CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_audit_resource_time
+    ON audit_logs (resource_type, resource_id, event_time DESC);
+
+-- device_codes: CountPendingDeviceCodes / pending polling lookups
+CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_device_auth_exp
+    ON device_codes (authorized, expires_at);
+
+-- authorization_codes: cleanup and replay-prevention queries
+CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_authcode_used_exp
+    ON authorization_codes (used_at, expires_at);

From 3c74ea3c761d7395754cccad19b7b815e3fc1775 Mon Sep 17 00:00:00 2001
From: Bo-Yi Wu <appleboy.tw@gmail.com>
Date: Sun, 12 Apr 2026 22:12:36 +0800
Subject: [PATCH 2/4] fix(db): align gorm index names with migration SQL and
 honor audit cleanup interval

- Rename GORM index tags on oauth_applications.user_id / status, oauth_connections.user_id, and audit_logs.severity to match the names used in 001_add_phase1_indexes.sql so AutoMigrate does not create duplicate default-named indexes on existing PostgreSQL deployments
- Use cfg.AuditLogCleanupInterval instead of a hard-coded 24h ticker in addAuditLogCleanupJob and treat <= 0 as disabling cleanup

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 internal/bootstrap/server.go         | 4 ++--
 internal/models/audit_log.go         | 2 +-
 internal/models/oauth_application.go | 6 +++---
 internal/models/oauth_connection.go  | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/internal/bootstrap/server.go b/internal/bootstrap/server.go
index 282c3eb2..333ff7cc 100644
--- a/internal/bootstrap/server.go
+++ b/internal/bootstrap/server.go
@@ -128,7 +128,7 @@ func addAuditLogCleanupJob(
 	auditService core.AuditLogger,
 	locker rueidislock.Locker,
 ) {
-	if !cfg.EnableAuditLogging || cfg.AuditLogRetention <= 0 {
+	if !cfg.EnableAuditLogging || cfg.AuditLogRetention <= 0 || cfg.AuditLogCleanupInterval <= 0 {
 		return
 	}
 
@@ -147,7 +147,7 @@ func addAuditLogCleanupJob(
 	}
 
 	m.AddRunningJob(func(ctx context.Context) error {
-		ticker := time.NewTicker(24 * time.Hour)
+		ticker := time.NewTicker(cfg.AuditLogCleanupInterval)
 		defer ticker.Stop()
 
 		if err := run(ctx); err != nil {
diff --git a/internal/models/audit_log.go b/internal/models/audit_log.go
index 662a1551..a8fc1f1d 100644
--- a/internal/models/audit_log.go
+++ b/internal/models/audit_log.go
@@ -139,7 +139,7 @@ type AuditLog struct {
 	// Event information
 	EventType EventType     `gorm:"type:varchar(50);index;not null;index:idx_audit_type_time,priority:1"                                                                                             json:"event_type"`
 	EventTime time.Time     `gorm:"index;not null;index:idx_audit_type_time,priority:2,sort:desc;index:idx_audit_actor_time,priority:2,sort:desc;index:idx_audit_resource_time,priority:3,sort:desc" json:"event_time"`
-	Severity  EventSeverity `gorm:"type:varchar(20);not null;index"                                                                                                                                  json:"severity"`
+	Severity  EventSeverity `gorm:"type:varchar(20);not null;index:idx_audit_severity"                                                                                                               json:"severity"`
 
 	// Actor information
 	ActorUserID   string `gorm:"type:varchar(36);index;index:idx_audit_actor_time,priority:1" json:"actor_user_id"`
diff --git a/internal/models/oauth_application.go b/internal/models/oauth_application.go
index a86b709a..67897eaa 100644
--- a/internal/models/oauth_application.go
+++ b/internal/models/oauth_application.go
@@ -35,15 +35,15 @@ type OAuthApplication struct {
 	ClientSecret                string      `gorm:"not null"` // bcrypt hashed secret
 	ClientName                  string      `gorm:"not null"`
 	Description                 string      `gorm:"type:text"`
-	UserID                      string      `gorm:"not null;index"`
+	UserID                      string      `gorm:"not null;index:idx_oauth_app_user_id"`
 	Scopes                      string      `gorm:"not null"`
 	GrantTypes                  string      `gorm:"not null;default:'device_code'"`
 	RedirectURIs                StringArray `gorm:"type:json"`
 	ClientType                  string      `gorm:"not null;default:'public'"` // "confidential" or "public"
 	EnableDeviceFlow            bool        `gorm:"not null;default:true"`
 	EnableAuthCodeFlow          bool        `gorm:"not null;default:false"`
-	EnableClientCredentialsFlow bool        `gorm:"not null;default:false"`          // Client Credentials Grant (RFC 6749 §4.4); confidential clients only
-	Status                      string      `gorm:"not null;default:'active';index"` // ClientStatusPending / ClientStatusActive / ClientStatusInactive
+	EnableClientCredentialsFlow bool        `gorm:"not null;default:false"`                               // Client Credentials Grant (RFC 6749 §4.4); confidential clients only
+	Status                      string      `gorm:"not null;default:'active';index:idx_oauth_app_status"` // ClientStatusPending / ClientStatusActive / ClientStatusInactive
 	CreatedBy                   string
 	CreatedAt                   time.Time
 	UpdatedAt                   time.Time
diff --git a/internal/models/oauth_connection.go b/internal/models/oauth_connection.go
index f8742410..fd7f5756 100644
--- a/internal/models/oauth_connection.go
+++ b/internal/models/oauth_connection.go
@@ -7,7 +7,7 @@ import (
 // OAuthConnection represents an OAuth provider connection for a user
 type OAuthConnection struct {
 	ID             string `gorm:"primaryKey"`
-	UserID         string `gorm:"not null;index;uniqueIndex:idx_oauth_user_provider,priority:1"`
+	UserID         string `gorm:"not null;index:idx_oauth_conn_user_id;uniqueIndex:idx_oauth_user_provider,priority:1"`
 	Provider       string `gorm:"not null;uniqueIndex:idx_oauth_provider_user,priority:1;uniqueIndex:idx_oauth_user_provider,priority:2"` // "github", "gitea", "gitlab"
 	ProviderUserID string `gorm:"not null;uniqueIndex:idx_oauth_provider_user,priority:2"`                                                // Provider's user ID
 

From d8fe3219b1428c6a1d5b483eeebeefee1dcce2a1 Mon Sep 17 00:00:00 2001
From: Bo-Yi Wu <appleboy.tw@gmail.com>
Date: Sun, 12 Apr 2026 22:20:55 +0800
Subject: [PATCH 3/4] fix(db): guard cleanup batch helper against
 misconfiguration and document id assumption

- Rename deleteInBatches to deleteByIDInBatches and document that it requires an id primary key column
- Return an error when cleanupBatchSize <= 0 so a misconfigured var cannot silently loop forever via GORM Limit(0) disabling the clause
- Also terminate on RowsAffected == 0 as a belt-and-braces exit condition
- Add a unit test covering the invalid batch size guard

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 internal/store/audit_log.go    |  2 +-
 internal/store/cleanup.go      | 22 +++++++++++++++-------
 internal/store/cleanup_test.go | 17 +++++++++++++++++
 3 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/internal/store/audit_log.go b/internal/store/audit_log.go
index c2161d60..8f54d9ee 100644
--- a/internal/store/audit_log.go
+++ b/internal/store/audit_log.go
@@ -92,7 +92,7 @@ func (s *Store) GetAuditLogsPaginated(
 // DeleteOldAuditLogs deletes audit logs older than the specified time in
 // bounded batches to keep lock duration short on large tables.
 func (s *Store) DeleteOldAuditLogs(olderThan time.Time) (int64, error) {
-	return s.deleteInBatches(&models.AuditLog{}, "created_at < ?", olderThan)
+	return s.deleteByIDInBatches(&models.AuditLog{}, "created_at < ?", olderThan)
 }
 
 // GetAuditLogStats returns statistics about audit logs in a given time range
diff --git a/internal/store/cleanup.go b/internal/store/cleanup.go
index ff0eea40..bdda49f2 100644
--- a/internal/store/cleanup.go
+++ b/internal/store/cleanup.go
@@ -1,6 +1,7 @@
 package store
 
 import (
+	"fmt"
 	"time"
 
 	"github.com/go-authgate/authgate/internal/models"
@@ -19,15 +20,22 @@ var cleanupBatchSize = 10000
 // friendliness, not throughput. Declared as var so tests can zero it out.
 var cleanupBatchPause = 200 * time.Millisecond
 
-// deleteInBatches DELETEs rows of `model` matching whereClause in batches of
-// cleanupBatchSize. The subquery form (`id IN (SELECT id … LIMIT N)`) works on
-// PostgreSQL — which does not support `DELETE … LIMIT` directly — and lets the
+// deleteByIDInBatches DELETEs rows of `model` matching whereClause in batches of
+// cleanupBatchSize, using the subquery form `id IN (SELECT id … LIMIT N)`.
+// PostgreSQL does not support `DELETE … LIMIT` directly, and this form lets the
 // inner SELECT use the WHERE-clause index (e.g. expires_at).
-func (s *Store) deleteInBatches(
+//
+// The helper assumes `model` has an `id` primary-key column — all current
+// cleanup targets (AccessToken, DeviceCode, AuditLog) satisfy this. Do not call
+// it with models whose PK column is named differently.
+func (s *Store) deleteByIDInBatches(
 	model any,
 	whereClause string,
 	args ...any,
 ) (int64, error) {
+	if cleanupBatchSize <= 0 {
+		return 0, fmt.Errorf("cleanupBatchSize must be positive, got %d", cleanupBatchSize)
+	}
 	var total int64
 	for {
 		sub := s.db.Model(model).
@@ -39,7 +47,7 @@ func (s *Store) deleteInBatches(
 			return total, res.Error
 		}
 		total += res.RowsAffected
-		if res.RowsAffected < int64(cleanupBatchSize) {
+		if res.RowsAffected == 0 || res.RowsAffected < int64(cleanupBatchSize) {
 			return total, nil
 		}
 		time.Sleep(cleanupBatchPause)
@@ -48,13 +56,13 @@ func (s *Store) deleteInBatches(
 
 // DeleteExpiredTokens removes access/refresh tokens past expiry.
 func (s *Store) DeleteExpiredTokens() error {
-	_, err := s.deleteInBatches(&models.AccessToken{}, "expires_at < ?", time.Now())
+	_, err := s.deleteByIDInBatches(&models.AccessToken{}, "expires_at < ?", time.Now())
 	return err
 }
 
 // DeleteExpiredDeviceCodes removes device authorization codes past expiry.
 func (s *Store) DeleteExpiredDeviceCodes() error {
-	_, err := s.deleteInBatches(&models.DeviceCode{}, "expires_at < ?", time.Now())
+	_, err := s.deleteByIDInBatches(&models.DeviceCode{}, "expires_at < ?", time.Now())
 	return err
 }
 
diff --git a/internal/store/cleanup_test.go b/internal/store/cleanup_test.go
index ce68402a..42fa0381 100644
--- a/internal/store/cleanup_test.go
+++ b/internal/store/cleanup_test.go
@@ -80,6 +80,23 @@ func TestDeleteExpiredTokensBatched(t *testing.T) {
 	assert.Equal(t, int64(30), remaining, "only unexpired tokens should remain")
 }
 
+func TestDeleteExpiredTokensInvalidBatchSize(t *testing.T) {
+	// Guard against a misconfigured batch size silently looping forever: GORM
+	// treats Limit(0) as "no limit", so the old termination check never fired.
+	withSmallCleanupBatch(t, 0)
+	s := createFreshStore(t, "sqlite", nil)
+
+	seedExpiredTokens(t, s, 5)
+
+	err := s.DeleteExpiredTokens()
+	require.Error(t, err)
+	assert.Contains(t, err.Error(), "cleanupBatchSize must be positive")
+
+	var remaining int64
+	require.NoError(t, s.db.Model(&models.AccessToken{}).Count(&remaining).Error)
+	assert.Equal(t, int64(5), remaining, "no rows should be deleted when guard fires")
+}
+
 func TestDeleteExpiredTokensNoRows(t *testing.T) {
 	withSmallCleanupBatch(t, 100)
 	s := createFreshStore(t, "sqlite", nil)

From ef52a809462fcfdec382eee08d74f1880b65e059 Mon Sep 17 00:00:00 2001
From: Bo-Yi Wu <appleboy.tw@gmail.com>
Date: Sun, 12 Apr 2026 22:31:59 +0800
Subject: [PATCH 4/4] fix(db): harden cleanup config validation and remove
 redundant index

- Validate EnableCleanupLock requires REDIS_ADDR and positive CleanupLockKeyValidity, and require positive ExpiredTokenCleanupInterval when expired-token cleanup is enabled, so misconfiguration fails fast instead of panicking time.NewTicker or throwing runtime lock errors
- Guard addExpiredTokenCleanupJob against non-positive interval as defense in depth
- Drop standalone idx_oauth_conn_user_id in favor of the existing composite unique index idx_oauth_user_provider whose leading column already serves user_id-only predicates
- Flip METRICS_GAUGE_UPDATE_ENABLED default to false in the production template so copy-pasting into every pod does not double-count gauge metrics

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .env.production.example                       |  8 ++-
 internal/bootstrap/server.go                  |  2 +-
 internal/config/config.go                     | 22 ++++++
 internal/config/config_test.go                | 71 +++++++++++++++++++
 internal/models/oauth_connection.go           |  2 +-
 .../migrations/001_add_phase1_indexes.sql     |  7 +-
 6 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/.env.production.example b/.env.production.example
index dccb7db2..df5597e8 100644
--- a/.env.production.example
+++ b/.env.production.example
@@ -63,9 +63,11 @@ RATE_LIMIT_STORE=redis
 
 # ---- Metrics ----------------------------------------------------------------
 METRICS_ENABLED=true
-# Enable on ONE replica only; other pods should set this to false to avoid
-# duplicated gauge updates. Alternatively, use PromQL avg()/max() aggregation.
-METRICS_GAUGE_UPDATE_ENABLED=true
+# Gauge updates query global counts; if every pod runs them you get duplicated
+# values across the fleet. Default this template to false so copying the file
+# into all pods is safe. On ONE dedicated replica set METRICS_GAUGE_UPDATE_ENABLED=true
+# (or set it true on all pods and aggregate with avg()/max() in PromQL).
+METRICS_GAUGE_UPDATE_ENABLED=false
 
 # ---- Sessions ---------------------------------------------------------------
 SESSION_FINGERPRINT=true
diff --git a/internal/bootstrap/server.go b/internal/bootstrap/server.go
index 333ff7cc..f6ff27f1 100644
--- a/internal/bootstrap/server.go
+++ b/internal/bootstrap/server.go
@@ -309,7 +309,7 @@ func addExpiredTokenCleanupJob(
 	cfg *config.Config,
 	locker rueidislock.Locker,
 ) {
-	if !cfg.EnableExpiredTokenCleanup {
+	if !cfg.EnableExpiredTokenCleanup || cfg.ExpiredTokenCleanupInterval <= 0 {
 		return
 	}
 
diff --git a/internal/config/config.go b/internal/config/config.go
index 2a8ff395..824bcd4c 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -1,6 +1,7 @@
 package config
 
 import (
+	"errors"
 	"fmt"
 	"os"
 	"strings"
@@ -636,6 +637,27 @@ func (c *Config) Validate() error {
 		)
 	}
 
+	// Cleanup lock validation (only when enabled)
+	if c.EnableCleanupLock {
+		if c.RedisAddr == "" {
+			return errors.New("REDIS_ADDR must be set when ENABLE_CLEANUP_LOCK=true")
+		}
+		if c.CleanupLockKeyValidity <= 0 {
+			return fmt.Errorf(
+				"CLEANUP_LOCK_KEY_VALIDITY must be a positive duration when ENABLE_CLEANUP_LOCK=true (got %s)",
+				c.CleanupLockKeyValidity,
+			)
+		}
+	}
+
+	// Expired-token cleanup interval must be positive when cleanup is enabled.
+	if c.EnableExpiredTokenCleanup && c.ExpiredTokenCleanupInterval <= 0 {
+		return fmt.Errorf(
+			"EXPIRED_TOKEN_CLEANUP_INTERVAL must be a positive duration when ENABLE_EXPIRED_TOKEN_CLEANUP=true (got %s)",
+			c.ExpiredTokenCleanupInterval,
+		)
+	}
+
 	// Token cache validation (only when enabled)
 	if c.TokenCacheEnabled {
 		if err := validateCacheType("TOKEN_CACHE_TYPE", c.TokenCacheType, c.RedisAddr); err != nil {
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
index a2d31ef5..b4a25fd1 100644
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -751,6 +751,77 @@ func TestConfig_Validate_SessionRememberMeMaxAge(t *testing.T) {
 	})
 }
 
+func TestConfig_Validate_CleanupLock(t *testing.T) {
+	t.Run("enabled without redis address fails", func(t *testing.T) {
+		cfg := validBaseConfig()
+		cfg.EnableCleanupLock = true
+		cfg.CleanupLockKeyValidity = 5 * time.Minute
+		cfg.RedisAddr = ""
+		err := cfg.Validate()
+		require.Error(t, err)
+		assert.Contains(t, err.Error(), "REDIS_ADDR must be set when ENABLE_CLEANUP_LOCK=true")
+	})
+
+	t.Run("enabled with zero validity fails", func(t *testing.T) {
+		cfg := validBaseConfig()
+		cfg.EnableCleanupLock = true
+		cfg.RedisAddr = "localhost:6379"
+		cfg.CleanupLockKeyValidity = 0
+		err := cfg.Validate()
+		require.Error(t, err)
+		assert.Contains(t, err.Error(), "CLEANUP_LOCK_KEY_VALIDITY must be a positive duration")
+	})
+
+	t.Run("enabled with valid settings passes", func(t *testing.T) {
+		cfg := validBaseConfig()
+		cfg.EnableCleanupLock = true
+		cfg.RedisAddr = "localhost:6379"
+		cfg.CleanupLockKeyValidity = 5 * time.Minute
+		err := cfg.Validate()
+		require.NoError(t, err)
+	})
+
+	t.Run("disabled does not require redis or validity", func(t *testing.T) {
+		cfg := validBaseConfig()
+		cfg.EnableCleanupLock = false
+		cfg.RedisAddr = ""
+		cfg.CleanupLockKeyValidity = 0
+		err := cfg.Validate()
+		require.NoError(t, err)
+	})
+}
+
+func TestConfig_Validate_ExpiredTokenCleanupInterval(t *testing.T) {
+	t.Run("enabled with zero interval fails", func(t *testing.T) {
+		cfg := validBaseConfig()
+		cfg.EnableExpiredTokenCleanup = true
+		cfg.ExpiredTokenCleanupInterval = 0
+		err := cfg.Validate()
+		require.Error(t, err)
+		assert.Contains(
+			t,
+			err.Error(),
+			"EXPIRED_TOKEN_CLEANUP_INTERVAL must be a positive duration",
+		)
+	})
+
+	t.Run("enabled with positive interval passes", func(t *testing.T) {
+		cfg := validBaseConfig()
+		cfg.EnableExpiredTokenCleanup = true
+		cfg.ExpiredTokenCleanupInterval = 30 * time.Minute
+		err := cfg.Validate()
+		require.NoError(t, err)
+	})
+
+	t.Run("disabled skips interval validation", func(t *testing.T) {
+		cfg := validBaseConfig()
+		cfg.EnableExpiredTokenCleanup = false
+		cfg.ExpiredTokenCleanupInterval = 0
+		err := cfg.Validate()
+		require.NoError(t, err)
+	})
+}
+
 func TestAuthCodeFlowConfigFields(t *testing.T) {
 	// Verify the fields exist and have sensible types/values
 	cfg := &Config{
diff --git a/internal/models/oauth_connection.go b/internal/models/oauth_connection.go
index fd7f5756..010edba6 100644
--- a/internal/models/oauth_connection.go
+++ b/internal/models/oauth_connection.go
@@ -7,7 +7,7 @@ import (
 // OAuthConnection represents an OAuth provider connection for a user
 type OAuthConnection struct {
 	ID             string `gorm:"primaryKey"`
-	UserID         string `gorm:"not null;index:idx_oauth_conn_user_id;uniqueIndex:idx_oauth_user_provider,priority:1"`
+	UserID         string `gorm:"not null;uniqueIndex:idx_oauth_user_provider,priority:1"`
 	Provider       string `gorm:"not null;uniqueIndex:idx_oauth_provider_user,priority:1;uniqueIndex:idx_oauth_user_provider,priority:2"` // "github", "gitea", "gitlab"
 	ProviderUserID string `gorm:"not null;uniqueIndex:idx_oauth_provider_user,priority:2"`                                                // Provider's user ID
 
diff --git a/internal/store/migrations/001_add_phase1_indexes.sql b/internal/store/migrations/001_add_phase1_indexes.sql
index 8c2ff3f1..b9d55c79 100644
--- a/internal/store/migrations/001_add_phase1_indexes.sql
+++ b/internal/store/migrations/001_add_phase1_indexes.sql
@@ -20,9 +20,10 @@ CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_user_active
 CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_client_active
     ON user_authorizations (client_id, is_active);
 
--- oauth_connections: GetOAuthConnectionsByUserID
-CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_oauth_conn_user_id
-    ON oauth_connections (user_id);
+-- oauth_connections: GetOAuthConnectionsByUserID / DeleteOAuthConnectionsByUserID
+-- are served by the existing unique composite index idx_oauth_user_provider
+-- (user_id, provider); PostgreSQL uses the leading column for user_id-only
+-- predicates, so no standalone user_id index is required.
 
 -- audit_logs: admin filter combinations
 CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_audit_severity