go-authgate · appleboy · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
diff --git a/.env.production.example b/.env.production.example
@@ -0,0 +1,73 @@
+# AuthGate production environment template for multi-pod deployments
+# (20k+ users, 5+ replicas, PostgreSQL + Redis).
+#
+# Copy to .env or inject via your secrets manager / Helm values, then override
+# secrets (JWT_SECRET, SESSION_SECRET, DATABASE_DSN, REDIS_ADDR, etc.) to match
+# your infrastructure. Adjust cache/pool sizes only after observing real
+# traffic metrics (cache hit rate, DB CPU, connection count).
+# =============================================================================
+
+ENVIRONMENT=production
+
+# ---- Secrets (REQUIRED: generate fresh values with `openssl rand -hex 32`) ---
+JWT_SECRET=CHANGE_ME
+SESSION_SECRET=CHANGE_ME
+
+# ---- Database ---------------------------------------------------------------
+DATABASE_DRIVER=postgres
+DATABASE_DSN=host=postgres user=authgate password=CHANGE_ME dbname=authgate port=5432 sslmode=require
+
+# Connection pool: 5 pods × 25 conns = 125; ensure PG max_connections >= 200.
+DB_MAX_OPEN_CONNS=25
+DB_MAX_IDLE_CONNS=10
+DB_CONN_MAX_LIFETIME=5m
+DB_CONN_MAX_IDLE_TIME=10m
+
+# ---- Redis (shared cache + rate limit + cleanup lock) -----------------------
+REDIS_ADDR=redis:6379
+# REDIS_PASSWORD=
+# REDIS_DB=0
+
+# ---- Token verification cache (major DB-load reducer) -----------------------
+# Off by default in .env.example; production should enable this.
+TOKEN_CACHE_ENABLED=true
+TOKEN_CACHE_TYPE=redis-aside
+TOKEN_CACHE_TTL=10h
+TOKEN_CACHE_CLIENT_TTL=1h
+# If pod memory is tight, drop this to 16 (MB) per connection.
+TOKEN_CACHE_SIZE_PER_CONN=32
+
+# ---- Client / User / Metrics cache (shared across pods) ---------------------
+CLIENT_CACHE_TYPE=redis-aside
+CLIENT_COUNT_CACHE_TYPE=redis-aside
+USER_CACHE_TYPE=redis-aside
+METRICS_CACHE_TYPE=redis-aside
+
+# ---- Expired token / device code cleanup ------------------------------------
+# All pods may enable this: a Redis-backed distributed lock (below) prevents
+# concurrent runs — only one pod does the DELETE each interval.
+ENABLE_EXPIRED_TOKEN_CLEANUP=true
+EXPIRED_TOKEN_CLEANUP_INTERVAL=30m
+
+# Distributed cleanup lock via rueidislock. Required for multi-pod.
+ENABLE_CLEANUP_LOCK=true
+CLEANUP_LOCK_KEY_VALIDITY=5m
+
+# ---- Audit logging ----------------------------------------------------------
+ENABLE_AUDIT_LOGGING=true
+AUDIT_LOG_RETENTION=2160h   # 90 days
+
+# ---- Rate limiting (distributed) --------------------------------------------
+ENABLE_RATE_LIMIT=true
+RATE_LIMIT_STORE=redis
+
+# ---- Metrics ----------------------------------------------------------------
+METRICS_ENABLED=true
+# Gauge updates query global counts; if every pod runs them you get duplicated
+# values across the fleet. Default this template to false so copying the file
+# into all pods is safe. On ONE dedicated replica set METRICS_GAUGE_UPDATE_ENABLED=true
+# (or set it true on all pods and aggregate with avg()/max() in PromQL).
+METRICS_GAUGE_UPDATE_ENABLED=false
+
+# ---- Sessions ---------------------------------------------------------------
+SESSION_FINGERPRINT=true
diff --git a/internal/bootstrap/bootstrap.go b/internal/bootstrap/bootstrap.go
@@ -15,6 +15,7 @@ import (
 	"github.com/gin-gonic/gin"
 	"github.com/prometheus/client_golang/prometheus"
 	"github.com/redis/go-redis/v9"
+	"github.com/redis/rueidis/rueidislock"
 )
 
 // Application holds all initialized components
@@ -39,6 +40,7 @@ type Application struct {
 	TokenCache             core.Cache[models.AccessToken]
 	TokenCacheCloser       func() error
 	RateLimitRedisClient   *redis.Client
+	CleanupLocker          rueidislock.Locker
 
 	// Services
 	AuditService core.AuditLogger
@@ -137,6 +139,12 @@ func (app *Application) initializeInfrastructure(ctx context.Context) error {
 		return err
 	}
 
+	// Distributed cleanup lock (multi-pod: serialize DELETE jobs)
+	app.CleanupLocker, err = initializeCleanupLocker(app.Config)
+	if err != nil {
+		return err
+	}
+
 	return nil
 }
 
@@ -222,8 +230,9 @@ func (app *Application) startWithGracefulShutdown() {
 	addClientCacheCleanupJob(m, app.ClientCache, app.Config)
 	addTokenCacheCleanupJob(m, app.TokenCache, app.Config)
 	addDatabaseShutdownJob(m, app.DB, app.Config)
-	addAuditLogCleanupJob(m, app.Config, app.AuditService)
-	addExpiredTokenCleanupJob(m, app.DB, app.Config)
+	addAuditLogCleanupJob(m, app.Config, app.AuditService, app.CleanupLocker)
+	addExpiredTokenCleanupJob(m, app.DB, app.Config, app.CleanupLocker)
+	addCleanupLockerShutdownJob(m, app.CleanupLocker)
 	addMetricsGaugeUpdateJob(m, app.Config, app.DB, app.MetricsRecorder, app.MetricsCache)
 
 	// Wait for graceful shutdown

diff --git a/internal/bootstrap/cleanup_lock.go b/internal/bootstrap/cleanup_lock.go
@@ -0,0 +1,80 @@
+package bootstrap
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"log"
+
+	"github.com/redis/rueidis"
+	"github.com/redis/rueidis/rueidislock"
+
+	"github.com/go-authgate/authgate/internal/config"
+)
+
+// Lock names for the distributed cleanup jobs. Keep in sync with docs/runbooks
+// that may reference these keys for debugging stuck cleanups.
+const (
+	cleanupLockAuditLogs     = "cleanup:audit-logs"
+	cleanupLockExpiredTokens = "cleanup:expired-tokens"
+)
+
+// initializeCleanupLocker builds a Redis-backed distributed locker that
+// serializes periodic cleanup jobs across multi-pod deployments. Returns
+// (nil, nil) when cleanup lock is disabled; callers treat a nil locker as
+// "run unconditionally" (single-instance mode).
+//
+// KeyMajority is 1 (single Redis target) rather than a Redlock quorum. A
+// Redis failover window could allow two pods to hold the lock simultaneously,
+// but cleanup DELETEs are idempotent (the inner SELECT finds no matching rows
+// on the second pod), so this is safe — the worst case is transient double
+// work, never data loss or corruption.
+func initializeCleanupLocker(cfg *config.Config) (rueidislock.Locker, error) {
+	if !cfg.EnableCleanupLock {
+		return nil, nil //nolint:nilnil // locker not needed when feature is disabled
+	}
+	if cfg.RedisAddr == "" {
+		return nil, errors.New("ENABLE_CLEANUP_LOCK requires REDIS_ADDR to be set")
+	}
+
+	locker, err := rueidislock.NewLocker(rueidislock.LockerOption{
+		ClientOption: rueidis.ClientOption{
+			InitAddress: []string{cfg.RedisAddr},
+			Password:    cfg.RedisPassword,
+			SelectDB:    cfg.RedisDB,
+		},
+		KeyPrefix:   "authgate:lock",
+		KeyMajority: 1,
+		KeyValidity: cfg.CleanupLockKeyValidity,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to create cleanup locker: %w", err)
+	}
+
+	log.Printf("Cleanup lock initialized (validity: %v)", cfg.CleanupLockKeyValidity)
+	return locker, nil
+}
+
+// runWithCleanupLock executes fn while holding the named distributed lock.
+// When locker is nil (single-instance mode) fn runs unconditionally.
+// When another pod currently holds the lock, fn is skipped silently and
+// nil is returned — the next tick will try again.
+func runWithCleanupLock(
+	ctx context.Context,
+	locker rueidislock.Locker,
+	name string,
+	fn func(context.Context) error,
+) error {
+	if locker == nil {
+		return fn(ctx)
+	}
+	lockCtx, cancel, err := locker.TryWithContext(ctx, name)
+	if err != nil {
+		if errors.Is(err, rueidislock.ErrNotLocked) {
+			return nil
+		}
+		return fmt.Errorf("acquire cleanup lock %q: %w", name, err)
+	}
+	defer cancel()
+	return fn(lockCtx)
+}
diff --git a/internal/bootstrap/server.go b/internal/bootstrap/server.go
@@ -14,6 +14,7 @@ import (
 
 	"github.com/appleboy/graceful"
 	"github.com/redis/go-redis/v9"
+	"github.com/redis/rueidis/rueidislock"
 )
 
 // createHTTPServer creates the HTTP server instance
@@ -118,36 +119,46 @@ func addAuditServiceShutdownJob(
 	})
 }
 
-// addAuditLogCleanupJob adds periodic audit log cleanup job
+// addAuditLogCleanupJob adds periodic audit log cleanup job.
+// When a cleanup locker is supplied, only one pod across the fleet performs
+// the DELETE per tick; others skip silently.
 func addAuditLogCleanupJob(
 	m *graceful.Manager,
 	cfg *config.Config,
 	auditService core.AuditLogger,
+	locker rueidislock.Locker,
 ) {
-	if !cfg.EnableAuditLogging || cfg.AuditLogRetention <= 0 {
+	if !cfg.EnableAuditLogging || cfg.AuditLogRetention <= 0 || cfg.AuditLogCleanupInterval <= 0 {
 		return
 	}
 
+	run := func(ctx context.Context) error {
+		return runWithCleanupLock(ctx, locker, cleanupLockAuditLogs, func(context.Context) error {
+			deleted, err := auditService.CleanupOldLogs(cfg.AuditLogRetention)
+			if err != nil {
+				log.Printf("Failed to cleanup old audit logs: %v", err)
+				return nil
+			}
+			if deleted > 0 {
+				log.Printf("Cleaned up %d old audit logs", deleted)
+			}
+			return nil
+		})
+	}
+
 	m.AddRunningJob(func(ctx context.Context) error {
-		ticker := time.NewTicker(24 * time.Hour)
+		ticker := time.NewTicker(cfg.AuditLogCleanupInterval)
 		defer ticker.Stop()
 
-		// Run cleanup immediately on startup
-		if deleted, err := auditService.CleanupOldLogs(cfg.AuditLogRetention); err != nil {
-			log.Printf("Failed to cleanup old audit logs: %v", err)
-		} else if deleted > 0 {
-			log.Printf("Cleaned up %d old audit logs", deleted)
+		if err := run(ctx); err != nil {
+			log.Printf("Audit log cleanup run error: %v", err)
 		}
 
 		for {
 			select {
 			case <-ticker.C:
-				if deleted, err := auditService.CleanupOldLogs(
-					cfg.AuditLogRetention,
-				); err != nil {
-					log.Printf("Failed to cleanup old audit logs: %v", err)
-				} else if deleted > 0 {
-					log.Printf("Cleaned up %d old audit logs", deleted)
+				if err := run(ctx); err != nil {
+					log.Printf("Audit log cleanup run error: %v", err)
 				}
 			case <-ctx.Done():
 				return nil
@@ -290,31 +301,48 @@ func addTokenCacheCleanupJob(
 
 // addExpiredTokenCleanupJob adds a periodic job that purges expired access tokens
 // and device codes from the database to prevent unbounded table growth.
-func addExpiredTokenCleanupJob(m *graceful.Manager, db *store.Store, cfg *config.Config) {
-	if !cfg.EnableExpiredTokenCleanup {
+// When a cleanup locker is supplied, only one pod across the fleet performs
+// the DELETE per tick; others skip silently.
+func addExpiredTokenCleanupJob(
+	m *graceful.Manager,
+	db *store.Store,
+	cfg *config.Config,
+	locker rueidislock.Locker,
+) {
+	if !cfg.EnableExpiredTokenCleanup || cfg.ExpiredTokenCleanupInterval <= 0 {
 		return
 	}
 
+	run := func(ctx context.Context) error {
+		return runWithCleanupLock(
+			ctx,
+			locker,
+			cleanupLockExpiredTokens,
+			func(context.Context) error {
+				if err := db.DeleteExpiredTokens(); err != nil {
+					log.Printf("Failed to cleanup expired tokens: %v", err)
+				}
+				if err := db.DeleteExpiredDeviceCodes(); err != nil {
+					log.Printf("Failed to cleanup expired device codes: %v", err)
+				}
+				return nil
+			},
+		)
+	}
+
 	m.AddRunningJob(func(ctx context.Context) error {
 		ticker := time.NewTicker(cfg.ExpiredTokenCleanupInterval)
 		defer ticker.Stop()
 
-		// Run cleanup immediately on startup
-		if err := db.DeleteExpiredTokens(); err != nil {
-			log.Printf("Failed to cleanup expired tokens: %v", err)
-		}
-		if err := db.DeleteExpiredDeviceCodes(); err != nil {
-			log.Printf("Failed to cleanup expired device codes: %v", err)
+		if err := run(ctx); err != nil {
+			log.Printf("Expired token cleanup run error: %v", err)
 		}
 
 		for {
 			select {
 			case <-ticker.C:
-				if err := db.DeleteExpiredTokens(); err != nil {
-					log.Printf("Failed to cleanup expired tokens: %v", err)
-				}
-				if err := db.DeleteExpiredDeviceCodes(); err != nil {
-					log.Printf("Failed to cleanup expired device codes: %v", err)
+				if err := run(ctx); err != nil {
+					log.Printf("Expired token cleanup run error: %v", err)
 				}
 			case <-ctx.Done():
 				return nil
@@ -323,6 +351,20 @@ func addExpiredTokenCleanupJob(m *graceful.Manager, db *store.Store, cfg *config
 	})
 }
 
+// addCleanupLockerShutdownJob closes the distributed cleanup locker on
+// shutdown. No-op when locker is nil.
+func addCleanupLockerShutdownJob(m *graceful.Manager, locker rueidislock.Locker) {
+	if locker == nil {
+		return
+	}
+	m.AddShutdownJob(func() error {
+		log.Println("Closing cleanup locker...")
+		locker.Close()
+		log.Println("Cleanup locker closed")
+		return nil
+	})
+}
+
 // addDatabaseShutdownJob adds database connection close handler
 func addDatabaseShutdownJob(m *graceful.Manager, db *store.Store, cfg *config.Config) {
 	m.AddShutdownJob(func() error {