From 6d71baf68321bdc679ddf10ae4acc2baa5b869be Mon Sep 17 00:00:00 2001 From: Bo-Yi Wu Date: Sun, 12 Apr 2026 21:43:26 +0800 Subject: [PATCH 1/4] perf(db): add Phase 1 indexes, batched cleanup, and distributed cleanup lock - Add composite and single-column indexes on oauth_applications, user_authorizations, oauth_connections, audit_logs, device_codes, and authorization_codes for multi-pod hot paths - Ship PostgreSQL migration using CREATE INDEX CONCURRENTLY to avoid write locks on large tables - Batch DELETE of expired tokens, device codes, and old audit logs via shared deleteInBatches helper (10k rows per iteration) - Add Redis-backed distributed cleanup lock via rueidislock so only one pod runs each cleanup per tick - Introduce .env.production.example with recommended redis-aside cache defaults for 5+ pod deployments Co-Authored-By: Claude Opus 4.6 (1M context) --- .env.production.example | 71 +++++++ internal/bootstrap/bootstrap.go | 13 +- internal/bootstrap/cleanup_lock.go | 80 ++++++++ internal/bootstrap/server.go | 90 ++++++--- internal/config/config.go | 8 + internal/models/audit_log.go | 18 +- internal/models/authorization_code.go | 4 +- internal/models/device_code.go | 4 +- internal/models/oauth_application.go | 6 +- internal/models/oauth_connection.go | 2 +- internal/models/user_authorization.go | 8 +- internal/store/audit_log.go | 6 +- internal/store/cleanup.go | 46 ++++- internal/store/cleanup_test.go | 181 ++++++++++++++++++ .../migrations/001_add_phase1_indexes.sql | 43 +++++ 15 files changed, 528 insertions(+), 52 deletions(-) create mode 100644 .env.production.example create mode 100644 internal/bootstrap/cleanup_lock.go create mode 100644 internal/store/cleanup_test.go create mode 100644 internal/store/migrations/001_add_phase1_indexes.sql diff --git a/.env.production.example b/.env.production.example new file mode 100644 index 00000000..dccb7db2 --- /dev/null +++ b/.env.production.example @@ -0,0 +1,71 @@ +# AuthGate production environment template for multi-pod deployments +# (20k+ users, 5+ replicas, PostgreSQL + Redis). +# +# Copy to .env or inject via your secrets manager / Helm values, then override +# secrets (JWT_SECRET, SESSION_SECRET, DATABASE_DSN, REDIS_ADDR, etc.) to match +# your infrastructure. Adjust cache/pool sizes only after observing real +# traffic metrics (cache hit rate, DB CPU, connection count). +# ============================================================================= + +ENVIRONMENT=production + +# ---- Secrets (REQUIRED: generate fresh values with `openssl rand -hex 32`) --- +JWT_SECRET=CHANGE_ME +SESSION_SECRET=CHANGE_ME + +# ---- Database --------------------------------------------------------------- +DATABASE_DRIVER=postgres +DATABASE_DSN=host=postgres user=authgate password=CHANGE_ME dbname=authgate port=5432 sslmode=require + +# Connection pool: 5 pods × 25 conns = 125; ensure PG max_connections >= 200. +DB_MAX_OPEN_CONNS=25 +DB_MAX_IDLE_CONNS=10 +DB_CONN_MAX_LIFETIME=5m +DB_CONN_MAX_IDLE_TIME=10m + +# ---- Redis (shared cache + rate limit + cleanup lock) ----------------------- +REDIS_ADDR=redis:6379 +# REDIS_PASSWORD= +# REDIS_DB=0 + +# ---- Token verification cache (major DB-load reducer) ----------------------- +# Off by default in .env.example; production should enable this. +TOKEN_CACHE_ENABLED=true +TOKEN_CACHE_TYPE=redis-aside +TOKEN_CACHE_TTL=10h +TOKEN_CACHE_CLIENT_TTL=1h +# If pod memory is tight, drop this to 16 (MB) per connection. +TOKEN_CACHE_SIZE_PER_CONN=32 + +# ---- Client / User / Metrics cache (shared across pods) --------------------- +CLIENT_CACHE_TYPE=redis-aside +CLIENT_COUNT_CACHE_TYPE=redis-aside +USER_CACHE_TYPE=redis-aside +METRICS_CACHE_TYPE=redis-aside + +# ---- Expired token / device code cleanup ------------------------------------ +# All pods may enable this: a Redis-backed distributed lock (below) prevents +# concurrent runs — only one pod does the DELETE each interval. +ENABLE_EXPIRED_TOKEN_CLEANUP=true +EXPIRED_TOKEN_CLEANUP_INTERVAL=30m + +# Distributed cleanup lock via rueidislock. Required for multi-pod. +ENABLE_CLEANUP_LOCK=true +CLEANUP_LOCK_KEY_VALIDITY=5m + +# ---- Audit logging ---------------------------------------------------------- +ENABLE_AUDIT_LOGGING=true +AUDIT_LOG_RETENTION=2160h # 90 days + +# ---- Rate limiting (distributed) -------------------------------------------- +ENABLE_RATE_LIMIT=true +RATE_LIMIT_STORE=redis + +# ---- Metrics ---------------------------------------------------------------- +METRICS_ENABLED=true +# Enable on ONE replica only; other pods should set this to false to avoid +# duplicated gauge updates. Alternatively, use PromQL avg()/max() aggregation. +METRICS_GAUGE_UPDATE_ENABLED=true + +# ---- Sessions --------------------------------------------------------------- +SESSION_FINGERPRINT=true diff --git a/internal/bootstrap/bootstrap.go b/internal/bootstrap/bootstrap.go index 8587cfed..f02505d6 100644 --- a/internal/bootstrap/bootstrap.go +++ b/internal/bootstrap/bootstrap.go @@ -15,6 +15,7 @@ import ( "github.com/gin-gonic/gin" "github.com/prometheus/client_golang/prometheus" "github.com/redis/go-redis/v9" + "github.com/redis/rueidis/rueidislock" ) // Application holds all initialized components @@ -39,6 +40,7 @@ type Application struct { TokenCache core.Cache[models.AccessToken] TokenCacheCloser func() error RateLimitRedisClient *redis.Client + CleanupLocker rueidislock.Locker // Services AuditService core.AuditLogger @@ -137,6 +139,12 @@ func (app *Application) initializeInfrastructure(ctx context.Context) error { return err } + // Distributed cleanup lock (multi-pod: serialize DELETE jobs) + app.CleanupLocker, err = initializeCleanupLocker(app.Config) + if err != nil { + return err + } + return nil } @@ -222,8 +230,9 @@ func (app *Application) startWithGracefulShutdown() { addClientCacheCleanupJob(m, app.ClientCache, app.Config) addTokenCacheCleanupJob(m, app.TokenCache, app.Config) addDatabaseShutdownJob(m, app.DB, app.Config) - addAuditLogCleanupJob(m, app.Config, app.AuditService) - addExpiredTokenCleanupJob(m, app.DB, app.Config) + addAuditLogCleanupJob(m, app.Config, app.AuditService, app.CleanupLocker) + addExpiredTokenCleanupJob(m, app.DB, app.Config, app.CleanupLocker) + addCleanupLockerShutdownJob(m, app.CleanupLocker) addMetricsGaugeUpdateJob(m, app.Config, app.DB, app.MetricsRecorder, app.MetricsCache) // Wait for graceful shutdown diff --git a/internal/bootstrap/cleanup_lock.go b/internal/bootstrap/cleanup_lock.go new file mode 100644 index 00000000..97cb1788 --- /dev/null +++ b/internal/bootstrap/cleanup_lock.go @@ -0,0 +1,80 @@ +package bootstrap + +import ( + "context" + "errors" + "fmt" + "log" + + "github.com/redis/rueidis" + "github.com/redis/rueidis/rueidislock" + + "github.com/go-authgate/authgate/internal/config" +) + +// Lock names for the distributed cleanup jobs. Keep in sync with docs/runbooks +// that may reference these keys for debugging stuck cleanups. +const ( + cleanupLockAuditLogs = "cleanup:audit-logs" + cleanupLockExpiredTokens = "cleanup:expired-tokens" +) + +// initializeCleanupLocker builds a Redis-backed distributed locker that +// serializes periodic cleanup jobs across multi-pod deployments. Returns +// (nil, nil) when cleanup lock is disabled; callers treat a nil locker as +// "run unconditionally" (single-instance mode). +// +// KeyMajority is 1 (single Redis target) rather than a Redlock quorum. A +// Redis failover window could allow two pods to hold the lock simultaneously, +// but cleanup DELETEs are idempotent (the inner SELECT finds no matching rows +// on the second pod), so this is safe — the worst case is transient double +// work, never data loss or corruption. +func initializeCleanupLocker(cfg *config.Config) (rueidislock.Locker, error) { + if !cfg.EnableCleanupLock { + return nil, nil //nolint:nilnil // locker not needed when feature is disabled + } + if cfg.RedisAddr == "" { + return nil, errors.New("ENABLE_CLEANUP_LOCK requires REDIS_ADDR to be set") + } + + locker, err := rueidislock.NewLocker(rueidislock.LockerOption{ + ClientOption: rueidis.ClientOption{ + InitAddress: []string{cfg.RedisAddr}, + Password: cfg.RedisPassword, + SelectDB: cfg.RedisDB, + }, + KeyPrefix: "authgate:lock", + KeyMajority: 1, + KeyValidity: cfg.CleanupLockKeyValidity, + }) + if err != nil { + return nil, fmt.Errorf("failed to create cleanup locker: %w", err) + } + + log.Printf("Cleanup lock initialized (validity: %v)", cfg.CleanupLockKeyValidity) + return locker, nil +} + +// runWithCleanupLock executes fn while holding the named distributed lock. +// When locker is nil (single-instance mode) fn runs unconditionally. +// When another pod currently holds the lock, fn is skipped silently and +// nil is returned — the next tick will try again. +func runWithCleanupLock( + ctx context.Context, + locker rueidislock.Locker, + name string, + fn func(context.Context) error, +) error { + if locker == nil { + return fn(ctx) + } + lockCtx, cancel, err := locker.TryWithContext(ctx, name) + if err != nil { + if errors.Is(err, rueidislock.ErrNotLocked) { + return nil + } + return fmt.Errorf("acquire cleanup lock %q: %w", name, err) + } + defer cancel() + return fn(lockCtx) +} diff --git a/internal/bootstrap/server.go b/internal/bootstrap/server.go index c9962082..282c3eb2 100644 --- a/internal/bootstrap/server.go +++ b/internal/bootstrap/server.go @@ -14,6 +14,7 @@ import ( "github.com/appleboy/graceful" "github.com/redis/go-redis/v9" + "github.com/redis/rueidis/rueidislock" ) // createHTTPServer creates the HTTP server instance @@ -118,36 +119,46 @@ func addAuditServiceShutdownJob( }) } -// addAuditLogCleanupJob adds periodic audit log cleanup job +// addAuditLogCleanupJob adds periodic audit log cleanup job. +// When a cleanup locker is supplied, only one pod across the fleet performs +// the DELETE per tick; others skip silently. func addAuditLogCleanupJob( m *graceful.Manager, cfg *config.Config, auditService core.AuditLogger, + locker rueidislock.Locker, ) { if !cfg.EnableAuditLogging || cfg.AuditLogRetention <= 0 { return } + run := func(ctx context.Context) error { + return runWithCleanupLock(ctx, locker, cleanupLockAuditLogs, func(context.Context) error { + deleted, err := auditService.CleanupOldLogs(cfg.AuditLogRetention) + if err != nil { + log.Printf("Failed to cleanup old audit logs: %v", err) + return nil + } + if deleted > 0 { + log.Printf("Cleaned up %d old audit logs", deleted) + } + return nil + }) + } + m.AddRunningJob(func(ctx context.Context) error { ticker := time.NewTicker(24 * time.Hour) defer ticker.Stop() - // Run cleanup immediately on startup - if deleted, err := auditService.CleanupOldLogs(cfg.AuditLogRetention); err != nil { - log.Printf("Failed to cleanup old audit logs: %v", err) - } else if deleted > 0 { - log.Printf("Cleaned up %d old audit logs", deleted) + if err := run(ctx); err != nil { + log.Printf("Audit log cleanup run error: %v", err) } for { select { case <-ticker.C: - if deleted, err := auditService.CleanupOldLogs( - cfg.AuditLogRetention, - ); err != nil { - log.Printf("Failed to cleanup old audit logs: %v", err) - } else if deleted > 0 { - log.Printf("Cleaned up %d old audit logs", deleted) + if err := run(ctx); err != nil { + log.Printf("Audit log cleanup run error: %v", err) } case <-ctx.Done(): return nil @@ -290,31 +301,48 @@ func addTokenCacheCleanupJob( // addExpiredTokenCleanupJob adds a periodic job that purges expired access tokens // and device codes from the database to prevent unbounded table growth. -func addExpiredTokenCleanupJob(m *graceful.Manager, db *store.Store, cfg *config.Config) { +// When a cleanup locker is supplied, only one pod across the fleet performs +// the DELETE per tick; others skip silently. +func addExpiredTokenCleanupJob( + m *graceful.Manager, + db *store.Store, + cfg *config.Config, + locker rueidislock.Locker, +) { if !cfg.EnableExpiredTokenCleanup { return } + run := func(ctx context.Context) error { + return runWithCleanupLock( + ctx, + locker, + cleanupLockExpiredTokens, + func(context.Context) error { + if err := db.DeleteExpiredTokens(); err != nil { + log.Printf("Failed to cleanup expired tokens: %v", err) + } + if err := db.DeleteExpiredDeviceCodes(); err != nil { + log.Printf("Failed to cleanup expired device codes: %v", err) + } + return nil + }, + ) + } + m.AddRunningJob(func(ctx context.Context) error { ticker := time.NewTicker(cfg.ExpiredTokenCleanupInterval) defer ticker.Stop() - // Run cleanup immediately on startup - if err := db.DeleteExpiredTokens(); err != nil { - log.Printf("Failed to cleanup expired tokens: %v", err) - } - if err := db.DeleteExpiredDeviceCodes(); err != nil { - log.Printf("Failed to cleanup expired device codes: %v", err) + if err := run(ctx); err != nil { + log.Printf("Expired token cleanup run error: %v", err) } for { select { case <-ticker.C: - if err := db.DeleteExpiredTokens(); err != nil { - log.Printf("Failed to cleanup expired tokens: %v", err) - } - if err := db.DeleteExpiredDeviceCodes(); err != nil { - log.Printf("Failed to cleanup expired device codes: %v", err) + if err := run(ctx); err != nil { + log.Printf("Expired token cleanup run error: %v", err) } case <-ctx.Done(): return nil @@ -323,6 +351,20 @@ func addExpiredTokenCleanupJob(m *graceful.Manager, db *store.Store, cfg *config }) } +// addCleanupLockerShutdownJob closes the distributed cleanup locker on +// shutdown. No-op when locker is nil. +func addCleanupLockerShutdownJob(m *graceful.Manager, locker rueidislock.Locker) { + if locker == nil { + return + } + m.AddShutdownJob(func() error { + log.Println("Closing cleanup locker...") + locker.Close() + log.Println("Cleanup locker closed") + return nil + }) +} + // addDatabaseShutdownJob adds database connection close handler func addDatabaseShutdownJob(m *graceful.Manager, db *store.Store, cfg *config.Config) { m.AddShutdownJob(func() error { diff --git a/internal/config/config.go b/internal/config/config.go index d3a9a3ce..2a8ff395 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -164,6 +164,10 @@ type Config struct { EnableExpiredTokenCleanup bool // Enable periodic cleanup of expired tokens and device codes (default: false) ExpiredTokenCleanupInterval time.Duration // How often to purge expired rows (default: 1h) + // Distributed cleanup lock (multi-pod deployments) + EnableCleanupLock bool // Use Redis lock so only one pod runs cleanup per interval (default: false) + CleanupLockKeyValidity time.Duration // Lock validity window, auto-extended by rueidislock (default: 5m) + // Prometheus Metrics settings MetricsEnabled bool // Enable Prometheus metrics endpoint (default: false) MetricsToken string // Bearer token for /metrics (empty = no auth, recommended for production) @@ -371,6 +375,10 @@ func Load() *Config { EnableExpiredTokenCleanup: getEnvBool("ENABLE_EXPIRED_TOKEN_CLEANUP", false), ExpiredTokenCleanupInterval: getEnvDuration("EXPIRED_TOKEN_CLEANUP_INTERVAL", time.Hour), + // Distributed cleanup lock + EnableCleanupLock: getEnvBool("ENABLE_CLEANUP_LOCK", false), + CleanupLockKeyValidity: getEnvDuration("CLEANUP_LOCK_KEY_VALIDITY", 5*time.Minute), + // Prometheus Metrics settings MetricsEnabled: getEnvBool("METRICS_ENABLED", false), MetricsToken: getEnv("METRICS_TOKEN", ""), diff --git a/internal/models/audit_log.go b/internal/models/audit_log.go index 551abd84..662a1551 100644 --- a/internal/models/audit_log.go +++ b/internal/models/audit_log.go @@ -137,19 +137,19 @@ type AuditLog struct { ID string `gorm:"primaryKey;type:varchar(36)" json:"id"` // Event information - EventType EventType `gorm:"type:varchar(50);index;not null" json:"event_type"` - EventTime time.Time `gorm:"index;not null" json:"event_time"` - Severity EventSeverity `gorm:"type:varchar(20);not null" json:"severity"` + EventType EventType `gorm:"type:varchar(50);index;not null;index:idx_audit_type_time,priority:1" json:"event_type"` + EventTime time.Time `gorm:"index;not null;index:idx_audit_type_time,priority:2,sort:desc;index:idx_audit_actor_time,priority:2,sort:desc;index:idx_audit_resource_time,priority:3,sort:desc" json:"event_time"` + Severity EventSeverity `gorm:"type:varchar(20);not null;index" json:"severity"` // Actor information - ActorUserID string `gorm:"type:varchar(36);index" json:"actor_user_id"` - ActorUsername string `gorm:"type:varchar(100)" json:"actor_username"` - ActorIP string `gorm:"type:varchar(45);index" json:"actor_ip"` // Support IPv6 + ActorUserID string `gorm:"type:varchar(36);index;index:idx_audit_actor_time,priority:1" json:"actor_user_id"` + ActorUsername string `gorm:"type:varchar(100)" json:"actor_username"` + ActorIP string `gorm:"type:varchar(45);index" json:"actor_ip"` // Support IPv6 // Resource information - ResourceType ResourceType `gorm:"type:varchar(50);index" json:"resource_type"` - ResourceID string `gorm:"type:varchar(36);index" json:"resource_id"` - ResourceName string `gorm:"type:varchar(255)" json:"resource_name"` + ResourceType ResourceType `gorm:"type:varchar(50);index;index:idx_audit_resource_time,priority:1" json:"resource_type"` + ResourceID string `gorm:"type:varchar(36);index;index:idx_audit_resource_time,priority:2" json:"resource_id"` + ResourceName string `gorm:"type:varchar(255)" json:"resource_name"` // Operation details Action string `gorm:"type:varchar(255);not null" json:"action"` diff --git a/internal/models/authorization_code.go b/internal/models/authorization_code.go index 8acbf895..83222565 100644 --- a/internal/models/authorization_code.go +++ b/internal/models/authorization_code.go @@ -27,8 +27,8 @@ type AuthorizationCode struct { // OIDC (OpenID Connect Core 1.0 §3.1.2.1) Nonce string `gorm:"default:''"` // nonce from authorization request (empty if not provided) - ExpiresAt time.Time `gorm:"index"` - UsedAt *time.Time // Set immediately upon exchange; prevents replay attacks + ExpiresAt time.Time `gorm:"index;index:idx_authcode_used_exp,priority:2"` + UsedAt *time.Time `gorm:"index:idx_authcode_used_exp,priority:1"` // Set immediately upon exchange; prevents replay attacks CreatedAt time.Time } diff --git a/internal/models/device_code.go b/internal/models/device_code.go index 6abdaa0e..bc03cbd7 100644 --- a/internal/models/device_code.go +++ b/internal/models/device_code.go @@ -13,10 +13,10 @@ type DeviceCode struct { UserCode string `gorm:"uniqueIndex;not null"` ClientID string `gorm:"not null;index"` Scopes string `gorm:"not null"` // space-separated scopes - ExpiresAt time.Time `gorm:"index"` + ExpiresAt time.Time `gorm:"index;index:idx_device_auth_exp,priority:2"` Interval int // polling interval in seconds UserID string // filled after authorization - Authorized bool `gorm:"default:false"` + Authorized bool `gorm:"default:false;index:idx_device_auth_exp,priority:1"` AuthorizedAt time.Time CreatedAt time.Time UpdatedAt time.Time diff --git a/internal/models/oauth_application.go b/internal/models/oauth_application.go index 8fe215ae..a86b709a 100644 --- a/internal/models/oauth_application.go +++ b/internal/models/oauth_application.go @@ -35,15 +35,15 @@ type OAuthApplication struct { ClientSecret string `gorm:"not null"` // bcrypt hashed secret ClientName string `gorm:"not null"` Description string `gorm:"type:text"` - UserID string `gorm:"not null"` + UserID string `gorm:"not null;index"` Scopes string `gorm:"not null"` GrantTypes string `gorm:"not null;default:'device_code'"` RedirectURIs StringArray `gorm:"type:json"` ClientType string `gorm:"not null;default:'public'"` // "confidential" or "public" EnableDeviceFlow bool `gorm:"not null;default:true"` EnableAuthCodeFlow bool `gorm:"not null;default:false"` - EnableClientCredentialsFlow bool `gorm:"not null;default:false"` // Client Credentials Grant (RFC 6749 §4.4); confidential clients only - Status string `gorm:"not null;default:'active'"` // ClientStatusPending / ClientStatusActive / ClientStatusInactive + EnableClientCredentialsFlow bool `gorm:"not null;default:false"` // Client Credentials Grant (RFC 6749 §4.4); confidential clients only + Status string `gorm:"not null;default:'active';index"` // ClientStatusPending / ClientStatusActive / ClientStatusInactive CreatedBy string CreatedAt time.Time UpdatedAt time.Time diff --git a/internal/models/oauth_connection.go b/internal/models/oauth_connection.go index 010edba6..f8742410 100644 --- a/internal/models/oauth_connection.go +++ b/internal/models/oauth_connection.go @@ -7,7 +7,7 @@ import ( // OAuthConnection represents an OAuth provider connection for a user type OAuthConnection struct { ID string `gorm:"primaryKey"` - UserID string `gorm:"not null;uniqueIndex:idx_oauth_user_provider,priority:1"` + UserID string `gorm:"not null;index;uniqueIndex:idx_oauth_user_provider,priority:1"` Provider string `gorm:"not null;uniqueIndex:idx_oauth_provider_user,priority:1;uniqueIndex:idx_oauth_user_provider,priority:2"` // "github", "gitea", "gitlab" ProviderUserID string `gorm:"not null;uniqueIndex:idx_oauth_provider_user,priority:2"` // Provider's user ID diff --git a/internal/models/user_authorization.go b/internal/models/user_authorization.go index 1e23b8a2..3ab0953c 100644 --- a/internal/models/user_authorization.go +++ b/internal/models/user_authorization.go @@ -9,14 +9,14 @@ type UserAuthorization struct { UUID string `gorm:"uniqueIndex;size:36;not null"` // Public UUID for API/UI identification // Relations (composite unique index ensures one grant per user+app) - UserID string `gorm:"not null;uniqueIndex:idx_user_app"` // FK → User.ID - ApplicationID int64 `gorm:"not null;uniqueIndex:idx_user_app"` // FK → OAuthApplication.ID - ClientID string `gorm:"not null;index"` // Denormalized ClientID UUID (for UI display and API responses) + UserID string `gorm:"not null;uniqueIndex:idx_user_app;index:idx_user_active,priority:1"` // FK → User.ID + ApplicationID int64 `gorm:"not null;uniqueIndex:idx_user_app"` // FK → OAuthApplication.ID + ClientID string `gorm:"not null;index;index:idx_client_active,priority:1"` // Denormalized ClientID UUID (for UI display and API responses) Scopes string `gorm:"not null"` GrantedAt time.Time RevokedAt *time.Time - IsActive bool `gorm:"not null;default:true"` + IsActive bool `gorm:"not null;default:true;index:idx_user_active,priority:2;index:idx_client_active,priority:2"` CreatedAt time.Time UpdatedAt time.Time diff --git a/internal/store/audit_log.go b/internal/store/audit_log.go index 3a6a1f6e..c2161d60 100644 --- a/internal/store/audit_log.go +++ b/internal/store/audit_log.go @@ -89,10 +89,10 @@ func (s *Store) GetAuditLogsPaginated( return logs, pagination, nil } -// DeleteOldAuditLogs deletes audit logs older than the specified time +// DeleteOldAuditLogs deletes audit logs older than the specified time in +// bounded batches to keep lock duration short on large tables. func (s *Store) DeleteOldAuditLogs(olderThan time.Time) (int64, error) { - result := s.db.Where("created_at < ?", olderThan).Delete(&models.AuditLog{}) - return result.RowsAffected, result.Error + return s.deleteInBatches(&models.AuditLog{}, "created_at < ?", olderThan) } // GetAuditLogStats returns statistics about audit logs in a given time range diff --git a/internal/store/cleanup.go b/internal/store/cleanup.go index 01b13059..ff0eea40 100644 --- a/internal/store/cleanup.go +++ b/internal/store/cleanup.go @@ -8,12 +8,54 @@ import ( // Cleanup and Metrics operations (implements core.CleanupStore + core.MetricsStore) +// cleanupBatchSize bounds each DELETE to keep lock duration short on large tables. +// Chosen empirically: 10k rows × a single indexed DELETE completes in well under +// a second even on a busy PostgreSQL, so replicas / WAL apply do not fall behind. +// Declared as var (not const) so tests can shrink it to exercise the batching loop. +var cleanupBatchSize = 10000 + +// cleanupBatchPause gives WAL / replication / autovacuum breathing room between +// batches. A short sleep is sufficient; we are optimizing for background-job +// friendliness, not throughput. Declared as var so tests can zero it out. +var cleanupBatchPause = 200 * time.Millisecond + +// deleteInBatches DELETEs rows of `model` matching whereClause in batches of +// cleanupBatchSize. The subquery form (`id IN (SELECT id … LIMIT N)`) works on +// PostgreSQL — which does not support `DELETE … LIMIT` directly — and lets the +// inner SELECT use the WHERE-clause index (e.g. expires_at). +func (s *Store) deleteInBatches( + model any, + whereClause string, + args ...any, +) (int64, error) { + var total int64 + for { + sub := s.db.Model(model). + Select("id"). + Where(whereClause, args...). + Limit(cleanupBatchSize) + res := s.db.Where("id IN (?)", sub).Delete(model) + if res.Error != nil { + return total, res.Error + } + total += res.RowsAffected + if res.RowsAffected < int64(cleanupBatchSize) { + return total, nil + } + time.Sleep(cleanupBatchPause) + } +} + +// DeleteExpiredTokens removes access/refresh tokens past expiry. func (s *Store) DeleteExpiredTokens() error { - return s.db.Where("expires_at < ?", time.Now()).Delete(&models.AccessToken{}).Error + _, err := s.deleteInBatches(&models.AccessToken{}, "expires_at < ?", time.Now()) + return err } +// DeleteExpiredDeviceCodes removes device authorization codes past expiry. func (s *Store) DeleteExpiredDeviceCodes() error { - return s.db.Where("expires_at < ?", time.Now()).Delete(&models.DeviceCode{}).Error + _, err := s.deleteInBatches(&models.DeviceCode{}, "expires_at < ?", time.Now()) + return err } // CountActiveTokensByCategory counts active, non-expired tokens by category diff --git a/internal/store/cleanup_test.go b/internal/store/cleanup_test.go new file mode 100644 index 00000000..ce68402a --- /dev/null +++ b/internal/store/cleanup_test.go @@ -0,0 +1,181 @@ +package store + +import ( + "testing" + "time" + + "github.com/go-authgate/authgate/internal/models" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// withSmallCleanupBatch shrinks the cleanup batch loop parameters for the +// duration of a test so we can exercise the multi-batch path with a realistic +// row count. Restores the production defaults on cleanup. +func withSmallCleanupBatch(t *testing.T, size int) { + t.Helper() + origSize := cleanupBatchSize + origPause := cleanupBatchPause + cleanupBatchSize = size + cleanupBatchPause = 0 + t.Cleanup(func() { + cleanupBatchSize = origSize + cleanupBatchPause = origPause + }) +} + +// seedExpiredTokens creates `count` AccessToken rows already past expiry. +func seedExpiredTokens(t *testing.T, s *Store, count int) { + t.Helper() + expired := time.Now().Add(-time.Hour) + tokens := make([]*models.AccessToken, 0, count) + for range count { + tokens = append(tokens, &models.AccessToken{ + ID: uuid.New().String(), + TokenHash: uuid.New().String(), + TokenCategory: models.TokenCategoryAccess, + Status: models.TokenStatusActive, + UserID: "user-" + uuid.New().String(), + ClientID: "client-x", + Scopes: "read", + ExpiresAt: expired, + }) + } + require.NoError(t, s.db.CreateInBatches(tokens, 100).Error) +} + +// seedActiveTokens creates `count` AccessToken rows still valid for a while. +func seedActiveTokens(t *testing.T, s *Store, count int) { + t.Helper() + future := time.Now().Add(time.Hour) + tokens := make([]*models.AccessToken, 0, count) + for range count { + tokens = append(tokens, &models.AccessToken{ + ID: uuid.New().String(), + TokenHash: uuid.New().String(), + TokenCategory: models.TokenCategoryAccess, + Status: models.TokenStatusActive, + UserID: "user-" + uuid.New().String(), + ClientID: "client-x", + Scopes: "read", + ExpiresAt: future, + }) + } + require.NoError(t, s.db.CreateInBatches(tokens, 100).Error) +} + +func TestDeleteExpiredTokensBatched(t *testing.T) { + withSmallCleanupBatch(t, 100) // force the loop to iterate + s := createFreshStore(t, "sqlite", nil) + + seedExpiredTokens(t, s, 250) + seedActiveTokens(t, s, 30) + + require.NoError(t, s.DeleteExpiredTokens()) + + var remaining int64 + require.NoError(t, s.db.Model(&models.AccessToken{}).Count(&remaining).Error) + assert.Equal(t, int64(30), remaining, "only unexpired tokens should remain") +} + +func TestDeleteExpiredTokensNoRows(t *testing.T) { + withSmallCleanupBatch(t, 100) + s := createFreshStore(t, "sqlite", nil) + + // Nothing expired; cleanup must still complete without error and leave + // everything intact. + seedActiveTokens(t, s, 10) + + require.NoError(t, s.DeleteExpiredTokens()) + + var remaining int64 + require.NoError(t, s.db.Model(&models.AccessToken{}).Count(&remaining).Error) + assert.Equal(t, int64(10), remaining) +} + +func TestDeleteExpiredDeviceCodesBatched(t *testing.T) { + withSmallCleanupBatch(t, 50) + s := createFreshStore(t, "sqlite", nil) + + expired := time.Now().Add(-time.Hour) + future := time.Now().Add(time.Hour) + + expiredCodes := make([]*models.DeviceCode, 0, 120) + for range 120 { + expiredCodes = append(expiredCodes, &models.DeviceCode{ + DeviceCodeHash: uuid.New().String(), + DeviceCodeSalt: "salt", + DeviceCodeID: uuid.New().String()[:8], + UserCode: uuid.New().String()[:8], + ClientID: "client-x", + Scopes: "read", + ExpiresAt: expired, + }) + } + require.NoError(t, s.db.CreateInBatches(expiredCodes, 40).Error) + + activeCodes := make([]*models.DeviceCode, 0, 15) + for range 15 { + activeCodes = append(activeCodes, &models.DeviceCode{ + DeviceCodeHash: uuid.New().String(), + DeviceCodeSalt: "salt", + DeviceCodeID: uuid.New().String()[:8], + UserCode: uuid.New().String()[:8], + ClientID: "client-x", + Scopes: "read", + ExpiresAt: future, + }) + } + require.NoError(t, s.db.CreateInBatches(activeCodes, 40).Error) + + require.NoError(t, s.DeleteExpiredDeviceCodes()) + + var remaining int64 + require.NoError(t, s.db.Model(&models.DeviceCode{}).Count(&remaining).Error) + assert.Equal(t, int64(15), remaining) +} + +func TestDeleteOldAuditLogsBatched(t *testing.T) { + withSmallCleanupBatch(t, 50) + s := createFreshStore(t, "sqlite", nil) + + // 80 old entries and 20 recent entries; retention cutoff is "now minus 1h" + oldTime := time.Now().Add(-24 * time.Hour) + recent := time.Now() + + oldLogs := make([]*models.AuditLog, 0, 80) + for range 80 { + oldLogs = append(oldLogs, &models.AuditLog{ + ID: uuid.New().String(), + EventType: models.EventAuthenticationSuccess, + EventTime: oldTime, + Severity: models.SeverityInfo, + Action: "test", + CreatedAt: oldTime, + }) + } + require.NoError(t, s.db.CreateInBatches(oldLogs, 40).Error) + + newLogs := make([]*models.AuditLog, 0, 20) + for range 20 { + newLogs = append(newLogs, &models.AuditLog{ + ID: uuid.New().String(), + EventType: models.EventAuthenticationSuccess, + EventTime: recent, + Severity: models.SeverityInfo, + Action: "test", + CreatedAt: recent, + }) + } + require.NoError(t, s.db.CreateInBatches(newLogs, 40).Error) + + deleted, err := s.DeleteOldAuditLogs(time.Now().Add(-time.Hour)) + require.NoError(t, err) + assert.Equal(t, int64(80), deleted) + + var remaining int64 + require.NoError(t, s.db.Model(&models.AuditLog{}).Count(&remaining).Error) + assert.Equal(t, int64(20), remaining) +} diff --git a/internal/store/migrations/001_add_phase1_indexes.sql b/internal/store/migrations/001_add_phase1_indexes.sql new file mode 100644 index 00000000..8c2ff3f1 --- /dev/null +++ b/internal/store/migrations/001_add_phase1_indexes.sql @@ -0,0 +1,43 @@ +-- Phase 1 index additions for AuthGate performance at 20k+ user scale. +-- +-- Usage notes +-- * PostgreSQL only. Run each statement individually; CREATE INDEX +-- CONCURRENTLY cannot run inside a transaction block. +-- * Safe to re-run: `IF NOT EXISTS` short-circuits when indexes already exist. +-- * AutoMigrate in internal/store/sqlite.go will create equivalent indexes on +-- fresh databases via GORM tags; this file is only needed for existing +-- production databases where CONCURRENTLY is required to avoid write locks. + +-- oauth_applications: owner lookups and status filtering +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_oauth_app_user_id + ON oauth_applications (user_id); +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_oauth_app_status + ON oauth_applications (status); + +-- user_authorizations: ListUserAuthorizations / RevokeAllUserAuthorizationsByClientID +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_user_active + ON user_authorizations (user_id, is_active); +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_client_active + ON user_authorizations (client_id, is_active); + +-- oauth_connections: GetOAuthConnectionsByUserID +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_oauth_conn_user_id + ON oauth_connections (user_id); + +-- audit_logs: admin filter combinations +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_audit_severity + ON audit_logs (severity); +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_audit_type_time + ON audit_logs (event_type, event_time DESC); +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_audit_actor_time + ON audit_logs (actor_user_id, event_time DESC); +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_audit_resource_time + ON audit_logs (resource_type, resource_id, event_time DESC); + +-- device_codes: CountPendingDeviceCodes / pending polling lookups +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_device_auth_exp + ON device_codes (authorized, expires_at); + +-- authorization_codes: cleanup and replay-prevention queries +CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_authcode_used_exp + ON authorization_codes (used_at, expires_at); From 3c74ea3c761d7395754cccad19b7b815e3fc1775 Mon Sep 17 00:00:00 2001 From: Bo-Yi Wu Date: Sun, 12 Apr 2026 22:12:36 +0800 Subject: [PATCH 2/4] fix(db): align gorm index names with migration SQL and honor audit cleanup interval - Rename GORM index tags on oauth_applications.user_id / status, oauth_connections.user_id, and audit_logs.severity to match the names used in 001_add_phase1_indexes.sql so AutoMigrate does not create duplicate default-named indexes on existing PostgreSQL deployments - Use cfg.AuditLogCleanupInterval instead of a hard-coded 24h ticker in addAuditLogCleanupJob and treat <= 0 as disabling cleanup Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/bootstrap/server.go | 4 ++-- internal/models/audit_log.go | 2 +- internal/models/oauth_application.go | 6 +++--- internal/models/oauth_connection.go | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/bootstrap/server.go b/internal/bootstrap/server.go index 282c3eb2..333ff7cc 100644 --- a/internal/bootstrap/server.go +++ b/internal/bootstrap/server.go @@ -128,7 +128,7 @@ func addAuditLogCleanupJob( auditService core.AuditLogger, locker rueidislock.Locker, ) { - if !cfg.EnableAuditLogging || cfg.AuditLogRetention <= 0 { + if !cfg.EnableAuditLogging || cfg.AuditLogRetention <= 0 || cfg.AuditLogCleanupInterval <= 0 { return } @@ -147,7 +147,7 @@ func addAuditLogCleanupJob( } m.AddRunningJob(func(ctx context.Context) error { - ticker := time.NewTicker(24 * time.Hour) + ticker := time.NewTicker(cfg.AuditLogCleanupInterval) defer ticker.Stop() if err := run(ctx); err != nil { diff --git a/internal/models/audit_log.go b/internal/models/audit_log.go index 662a1551..a8fc1f1d 100644 --- a/internal/models/audit_log.go +++ b/internal/models/audit_log.go @@ -139,7 +139,7 @@ type AuditLog struct { // Event information EventType EventType `gorm:"type:varchar(50);index;not null;index:idx_audit_type_time,priority:1" json:"event_type"` EventTime time.Time `gorm:"index;not null;index:idx_audit_type_time,priority:2,sort:desc;index:idx_audit_actor_time,priority:2,sort:desc;index:idx_audit_resource_time,priority:3,sort:desc" json:"event_time"` - Severity EventSeverity `gorm:"type:varchar(20);not null;index" json:"severity"` + Severity EventSeverity `gorm:"type:varchar(20);not null;index:idx_audit_severity" json:"severity"` // Actor information ActorUserID string `gorm:"type:varchar(36);index;index:idx_audit_actor_time,priority:1" json:"actor_user_id"` diff --git a/internal/models/oauth_application.go b/internal/models/oauth_application.go index a86b709a..67897eaa 100644 --- a/internal/models/oauth_application.go +++ b/internal/models/oauth_application.go @@ -35,15 +35,15 @@ type OAuthApplication struct { ClientSecret string `gorm:"not null"` // bcrypt hashed secret ClientName string `gorm:"not null"` Description string `gorm:"type:text"` - UserID string `gorm:"not null;index"` + UserID string `gorm:"not null;index:idx_oauth_app_user_id"` Scopes string `gorm:"not null"` GrantTypes string `gorm:"not null;default:'device_code'"` RedirectURIs StringArray `gorm:"type:json"` ClientType string `gorm:"not null;default:'public'"` // "confidential" or "public" EnableDeviceFlow bool `gorm:"not null;default:true"` EnableAuthCodeFlow bool `gorm:"not null;default:false"` - EnableClientCredentialsFlow bool `gorm:"not null;default:false"` // Client Credentials Grant (RFC 6749 §4.4); confidential clients only - Status string `gorm:"not null;default:'active';index"` // ClientStatusPending / ClientStatusActive / ClientStatusInactive + EnableClientCredentialsFlow bool `gorm:"not null;default:false"` // Client Credentials Grant (RFC 6749 §4.4); confidential clients only + Status string `gorm:"not null;default:'active';index:idx_oauth_app_status"` // ClientStatusPending / ClientStatusActive / ClientStatusInactive CreatedBy string CreatedAt time.Time UpdatedAt time.Time diff --git a/internal/models/oauth_connection.go b/internal/models/oauth_connection.go index f8742410..fd7f5756 100644 --- a/internal/models/oauth_connection.go +++ b/internal/models/oauth_connection.go @@ -7,7 +7,7 @@ import ( // OAuthConnection represents an OAuth provider connection for a user type OAuthConnection struct { ID string `gorm:"primaryKey"` - UserID string `gorm:"not null;index;uniqueIndex:idx_oauth_user_provider,priority:1"` + UserID string `gorm:"not null;index:idx_oauth_conn_user_id;uniqueIndex:idx_oauth_user_provider,priority:1"` Provider string `gorm:"not null;uniqueIndex:idx_oauth_provider_user,priority:1;uniqueIndex:idx_oauth_user_provider,priority:2"` // "github", "gitea", "gitlab" ProviderUserID string `gorm:"not null;uniqueIndex:idx_oauth_provider_user,priority:2"` // Provider's user ID From d8fe3219b1428c6a1d5b483eeebeefee1dcce2a1 Mon Sep 17 00:00:00 2001 From: Bo-Yi Wu Date: Sun, 12 Apr 2026 22:20:55 +0800 Subject: [PATCH 3/4] fix(db): guard cleanup batch helper against misconfiguration and document id assumption - Rename deleteInBatches to deleteByIDInBatches and document that it requires an id primary key column - Return an error when cleanupBatchSize <= 0 so a misconfigured var cannot silently loop forever via GORM Limit(0) disabling the clause - Also terminate on RowsAffected == 0 as a belt-and-braces exit condition - Add a unit test covering the invalid batch size guard Co-Authored-By: Claude Opus 4.6 (1M context) --- internal/store/audit_log.go | 2 +- internal/store/cleanup.go | 22 +++++++++++++++------- internal/store/cleanup_test.go | 17 +++++++++++++++++ 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/internal/store/audit_log.go b/internal/store/audit_log.go index c2161d60..8f54d9ee 100644 --- a/internal/store/audit_log.go +++ b/internal/store/audit_log.go @@ -92,7 +92,7 @@ func (s *Store) GetAuditLogsPaginated( // DeleteOldAuditLogs deletes audit logs older than the specified time in // bounded batches to keep lock duration short on large tables. func (s *Store) DeleteOldAuditLogs(olderThan time.Time) (int64, error) { - return s.deleteInBatches(&models.AuditLog{}, "created_at < ?", olderThan) + return s.deleteByIDInBatches(&models.AuditLog{}, "created_at < ?", olderThan) } // GetAuditLogStats returns statistics about audit logs in a given time range diff --git a/internal/store/cleanup.go b/internal/store/cleanup.go index ff0eea40..bdda49f2 100644 --- a/internal/store/cleanup.go +++ b/internal/store/cleanup.go @@ -1,6 +1,7 @@ package store import ( + "fmt" "time" "github.com/go-authgate/authgate/internal/models" @@ -19,15 +20,22 @@ var cleanupBatchSize = 10000 // friendliness, not throughput. Declared as var so tests can zero it out. var cleanupBatchPause = 200 * time.Millisecond -// deleteInBatches DELETEs rows of `model` matching whereClause in batches of -// cleanupBatchSize. The subquery form (`id IN (SELECT id … LIMIT N)`) works on -// PostgreSQL — which does not support `DELETE … LIMIT` directly — and lets the +// deleteByIDInBatches DELETEs rows of `model` matching whereClause in batches of +// cleanupBatchSize, using the subquery form `id IN (SELECT id … LIMIT N)`. +// PostgreSQL does not support `DELETE … LIMIT` directly, and this form lets the // inner SELECT use the WHERE-clause index (e.g. expires_at). -func (s *Store) deleteInBatches( +// +// The helper assumes `model` has an `id` primary-key column — all current +// cleanup targets (AccessToken, DeviceCode, AuditLog) satisfy this. Do not call +// it with models whose PK column is named differently. +func (s *Store) deleteByIDInBatches( model any, whereClause string, args ...any, ) (int64, error) { + if cleanupBatchSize <= 0 { + return 0, fmt.Errorf("cleanupBatchSize must be positive, got %d", cleanupBatchSize) + } var total int64 for { sub := s.db.Model(model). @@ -39,7 +47,7 @@ func (s *Store) deleteInBatches( return total, res.Error } total += res.RowsAffected - if res.RowsAffected < int64(cleanupBatchSize) { + if res.RowsAffected == 0 || res.RowsAffected < int64(cleanupBatchSize) { return total, nil } time.Sleep(cleanupBatchPause) @@ -48,13 +56,13 @@ func (s *Store) deleteInBatches( // DeleteExpiredTokens removes access/refresh tokens past expiry. func (s *Store) DeleteExpiredTokens() error { - _, err := s.deleteInBatches(&models.AccessToken{}, "expires_at < ?", time.Now()) + _, err := s.deleteByIDInBatches(&models.AccessToken{}, "expires_at < ?", time.Now()) return err } // DeleteExpiredDeviceCodes removes device authorization codes past expiry. func (s *Store) DeleteExpiredDeviceCodes() error { - _, err := s.deleteInBatches(&models.DeviceCode{}, "expires_at < ?", time.Now()) + _, err := s.deleteByIDInBatches(&models.DeviceCode{}, "expires_at < ?", time.Now()) return err } diff --git a/internal/store/cleanup_test.go b/internal/store/cleanup_test.go index ce68402a..42fa0381 100644 --- a/internal/store/cleanup_test.go +++ b/internal/store/cleanup_test.go @@ -80,6 +80,23 @@ func TestDeleteExpiredTokensBatched(t *testing.T) { assert.Equal(t, int64(30), remaining, "only unexpired tokens should remain") } +func TestDeleteExpiredTokensInvalidBatchSize(t *testing.T) { + // Guard against a misconfigured batch size silently looping forever: GORM + // treats Limit(0) as "no limit", so the old termination check never fired. + withSmallCleanupBatch(t, 0) + s := createFreshStore(t, "sqlite", nil) + + seedExpiredTokens(t, s, 5) + + err := s.DeleteExpiredTokens() + require.Error(t, err) + assert.Contains(t, err.Error(), "cleanupBatchSize must be positive") + + var remaining int64 + require.NoError(t, s.db.Model(&models.AccessToken{}).Count(&remaining).Error) + assert.Equal(t, int64(5), remaining, "no rows should be deleted when guard fires") +} + func TestDeleteExpiredTokensNoRows(t *testing.T) { withSmallCleanupBatch(t, 100) s := createFreshStore(t, "sqlite", nil) From ef52a809462fcfdec382eee08d74f1880b65e059 Mon Sep 17 00:00:00 2001 From: Bo-Yi Wu Date: Sun, 12 Apr 2026 22:31:59 +0800 Subject: [PATCH 4/4] fix(db): harden cleanup config validation and remove redundant index - Validate EnableCleanupLock requires REDIS_ADDR and positive CleanupLockKeyValidity, and require positive ExpiredTokenCleanupInterval when expired-token cleanup is enabled, so misconfiguration fails fast instead of panicking time.NewTicker or throwing runtime lock errors - Guard addExpiredTokenCleanupJob against non-positive interval as defense in depth - Drop standalone idx_oauth_conn_user_id in favor of the existing composite unique index idx_oauth_user_provider whose leading column already serves user_id-only predicates - Flip METRICS_GAUGE_UPDATE_ENABLED default to false in the production template so copy-pasting into every pod does not double-count gauge metrics Co-Authored-By: Claude Opus 4.6 (1M context) --- .env.production.example | 8 ++- internal/bootstrap/server.go | 2 +- internal/config/config.go | 22 ++++++ internal/config/config_test.go | 71 +++++++++++++++++++ internal/models/oauth_connection.go | 2 +- .../migrations/001_add_phase1_indexes.sql | 7 +- 6 files changed, 104 insertions(+), 8 deletions(-) diff --git a/.env.production.example b/.env.production.example index dccb7db2..df5597e8 100644 --- a/.env.production.example +++ b/.env.production.example @@ -63,9 +63,11 @@ RATE_LIMIT_STORE=redis # ---- Metrics ---------------------------------------------------------------- METRICS_ENABLED=true -# Enable on ONE replica only; other pods should set this to false to avoid -# duplicated gauge updates. Alternatively, use PromQL avg()/max() aggregation. -METRICS_GAUGE_UPDATE_ENABLED=true +# Gauge updates query global counts; if every pod runs them you get duplicated +# values across the fleet. Default this template to false so copying the file +# into all pods is safe. On ONE dedicated replica set METRICS_GAUGE_UPDATE_ENABLED=true +# (or set it true on all pods and aggregate with avg()/max() in PromQL). +METRICS_GAUGE_UPDATE_ENABLED=false # ---- Sessions --------------------------------------------------------------- SESSION_FINGERPRINT=true diff --git a/internal/bootstrap/server.go b/internal/bootstrap/server.go index 333ff7cc..f6ff27f1 100644 --- a/internal/bootstrap/server.go +++ b/internal/bootstrap/server.go @@ -309,7 +309,7 @@ func addExpiredTokenCleanupJob( cfg *config.Config, locker rueidislock.Locker, ) { - if !cfg.EnableExpiredTokenCleanup { + if !cfg.EnableExpiredTokenCleanup || cfg.ExpiredTokenCleanupInterval <= 0 { return } diff --git a/internal/config/config.go b/internal/config/config.go index 2a8ff395..824bcd4c 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -1,6 +1,7 @@ package config import ( + "errors" "fmt" "os" "strings" @@ -636,6 +637,27 @@ func (c *Config) Validate() error { ) } + // Cleanup lock validation (only when enabled) + if c.EnableCleanupLock { + if c.RedisAddr == "" { + return errors.New("REDIS_ADDR must be set when ENABLE_CLEANUP_LOCK=true") + } + if c.CleanupLockKeyValidity <= 0 { + return fmt.Errorf( + "CLEANUP_LOCK_KEY_VALIDITY must be a positive duration when ENABLE_CLEANUP_LOCK=true (got %s)", + c.CleanupLockKeyValidity, + ) + } + } + + // Expired-token cleanup interval must be positive when cleanup is enabled. + if c.EnableExpiredTokenCleanup && c.ExpiredTokenCleanupInterval <= 0 { + return fmt.Errorf( + "EXPIRED_TOKEN_CLEANUP_INTERVAL must be a positive duration when ENABLE_EXPIRED_TOKEN_CLEANUP=true (got %s)", + c.ExpiredTokenCleanupInterval, + ) + } + // Token cache validation (only when enabled) if c.TokenCacheEnabled { if err := validateCacheType("TOKEN_CACHE_TYPE", c.TokenCacheType, c.RedisAddr); err != nil { diff --git a/internal/config/config_test.go b/internal/config/config_test.go index a2d31ef5..b4a25fd1 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -751,6 +751,77 @@ func TestConfig_Validate_SessionRememberMeMaxAge(t *testing.T) { }) } +func TestConfig_Validate_CleanupLock(t *testing.T) { + t.Run("enabled without redis address fails", func(t *testing.T) { + cfg := validBaseConfig() + cfg.EnableCleanupLock = true + cfg.CleanupLockKeyValidity = 5 * time.Minute + cfg.RedisAddr = "" + err := cfg.Validate() + require.Error(t, err) + assert.Contains(t, err.Error(), "REDIS_ADDR must be set when ENABLE_CLEANUP_LOCK=true") + }) + + t.Run("enabled with zero validity fails", func(t *testing.T) { + cfg := validBaseConfig() + cfg.EnableCleanupLock = true + cfg.RedisAddr = "localhost:6379" + cfg.CleanupLockKeyValidity = 0 + err := cfg.Validate() + require.Error(t, err) + assert.Contains(t, err.Error(), "CLEANUP_LOCK_KEY_VALIDITY must be a positive duration") + }) + + t.Run("enabled with valid settings passes", func(t *testing.T) { + cfg := validBaseConfig() + cfg.EnableCleanupLock = true + cfg.RedisAddr = "localhost:6379" + cfg.CleanupLockKeyValidity = 5 * time.Minute + err := cfg.Validate() + require.NoError(t, err) + }) + + t.Run("disabled does not require redis or validity", func(t *testing.T) { + cfg := validBaseConfig() + cfg.EnableCleanupLock = false + cfg.RedisAddr = "" + cfg.CleanupLockKeyValidity = 0 + err := cfg.Validate() + require.NoError(t, err) + }) +} + +func TestConfig_Validate_ExpiredTokenCleanupInterval(t *testing.T) { + t.Run("enabled with zero interval fails", func(t *testing.T) { + cfg := validBaseConfig() + cfg.EnableExpiredTokenCleanup = true + cfg.ExpiredTokenCleanupInterval = 0 + err := cfg.Validate() + require.Error(t, err) + assert.Contains( + t, + err.Error(), + "EXPIRED_TOKEN_CLEANUP_INTERVAL must be a positive duration", + ) + }) + + t.Run("enabled with positive interval passes", func(t *testing.T) { + cfg := validBaseConfig() + cfg.EnableExpiredTokenCleanup = true + cfg.ExpiredTokenCleanupInterval = 30 * time.Minute + err := cfg.Validate() + require.NoError(t, err) + }) + + t.Run("disabled skips interval validation", func(t *testing.T) { + cfg := validBaseConfig() + cfg.EnableExpiredTokenCleanup = false + cfg.ExpiredTokenCleanupInterval = 0 + err := cfg.Validate() + require.NoError(t, err) + }) +} + func TestAuthCodeFlowConfigFields(t *testing.T) { // Verify the fields exist and have sensible types/values cfg := &Config{ diff --git a/internal/models/oauth_connection.go b/internal/models/oauth_connection.go index fd7f5756..010edba6 100644 --- a/internal/models/oauth_connection.go +++ b/internal/models/oauth_connection.go @@ -7,7 +7,7 @@ import ( // OAuthConnection represents an OAuth provider connection for a user type OAuthConnection struct { ID string `gorm:"primaryKey"` - UserID string `gorm:"not null;index:idx_oauth_conn_user_id;uniqueIndex:idx_oauth_user_provider,priority:1"` + UserID string `gorm:"not null;uniqueIndex:idx_oauth_user_provider,priority:1"` Provider string `gorm:"not null;uniqueIndex:idx_oauth_provider_user,priority:1;uniqueIndex:idx_oauth_user_provider,priority:2"` // "github", "gitea", "gitlab" ProviderUserID string `gorm:"not null;uniqueIndex:idx_oauth_provider_user,priority:2"` // Provider's user ID diff --git a/internal/store/migrations/001_add_phase1_indexes.sql b/internal/store/migrations/001_add_phase1_indexes.sql index 8c2ff3f1..b9d55c79 100644 --- a/internal/store/migrations/001_add_phase1_indexes.sql +++ b/internal/store/migrations/001_add_phase1_indexes.sql @@ -20,9 +20,10 @@ CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_user_active CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_client_active ON user_authorizations (client_id, is_active); --- oauth_connections: GetOAuthConnectionsByUserID -CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_oauth_conn_user_id - ON oauth_connections (user_id); +-- oauth_connections: GetOAuthConnectionsByUserID / DeleteOAuthConnectionsByUserID +-- are served by the existing unique composite index idx_oauth_user_provider +-- (user_id, provider); PostgreSQL uses the leading column for user_id-only +-- predicates, so no standalone user_id index is required. -- audit_logs: admin filter combinations CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_audit_severity