Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions .env.production.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# AuthGate production environment template for multi-pod deployments
# (20k+ users, 5+ replicas, PostgreSQL + Redis).
#
# Copy to .env or inject via your secrets manager / Helm values, then override
# secrets (JWT_SECRET, SESSION_SECRET, DATABASE_DSN, REDIS_ADDR, etc.) to match
# your infrastructure. Adjust cache/pool sizes only after observing real
# traffic metrics (cache hit rate, DB CPU, connection count).
# =============================================================================

ENVIRONMENT=production

# ---- Secrets (REQUIRED: generate fresh values with `openssl rand -hex 32`) ---
JWT_SECRET=CHANGE_ME
SESSION_SECRET=CHANGE_ME

# ---- Database ---------------------------------------------------------------
DATABASE_DRIVER=postgres
DATABASE_DSN=host=postgres user=authgate password=CHANGE_ME dbname=authgate port=5432 sslmode=require

# Connection pool: 5 pods × 25 conns = 125; ensure PG max_connections >= 200.
DB_MAX_OPEN_CONNS=25
DB_MAX_IDLE_CONNS=10
DB_CONN_MAX_LIFETIME=5m
DB_CONN_MAX_IDLE_TIME=10m

# ---- Redis (shared cache + rate limit + cleanup lock) -----------------------
REDIS_ADDR=redis:6379
# REDIS_PASSWORD=
# REDIS_DB=0

# ---- Token verification cache (major DB-load reducer) -----------------------
# Off by default in .env.example; production should enable this.
TOKEN_CACHE_ENABLED=true
TOKEN_CACHE_TYPE=redis-aside
TOKEN_CACHE_TTL=10h
TOKEN_CACHE_CLIENT_TTL=1h
# If pod memory is tight, drop this to 16 (MB) per connection.
TOKEN_CACHE_SIZE_PER_CONN=32

# ---- Client / User / Metrics cache (shared across pods) ---------------------
CLIENT_CACHE_TYPE=redis-aside
CLIENT_COUNT_CACHE_TYPE=redis-aside
USER_CACHE_TYPE=redis-aside
METRICS_CACHE_TYPE=redis-aside

# ---- Expired token / device code cleanup ------------------------------------
# All pods may enable this: a Redis-backed distributed lock (below) prevents
# concurrent runs — only one pod does the DELETE each interval.
ENABLE_EXPIRED_TOKEN_CLEANUP=true
EXPIRED_TOKEN_CLEANUP_INTERVAL=30m

# Distributed cleanup lock via rueidislock. Required for multi-pod.
ENABLE_CLEANUP_LOCK=true
CLEANUP_LOCK_KEY_VALIDITY=5m

# ---- Audit logging ----------------------------------------------------------
ENABLE_AUDIT_LOGGING=true
AUDIT_LOG_RETENTION=2160h # 90 days

# ---- Rate limiting (distributed) --------------------------------------------
ENABLE_RATE_LIMIT=true
RATE_LIMIT_STORE=redis

# ---- Metrics ----------------------------------------------------------------
METRICS_ENABLED=true
# Gauge updates query global counts; if every pod runs them you get duplicated
# values across the fleet. Default this template to false so copying the file
# into all pods is safe. On ONE dedicated replica set METRICS_GAUGE_UPDATE_ENABLED=true
# (or set it true on all pods and aggregate with avg()/max() in PromQL).
METRICS_GAUGE_UPDATE_ENABLED=false

# ---- Sessions ---------------------------------------------------------------
SESSION_FINGERPRINT=true
13 changes: 11 additions & 2 deletions internal/bootstrap/bootstrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/gin-gonic/gin"
"github.com/prometheus/client_golang/prometheus"
"github.com/redis/go-redis/v9"
"github.com/redis/rueidis/rueidislock"
)

// Application holds all initialized components
Expand All @@ -39,6 +40,7 @@ type Application struct {
TokenCache core.Cache[models.AccessToken]
TokenCacheCloser func() error
RateLimitRedisClient *redis.Client
CleanupLocker rueidislock.Locker

// Services
AuditService core.AuditLogger
Expand Down Expand Up @@ -137,6 +139,12 @@ func (app *Application) initializeInfrastructure(ctx context.Context) error {
return err
}

// Distributed cleanup lock (multi-pod: serialize DELETE jobs)
app.CleanupLocker, err = initializeCleanupLocker(app.Config)
if err != nil {
return err
}

return nil
}

Expand Down Expand Up @@ -222,8 +230,9 @@ func (app *Application) startWithGracefulShutdown() {
addClientCacheCleanupJob(m, app.ClientCache, app.Config)
addTokenCacheCleanupJob(m, app.TokenCache, app.Config)
addDatabaseShutdownJob(m, app.DB, app.Config)
addAuditLogCleanupJob(m, app.Config, app.AuditService)
addExpiredTokenCleanupJob(m, app.DB, app.Config)
addAuditLogCleanupJob(m, app.Config, app.AuditService, app.CleanupLocker)
addExpiredTokenCleanupJob(m, app.DB, app.Config, app.CleanupLocker)
addCleanupLockerShutdownJob(m, app.CleanupLocker)
addMetricsGaugeUpdateJob(m, app.Config, app.DB, app.MetricsRecorder, app.MetricsCache)

// Wait for graceful shutdown
Expand Down
80 changes: 80 additions & 0 deletions internal/bootstrap/cleanup_lock.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
package bootstrap

import (
"context"
"errors"
"fmt"
"log"

"github.com/redis/rueidis"
"github.com/redis/rueidis/rueidislock"

"github.com/go-authgate/authgate/internal/config"
)

// Lock names for the distributed cleanup jobs. Keep in sync with docs/runbooks
// that may reference these keys for debugging stuck cleanups.
const (
cleanupLockAuditLogs = "cleanup:audit-logs"
cleanupLockExpiredTokens = "cleanup:expired-tokens"
)

// initializeCleanupLocker builds a Redis-backed distributed locker that
// serializes periodic cleanup jobs across multi-pod deployments. Returns
// (nil, nil) when cleanup lock is disabled; callers treat a nil locker as
// "run unconditionally" (single-instance mode).
//
// KeyMajority is 1 (single Redis target) rather than a Redlock quorum. A
// Redis failover window could allow two pods to hold the lock simultaneously,
// but cleanup DELETEs are idempotent (the inner SELECT finds no matching rows
// on the second pod), so this is safe — the worst case is transient double
// work, never data loss or corruption.
func initializeCleanupLocker(cfg *config.Config) (rueidislock.Locker, error) {
if !cfg.EnableCleanupLock {
return nil, nil //nolint:nilnil // locker not needed when feature is disabled
}
if cfg.RedisAddr == "" {
return nil, errors.New("ENABLE_CLEANUP_LOCK requires REDIS_ADDR to be set")
}

locker, err := rueidislock.NewLocker(rueidislock.LockerOption{
ClientOption: rueidis.ClientOption{
InitAddress: []string{cfg.RedisAddr},
Password: cfg.RedisPassword,
SelectDB: cfg.RedisDB,
},
KeyPrefix: "authgate:lock",
KeyMajority: 1,
KeyValidity: cfg.CleanupLockKeyValidity,
})
if err != nil {
return nil, fmt.Errorf("failed to create cleanup locker: %w", err)
}

log.Printf("Cleanup lock initialized (validity: %v)", cfg.CleanupLockKeyValidity)
return locker, nil
}

// runWithCleanupLock executes fn while holding the named distributed lock.
// When locker is nil (single-instance mode) fn runs unconditionally.
// When another pod currently holds the lock, fn is skipped silently and
// nil is returned — the next tick will try again.
func runWithCleanupLock(
ctx context.Context,
locker rueidislock.Locker,
name string,
fn func(context.Context) error,
) error {
if locker == nil {
return fn(ctx)
}
lockCtx, cancel, err := locker.TryWithContext(ctx, name)
if err != nil {
if errors.Is(err, rueidislock.ErrNotLocked) {
return nil
}
return fmt.Errorf("acquire cleanup lock %q: %w", name, err)
}
defer cancel()
return fn(lockCtx)
}
96 changes: 69 additions & 27 deletions internal/bootstrap/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (

"github.com/appleboy/graceful"
"github.com/redis/go-redis/v9"
"github.com/redis/rueidis/rueidislock"
)

// createHTTPServer creates the HTTP server instance
Expand Down Expand Up @@ -118,36 +119,46 @@ func addAuditServiceShutdownJob(
})
}

// addAuditLogCleanupJob adds periodic audit log cleanup job
// addAuditLogCleanupJob adds periodic audit log cleanup job.
// When a cleanup locker is supplied, only one pod across the fleet performs
// the DELETE per tick; others skip silently.
func addAuditLogCleanupJob(
m *graceful.Manager,
cfg *config.Config,
auditService core.AuditLogger,
locker rueidislock.Locker,
) {
if !cfg.EnableAuditLogging || cfg.AuditLogRetention <= 0 {
if !cfg.EnableAuditLogging || cfg.AuditLogRetention <= 0 || cfg.AuditLogCleanupInterval <= 0 {
return
}

run := func(ctx context.Context) error {
return runWithCleanupLock(ctx, locker, cleanupLockAuditLogs, func(context.Context) error {
deleted, err := auditService.CleanupOldLogs(cfg.AuditLogRetention)
if err != nil {
log.Printf("Failed to cleanup old audit logs: %v", err)
return nil
}
if deleted > 0 {
log.Printf("Cleaned up %d old audit logs", deleted)
}
return nil
})
}

m.AddRunningJob(func(ctx context.Context) error {
ticker := time.NewTicker(24 * time.Hour)
ticker := time.NewTicker(cfg.AuditLogCleanupInterval)
defer ticker.Stop()

Comment thread
appleboy marked this conversation as resolved.
// Run cleanup immediately on startup
if deleted, err := auditService.CleanupOldLogs(cfg.AuditLogRetention); err != nil {
log.Printf("Failed to cleanup old audit logs: %v", err)
} else if deleted > 0 {
log.Printf("Cleaned up %d old audit logs", deleted)
if err := run(ctx); err != nil {
log.Printf("Audit log cleanup run error: %v", err)
}

for {
select {
case <-ticker.C:
if deleted, err := auditService.CleanupOldLogs(
cfg.AuditLogRetention,
); err != nil {
log.Printf("Failed to cleanup old audit logs: %v", err)
} else if deleted > 0 {
log.Printf("Cleaned up %d old audit logs", deleted)
if err := run(ctx); err != nil {
log.Printf("Audit log cleanup run error: %v", err)
}
case <-ctx.Done():
return nil
Expand Down Expand Up @@ -290,31 +301,48 @@ func addTokenCacheCleanupJob(

// addExpiredTokenCleanupJob adds a periodic job that purges expired access tokens
// and device codes from the database to prevent unbounded table growth.
func addExpiredTokenCleanupJob(m *graceful.Manager, db *store.Store, cfg *config.Config) {
if !cfg.EnableExpiredTokenCleanup {
// When a cleanup locker is supplied, only one pod across the fleet performs
// the DELETE per tick; others skip silently.
func addExpiredTokenCleanupJob(
m *graceful.Manager,
db *store.Store,
cfg *config.Config,
locker rueidislock.Locker,
) {
if !cfg.EnableExpiredTokenCleanup || cfg.ExpiredTokenCleanupInterval <= 0 {
return
}
Comment thread
appleboy marked this conversation as resolved.

run := func(ctx context.Context) error {
return runWithCleanupLock(
ctx,
locker,
cleanupLockExpiredTokens,
func(context.Context) error {
if err := db.DeleteExpiredTokens(); err != nil {
log.Printf("Failed to cleanup expired tokens: %v", err)
}
if err := db.DeleteExpiredDeviceCodes(); err != nil {
log.Printf("Failed to cleanup expired device codes: %v", err)
}
return nil
},
)
}

m.AddRunningJob(func(ctx context.Context) error {
ticker := time.NewTicker(cfg.ExpiredTokenCleanupInterval)
defer ticker.Stop()

// Run cleanup immediately on startup
if err := db.DeleteExpiredTokens(); err != nil {
log.Printf("Failed to cleanup expired tokens: %v", err)
}
if err := db.DeleteExpiredDeviceCodes(); err != nil {
log.Printf("Failed to cleanup expired device codes: %v", err)
if err := run(ctx); err != nil {
log.Printf("Expired token cleanup run error: %v", err)
}

for {
select {
case <-ticker.C:
if err := db.DeleteExpiredTokens(); err != nil {
log.Printf("Failed to cleanup expired tokens: %v", err)
}
if err := db.DeleteExpiredDeviceCodes(); err != nil {
log.Printf("Failed to cleanup expired device codes: %v", err)
if err := run(ctx); err != nil {
log.Printf("Expired token cleanup run error: %v", err)
}
case <-ctx.Done():
return nil
Expand All @@ -323,6 +351,20 @@ func addExpiredTokenCleanupJob(m *graceful.Manager, db *store.Store, cfg *config
})
}

// addCleanupLockerShutdownJob closes the distributed cleanup locker on
// shutdown. No-op when locker is nil.
func addCleanupLockerShutdownJob(m *graceful.Manager, locker rueidislock.Locker) {
if locker == nil {
return
}
m.AddShutdownJob(func() error {
log.Println("Closing cleanup locker...")
locker.Close()
log.Println("Cleanup locker closed")
return nil
})
}

// addDatabaseShutdownJob adds database connection close handler
func addDatabaseShutdownJob(m *graceful.Manager, db *store.Store, cfg *config.Config) {
m.AddShutdownJob(func() error {
Expand Down
Loading
Loading