From 3f0c62a782534805a609849019766efb9abd7274 Mon Sep 17 00:00:00 2001
From: Joshua Gilman <joshuagilman@gmail.com>
Date: Wed, 24 Jun 2026 12:04:19 -0700
Subject: [PATCH] feat(api): add per-client IP rate limiting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rate limit the API per client IP, on by default, as a Huma middleware
installed before authentication so an over-limit request is rejected
with an RFC 9457 429 + Retry-After before it reaches the credential
store, shielding the auth path and database from anonymous floods. The
infrastructure routes (/healthz, /readyz, /metrics) bypass Huma and are
never limited.

The shipped limiter is in-process (per-key token bucket via
golang.org/x/time/rate, with idle-evicted buckets) behind a new
ratelimit.Limiter port — the seam for a distributed (for example,
Redis-backed) limiter. Keying is a pluggable KeyFunc whose default is the
spoof-safe resolved client IP; swap it to limit authenticated principals
instead. Config: --rate-limit-enabled (default true), --rate-limit-rps
(default 10), --rate-limit-burst (default 20).

Retry-After (RFC 9110) is the throttle signal; the IETF RateLimit
structured-field headers are still a draft and map only loosely onto a
token bucket, so they are left as a documented enhancement.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 DELETE_ME.md                           |   1 +
 README.md                              |  31 ++++++
 go.mod                                 |   1 +
 go.sum                                 |   2 +
 internal/adapter/http/ratelimit.go     |  24 ++++
 internal/adapter/http/router.go        |  34 ++++--
 internal/app/app.go                    |  40 ++++++-
 internal/app/app_test.go               |  45 ++++++++
 internal/app/serve.go                  |  14 +++
 internal/config/config.go              |  43 ++++++++
 internal/config/config_test.go         |  25 +++++
 internal/integration/authz_e2e_test.go |   5 +
 internal/ratelimit/limiter.go          |  40 +++++++
 internal/ratelimit/memory.go           | 146 +++++++++++++++++++++++++
 internal/ratelimit/memory_test.go      |  89 +++++++++++++++
 internal/ratelimit/middleware.go       | 111 +++++++++++++++++++
 internal/ratelimit/middleware_test.go  |  97 ++++++++++++++++
 17 files changed, 735 insertions(+), 13 deletions(-)
 create mode 100644 internal/adapter/http/ratelimit.go
 create mode 100644 internal/ratelimit/limiter.go
 create mode 100644 internal/ratelimit/memory.go
 create mode 100644 internal/ratelimit/memory_test.go
 create mode 100644 internal/ratelimit/middleware.go
 create mode 100644 internal/ratelimit/middleware_test.go

diff --git a/DELETE_ME.md b/DELETE_ME.md
index 8aefdaa..06de815 100644
--- a/DELETE_ME.md
+++ b/DELETE_ME.md
@@ -11,6 +11,7 @@ It is only here to orient the initial project owner.
 - A runnable hexagonal HTTP API server (chi + Huma) at `github.com/meigma/template-go-api`, with a `todo` example resource served under a `/v1` URL version prefix, RFC 9457 errors, unversioned `/healthz`, `/readyz`, and `/metrics`, runtime API docs at `/docs`, and an `openapi` spec-export command.
 - A PostgreSQL persistence adapter (pgx + sqlc typed queries + goose migrations) behind the domain's `todo.Repository` port: a `migrate` subcommand, a committed-and-drift-guarded sqlc layer, a real `/readyz` check, and container-backed integration tests. The port is the seam — implement it to back the template with a different datastore.
 - An authorization tier (Cedar via `cedar-go`) with a deny-by-default Huma middleware, a modular per-resource authz slice pattern, and authentication deferred to the integrator: a placeholder API-key authenticator (`X-API-Key`/Bearer, backed by an `api_keys` table) and dev-only mock keys seeded for the Compose demo. Replace the authenticator with real authn — see step 6.
+- Per-client rate limiting (on by default): a Huma middleware that throttles by client IP **before** authentication and returns RFC 9457 `429` with `Retry-After`. The shipped limiter is in-process (token bucket, `golang.org/x/time/rate`) behind a `ratelimit.Limiter` port — the seam for a distributed (for example, Redis-backed) limiter. See the README's [Rate limiting](README.md#rate-limiting) section.
 - A Cobra/Viper entrypoint under `cmd/template-go-api` and `internal/cli` exposing `serve` (default), `version`, `openapi`, and `migrate`.
 - Moon tasks for `format`, `lint`, `build`, `test`, and `check`, plus `sqlc` / `sqlc-check` (regenerate and drift-guard the typed query layer), `mockery` / `mockery-check` (regenerate and drift-guard the testify mocks), `migrate` (run database migrations), and `test-integration` (container-backed adapter tests).
 - `golangci-lint`, `sqlc`, `goose`, and `mockery` wired through Proto and Moon.
diff --git a/README.md b/README.md
index dbd8c31..2e1c1fb 100644
--- a/README.md
+++ b/README.md
@@ -198,6 +198,9 @@ default.
 | `--db-max-conns` | `TEMPLATE_GO_API_DB_MAX_CONNS` | `0` | maximum PostgreSQL pool connections; `0` uses the driver default |
 | `--authz-enabled` | `TEMPLATE_GO_API_AUTHZ_ENABLED` | `true` | enable the [authorization](#authorization) middleware (deny-by-default); `false` bypasses it entirely |
 | `--authz-policy-dir` | `TEMPLATE_GO_API_AUTHZ_POLICY_DIR` | _(none)_ | directory of `.cedar` files to load instead of the embedded policies; empty uses the embedded set |
+| `--rate-limit-enabled` | `TEMPLATE_GO_API_RATE_LIMIT_ENABLED` | `true` | enable per-client [rate limiting](#rate-limiting); `false` disables throttling entirely |
+| `--rate-limit-rps` | `TEMPLATE_GO_API_RATE_LIMIT_RPS` | `10` | sustained per-client request rate (requests/second) |
+| `--rate-limit-burst` | `TEMPLATE_GO_API_RATE_LIMIT_BURST` | `20` | per-client burst size (token-bucket depth) |
 
 CORS is off until you set origins. Client IP is read from the direct TCP peer
 unless you opt into a trusted proxy header — never from `X-Forwarded-For`
@@ -440,6 +443,34 @@ policy in one slice can reference shared principal roles (`principal in
 Role::"admin"`) or another slice's entities. Cross-cutting rules and shared
 principal/role types live in the base package's `base.cedar`.
 
+## Rate limiting
+
+The API is rate limited per client out of the box (`--rate-limit-enabled`,
+default true). The limiter is a Huma middleware installed **before**
+authentication, so an over-limit request is rejected with `429 Too Many
+Requests` before it reaches the credential store — protecting the auth path and
+database from anonymous floods. The infrastructure routes (`/healthz`,
+`/readyz`, `/metrics`) bypass Huma and are never limited.
+
+Requests are keyed by **client IP** (the spoof-safe IP the
+[`--trusted-proxy-header`](#configuration) logic resolves), allowing a burst of
+`--rate-limit-burst` and a sustained `--rate-limit-rps` per second (a token
+bucket). A throttled response is RFC 9457 `application/problem+json` and carries
+a `Retry-After` header (whole seconds).
+
+The shipped limiter is **in-process** (`golang.org/x/time/rate`), with per-key
+buckets evicted after an idle period to bound memory. That is the right default
+for a single instance; behind multiple replicas a shared backend keeps the limit
+global. The `ratelimit.Limiter` port is the seam: implement `Allow` over a store
+such as Redis and wire your adapter in `internal/app/app.go` — the middleware and
+key function are unchanged. To limit authenticated callers instead of IPs, swap
+the key function (`adapterhttp.ClientIPKeyFunc`) for one that reads the principal.
+
+> The IETF [RateLimit header fields](https://datatracker.ietf.org/doc/draft-ietf-httpapi-ratelimit-headers/)
+> (`RateLimit`/`RateLimit-Policy`) are still a draft and map only loosely onto a
+> token bucket, so the template advertises the limit with the stable `Retry-After`
+> header and leaves those headers as a documented enhancement.
+
 ## Testing
 
 Unit tests sit beside the code and use [Testify](https://github.com/stretchr/testify)
diff --git a/go.mod b/go.mod
index aa59893..9e4e867 100644
--- a/go.mod
+++ b/go.mod
@@ -17,6 +17,7 @@ require (
 	github.com/stretchr/testify v1.11.1
 	github.com/testcontainers/testcontainers-go v0.43.0
 	github.com/testcontainers/testcontainers-go/modules/postgres v0.43.0
+	golang.org/x/time v0.11.0
 )
 
 require (
diff --git a/go.sum b/go.sum
index 9d2d36c..6b30b65 100644
--- a/go.sum
+++ b/go.sum
@@ -222,6 +222,8 @@ golang.org/x/term v0.44.0 h1:0rLvDRCtNj0gZkyIXhCyOb2OAzEhLVqc4B+hrsBhrmc=
 golang.org/x/term v0.44.0/go.mod h1:7ze4MdzUzLXpSAoFP1H0bOI9aXDqveSvatT5vKcFh2Y=
 golang.org/x/text v0.38.0 h1:sXmwo9DwP3OK9EZ7PqAdaooSGozfl/3a6/xJcbzPRhE=
 golang.org/x/text v0.38.0/go.mod h1:YXZt3QhHUKYT53r2lLKFIVi6Ao1jdzrTR/KQ09qyxF4=
+golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0=
+golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg=
 golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
 google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
diff --git a/internal/adapter/http/ratelimit.go b/internal/adapter/http/ratelimit.go
new file mode 100644
index 0000000..6b55c46
--- /dev/null
+++ b/internal/adapter/http/ratelimit.go
@@ -0,0 +1,24 @@
+package http
+
+import (
+	"github.com/danielgtaylor/huma/v2"
+	"github.com/danielgtaylor/huma/v2/adapters/humachi"
+	chimiddleware "github.com/go-chi/chi/v5/middleware"
+)
+
+// ClientIPKeyFunc keys the rate limiter by the resolved client IP. It reads the
+// IP the ClientIP middleware stored on the request context — spoof-safe, since
+// that middleware honors --trusted-proxy-header and otherwise trusts only the
+// TCP peer — so the limiter and the access log agree on who the client is.
+//
+// It has the signature of ratelimit.KeyFunc and is the default key for the
+// rate-limit middleware. To limit authenticated callers instead, swap in a key
+// function that reads the principal (see internal/authz) from the context;
+// keying lives here, in the transport, so the limiter core stays
+// router-agnostic. It never errors: an unresolved IP yields the empty key, which
+// simply shares one bucket rather than failing the request.
+func ClientIPKeyFunc(ctx huma.Context) (string, error) {
+	r, _ := humachi.Unwrap(ctx)
+
+	return chimiddleware.GetClientIP(r.Context()), nil
+}
diff --git a/internal/adapter/http/router.go b/internal/adapter/http/router.go
index 10f58d8..f27be9a 100644
--- a/internal/adapter/http/router.go
+++ b/internal/adapter/http/router.go
@@ -39,6 +39,14 @@ type RouterDeps struct {
 	Readiness []ReadinessCheck
 	// Register mounts resource operations onto the Huma API.
 	Register Registrar
+	// InstallRateLimit installs the rate-limit Huma middleware on the API. Like
+	// InstallAuthz it MUST run before the resource operations are registered (Huma
+	// snapshots the middleware stack per operation at registration), and it runs
+	// BEFORE InstallAuthz so an over-limit request is rejected before
+	// authentication touches the credential store. Nil (or a disabled middleware)
+	// leaves the API unthrottled. The infrastructure routes bypass Huma, so they
+	// are never rate limited.
+	InstallRateLimit func(huma.API)
 	// InstallAuthz installs the authentication/authorization Huma middleware on
 	// the API. It MUST run before the resource operations are registered: Huma
 	// snapshots the API's middleware stack into each operation at registration
@@ -61,12 +69,15 @@ type RouterDeps struct {
 func NewRouter(deps RouterDeps) http.Handler {
 	mux := chi.NewMux()
 
-	// Core middleware, outermost first. Deferred seams (insert here in later
-	// slices): authn/authz and rate limiting.
+	// Core chi middleware, outermost first. The rate-limit and authn/authz
+	// middleware are Huma middleware (installed on the API below), not chi
+	// middleware, so they run only for API operations and never for the
+	// infrastructure routes.
 	//
-	// Client-IP runs first so the request id, access log, and metrics all see
-	// the resolved IP. CORS sits after the access log (so preflight responses are
-	// logged and metered) and is installed only when origins are configured.
+	// Client-IP runs first so the request id, access log, metrics, and the
+	// rate limiter all see the resolved IP. CORS sits after the access log (so
+	// preflight responses are logged and metered) and is installed only when
+	// origins are configured.
 	mux.Use(middleware.ClientIP(deps.TrustedProxyHeader))
 	mux.Use(chimiddleware.RequestID)
 	mux.Use(middleware.Recoverer(deps.Logger))
@@ -92,10 +103,15 @@ func NewRouter(deps RouterDeps) http.Handler {
 	})
 
 	api := NewAPI(mux, deps.Version)
-	// The authn/authz Huma middleware is installed BEFORE the operations are
-	// registered: Huma bakes the API's middleware stack into each operation at
-	// registration time, so middleware added afterward would never run. It is a
-	// no-op when authorization is disabled.
+	// The rate-limit and authn/authz Huma middleware are installed BEFORE the
+	// operations are registered: Huma bakes the API's middleware stack into each
+	// operation at registration time, so middleware added afterward would never
+	// run. Rate limiting is installed first so it runs outermost — an over-limit
+	// request is rejected before authentication runs. Each is a no-op when its
+	// feature is disabled.
+	if deps.InstallRateLimit != nil {
+		deps.InstallRateLimit(api)
+	}
 	if deps.InstallAuthz != nil {
 		deps.InstallAuthz(api)
 	}
diff --git a/internal/app/app.go b/internal/app/app.go
index d918648..86c8de3 100644
--- a/internal/app/app.go
+++ b/internal/app/app.go
@@ -13,6 +13,7 @@ import (
 
 	"github.com/danielgtaylor/huma/v2"
 	"github.com/jackc/pgx/v5/pgxpool"
+	"golang.org/x/time/rate"
 
 	adapterhttp "github.com/meigma/template-go-api/internal/adapter/http"
 	"github.com/meigma/template-go-api/internal/adapter/postgres"
@@ -20,12 +21,17 @@ import (
 	"github.com/meigma/template-go-api/internal/authz/apikey"
 	"github.com/meigma/template-go-api/internal/config"
 	"github.com/meigma/template-go-api/internal/observability"
+	"github.com/meigma/template-go-api/internal/ratelimit"
 	"github.com/meigma/template-go-api/internal/todo"
 	todoauthz "github.com/meigma/template-go-api/internal/todo/authz"
 	"github.com/meigma/template-go-api/internal/todo/httpapi"
 	todopostgres "github.com/meigma/template-go-api/internal/todo/postgres"
 )
 
+// rateLimiterIdleTTL is how long an idle per-client bucket is kept before the
+// in-process limiter evicts it, bounding memory under churning client keys.
+const rateLimiterIdleTTL = 10 * time.Minute
+
 // App is a fully wired API server ready to Run.
 type App struct {
 	server        *http.Server
@@ -35,6 +41,9 @@ type App struct {
 	// pool is the PostgreSQL connection pool, closed during graceful shutdown.
 	// It is nil when a repository is injected with WithRepository (tests).
 	pool *pgxpool.Pool
+	// rateLimiter is the in-process rate limiter whose janitor goroutine is
+	// stopped during graceful shutdown. It is nil when rate limiting is disabled.
+	rateLimiter *ratelimit.InMemory
 }
 
 // Option configures how New wires the application.
@@ -95,6 +104,8 @@ func New(
 		return nil, err
 	}
 
+	rateLimiter, installRateLimit := buildRateLimiter(cfg, logger)
+
 	// An empty metrics-addr co-locates /metrics on the API listener; otherwise a
 	// dedicated metrics server (below) serves it off the API surface.
 	serveMetricsInline := cfg.MetricsAddr == ""
@@ -108,10 +119,11 @@ func New(
 		TrustedProxyHeader:   cfg.TrustedProxyHeader,
 		// The postgres store contributes a real connectivity check here; an
 		// injected repository (tests) contributes none, so /readyz is always ready.
-		Readiness:     readiness,
-		Register:      registerResources(service),
-		InstallAuthz:  installAuthz,
-		FinalizeAuthz: finalizeAuthz,
+		Readiness:        readiness,
+		Register:         registerResources(service),
+		InstallRateLimit: installRateLimit,
+		InstallAuthz:     installAuthz,
+		FinalizeAuthz:    finalizeAuthz,
 	})
 
 	server := &http.Server{
@@ -141,6 +153,7 @@ func New(
 		logger:        logger,
 		grace:         cfg.ShutdownGrace,
 		pool:          pool,
+		rateLimiter:   rateLimiter,
 	}, nil
 }
 
@@ -240,6 +253,25 @@ func resolveAuthenticator(
 	return apikey.NewAuthenticator(apikey.NewStore(pool)), nil
 }
 
+// buildRateLimiter constructs the rate limiter and the hook that installs the
+// rate-limit middleware on the API. When rate limiting is disabled it returns a
+// nil limiter and a nil hook, so NewRouter leaves the API unthrottled. The
+// limiter is keyed by client IP (adapterhttp.ClientIPKeyFunc); swap that key
+// function for a principal-based one to limit authenticated callers instead.
+// The returned limiter runs a janitor goroutine the App stops on shutdown.
+func buildRateLimiter(cfg config.Config, logger *slog.Logger) (*ratelimit.InMemory, func(huma.API)) {
+	if !cfg.RateLimitEnabled {
+		return nil, nil
+	}
+
+	limiter := ratelimit.NewInMemory(rate.Limit(cfg.RateLimitRPS), cfg.RateLimitBurst, rateLimiterIdleTTL)
+	install := func(api huma.API) {
+		ratelimit.NewMiddleware(api, limiter, adapterhttp.ClientIPKeyFunc, logger, true).Install()
+	}
+
+	return limiter, install
+}
+
 // Handler returns the assembled HTTP handler, primarily for functional tests.
 func (a *App) Handler() http.Handler {
 	return a.server.Handler
diff --git a/internal/app/app_test.go b/internal/app/app_test.go
index bae4876..fdafc18 100644
--- a/internal/app/app_test.go
+++ b/internal/app/app_test.go
@@ -97,3 +97,48 @@ func TestAppWiringDeniesUnauthorized(t *testing.T) {
 	application.Handler().ServeHTTP(createRec, createReq)
 	assert.Equal(t, http.StatusForbidden, createRec.Code)
 }
+
+// TestAppWiringRateLimits proves the rate-limit middleware is wired into the
+// composed server, runs before authentication, and exempts the infrastructure
+// routes. With a burst of one, the second API request from the same client is
+// throttled — and it returns 429, not the 403 a denied caller would get, which
+// shows the limiter runs before authorization. The /healthz route is never
+// limited because the infra routes bypass Huma.
+func TestAppWiringRateLimits(t *testing.T) {
+	t.Parallel()
+
+	vp := viper.New()
+	vp.Set("rate-limit-rps", 1)
+	vp.Set("rate-limit-burst", 1)
+	cfg := config.Load(vp)
+	logger := observability.NewLogger(io.Discard, slog.LevelError, "json")
+	application, err := app.New(
+		context.Background(), cfg, logger, "test",
+		app.WithRepository(todotest.NewRepository()),
+		app.WithAuthenticator(stubAuthenticator{roles: []string{"user"}}),
+	)
+	require.NoError(t, err)
+	handler := application.Handler()
+
+	// Infra routes bypass Huma, so they are never rate limited: repeated /healthz
+	// hits all succeed despite the burst of one.
+	for range 3 {
+		rec := httptest.NewRecorder()
+		handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/healthz", nil))
+		assert.Equal(t, http.StatusOK, rec.Code)
+	}
+
+	post := func() int {
+		req := httptest.NewRequest(http.MethodPost, "/v1/todos", strings.NewReader(`{"title":"x"}`))
+		req.Header.Set("Content-Type", "application/json")
+		rec := httptest.NewRecorder()
+		handler.ServeHTTP(rec, req)
+
+		return rec.Code
+	}
+
+	// The single burst token lets the first request through; the next request
+	// from the same client is throttled before authorization runs.
+	assert.Equal(t, http.StatusCreated, post())
+	assert.Equal(t, http.StatusTooManyRequests, post())
+}
diff --git a/internal/app/serve.go b/internal/app/serve.go
index 7ad016c..154094d 100644
--- a/internal/app/serve.go
+++ b/internal/app/serve.go
@@ -31,6 +31,8 @@ func (a *App) Run(ctx context.Context) error {
 	// Close the database pool (when postgres) on every exit path, after the
 	// servers have returned — including when a server fails to start.
 	defer a.closePool(ctx)
+	// Stop the in-process rate limiter's janitor goroutine on every exit path.
+	defer a.stopRateLimiter(ctx)
 
 	servers := a.servers()
 	serveErr := make(chan error, len(servers))
@@ -91,3 +93,15 @@ func (a *App) closePool(ctx context.Context) {
 	a.logger.InfoContext(ctx, "closing database pool")
 	a.pool.Close()
 }
+
+// stopRateLimiter stops the in-process rate limiter's janitor goroutine when one
+// is configured. It is deferred in Run so it executes on every exit path. It is
+// a no-op when rate limiting is disabled (no limiter was built).
+func (a *App) stopRateLimiter(ctx context.Context) {
+	if a.rateLimiter == nil {
+		return
+	}
+
+	a.logger.InfoContext(ctx, "stopping rate limiter")
+	a.rateLimiter.Stop()
+}
diff --git a/internal/config/config.go b/internal/config/config.go
index 17c3b7a..42ced2d 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -30,6 +30,18 @@ const (
 	// box. Operators set it false as an escape hatch to bypass authorization
 	// entirely (incremental adoption or local debugging).
 	defaultAuthzEnabled = true
+	// defaultRateLimitEnabled is true: the API is rate limited out of the box
+	// (per client IP, pre-auth), a secure default that also shields the
+	// credential store from anonymous floods. Operators set it false to disable
+	// throttling entirely.
+	defaultRateLimitEnabled = true
+	// defaultRateLimitRPS is the sustained per-client request rate (requests per
+	// second). It is deliberately generous so local development and the demo
+	// stack are not throttled; tune it down for production.
+	defaultRateLimitRPS = 10.0
+	// defaultRateLimitBurst is the per-client token-bucket depth: how many
+	// requests a client may make in a burst before the sustained rate applies.
+	defaultRateLimitBurst = 20
 )
 
 // Config holds runtime settings for the API server.
@@ -78,6 +90,16 @@ type Config struct {
 	// instead of the embedded set. Empty (the default) uses the embedded
 	// policies.
 	AuthzPolicyDir string
+	// RateLimitEnabled is the rate-limiting master switch. It defaults to true:
+	// the API is throttled per client IP before authentication runs. When false
+	// the rate-limit middleware is inert (pass-through).
+	RateLimitEnabled bool
+	// RateLimitRPS is the sustained per-client request rate, in requests per
+	// second, when rate limiting is enabled.
+	RateLimitRPS float64
+	// RateLimitBurst is the per-client token-bucket depth: the number of requests
+	// a client may make in a burst before the sustained RateLimitRPS applies.
+	RateLimitBurst int
 }
 
 // RegisterFlags declares the server configuration flags on flags. Binding them
@@ -115,6 +137,13 @@ func RegisterFlags(flags *pflag.FlagSet) {
 		"",
 		"directory of .cedar policy files to load instead of the embedded policies; empty uses the embedded set",
 	)
+	flags.Bool(
+		"rate-limit-enabled",
+		defaultRateLimitEnabled,
+		"enable per-client (IP) rate limiting; false disables throttling entirely",
+	)
+	flags.Float64("rate-limit-rps", defaultRateLimitRPS, "sustained per-client request rate (requests per second)")
+	flags.Int("rate-limit-burst", defaultRateLimitBurst, "per-client burst size (token-bucket depth)")
 }
 
 // Load reads the server configuration from vp, applying defaults for unset keys.
@@ -138,6 +167,9 @@ func Load(vp *viper.Viper) Config {
 		DBMaxConns:         vp.GetInt32("db-max-conns"),
 		AuthzEnabled:       vp.GetBool("authz-enabled"),
 		AuthzPolicyDir:     vp.GetString("authz-policy-dir"),
+		RateLimitEnabled:   vp.GetBool("rate-limit-enabled"),
+		RateLimitRPS:       vp.GetFloat64("rate-limit-rps"),
+		RateLimitBurst:     vp.GetInt("rate-limit-burst"),
 	}
 }
 
@@ -161,6 +193,14 @@ func (c Config) Validate() error {
 	if strings.TrimSpace(c.DatabaseURL) == "" {
 		return errors.New("database-url is required")
 	}
+	if c.RateLimitEnabled {
+		if c.RateLimitRPS <= 0 {
+			return errors.New("rate-limit-rps must be positive when rate limiting is enabled")
+		}
+		if c.RateLimitBurst <= 0 {
+			return errors.New("rate-limit-burst must be positive when rate limiting is enabled")
+		}
+	}
 
 	return nil
 }
@@ -182,4 +222,7 @@ func setDefaults(vp *viper.Viper) {
 	vp.SetDefault("db-max-conns", defaultDBMaxConns)
 	vp.SetDefault("authz-enabled", defaultAuthzEnabled)
 	vp.SetDefault("authz-policy-dir", "")
+	vp.SetDefault("rate-limit-enabled", defaultRateLimitEnabled)
+	vp.SetDefault("rate-limit-rps", defaultRateLimitRPS)
+	vp.SetDefault("rate-limit-burst", defaultRateLimitBurst)
 }
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
index a5a16ca..18e3f60 100644
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -26,6 +26,9 @@ func TestLoadDefaults(t *testing.T) {
 	assert.Zero(t, cfg.DBMaxConns)
 	assert.True(t, cfg.AuthzEnabled, "authz is enabled by default now that the routes are tagged")
 	assert.Empty(t, cfg.AuthzPolicyDir)
+	assert.True(t, cfg.RateLimitEnabled, "rate limiting is enabled by default")
+	assert.InDelta(t, defaultRateLimitRPS, cfg.RateLimitRPS, 0.0001)
+	assert.Equal(t, defaultRateLimitBurst, cfg.RateLimitBurst)
 }
 
 func TestLoadAuthzFromFlags(t *testing.T) {
@@ -105,4 +108,26 @@ func TestValidate(t *testing.T) {
 	missingDatabaseURL := base
 	missingDatabaseURL.DatabaseURL = ""
 	require.Error(t, missingDatabaseURL.Validate())
+
+	// Rate-limit settings are validated only when rate limiting is enabled.
+	rateLimited := base
+	rateLimited.RateLimitEnabled = true
+	rateLimited.RateLimitRPS = 10
+	rateLimited.RateLimitBurst = 20
+	require.NoError(t, rateLimited.Validate())
+
+	zeroRPS := rateLimited
+	zeroRPS.RateLimitRPS = 0
+	require.Error(t, zeroRPS.Validate())
+
+	zeroBurst := rateLimited
+	zeroBurst.RateLimitBurst = 0
+	require.Error(t, zeroBurst.Validate())
+
+	// With rate limiting disabled, non-positive values are ignored.
+	disabledIgnoresValues := base
+	disabledIgnoresValues.RateLimitEnabled = false
+	disabledIgnoresValues.RateLimitRPS = 0
+	disabledIgnoresValues.RateLimitBurst = 0
+	require.NoError(t, disabledIgnoresValues.Validate())
 }
diff --git a/internal/integration/authz_e2e_test.go b/internal/integration/authz_e2e_test.go
index c18a17a..9075bf4 100644
--- a/internal/integration/authz_e2e_test.go
+++ b/internal/integration/authz_e2e_test.go
@@ -44,6 +44,11 @@ func e2eServer(ctx context.Context, t *testing.T, databaseURL string) *httptest.
 
 	vp := viper.New()
 	vp.Set("database-url", databaseURL)
+	// Rate limiting is orthogonal here, and its per-IP bucket would be shared
+	// across this suite's rapid requests from one loopback client; disable it so
+	// the authz assertions stay deterministic. Rate limiting is covered by its
+	// own tests in internal/ratelimit.
+	vp.Set("rate-limit-enabled", false)
 	cfg := config.Load(vp)
 	require.NoError(t, cfg.Validate())
 	// Guard the premise of the whole suite: authz must be enabled by default now
diff --git a/internal/ratelimit/limiter.go b/internal/ratelimit/limiter.go
new file mode 100644
index 0000000..72a311a
--- /dev/null
+++ b/internal/ratelimit/limiter.go
@@ -0,0 +1,40 @@
+// Package ratelimit provides per-client request rate limiting for the API: a
+// Limiter port, an in-process token-bucket adapter, and a Huma middleware that
+// enforces the limit and emits an RFC 9457 429 response when it is exceeded.
+//
+// The Limiter interface is the extension seam. The shipped in-process adapter
+// (NewInMemory) limits per process, which is the right default for a single
+// instance; behind multiple replicas a shared backend (for example, Redis)
+// keeps the limit global. Implement Limiter with that backend and wire it in
+// the composition root — the middleware and key function are unchanged.
+package ratelimit
+
+import (
+	"context"
+	"time"
+)
+
+// Decision is the outcome of a limiter check for a single request. It carries
+// what the middleware needs to answer the client: whether the request is
+// allowed and, when it is not, how long the client should wait before retrying.
+type Decision struct {
+	// Allowed reports whether the request may proceed.
+	Allowed bool
+	// RetryAfter is how long until the client may retry. It is meaningful only
+	// when Allowed is false; the middleware rounds it up to whole seconds for the
+	// Retry-After response header. Zero means no hint is available.
+	RetryAfter time.Duration
+}
+
+// Limiter decides whether a request identified by key may proceed now. key is
+// the client identity the middleware extracts — by default the client IP.
+// Implementations must be safe for concurrent use.
+//
+// Allow returns a non-nil error only for an infrastructure failure (for
+// example, a distributed backend being unreachable). The middleware fails open
+// on such an error — it lets the request through — so that a limiter outage
+// cannot take the whole API down with it. A plain allow/deny decision returns a
+// nil error.
+type Limiter interface {
+	Allow(ctx context.Context, key string) (Decision, error)
+}
diff --git a/internal/ratelimit/memory.go b/internal/ratelimit/memory.go
new file mode 100644
index 0000000..6075b28
--- /dev/null
+++ b/internal/ratelimit/memory.go
@@ -0,0 +1,146 @@
+package ratelimit
+
+import (
+	"context"
+	"sync"
+	"time"
+
+	"golang.org/x/time/rate"
+)
+
+// minIdleTTL floors the eviction interval so a misconfigured zero or negative
+// TTL cannot panic [time.NewTicker] or spin the janitor.
+const minIdleTTL = time.Minute
+
+// InMemory is an in-process Limiter backed by a per-key token bucket
+// (golang.org/x/time/rate). Each distinct key gets its own bucket that refills
+// at rps tokens per second up to a depth of burst, so a client may burst up to
+// burst requests and then sustains rps requests per second.
+//
+// Buckets are created on first use and evicted after they go idle for idleTTL,
+// bounding memory under churning keys (for example, per-IP keys behind NAT or
+// during a scan). A background janitor performs the eviction; call Stop when the
+// limiter is no longer needed so the goroutine exits. The composition root owns
+// that lifecycle and stops the limiter on shutdown.
+type InMemory struct {
+	rps   rate.Limit
+	burst int
+	ttl   time.Duration
+
+	mu      sync.Mutex
+	clients map[string]*clientBucket
+	stop    chan struct{}
+	stopped bool
+}
+
+// clientBucket is one key's token bucket plus the last time it was seen, which
+// the janitor uses to evict idle keys.
+type clientBucket struct {
+	limiter  *rate.Limiter
+	lastSeen time.Time
+}
+
+// NewInMemory builds an in-process limiter that allows up to burst requests
+// immediately and rps requests per second sustained, independently per key.
+// Buckets unused for idleTTL are evicted. The returned limiter starts a janitor
+// goroutine; call Stop to end it.
+func NewInMemory(rps rate.Limit, burst int, idleTTL time.Duration) *InMemory {
+	if idleTTL < minIdleTTL {
+		idleTTL = minIdleTTL
+	}
+
+	l := &InMemory{
+		rps:     rps,
+		burst:   burst,
+		ttl:     idleTTL,
+		clients: make(map[string]*clientBucket),
+		stop:    make(chan struct{}),
+	}
+	go l.janitor()
+
+	return l
+}
+
+// Allow reports whether the request for key may proceed, consuming a token when
+// it does. A denied request reports how long until the next token frees up so
+// the middleware can set Retry-After. It never returns an error: an in-process
+// bucket has no backend that can fail.
+func (l *InMemory) Allow(_ context.Context, key string) (Decision, error) {
+	limiter := l.bucket(key)
+
+	reservation := limiter.Reserve()
+	if !reservation.OK() {
+		// burst is non-positive, so a token can never be granted: deny with no
+		// retry hint rather than reporting an infinite wait.
+		return Decision{Allowed: false}, nil
+	}
+
+	if delay := reservation.Delay(); delay > 0 {
+		// No token is available yet. Cancel so this check does not consume a
+		// future token, and report the wait.
+		reservation.Cancel()
+
+		return Decision{Allowed: false, RetryAfter: delay}, nil
+	}
+
+	return Decision{Allowed: true}, nil
+}
+
+// bucket returns the token bucket for key, creating it on first use, and
+// records that the key was just seen so the janitor does not evict it.
+func (l *InMemory) bucket(key string) *rate.Limiter {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	c, ok := l.clients[key]
+	if !ok {
+		c = &clientBucket{limiter: rate.NewLimiter(l.rps, l.burst)}
+		l.clients[key] = c
+	}
+	c.lastSeen = time.Now()
+
+	return c.limiter
+}
+
+// Stop ends the janitor goroutine. It is safe to call more than once; calls
+// after the first are no-ops. Allow still works after Stop, but idle buckets are
+// no longer evicted.
+func (l *InMemory) Stop() {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	if l.stopped {
+		return
+	}
+	l.stopped = true
+	close(l.stop)
+}
+
+// janitor evicts idle buckets on each tick until Stop closes the stop channel.
+func (l *InMemory) janitor() {
+	ticker := time.NewTicker(l.ttl)
+	defer ticker.Stop()
+
+	for {
+		select {
+		case <-l.stop:
+			return
+		case <-ticker.C:
+			l.evictIdle()
+		}
+	}
+}
+
+// evictIdle removes every bucket not seen within the idle TTL.
+func (l *InMemory) evictIdle() {
+	cutoff := time.Now().Add(-l.ttl)
+
+	l.mu.Lock()
+	defer l.mu.Unlock()
+
+	for key, c := range l.clients {
+		if c.lastSeen.Before(cutoff) {
+			delete(l.clients, key)
+		}
+	}
+}
diff --git a/internal/ratelimit/memory_test.go b/internal/ratelimit/memory_test.go
new file mode 100644
index 0000000..de676ad
--- /dev/null
+++ b/internal/ratelimit/memory_test.go
@@ -0,0 +1,89 @@
+package ratelimit_test
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"golang.org/x/time/rate"
+
+	"github.com/meigma/template-go-api/internal/ratelimit"
+)
+
+func TestInMemoryAllowsBurstThenDenies(t *testing.T) {
+	t.Parallel()
+
+	// Refill of 1 token/sec with a burst of 3: the first three requests pass
+	// immediately, the fourth (within the same second) is denied with a hint.
+	limiter := ratelimit.NewInMemory(rate.Limit(1), 3, time.Minute)
+	t.Cleanup(limiter.Stop)
+
+	ctx := context.Background()
+	for i := range 3 {
+		d, err := limiter.Allow(ctx, "client-a")
+		require.NoError(t, err)
+		assert.Truef(t, d.Allowed, "burst request %d should be allowed", i+1)
+	}
+
+	d, err := limiter.Allow(ctx, "client-a")
+	require.NoError(t, err)
+	assert.False(t, d.Allowed, "the request past the burst should be denied")
+	assert.Positive(t, d.RetryAfter, "a denied request reports when to retry")
+}
+
+func TestInMemoryKeysAreIndependent(t *testing.T) {
+	t.Parallel()
+
+	limiter := ratelimit.NewInMemory(rate.Limit(1), 1, time.Minute)
+	t.Cleanup(limiter.Stop)
+	ctx := context.Background()
+
+	first, err := limiter.Allow(ctx, "client-a")
+	require.NoError(t, err)
+	require.True(t, first.Allowed)
+
+	// client-a is now exhausted, but client-b has its own independent bucket.
+	exhausted, err := limiter.Allow(ctx, "client-a")
+	require.NoError(t, err)
+	assert.False(t, exhausted.Allowed)
+
+	other, err := limiter.Allow(ctx, "client-b")
+	require.NoError(t, err)
+	assert.True(t, other.Allowed, "a different key has an independent bucket")
+}
+
+func TestInMemoryRefillsOverTime(t *testing.T) {
+	t.Parallel()
+
+	// 50 tokens/sec => a token refills in ~20ms. Burst 1, so the bucket empties
+	// after one request and then refills shortly after.
+	limiter := ratelimit.NewInMemory(rate.Limit(50), 1, time.Minute)
+	t.Cleanup(limiter.Stop)
+	ctx := context.Background()
+
+	first, err := limiter.Allow(ctx, "c")
+	require.NoError(t, err)
+	require.True(t, first.Allowed)
+
+	denied, err := limiter.Allow(ctx, "c")
+	require.NoError(t, err)
+	require.False(t, denied.Allowed)
+
+	// After the refill interval the bucket allows again. Reserve/Cancel on the
+	// denied polls does not consume a future token, so this converges.
+	require.Eventually(t, func() bool {
+		d, err := limiter.Allow(ctx, "c")
+
+		return err == nil && d.Allowed
+	}, time.Second, 5*time.Millisecond, "the bucket should refill and allow again")
+}
+
+func TestInMemoryStopIsIdempotent(t *testing.T) {
+	t.Parallel()
+
+	limiter := ratelimit.NewInMemory(rate.Limit(1), 1, time.Minute)
+	limiter.Stop()
+	assert.NotPanics(t, limiter.Stop, "Stop is safe to call more than once")
+}
diff --git a/internal/ratelimit/middleware.go b/internal/ratelimit/middleware.go
new file mode 100644
index 0000000..ee2ee3a
--- /dev/null
+++ b/internal/ratelimit/middleware.go
@@ -0,0 +1,111 @@
+package ratelimit
+
+import (
+	"log/slog"
+	"math"
+	"net/http"
+	"strconv"
+
+	"github.com/danielgtaylor/huma/v2"
+)
+
+// genericTooManyRequests is the client-facing detail for a throttled request.
+// The limited key is logged, not returned, so the response leaks nothing about
+// other clients' traffic.
+const genericTooManyRequests = "rate limit exceeded; retry later"
+
+// KeyFunc extracts the client identity a request is limited by — by default the
+// resolved client IP. Returning an error makes the middleware fail open (the
+// request proceeds), so a key-extraction failure cannot deny all traffic.
+type KeyFunc func(ctx huma.Context) (string, error)
+
+// Middleware enforces a per-client rate limit as Huma middleware. Installed
+// before the authentication middleware (see Install), it rejects an over-limit
+// request with an RFC 9457 429 before authentication runs, so a flood never
+// reaches the credential store. It is inert (pass-through) when disabled — the
+// escape hatch matching the authz tier.
+type Middleware struct {
+	api     huma.API
+	limiter Limiter
+	key     KeyFunc
+	logger  *slog.Logger
+	enabled bool
+}
+
+// NewMiddleware builds the rate-limit middleware over limiter, keyed by key. api
+// is the Huma API used to write the RFC 9457 problem response; logger records
+// throttling and fail-open events. When enabled is false the middleware is a
+// pass-through (Install is a no-op).
+func NewMiddleware(
+	api huma.API,
+	limiter Limiter,
+	key KeyFunc,
+	logger *slog.Logger,
+	enabled bool,
+) *Middleware {
+	if logger == nil {
+		logger = slog.Default()
+	}
+
+	return &Middleware{
+		api:     api,
+		limiter: limiter,
+		key:     key,
+		logger:  logger,
+		enabled: enabled,
+	}
+}
+
+// Install registers the rate-limit middleware on the API. Like the authz
+// middleware it must run before huma.Register (Huma snapshots the API's
+// middleware stack into each operation at registration time, so middleware
+// added afterward never runs), and it should be installed before the
+// authentication middleware so limiting happens pre-auth. It is a no-op when
+// disabled. The infrastructure routes (/healthz, /readyz, /metrics) bypass Huma,
+// so they are never rate limited.
+func (m *Middleware) Install() {
+	if !m.enabled {
+		return
+	}
+
+	m.api.UseMiddleware(m.limit)
+}
+
+// limit is the middleware function: derive the client key, consult the limiter,
+// and either continue or reject with 429. It fails open on any key-extraction or
+// limiter error so the limiter is never a single point of failure for the API.
+func (m *Middleware) limit(ctx huma.Context, next func(huma.Context)) {
+	key, err := m.key(ctx)
+	if err != nil {
+		m.logger.WarnContext(ctx.Context(), "rate-limit key extraction failed; allowing request",
+			slog.Any("error", err))
+		next(ctx)
+
+		return
+	}
+
+	decision, err := m.limiter.Allow(ctx.Context(), key)
+	if err != nil {
+		m.logger.ErrorContext(ctx.Context(), "rate limiter unavailable; allowing request",
+			slog.Any("error", err))
+		next(ctx)
+
+		return
+	}
+
+	if decision.Allowed {
+		next(ctx)
+
+		return
+	}
+
+	if decision.RetryAfter > 0 {
+		// Retry-After is whole seconds (RFC 9110); round up so the client never
+		// retries before a token is actually available.
+		seconds := int(math.Ceil(decision.RetryAfter.Seconds()))
+		ctx.SetHeader("Retry-After", strconv.Itoa(seconds))
+	}
+
+	m.logger.InfoContext(ctx.Context(), "rate limit exceeded", slog.String("key", key))
+	_ = huma.WriteErr(m.api, ctx, http.StatusTooManyRequests, genericTooManyRequests)
+}
diff --git a/internal/ratelimit/middleware_test.go b/internal/ratelimit/middleware_test.go
new file mode 100644
index 0000000..1c1dc82
--- /dev/null
+++ b/internal/ratelimit/middleware_test.go
@@ -0,0 +1,97 @@
+package ratelimit_test
+
+import (
+	"context"
+	"log/slog"
+	"net/http"
+	"testing"
+	"time"
+
+	"github.com/danielgtaylor/huma/v2"
+	"github.com/danielgtaylor/huma/v2/humatest"
+	"github.com/stretchr/testify/assert"
+	"golang.org/x/time/rate"
+
+	"github.com/meigma/template-go-api/internal/ratelimit"
+)
+
+// testClientHeader names the header the test key function reads to identify the
+// client, so a test can simulate distinct clients without distinct IPs.
+const testClientHeader = "X-Test-Client"
+
+// keyByHeader keys requests by the testClientHeader value, falling back to a
+// shared key when the header is absent.
+func keyByHeader(ctx huma.Context) (string, error) {
+	if c := ctx.Header(testClientHeader); c != "" {
+		return c, nil
+	}
+
+	return "default", nil
+}
+
+// newTestAPI installs the rate-limit middleware over limiter on a humatest API
+// and registers a single GET /ping operation behind it.
+func newTestAPI(t *testing.T, limiter ratelimit.Limiter, enabled bool) humatest.TestAPI {
+	t.Helper()
+
+	_, api := humatest.New(t)
+	logger := slog.New(slog.DiscardHandler)
+	ratelimit.NewMiddleware(api, limiter, keyByHeader, logger, enabled).Install()
+
+	huma.Register(api, huma.Operation{
+		OperationID: "ping",
+		Method:      http.MethodGet,
+		Path:        "/ping",
+	}, func(_ context.Context, _ *struct{}) (*struct{}, error) {
+		return &struct{}{}, nil
+	})
+
+	return api
+}
+
+func TestMiddlewareAllowsBurstThenReturns429(t *testing.T) {
+	t.Parallel()
+
+	limiter := ratelimit.NewInMemory(rate.Limit(1), 2, time.Minute)
+	t.Cleanup(limiter.Stop)
+	api := newTestAPI(t, limiter, true)
+
+	// The burst of 2 passes.
+	assert.Equal(t, http.StatusNoContent, api.Get("/ping").Code)
+	assert.Equal(t, http.StatusNoContent, api.Get("/ping").Code)
+
+	// The third request within the same second is throttled.
+	resp := api.Get("/ping")
+	assert.Equal(t, http.StatusTooManyRequests, resp.Code)
+	assert.NotEmpty(t, resp.Header().Get("Retry-After"), "a 429 carries a Retry-After hint")
+	assert.Contains(t, resp.Body.String(), "rate limit exceeded", "the body is the RFC 9457 problem detail")
+}
+
+func TestMiddlewareLimitsPerClientKey(t *testing.T) {
+	t.Parallel()
+
+	limiter := ratelimit.NewInMemory(rate.Limit(1), 1, time.Minute)
+	t.Cleanup(limiter.Stop)
+	api := newTestAPI(t, limiter, true)
+
+	// alice exhausts her own bucket.
+	assert.Equal(t, http.StatusNoContent, api.Get("/ping", testClientHeader+": alice").Code)
+	assert.Equal(t, http.StatusTooManyRequests, api.Get("/ping", testClientHeader+": alice").Code)
+
+	// bob is unaffected: he has an independent bucket.
+	assert.Equal(t, http.StatusNoContent, api.Get("/ping", testClientHeader+": bob").Code)
+}
+
+func TestMiddlewareDisabledIsPassthrough(t *testing.T) {
+	t.Parallel()
+
+	// A burst of 1 would throttle the second request if the middleware ran, but
+	// disabled it must never install, so every request reaches the handler.
+	limiter := ratelimit.NewInMemory(rate.Limit(1), 1, time.Minute)
+	t.Cleanup(limiter.Stop)
+	api := newTestAPI(t, limiter, false)
+
+	for range 3 {
+		assert.Equal(t, http.StatusNoContent, api.Get("/ping").Code)
+	}
+}