diff --git a/DELETE_ME.md b/DELETE_ME.md index 06de815..15eccbd 100644 --- a/DELETE_ME.md +++ b/DELETE_ME.md @@ -12,6 +12,7 @@ It is only here to orient the initial project owner. - A PostgreSQL persistence adapter (pgx + sqlc typed queries + goose migrations) behind the domain's `todo.Repository` port: a `migrate` subcommand, a committed-and-drift-guarded sqlc layer, a real `/readyz` check, and container-backed integration tests. The port is the seam — implement it to back the template with a different datastore. - An authorization tier (Cedar via `cedar-go`) with a deny-by-default Huma middleware, a modular per-resource authz slice pattern, and authentication deferred to the integrator: a placeholder API-key authenticator (`X-API-Key`/Bearer, backed by an `api_keys` table) and dev-only mock keys seeded for the Compose demo. Replace the authenticator with real authn — see step 6. - Per-client rate limiting (on by default): a Huma middleware that throttles by client IP **before** authentication and returns RFC 9457 `429` with `Retry-After`. The shipped limiter is in-process (token bucket, `golang.org/x/time/rate`) behind a `ratelimit.Limiter` port — the seam for a distributed (for example, Redis-backed) limiter. See the README's [Rate limiting](README.md#rate-limiting) section. +- OpenTelemetry distributed tracing (opt-in via `--tracing-enabled`): inbound HTTP server spans (otelhttp, named by operation) and PostgreSQL query spans (otelpgx), exported over OTLP/HTTP and configured through the standard `OTEL_*` env vars. Off by default since it needs a collector. See the README's [Tracing](README.md#tracing) section. - A Cobra/Viper entrypoint under `cmd/template-go-api` and `internal/cli` exposing `serve` (default), `version`, `openapi`, and `migrate`. - Moon tasks for `format`, `lint`, `build`, `test`, and `check`, plus `sqlc` / `sqlc-check` (regenerate and drift-guard the typed query layer), `mockery` / `mockery-check` (regenerate and drift-guard the testify mocks), `migrate` (run database migrations), and `test-integration` (container-backed adapter tests). - `golangci-lint`, `sqlc`, `goose`, and `mockery` wired through Proto and Moon. diff --git a/README.md b/README.md index 2e1c1fb..5159ba5 100644 --- a/README.md +++ b/README.md @@ -201,6 +201,7 @@ default. | `--rate-limit-enabled` | `TEMPLATE_GO_API_RATE_LIMIT_ENABLED` | `true` | enable per-client [rate limiting](#rate-limiting); `false` disables throttling entirely | | `--rate-limit-rps` | `TEMPLATE_GO_API_RATE_LIMIT_RPS` | `10` | sustained per-client request rate (requests/second) | | `--rate-limit-burst` | `TEMPLATE_GO_API_RATE_LIMIT_BURST` | `20` | per-client burst size (token-bucket depth) | +| `--tracing-enabled` | `TEMPLATE_GO_API_TRACING_ENABLED` | `false` | enable OpenTelemetry [tracing](#tracing); the OTLP exporter is configured via the standard `OTEL_*` env vars | CORS is off until you set origins. Client IP is read from the direct TCP peer unless you opt into a trusted proxy header — never from `X-Forwarded-For` @@ -471,6 +472,37 @@ the key function (`adapterhttp.ClientIPKeyFunc`) for one that reads the principa > token bucket, so the template advertises the limit with the stable `Retry-After` > header and leaves those headers as a documented enhancement. +## Tracing + +Distributed tracing is [OpenTelemetry](https://opentelemetry.io)-based and +**opt-in** (`--tracing-enabled`, default false) because it needs an external +collector. When enabled, the server exports spans over **OTLP/HTTP** and is +configured entirely through the standard `OTEL_*` environment variables — there +are no bespoke endpoint or sampler flags: + +```sh +TEMPLATE_GO_API_TRACING_ENABLED=true \ +OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 \ +OTEL_SERVICE_NAME=template-go-api \ +OTEL_TRACES_SAMPLER=parentbased_traceidratio OTEL_TRACES_SAMPLER_ARG=0.1 \ + ./bin/template-go-api serve --database-url ... +``` + +What is instrumented out of the box: + +- **Inbound HTTP** — every request is a server span (`otelhttp`) that extracts + W3C trace context for propagation. Spans are named by operation (for example + `get-todo`) for low cardinality. The infrastructure routes (`/healthz`, + `/readyz`, `/metrics`) are excluded so health checks and scrapes do not flood + the backend. +- **PostgreSQL** — each query is a child span ([`otelpgx`](https://github.com/exaring/otelpgx) + on the pool), so a trace shows the SQL under the request that issued it. + +`service.name`/`service.version` default to the app name and build version and +are overridable via `OTEL_SERVICE_NAME` / `OTEL_RESOURCE_ATTRIBUTES`. The tracer +provider is flushed on graceful shutdown. To trace your own domain logic, start +child spans with the global tracer (`otel.Tracer(...)`) inside the service layer. + ## Testing Unit tests sit beside the code and use [Testify](https://github.com/stretchr/testify) diff --git a/go.mod b/go.mod index 9e4e867..6709e9f 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.26.4 require ( github.com/cedar-policy/cedar-go v1.8.0 github.com/danielgtaylor/huma/v2 v2.38.0 + github.com/exaring/otelpgx v0.11.1 github.com/go-chi/chi/v5 v5.3.0 github.com/go-chi/cors v1.2.2 github.com/google/uuid v1.6.0 @@ -17,6 +18,11 @@ require ( github.com/stretchr/testify v1.11.1 github.com/testcontainers/testcontainers-go v0.43.0 github.com/testcontainers/testcontainers-go/modules/postgres v0.43.0 + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 + go.opentelemetry.io/otel v1.44.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.44.0 + go.opentelemetry.io/otel/sdk v1.44.0 + go.opentelemetry.io/otel/trace v1.44.0 golang.org/x/time v0.11.0 ) @@ -26,6 +32,7 @@ require ( github.com/Microsoft/go-winio v0.6.2 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/containerd/errdefs v1.0.0 // indirect github.com/containerd/errdefs/pkg v0.3.0 // indirect @@ -43,6 +50,7 @@ require ( github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.2.6 // indirect github.com/go-viper/mapstructure/v2 v2.5.0 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.29.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect @@ -82,18 +90,21 @@ require ( github.com/tklauser/numcpus v0.11.0 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 // indirect - go.opentelemetry.io/otel v1.43.0 // indirect - go.opentelemetry.io/otel/metric v1.43.0 // indirect - go.opentelemetry.io/otel/trace v1.43.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.44.0 // indirect + go.opentelemetry.io/otel/metric v1.44.0 // indirect + go.opentelemetry.io/proto/otlp v1.10.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/crypto v0.53.0 // indirect golang.org/x/exp v0.0.0-20260410095643-746e56fc9e2f // indirect + golang.org/x/net v0.55.0 // indirect golang.org/x/sync v0.21.0 // indirect golang.org/x/sys v0.46.0 // indirect golang.org/x/text v0.38.0 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260526163538-3dc84a4a5aaa // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260526163538-3dc84a4a5aaa // indirect + google.golang.org/grpc v1.81.1 // indirect google.golang.org/protobuf v1.36.11 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index 6b30b65..8f7df8b 100644 --- a/go.sum +++ b/go.sum @@ -12,6 +12,8 @@ github.com/cedar-policy/cedar-go v1.8.0 h1:9gcU7EHXwHC2RMdpph68yTAkdB3behTTssC+k github.com/cedar-policy/cedar-go v1.8.0/go.mod h1:h5+3CVW1oI5LXVskJG+my9TFCYI5yjh/+Ul3EJie6MI= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= +github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/containerd/errdefs v1.0.0 h1:tg5yIfIlQIrxYtu9ajqY42W3lpS19XqdxRQeEwYG8PI= @@ -42,6 +44,8 @@ github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkp github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/ebitengine/purego v0.10.0 h1:QIw4xfpWT6GWTzaW5XEKy3HXoqrJGx1ijYHzTF0/ISU= github.com/ebitengine/purego v0.10.0/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ= +github.com/exaring/otelpgx v0.11.1 h1:pE79fIg/qh/Lpu00kvswFC5dKfqyJJhMJ4Y4N3w5Lj4= +github.com/exaring/otelpgx v0.11.1/go.mod h1:3OojrUKhhy3lTbYIMBijP3YjMey/jo14eHAW5cXcUdk= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= @@ -63,11 +67,15 @@ github.com/go-ole/go-ole v1.2.6 h1:/Fpf6oFPoeFik9ty7siob0G6Ke8QvQEuVcuChpwXzpY= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-viper/mapstructure/v2 v2.5.0 h1:vM5IJoUAy3d7zRSVtIwQgBj7BiWtMPfmPEgAXnvj1Ro= github.com/go-viper/mapstructure/v2 v2.5.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.29.0 h1:5VipnvEpbqr2gA2VbM+nYVbkIF28c5ZQfqCBQ5g2xfk= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.29.0/go.mod h1:Hyl3n6Twe1hvtd9XUXDec4pTvgMSEixRuQKPTMH2bNs= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= @@ -189,16 +197,22 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0 h1:CqXxU8VOmDefoh0+ztfGaymYbhdB/tT3zs79QaZTNGY= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.68.0/go.mod h1:BuhAPThV8PBHBvg8ZzZ/Ok3idOdhWIodywz2xEcRbJo= -go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= -go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= -go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= -go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= -go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= -go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= -go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= -go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= -go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= -go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= +go.opentelemetry.io/otel v1.44.0 h1:JjwHmHpA4iZ3wBxluu2fbbE7j4kqlE8jXyAyPXH7HqU= +go.opentelemetry.io/otel v1.44.0/go.mod h1:BMgjTHL9WPRlRjL2oZCBTL4whCGtXch2H4BhOPIAyYc= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.44.0 h1:4YsVu3B8+3qtWYYrsUYgn0OG78pN0rnNPRGX4SbokQI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.44.0/go.mod h1:+wnlSn0mD1ADVMe3v9Z/WIaiz6q6gL2J/ejaAmdmv80= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.44.0 h1:lgh3PiVrRUWMLOVSkQicxzZll5NjF1r+AtsX1XRIHw0= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.44.0/go.mod h1:5Cnhth3m/AgOeTgE3ex12pPmiu/gGtZit03kSzx9X7s= +go.opentelemetry.io/otel/metric v1.44.0 h1:1w0gILTcHdr3YI+ixLyjemwrVnsMURbTZFrSYCdDdmc= +go.opentelemetry.io/otel/metric v1.44.0/go.mod h1:8O7hanEPBNgEMmybD3s2VBKcgWOCsA6tzHBPODAiquo= +go.opentelemetry.io/otel/sdk v1.44.0 h1:nHYwb9lK+fJPU/dnT6s7W7Z8itMWyqrnVfbheVYrZ58= +go.opentelemetry.io/otel/sdk v1.44.0/go.mod h1:Osuydd3Se74nqjAKxid74N5eC+jfEqfTegHRnq58oK0= +go.opentelemetry.io/otel/sdk/metric v1.44.0 h1:3LlKgI+VjbVsjNRFZJZAJ30WjXC5VkNRks6si09iEfI= +go.opentelemetry.io/otel/sdk/metric v1.44.0/go.mod h1:5B5pMARnXxKhltooO4xUuCBorl65a4EpnTalObqOigA= +go.opentelemetry.io/otel/trace v1.44.0 h1:jxF5CsGYCe74MCRx2X4g7WsY/VBKRqqpNvXlX/6gtIk= +go.opentelemetry.io/otel/trace v1.44.0/go.mod h1:oLl1jrMQAVo6v3GAggN+1VH9VIz9iUSvW53sW1Q8PIE= +go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g= +go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -211,6 +225,8 @@ golang.org/x/crypto v0.53.0 h1:QZ4Muo8THX6CizN2vPPd5fBGHyogrdK9fG4wLPFUsto= golang.org/x/crypto v0.53.0/go.mod h1:DNLU434OwVakk9PzuwV8w62mAJpRJL3vsgcfp4Qnsio= golang.org/x/exp v0.0.0-20260410095643-746e56fc9e2f h1:W3F4c+6OLc6H2lb//N1q4WpJkhzJCK5J6kUi1NTVXfM= golang.org/x/exp v0.0.0-20260410095643-746e56fc9e2f/go.mod h1:J1xhfL/vlindoeF/aINzNzt2Bket5bjo9sdOYzOsU80= +golang.org/x/net v0.55.0 h1:bcvxaJn3e1U6InsFWt1JUq1aSjnRxLzT2rtD2KfkDF8= +golang.org/x/net v0.55.0/go.mod h1:L5U2KuzuOe1lY7Z+aWVIKK6qEeJXnXV9yzGA+WCHJww= golang.org/x/sync v0.21.0 h1:HLII4xRRTtCRkxYp4HNFF0Js/Og6q2i++KXbg0gHCwM= golang.org/x/sync v0.21.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -225,6 +241,14 @@ golang.org/x/text v0.38.0/go.mod h1:YXZt3QhHUKYT53r2lLKFIVi6Ao1jdzrTR/KQ09qyxF4= golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= +google.golang.org/genproto/googleapis/api v0.0.0-20260526163538-3dc84a4a5aaa h1:Kjn0N0tCrDgiAFW+lGO4JZ3ck44CehvJQMAwj9QF0G8= +google.golang.org/genproto/googleapis/api v0.0.0-20260526163538-3dc84a4a5aaa/go.mod h1:q4lMZS6kskjT5HvCPrnnypcDPVJqT/f4nfxmkE7gryY= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260526163538-3dc84a4a5aaa h1:mZHHdPZl0dbGHCflZgAq/Q468DWVFcU2whhB2KAo8fk= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260526163538-3dc84a4a5aaa/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= +google.golang.org/grpc v1.81.1 h1:VnnIIZ88UzOOKLukQi+ImGz8O1Wdp8nAGGnvOfEIWQQ= +google.golang.org/grpc v1.81.1/go.mod h1:xGH9GfzOyMTGIOXBJmXt+BX/V0kcdQbdcuwQ/zNw42I= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= diff --git a/internal/adapter/http/router.go b/internal/adapter/http/router.go index f27be9a..2e00676 100644 --- a/internal/adapter/http/router.go +++ b/internal/adapter/http/router.go @@ -9,12 +9,21 @@ import ( "github.com/danielgtaylor/huma/v2" "github.com/go-chi/chi/v5" chimiddleware "github.com/go-chi/chi/v5/middleware" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "github.com/meigma/template-go-api/internal/adapter/http/middleware" "github.com/meigma/template-go-api/internal/adapter/http/problem" "github.com/meigma/template-go-api/internal/observability" ) +// Infrastructure route paths. They are raw chi routes outside the Huma API and +// the OpenAPI spec, and are excluded from tracing. +const ( + pathHealthz = "/healthz" + pathReadyz = "/readyz" + pathMetrics = "/metrics" +) + // RouterDeps carries the dependencies needed to assemble the HTTP handler. type RouterDeps struct { // Logger is the base logger for the recover and access-log middleware. @@ -39,6 +48,12 @@ type RouterDeps struct { Readiness []ReadinessCheck // Register mounts resource operations onto the Huma API. Register Registrar + // Tracing wraps the handler with the OpenTelemetry HTTP server-span + // instrumentation (otelhttp) and installs the span-naming Huma middleware. + // The infrastructure routes (/healthz, /readyz, /metrics) are filtered out so + // health checks and metrics scrapes do not generate spans. False adds no + // tracing overhead. + Tracing bool // InstallRateLimit installs the rate-limit Huma middleware on the API. Like // InstallAuthz it MUST run before the resource operations are registered (Huma // snapshots the middleware stack per operation at registration), and it runs @@ -103,12 +118,16 @@ func NewRouter(deps RouterDeps) http.Handler { }) api := NewAPI(mux, deps.Version) - // The rate-limit and authn/authz Huma middleware are installed BEFORE the - // operations are registered: Huma bakes the API's middleware stack into each - // operation at registration time, so middleware added afterward would never - // run. Rate limiting is installed first so it runs outermost — an over-limit - // request is rejected before authentication runs. Each is a no-op when its - // feature is disabled. + // The tracing, rate-limit, and authn/authz Huma middleware are installed + // BEFORE the operations are registered: Huma bakes the API's middleware stack + // into each operation at registration time, so middleware added afterward + // would never run. The span namer is installed first so it runs within the + // otelhttp server span; rate limiting next so an over-limit request is + // rejected before authentication runs. Each is a no-op when its feature is + // disabled. + if deps.Tracing { + api.UseMiddleware(observability.TraceSpanNamer) + } if deps.InstallRateLimit != nil { deps.InstallRateLimit(api) } @@ -129,19 +148,38 @@ func NewRouter(deps RouterDeps) http.Handler { // Infrastructure routes stay raw chi and are excluded from the spec. mountInfra(mux, deps.Metrics, deps.Readiness, deps.ServeMetricsEndpoint) + if deps.Tracing { + // Wrap the whole handler in the OpenTelemetry HTTP server span, extracting + // any propagated trace context. The filter excludes the infrastructure + // routes so health checks and metrics scrapes are not traced. + return otelhttp.NewHandler(mux, "http.server", otelhttp.WithFilter(traceableRequest)) + } + return mux } +// traceableRequest reports whether a request should be traced. The +// infrastructure routes (/healthz, /readyz, /metrics) are excluded so routine +// health checks and metrics scrapes do not flood the trace backend. +func traceableRequest(r *http.Request) bool { + switch r.URL.Path { + case pathHealthz, pathReadyz, pathMetrics: + return false + default: + return true + } +} + func mountInfra( mux chi.Router, metrics *observability.Metrics, readiness []ReadinessCheck, serveMetrics bool, ) { - mux.Get("/healthz", handleHealthz) - mux.Get("/readyz", handleReadyz(readiness)) + mux.Get(pathHealthz, handleHealthz) + mux.Get(pathReadyz, handleReadyz(readiness)) if serveMetrics { - mux.Handle("/metrics", metrics.Handler()) + mux.Handle(pathMetrics, metrics.Handler()) } } diff --git a/internal/adapter/http/router_test.go b/internal/adapter/http/router_test.go index 6765786..6d6896c 100644 --- a/internal/adapter/http/router_test.go +++ b/internal/adapter/http/router_test.go @@ -76,6 +76,48 @@ func TestMetricsEndpointOmittedWhenServedSeparately(t *testing.T) { assert.Equal(t, http.StatusOK, get(t, srv, "/healthz").status) } +// TestRouterWithTracingServesRequests verifies that enabling tracing wraps the +// handler with the otelhttp server-span instrumentation without breaking +// routing: infra routes still serve normally. +func TestRouterWithTracingServesRequests(t *testing.T) { + t.Parallel() + + discard := observability.NewLogger(io.Discard, slog.LevelError, "json") + handler := NewRouter(RouterDeps{ + Logger: discard, + Metrics: observability.NewMetrics(), + ServeMetricsEndpoint: true, + Version: "test", + RequestTimeout: testRequestTimeout, + Tracing: true, + Register: nil, + }) + + srv := httptest.NewServer(handler) + t.Cleanup(srv.Close) + + assert.Equal(t, http.StatusOK, get(t, srv, "/healthz").status) + assert.Equal(t, http.StatusOK, get(t, srv, "/metrics").status) +} + +// TestTraceableRequest checks that the infrastructure routes are excluded from +// tracing while resource routes are traced. +func TestTraceableRequest(t *testing.T) { + t.Parallel() + + cases := map[string]bool{ + "/healthz": false, + "/readyz": false, + "/metrics": false, + "/v1/todos": true, + "/v1/todos/42": true, + } + for path, want := range cases { + req := httptest.NewRequest(http.MethodGet, path, nil) + assert.Equalf(t, want, traceableRequest(req), "traceableRequest(%q)", path) + } +} + // TestNewMetricsHandler verifies the dedicated metrics handler serves /metrics // and nothing else. func TestNewMetricsHandler(t *testing.T) { diff --git a/internal/adapter/postgres/postgres.go b/internal/adapter/postgres/postgres.go index dde1d2d..0856013 100644 --- a/internal/adapter/postgres/postgres.go +++ b/internal/adapter/postgres/postgres.go @@ -6,6 +6,7 @@ import ( "context" "fmt" + "github.com/exaring/otelpgx" "github.com/jackc/pgx/v5/pgxpool" ) @@ -16,6 +17,10 @@ type Config struct { URL string // MaxConns caps the pool size. Zero leaves pgx's default in place. MaxConns int32 + // Tracing installs the OpenTelemetry pgx query tracer on the pool so each + // query becomes a child span of the request. It uses the global tracer + // provider; leave it false to add no tracing overhead. + Tracing bool } // Connect parses cfg, applies pool tuning, and opens a connection pool, @@ -29,6 +34,12 @@ func Connect(ctx context.Context, cfg Config) (*pgxpool.Pool, error) { if cfg.MaxConns > 0 { poolCfg.MaxConns = cfg.MaxConns } + if cfg.Tracing { + // Trace every query as a span under the active request span. The tracer + // resolves the global tracer provider at query time, so it is inert until + // a real provider is installed. + poolCfg.ConnConfig.Tracer = otelpgx.NewTracer() + } pool, err := pgxpool.NewWithConfig(ctx, poolCfg) if err != nil { diff --git a/internal/app/app.go b/internal/app/app.go index 86c8de3..c72ce35 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -32,6 +32,10 @@ import ( // in-process limiter evicts it, bounding memory under churning client keys. const rateLimiterIdleTTL = 10 * time.Minute +// serviceName is the OpenTelemetry service.name reported by traces. It is a +// default; OTEL_SERVICE_NAME or OTEL_RESOURCE_ATTRIBUTES override it. +const serviceName = "template-go-api" + // App is a fully wired API server ready to Run. type App struct { server *http.Server @@ -44,6 +48,9 @@ type App struct { // rateLimiter is the in-process rate limiter whose janitor goroutine is // stopped during graceful shutdown. It is nil when rate limiting is disabled. rateLimiter *ratelimit.InMemory + // traceShutdown flushes and shuts down the OpenTelemetry tracer provider on + // graceful shutdown. It is a no-op when tracing is disabled. + traceShutdown func(context.Context) error } // Option configures how New wires the application. @@ -106,6 +113,17 @@ func New( rateLimiter, installRateLimit := buildRateLimiter(cfg, logger) + // Configure tracing before serving so the global provider is in place when + // requests (and their pgx queries) start producing spans. + traceShutdown, err := observability.NewTracerProvider(ctx, observability.TracingConfig{ + Enabled: cfg.TracingEnabled, + ServiceName: serviceName, + ServiceVersion: version, + }) + if err != nil { + return nil, fmt.Errorf("init tracing: %w", err) + } + // An empty metrics-addr co-locates /metrics on the API listener; otherwise a // dedicated metrics server (below) serves it off the API surface. serveMetricsInline := cfg.MetricsAddr == "" @@ -121,6 +139,7 @@ func New( // injected repository (tests) contributes none, so /readyz is always ready. Readiness: readiness, Register: registerResources(service), + Tracing: cfg.TracingEnabled, InstallRateLimit: installRateLimit, InstallAuthz: installAuthz, FinalizeAuthz: finalizeAuthz, @@ -154,6 +173,7 @@ func New( grace: cfg.ShutdownGrace, pool: pool, rateLimiter: rateLimiter, + traceShutdown: traceShutdown, }, nil } @@ -170,7 +190,11 @@ func resolveStore( return injected, nil, nil, nil } - pool, err := postgres.Connect(ctx, postgres.Config{URL: cfg.DatabaseURL, MaxConns: cfg.DBMaxConns}) + pool, err := postgres.Connect(ctx, postgres.Config{ + URL: cfg.DatabaseURL, + MaxConns: cfg.DBMaxConns, + Tracing: cfg.TracingEnabled, + }) if err != nil { return nil, nil, nil, fmt.Errorf("connect postgres: %w", err) } diff --git a/internal/app/serve.go b/internal/app/serve.go index 154094d..595136b 100644 --- a/internal/app/serve.go +++ b/internal/app/serve.go @@ -33,6 +33,8 @@ func (a *App) Run(ctx context.Context) error { defer a.closePool(ctx) // Stop the in-process rate limiter's janitor goroutine on every exit path. defer a.stopRateLimiter(ctx) + // Flush and shut down the tracer provider on every exit path. + defer a.shutdownTracing(ctx) servers := a.servers() serveErr := make(chan error, len(servers)) @@ -105,3 +107,23 @@ func (a *App) stopRateLimiter(ctx context.Context) { a.logger.InfoContext(ctx, "stopping rate limiter") a.rateLimiter.Stop() } + +// shutdownTracing flushes and shuts down the tracer provider when tracing is +// enabled. It is deferred in Run so it executes on every exit path. The flush +// runs on a fresh, grace-bounded context because Run's context is already +// cancelled by the time shutdown begins, and an already-cancelled context would +// abandon the final span export. It is a no-op when tracing is disabled. +func (a *App) shutdownTracing(ctx context.Context) { + if a.traceShutdown == nil { + return + } + + a.logger.InfoContext(ctx, "shutting down tracer provider") + + flushCtx, cancel := context.WithTimeout(context.Background(), a.grace) + defer cancel() + + if err := a.traceShutdown(flushCtx); err != nil { + a.logger.ErrorContext(ctx, "tracer provider shutdown failed", slog.Any("error", err)) + } +} diff --git a/internal/config/config.go b/internal/config/config.go index 42ced2d..14f5215 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -42,6 +42,10 @@ const ( // defaultRateLimitBurst is the per-client token-bucket depth: how many // requests a client may make in a burst before the sustained rate applies. defaultRateLimitBurst = 20 + // defaultTracingEnabled is false: distributed tracing requires an external + // OpenTelemetry collector, so it is opt-in. Enable it and configure the + // exporter via the standard OTEL_* environment variables. + defaultTracingEnabled = false ) // Config holds runtime settings for the API server. @@ -100,6 +104,10 @@ type Config struct { // RateLimitBurst is the per-client token-bucket depth: the number of requests // a client may make in a burst before the sustained RateLimitRPS applies. RateLimitBurst int + // TracingEnabled turns on OpenTelemetry distributed tracing. It defaults to + // false because tracing needs an external collector; the exporter is then + // configured via the standard OTEL_* environment variables. + TracingEnabled bool } // RegisterFlags declares the server configuration flags on flags. Binding them @@ -144,6 +152,11 @@ func RegisterFlags(flags *pflag.FlagSet) { ) flags.Float64("rate-limit-rps", defaultRateLimitRPS, "sustained per-client request rate (requests per second)") flags.Int("rate-limit-burst", defaultRateLimitBurst, "per-client burst size (token-bucket depth)") + flags.Bool( + "tracing-enabled", + defaultTracingEnabled, + "enable OpenTelemetry tracing (OTLP); configure the exporter via the standard OTEL_* env vars", + ) } // Load reads the server configuration from vp, applying defaults for unset keys. @@ -170,6 +183,7 @@ func Load(vp *viper.Viper) Config { RateLimitEnabled: vp.GetBool("rate-limit-enabled"), RateLimitRPS: vp.GetFloat64("rate-limit-rps"), RateLimitBurst: vp.GetInt("rate-limit-burst"), + TracingEnabled: vp.GetBool("tracing-enabled"), } } @@ -225,4 +239,5 @@ func setDefaults(vp *viper.Viper) { vp.SetDefault("rate-limit-enabled", defaultRateLimitEnabled) vp.SetDefault("rate-limit-rps", defaultRateLimitRPS) vp.SetDefault("rate-limit-burst", defaultRateLimitBurst) + vp.SetDefault("tracing-enabled", defaultTracingEnabled) } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 18e3f60..0855fa7 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -29,6 +29,7 @@ func TestLoadDefaults(t *testing.T) { assert.True(t, cfg.RateLimitEnabled, "rate limiting is enabled by default") assert.InDelta(t, defaultRateLimitRPS, cfg.RateLimitRPS, 0.0001) assert.Equal(t, defaultRateLimitBurst, cfg.RateLimitBurst) + assert.False(t, cfg.TracingEnabled, "tracing is opt-in (needs an external collector)") } func TestLoadAuthzFromFlags(t *testing.T) { diff --git a/internal/observability/tracing.go b/internal/observability/tracing.go new file mode 100644 index 0000000..6aedd44 --- /dev/null +++ b/internal/observability/tracing.go @@ -0,0 +1,94 @@ +package observability + +import ( + "context" + "fmt" + + "github.com/danielgtaylor/huma/v2" + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.39.0" + "go.opentelemetry.io/otel/trace" +) + +// TracingConfig configures the OpenTelemetry tracer provider. +type TracingConfig struct { + // Enabled is the master switch. When false, NewTracerProvider installs + // nothing and returns a no-op shutdown. + Enabled bool + // ServiceName and ServiceVersion seed the resource's service.name and + // service.version. They are defaults: the standard OTEL_SERVICE_NAME and + // OTEL_RESOURCE_ATTRIBUTES environment variables override them. + ServiceName string + ServiceVersion string +} + +// NewTracerProvider configures OpenTelemetry tracing and registers it globally, +// returning a shutdown function that flushes buffered spans. When cfg.Enabled is +// false it installs nothing — the global no-op tracer provider stays in place — +// and returns a nil shutdown, so instrumentation such as otelhttp and otelpgx +// adds no overhead. Callers must nil-check the returned shutdown. +// +// The exporter is OTLP/HTTP, configured entirely from the standard OTEL_* +// environment variables (OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_HEADERS, +// OTEL_TRACES_SAMPLER, and so on), so deployments tune tracing the OpenTelemetry +// way rather than through bespoke flags. The global propagator is set to W3C +// Trace Context + Baggage so trace context flows across services. +func NewTracerProvider(ctx context.Context, cfg TracingConfig) (func(context.Context) error, error) { + if !cfg.Enabled { + return nil, nil //nolint:nilnil // tracing disabled: there is no shutdown to run and no error. + } + + exporter, err := otlptracehttp.New(ctx) + if err != nil { + return nil, fmt.Errorf("create otlp trace exporter: %w", err) + } + + // Only WithTelemetrySDK carries a schema URL; WithAttributes and WithFromEnv + // are schemaless, so the merge cannot conflict. WithFromEnv is last so + // OTEL_SERVICE_NAME / OTEL_RESOURCE_ATTRIBUTES override the seeded defaults. + res, err := resource.New(ctx, + resource.WithTelemetrySDK(), + resource.WithAttributes( + semconv.ServiceName(cfg.ServiceName), + semconv.ServiceVersion(cfg.ServiceVersion), + ), + resource.WithFromEnv(), + ) + if err != nil { + return nil, fmt.Errorf("build trace resource: %w", err) + } + + provider := sdktrace.NewTracerProvider( + sdktrace.WithBatcher(exporter), + sdktrace.WithResource(res), + ) + + otel.SetTracerProvider(provider) + otel.SetTextMapPropagator(propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + propagation.Baggage{}, + )) + + return provider.Shutdown, nil +} + +// TraceSpanNamer is router-agnostic Huma middleware that renames the active +// server span (created by otelhttp at the edge) to the matched operation's ID +// and tags it with the route template, giving low-cardinality, meaningful span +// names like "get-todo" instead of bare HTTP methods or high-cardinality paths. +// It is a no-op when no span is recording (tracing disabled), so it is safe to +// install unconditionally — though the composition root installs it only when +// tracing is enabled. +func TraceSpanNamer(ctx huma.Context, next func(huma.Context)) { + if span := trace.SpanFromContext(ctx.Context()); span.IsRecording() { + op := ctx.Operation() + span.SetName(op.OperationID) + span.SetAttributes(semconv.HTTPRoute(op.Path)) + } + + next(ctx) +} diff --git a/internal/observability/tracing_test.go b/internal/observability/tracing_test.go new file mode 100644 index 0000000..8a7b298 --- /dev/null +++ b/internal/observability/tracing_test.go @@ -0,0 +1,82 @@ +package observability_test + +import ( + "context" + "net/http" + "testing" + + "github.com/danielgtaylor/huma/v2" + "github.com/danielgtaylor/huma/v2/humatest" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + "go.opentelemetry.io/otel/sdk/trace/tracetest" + + "github.com/meigma/template-go-api/internal/observability" +) + +func TestNewTracerProviderDisabled(t *testing.T) { + t.Parallel() + + shutdown, err := observability.NewTracerProvider( + context.Background(), + observability.TracingConfig{Enabled: false}, + ) + require.NoError(t, err) + assert.Nil(t, shutdown, "a disabled provider installs nothing and has no shutdown") +} + +func TestNewTracerProviderEnabled(t *testing.T) { + // Not parallel: this registers a global tracer provider and propagator, which + // are restored on cleanup so other tests see the original globals. + prevProvider := otel.GetTracerProvider() + prevPropagator := otel.GetTextMapPropagator() + t.Cleanup(func() { + otel.SetTracerProvider(prevProvider) + otel.SetTextMapPropagator(prevPropagator) + }) + + shutdown, err := observability.NewTracerProvider(context.Background(), observability.TracingConfig{ + Enabled: true, + ServiceName: "test-service", + ServiceVersion: "1.2.3", + }) + require.NoError(t, err) + require.NotNil(t, shutdown) + // The OTLP/HTTP exporter connects lazily, so no collector is needed; shutdown + // flushes the (empty) batch without error. + assert.NoError(t, shutdown(context.Background())) +} + +func TestTraceSpanNamerRenamesSpanToOperationID(t *testing.T) { + t.Parallel() + + exporter := tracetest.NewInMemoryExporter() + provider := sdktrace.NewTracerProvider(sdktrace.WithSyncer(exporter)) + t.Cleanup(func() { _ = provider.Shutdown(context.Background()) }) + + _, api := humatest.New(t) + // Simulate the otelhttp server span: start a recording span and put it on the + // request context before the namer runs. + api.UseMiddleware(func(ctx huma.Context, next func(huma.Context)) { + spanCtx, span := provider.Tracer("test").Start(ctx.Context(), "HTTP") + defer span.End() + next(huma.WithContext(ctx, spanCtx)) + }) + api.UseMiddleware(observability.TraceSpanNamer) + + huma.Register(api, huma.Operation{ + OperationID: "get-thing", + Method: http.MethodGet, + Path: "/things/{id}", + }, func(_ context.Context, _ *struct{}) (*struct{}, error) { + return &struct{}{}, nil + }) + + api.Get("/things/42") + + spans := exporter.GetSpans() + require.Len(t, spans, 1) + assert.Equal(t, "get-thing", spans[0].Name, "the active span is renamed to the operation id") +}