diff --git a/go.mod b/go.mod index e4bfbd8..2eeda9f 100644 --- a/go.mod +++ b/go.mod @@ -28,8 +28,10 @@ require ( github.com/testcontainers/testcontainers-go v0.40.0 github.com/testcontainers/testcontainers-go/modules/localstack v0.40.0 github.com/testcontainers/testcontainers-go/modules/mysql v0.40.0 + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 go.opentelemetry.io/otel v1.43.0 go.opentelemetry.io/otel/exporters/prometheus v0.65.0 + go.opentelemetry.io/otel/metric v1.43.0 go.opentelemetry.io/otel/sdk v1.43.0 go.opentelemetry.io/otel/sdk/metric v1.43.0 golang.org/x/sync v0.20.0 @@ -223,9 +225,7 @@ require ( go.opentelemetry.io/collector/internal/telemetry v0.136.0 // indirect go.opentelemetry.io/collector/pdata v1.42.0 // indirect go.opentelemetry.io/contrib/bridges/otelzap v0.13.0 // indirect - go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect go.opentelemetry.io/otel/log v0.14.0 // indirect - go.opentelemetry.io/otel/metric v1.43.0 // indirect go.opentelemetry.io/otel/trace v1.43.0 // indirect go.uber.org/atomic v1.11.0 // indirect go.uber.org/multierr v1.11.0 // indirect diff --git a/pkg/api/telemetry_integration_test.go b/pkg/api/telemetry_integration_test.go new file mode 100644 index 0000000..abf9be6 --- /dev/null +++ b/pkg/api/telemetry_integration_test.go @@ -0,0 +1,122 @@ +//go:build integration + +package api + +import ( + "database/sql" + "io" + "log/slog" + "net/http" + "net/http/httptest" + "os" + "strings" + "testing" + + "github.com/block/spirit/pkg/utils" + _ "github.com/go-sql-driver/mysql" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "github.com/testcontainers/testcontainers-go" + "github.com/testcontainers/testcontainers-go/modules/mysql" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" + + "github.com/block/schemabot/pkg/storage/mysqlstore" + "github.com/block/schemabot/pkg/testutil" +) + +// TestMetricsAfterRequests starts a real service with MySQL storage, hits +// several API endpoints, then scrapes /metrics and verifies that HTTP server +// metrics appear in the Prometheus text output. +func TestMetricsAfterRequests(t *testing.T) { + ctx := t.Context() + + container, err := mysql.Run(ctx, + "mysql:8.4", + mysql.WithDatabase("schemabot_test"), + mysql.WithUsername("root"), + mysql.WithPassword("test"), + ) + require.NoError(t, err, "failed to start mysql") + t.Cleanup(func() { + if err := testcontainers.TerminateContainer(container); err != nil { + t.Logf("failed to terminate container: %v", err) + } + }) + + dsn, err := testutil.ContainerConnectionString(ctx, container, "parseTime=true") + require.NoError(t, err, "failed to get connection string") + + logger := slog.New(slog.NewTextHandler(os.Stdout, &slog.HandlerOptions{Level: slog.LevelError})) + require.NoError(t, EnsureSchema(dsn, logger), "failed to ensure schema") + + db, err := sql.Open("mysql", dsn) + require.NoError(t, err) + require.NoError(t, db.PingContext(ctx)) + + storage := mysqlstore.New(db) + serverConfig := &ServerConfig{ + TernDeployments: TernConfig{ + "default": {"staging": "tern-staging:9090"}, + }, + } + svc := New(storage, serverConfig, nil, logger) + defer utils.CloseAndLog(svc) + + // Set up telemetry and routes exactly as serve.go does. + tel, err := SetupTelemetry(logger) + require.NoError(t, err) + t.Cleanup(func() { require.NoError(t, tel.Shutdown(t.Context())) }) + + mux := http.NewServeMux() + svc.ConfigureRoutes(mux) + mux.Handle("GET /metrics", tel.MetricsHandler) + handler := otelhttp.NewHandler(mux, "schemabot") + + ts := httptest.NewServer(handler) + defer ts.Close() + + // Hit several endpoints to generate HTTP metrics. + endpoints := []struct { + method string + path string + }{ + {"GET", "/health"}, + {"GET", "/api/status"}, + {"GET", "/api/locks"}, + {"GET", "/api/settings"}, + {"GET", "/api/logs"}, + } + + client := ts.Client() + for _, ep := range endpoints { + req, err := http.NewRequestWithContext(ctx, ep.method, ts.URL+ep.path, nil) + require.NoError(t, err) + resp, err := client.Do(req) + require.NoError(t, err) + resp.Body.Close() + } + + // Scrape /metrics and verify HTTP server metrics appear. + metricsReq, err := http.NewRequestWithContext(ctx, "GET", ts.URL+"/metrics", nil) + require.NoError(t, err) + resp, err := client.Do(metricsReq) + require.NoError(t, err) + defer resp.Body.Close() + + assert.Equal(t, http.StatusOK, resp.StatusCode) + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + metricsText := string(body) + + // otelhttp produces these standard metrics. + assert.True(t, strings.Contains(metricsText, "http_server_request_duration"), + "/metrics should contain http_server_request_duration") + assert.True(t, strings.Contains(metricsText, "http_server_request_body_size"), + "/metrics should contain http_server_request_body_size") + assert.True(t, strings.Contains(metricsText, "http_server_response_body_size"), + "/metrics should contain http_server_response_body_size") + + // The custom plans counter only appears after its first increment, + // so we don't assert it here — it's tested in TestRecordPlanMetric. +} diff --git a/pkg/api/telemetry_test.go b/pkg/api/telemetry_test.go index ee5b6d0..e697ed1 100644 --- a/pkg/api/telemetry_test.go +++ b/pkg/api/telemetry_test.go @@ -9,6 +9,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" sdkmetric "go.opentelemetry.io/otel/sdk/metric" @@ -103,3 +104,59 @@ func TestRecordPlanMetric(t *testing.T) { } metricdatatest.AssertEqual(t, want, plansMetric, metricdatatest.IgnoreTimestamp()) } + +func TestOtelHTTPMetrics(t *testing.T) { + reader := sdkmetric.NewManualReader() + mp := sdkmetric.NewMeterProvider(sdkmetric.WithReader(reader)) + prevMP := otel.GetMeterProvider() + otel.SetMeterProvider(mp) + t.Cleanup(func() { + otel.SetMeterProvider(prevMP) + require.NoError(t, mp.Shutdown(t.Context())) + }) + + svc := newTestService() + mux := http.NewServeMux() + svc.ConfigureRoutes(mux) + handler := otelhttp.NewHandler(mux, "schemabot") + + // Hit /health — the one route guaranteed to work with mock storage. + req := httptest.NewRequestWithContext(t.Context(), "GET", "/health", nil) + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + assert.Equal(t, http.StatusOK, w.Code) + + var rm metricdata.ResourceMetrics + require.NoError(t, reader.Collect(t.Context(), &rm)) + + // Verify otelhttp produced the standard HTTP server metrics. + metricNames := make(map[string]bool) + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + metricNames[m.Name] = true + } + } + assert.True(t, metricNames["http.server.request.duration"], "expected http.server.request.duration metric") + assert.True(t, metricNames["http.server.request.body.size"], "expected http.server.request.body.size metric") + assert.True(t, metricNames["http.server.response.body.size"], "expected http.server.response.body.size metric") + + // Verify the duration histogram has data points with expected attributes. + for _, sm := range rm.ScopeMetrics { + for _, m := range sm.Metrics { + if m.Name != "http.server.request.duration" { + continue + } + hist, ok := m.Data.(metricdata.Histogram[float64]) + require.True(t, ok) + assert.GreaterOrEqual(t, len(hist.DataPoints), 1, "expected at least one duration data point") + + // Verify data points have standard HTTP attributes. + for _, dp := range hist.DataPoints { + _, hasMethod := dp.Attributes.Value(attribute.Key("http.request.method")) + assert.True(t, hasMethod, "expected http.request.method attribute on duration data point") + _, hasStatus := dp.Attributes.Value(attribute.Key("http.response.status_code")) + assert.True(t, hasStatus, "expected http.response.status_code attribute on duration data point") + } + } + } +} diff --git a/pkg/cmd/commands/serve.go b/pkg/cmd/commands/serve.go index 1be7b78..4385950 100644 --- a/pkg/cmd/commands/serve.go +++ b/pkg/cmd/commands/serve.go @@ -15,6 +15,7 @@ import ( "github.com/block/spirit/pkg/utils" _ "github.com/go-sql-driver/mysql" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "google.golang.org/grpc" "github.com/block/schemabot/pkg/api" @@ -117,10 +118,14 @@ func (cmd *ServeCmd) Run(g *Globals) error { } mux.Handle("POST /webhook", webhookHandler) + // Wrap mux with OTel HTTP instrumentation for automatic request + // duration, request body size, and response body size metrics. + handler := otelhttp.NewHandler(mux, "schemabot") + // Create server server := &http.Server{ Addr: ":" + port, - Handler: mux, + Handler: handler, ReadTimeout: 30 * time.Second, WriteTimeout: 30 * time.Second, IdleTimeout: 60 * time.Second,