diff --git a/app/services/metrics/metrics.go b/app/services/metrics/metrics.go index 2b80743..9018320 100644 --- a/app/services/metrics/metrics.go +++ b/app/services/metrics/metrics.go @@ -338,7 +338,7 @@ func (mp *metricspusher) Push(cred credential.Credential) error { } } - // ── Traefik metrics (HTTP requests / response time / error rate per entrypoint) ── + // ── Traefik metrics (aggregate per entrypoint) ────────────────────────────── traefikSets, err := mp.traefikcollector.Collect(ctx) if err != nil { @@ -355,6 +355,28 @@ func (mp *metricspusher) Push(cred credential.Credential) error { } } + // ── Traefik router metrics (per-app, excludes catchall noise) ──────────────── + + routerSets, err := mp.traefikcollector.CollectRouters(ctx) + if err != nil { + log.Warnf("traefik router metrics collection failed: %v", err) + } else { + for _, rs := range routerSets { + attrs := map[string]any{ + "router_name": rs.Attributes.RouterName, + "entrypoint_name": rs.Attributes.EntrypointName, + } + if rs.Attributes.Service != "" { + attrs["service"] = rs.Attributes.Service + } + metricSets = append(metricSets, domainmetrics.MetricSet{ + Type: domainmetrics.MetricTypeTraefikRouter, + Attributes: attrs, + Metrics: rs.Metrics, + }) + } + } + hostname, _ := os.Hostname() payload := domainmetrics.MetricPayload{ diff --git a/app/services/metrics/metrics_test.go b/app/services/metrics/metrics_test.go index d775a11..963957b 100644 --- a/app/services/metrics/metrics_test.go +++ b/app/services/metrics/metrics_test.go @@ -243,6 +243,14 @@ func (m *MockTraefikCollector) Collect(ctx context.Context) ([]traefikmetrics.En return args.Get(0).([]traefikmetrics.EntrypointMetricSet), args.Error(1) } +func (m *MockTraefikCollector) CollectRouters(ctx context.Context) ([]traefikmetrics.RouterMetricSet, error) { + args := m.Called(ctx) + if args.Get(0) == nil { + return nil, args.Error(1) + } + return args.Get(0).([]traefikmetrics.RouterMetricSet), args.Error(1) +} + type MockDockerDiscoverer struct { mock.Mock } @@ -314,6 +322,8 @@ func setupTestMetricsPusher() (*metricspusher, *testMocks) { Return([]containermetrics.ContainerMetricSet(nil), nil) mocks.traefikcollector.On("Collect", mock.Anything). Return([]traefikmetrics.EntrypointMetricSet(nil), nil) + mocks.traefikcollector.On("CollectRouters", mock.Anything). + Return([]traefikmetrics.RouterMetricSet(nil), nil) return mp, mocks } diff --git a/domain/metrics/metrics.go b/domain/metrics/metrics.go index bf6b6a8..4dfe69b 100644 --- a/domain/metrics/metrics.go +++ b/domain/metrics/metrics.go @@ -12,6 +12,7 @@ const ( MetricTypeRedis = "redis" MetricTypeContainer = "container" MetricTypeTraefikService = "traefik.proxy" + MetricTypeTraefikRouter = "traefik.router" ) type MetricPayload struct { @@ -187,6 +188,27 @@ type TraefikEntrypointAttributes struct { EntrypointName string `json:"entrypoint_name"` } +// TraefikRouterMetrics holds per-router HTTP metrics. Unlike entrypoint metrics +// which are aggregate across all traffic, router metrics map 1:1 to a deployed +// app and exclude unmatched/catchall traffic noise. +type TraefikRouterMetrics struct { + RequestsTotal int64 `json:"requests_total"` + Requests2xx int64 `json:"requests_2xx"` + Requests4xx int64 `json:"requests_4xx"` + Requests5xx int64 `json:"requests_5xx"` + ErrorRate float64 `json:"error_rate"` + AvgResponseTimeMs float64 `json:"avg_response_time_ms"` + P50ResponseTimeMs float64 `json:"p50_response_time_ms"` + P95ResponseTimeMs float64 `json:"p95_response_time_ms"` + P99ResponseTimeMs float64 `json:"p99_response_time_ms"` +} + +type TraefikRouterAttributes struct { + RouterName string `json:"router_name"` + EntrypointName string `json:"entrypoint_name"` + Service string `json:"service,omitempty"` +} + type ContainerAttributes struct { ContainerID string `json:"container_id"` ContainerName string `json:"container_name"` diff --git a/internal/traefikmetrics/collector.go b/internal/traefikmetrics/collector.go index 0328209..4cfbff2 100644 --- a/internal/traefikmetrics/collector.go +++ b/internal/traefikmetrics/collector.go @@ -25,6 +25,7 @@ import ( type Collector interface { Collect(ctx context.Context) ([]EntrypointMetricSet, error) + CollectRouters(ctx context.Context) ([]RouterMetricSet, error) } type EntrypointMetricSet struct { @@ -32,6 +33,11 @@ type EntrypointMetricSet struct { Metrics domainmetrics.TraefikEntrypointMetrics } +type RouterMetricSet struct { + Attributes domainmetrics.TraefikRouterAttributes + Metrics domainmetrics.TraefikRouterMetrics +} + type lastRequestsEntrypoint struct { total int64 collectedAt time.Time @@ -281,6 +287,124 @@ func (tc *traefikCollector) aggregate(text string) ([]EntrypointMetricSet, error return results, nil } +// CollectRouters scrapes per-router metrics from Traefik's Prometheus endpoint. +// Routers map 1:1 to deployed apps — catchall@internal and traefik@internal are +// excluded so the error rate reflects real app traffic only. +func (tc *traefikCollector) CollectRouters(ctx context.Context) ([]RouterMetricSet, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, tc.endpoint, nil) + if err != nil { + return nil, fmt.Errorf("build request: %w", err) + } + resp, err := tc.client.Do(req) + if err != nil { + return nil, fmt.Errorf("fetch %s: %w", tc.endpoint, err) + } + defer resp.Body.Close() + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read body: %w", err) + } + return tc.aggregateRouters(string(body)) +} + +type routerAgg struct { + entrypoint string + service string + requestsTotal int64 + requests2xx int64 + requests4xx int64 + requests5xx int64 + buckets map[float64]float64 + durationSum float64 + durationCount float64 +} + +func (tc *traefikCollector) aggregateRouters(text string) ([]RouterMetricSet, error) { + samples := parseSamples(text) + + // key: "router@entrypoint" + routers := make(map[string]*routerAgg) + ensure := func(router, entrypoint, service string) *routerAgg { + key := router + "@" + entrypoint + if routers[key] == nil { + routers[key] = &routerAgg{entrypoint: entrypoint, service: service, buckets: make(map[float64]float64)} + } + return routers[key] + } + + for _, s := range samples { + router := s.labels["router"] + if router == "" { + continue + } + // Skip internal/catchall routers — they represent unmatched traffic noise + if strings.HasSuffix(router, "@internal") { + continue + } + entrypoint := s.labels["entrypoint"] + service := s.labels["service"] + agg := ensure(router, entrypoint, service) + + switch s.name { + case "traefik_router_requests_total": + count := int64(s.value) + agg.requestsTotal += count + switch { + case strings.HasPrefix(s.labels["code"], "2"): + agg.requests2xx += count + case strings.HasPrefix(s.labels["code"], "4"): + agg.requests4xx += count + case strings.HasPrefix(s.labels["code"], "5"): + agg.requests5xx += count + } + case "traefik_router_request_duration_seconds_bucket": + leStr := s.labels["le"] + if leStr == "+Inf" { + continue + } + le, err := strconv.ParseFloat(leStr, 64) + if err == nil { + agg.buckets[le] += s.value + } + case "traefik_router_request_duration_seconds_sum": + agg.durationSum += s.value + case "traefik_router_request_duration_seconds_count": + agg.durationCount += s.value + } + } + + var results []RouterMetricSet + for key, agg := range routers { + routerName := strings.SplitN(key, "@", 2)[0] + m := domainmetrics.TraefikRouterMetrics{ + RequestsTotal: agg.requestsTotal, + Requests2xx: agg.requests2xx, + Requests4xx: agg.requests4xx, + Requests5xx: agg.requests5xx, + } + if agg.requestsTotal > 0 { + m.ErrorRate = float64(agg.requests4xx+agg.requests5xx) / float64(agg.requestsTotal) * 100 + } + if agg.durationCount > 0 { + m.AvgResponseTimeMs = (agg.durationSum / agg.durationCount) * 1000 + } + if agg.durationCount > 0 && len(agg.buckets) > 0 { + m.P50ResponseTimeMs = pct(agg.buckets, agg.durationCount, 0.50) * 1000 + m.P95ResponseTimeMs = pct(agg.buckets, agg.durationCount, 0.95) * 1000 + m.P99ResponseTimeMs = pct(agg.buckets, agg.durationCount, 0.99) * 1000 + } + results = append(results, RouterMetricSet{ + Attributes: domainmetrics.TraefikRouterAttributes{ + RouterName: routerName, + EntrypointName: agg.entrypoint, + Service: agg.service, + }, + Metrics: m, + }) + } + return results, nil +} + // pct returns the estimated p-th percentile (0–1) from a cumulative histogram. // buckets maps upper-bound seconds → cumulative count; total is the overall count. func pct(buckets map[float64]float64, total, p float64) float64 {