Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion app/services/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ func (mp *metricspusher) Push(cred credential.Credential) error {
}
}

// ── Traefik metrics (HTTP requests / response time / error rate per entrypoint) ──
// ── Traefik metrics (aggregate per entrypoint) ──────────────────────────────

traefikSets, err := mp.traefikcollector.Collect(ctx)
if err != nil {
Expand All @@ -355,6 +355,28 @@ func (mp *metricspusher) Push(cred credential.Credential) error {
}
}

// ── Traefik router metrics (per-app, excludes catchall noise) ────────────────

routerSets, err := mp.traefikcollector.CollectRouters(ctx)
if err != nil {
log.Warnf("traefik router metrics collection failed: %v", err)
} else {
for _, rs := range routerSets {
attrs := map[string]any{
"router_name": rs.Attributes.RouterName,
"entrypoint_name": rs.Attributes.EntrypointName,
}
if rs.Attributes.Service != "" {
attrs["service"] = rs.Attributes.Service
}
metricSets = append(metricSets, domainmetrics.MetricSet{
Type: domainmetrics.MetricTypeTraefikRouter,
Attributes: attrs,
Metrics: rs.Metrics,
})
}
}

hostname, _ := os.Hostname()

payload := domainmetrics.MetricPayload{
Expand Down
10 changes: 10 additions & 0 deletions app/services/metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,14 @@ func (m *MockTraefikCollector) Collect(ctx context.Context) ([]traefikmetrics.En
return args.Get(0).([]traefikmetrics.EntrypointMetricSet), args.Error(1)
}

func (m *MockTraefikCollector) CollectRouters(ctx context.Context) ([]traefikmetrics.RouterMetricSet, error) {
args := m.Called(ctx)
if args.Get(0) == nil {
return nil, args.Error(1)
}
return args.Get(0).([]traefikmetrics.RouterMetricSet), args.Error(1)
}

type MockDockerDiscoverer struct {
mock.Mock
}
Expand Down Expand Up @@ -314,6 +322,8 @@ func setupTestMetricsPusher() (*metricspusher, *testMocks) {
Return([]containermetrics.ContainerMetricSet(nil), nil)
mocks.traefikcollector.On("Collect", mock.Anything).
Return([]traefikmetrics.EntrypointMetricSet(nil), nil)
mocks.traefikcollector.On("CollectRouters", mock.Anything).
Return([]traefikmetrics.RouterMetricSet(nil), nil)

return mp, mocks
}
Expand Down
22 changes: 22 additions & 0 deletions domain/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ const (
MetricTypeRedis = "redis"
MetricTypeContainer = "container"
MetricTypeTraefikService = "traefik.proxy"
MetricTypeTraefikRouter = "traefik.router"
)

type MetricPayload struct {
Expand Down Expand Up @@ -187,6 +188,27 @@ type TraefikEntrypointAttributes struct {
EntrypointName string `json:"entrypoint_name"`
}

// TraefikRouterMetrics holds per-router HTTP metrics. Unlike entrypoint metrics
// which are aggregate across all traffic, router metrics map 1:1 to a deployed
// app and exclude unmatched/catchall traffic noise.
type TraefikRouterMetrics struct {
RequestsTotal int64 `json:"requests_total"`
Requests2xx int64 `json:"requests_2xx"`
Requests4xx int64 `json:"requests_4xx"`
Requests5xx int64 `json:"requests_5xx"`
ErrorRate float64 `json:"error_rate"`
AvgResponseTimeMs float64 `json:"avg_response_time_ms"`
P50ResponseTimeMs float64 `json:"p50_response_time_ms"`
P95ResponseTimeMs float64 `json:"p95_response_time_ms"`
P99ResponseTimeMs float64 `json:"p99_response_time_ms"`
}

type TraefikRouterAttributes struct {
RouterName string `json:"router_name"`
EntrypointName string `json:"entrypoint_name"`
Service string `json:"service,omitempty"`
}

type ContainerAttributes struct {
ContainerID string `json:"container_id"`
ContainerName string `json:"container_name"`
Expand Down
124 changes: 124 additions & 0 deletions internal/traefikmetrics/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,19 @@ import (

type Collector interface {
Collect(ctx context.Context) ([]EntrypointMetricSet, error)
CollectRouters(ctx context.Context) ([]RouterMetricSet, error)
}

type EntrypointMetricSet struct {
Attributes domainmetrics.TraefikEntrypointAttributes
Metrics domainmetrics.TraefikEntrypointMetrics
}

type RouterMetricSet struct {
Attributes domainmetrics.TraefikRouterAttributes
Metrics domainmetrics.TraefikRouterMetrics
}

type lastRequestsEntrypoint struct {
total int64
collectedAt time.Time
Expand Down Expand Up @@ -281,6 +287,124 @@ func (tc *traefikCollector) aggregate(text string) ([]EntrypointMetricSet, error
return results, nil
}

// CollectRouters scrapes per-router metrics from Traefik's Prometheus endpoint.
// Routers map 1:1 to deployed apps — catchall@internal and traefik@internal are
// excluded so the error rate reflects real app traffic only.
func (tc *traefikCollector) CollectRouters(ctx context.Context) ([]RouterMetricSet, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, tc.endpoint, nil)
if err != nil {
return nil, fmt.Errorf("build request: %w", err)
}
resp, err := tc.client.Do(req)
if err != nil {
return nil, fmt.Errorf("fetch %s: %w", tc.endpoint, err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("read body: %w", err)
}
return tc.aggregateRouters(string(body))
}

type routerAgg struct {
entrypoint string
service string
requestsTotal int64
requests2xx int64
requests4xx int64
requests5xx int64
buckets map[float64]float64
durationSum float64
durationCount float64
}

func (tc *traefikCollector) aggregateRouters(text string) ([]RouterMetricSet, error) {
samples := parseSamples(text)

// key: "router@entrypoint"
routers := make(map[string]*routerAgg)
ensure := func(router, entrypoint, service string) *routerAgg {
key := router + "@" + entrypoint
if routers[key] == nil {
routers[key] = &routerAgg{entrypoint: entrypoint, service: service, buckets: make(map[float64]float64)}
}
return routers[key]
}

for _, s := range samples {
router := s.labels["router"]
if router == "" {
continue
}
// Skip internal/catchall routers — they represent unmatched traffic noise
if strings.HasSuffix(router, "@internal") {
continue
}
entrypoint := s.labels["entrypoint"]
service := s.labels["service"]
agg := ensure(router, entrypoint, service)

switch s.name {
case "traefik_router_requests_total":
count := int64(s.value)
agg.requestsTotal += count
switch {
case strings.HasPrefix(s.labels["code"], "2"):
agg.requests2xx += count
case strings.HasPrefix(s.labels["code"], "4"):
agg.requests4xx += count
case strings.HasPrefix(s.labels["code"], "5"):
agg.requests5xx += count
}
case "traefik_router_request_duration_seconds_bucket":
leStr := s.labels["le"]
if leStr == "+Inf" {
continue
}
le, err := strconv.ParseFloat(leStr, 64)
if err == nil {
agg.buckets[le] += s.value
}
case "traefik_router_request_duration_seconds_sum":
agg.durationSum += s.value
case "traefik_router_request_duration_seconds_count":
agg.durationCount += s.value
}
}

var results []RouterMetricSet
for key, agg := range routers {
routerName := strings.SplitN(key, "@", 2)[0]
m := domainmetrics.TraefikRouterMetrics{
RequestsTotal: agg.requestsTotal,
Requests2xx: agg.requests2xx,
Requests4xx: agg.requests4xx,
Requests5xx: agg.requests5xx,
}
if agg.requestsTotal > 0 {
m.ErrorRate = float64(agg.requests4xx+agg.requests5xx) / float64(agg.requestsTotal) * 100
}
if agg.durationCount > 0 {
m.AvgResponseTimeMs = (agg.durationSum / agg.durationCount) * 1000
}
if agg.durationCount > 0 && len(agg.buckets) > 0 {
m.P50ResponseTimeMs = pct(agg.buckets, agg.durationCount, 0.50) * 1000
m.P95ResponseTimeMs = pct(agg.buckets, agg.durationCount, 0.95) * 1000
m.P99ResponseTimeMs = pct(agg.buckets, agg.durationCount, 0.99) * 1000
}
results = append(results, RouterMetricSet{
Attributes: domainmetrics.TraefikRouterAttributes{
RouterName: routerName,
EntrypointName: agg.entrypoint,
Service: agg.service,
},
Metrics: m,
})
}
return results, nil
}

// pct returns the estimated p-th percentile (0–1) from a cumulative histogram.
// buckets maps upper-bound seconds → cumulative count; total is the overall count.
func pct(buckets map[float64]float64, total, p float64) float64 {
Expand Down
Loading