From 4b20025623e77df9d2cefe1dac87751c04ac5d0b Mon Sep 17 00:00:00 2001 From: Shad Mirza Date: Wed, 11 Mar 2026 22:04:14 +0530 Subject: [PATCH 1/3] do-agent: add SI metrics support with aggregation and whitelist - Introduced SI metrics path configuration for Serverless Inference (SI) metrics collection. - Added SI metrics aggregation specifications to drop high-cardinality labels. - Implemented a whitelist for SI metrics to control which metrics are collected. --- cmd/do-agent/aggregation.go | 18 ++++++++++++++++++ cmd/do-agent/config.go | 18 ++++++++++++++++++ cmd/do-agent/whitelist.go | 8 ++++++++ 3 files changed, 44 insertions(+) diff --git a/cmd/do-agent/aggregation.go b/cmd/do-agent/aggregation.go index 9548dcd..9704dbc 100644 --- a/cmd/do-agent/aggregation.go +++ b/cmd/do-agent/aggregation.go @@ -237,3 +237,21 @@ var diAggregationSpec = map[string][]string{ "gradient_infra_di_vllm:request_time_per_output_token_seconds_bucket": diLabelsToDrop, "gradient_infra_di_vllm:time_to_first_token_seconds_bucket": diLabelsToDrop, } + +// SI metrics: drop high-cardinality labels we don't want to keep. +var siLabelsToDrop = []string{ + "container", + "job", + "otel_scope_name", + "otel_scope_schema_url", + "otel_scope_version", + "resource_uuid", +} + +var siAggregationSpec = map[string][]string{ + "gen_ai_otel_inference_proxy_http_requests_total": siLabelsToDrop, + "gen_ai_otel_inference_proxy_token_throughput": siLabelsToDrop, + "gen_ai_otel_inference_proxy_rate_limit_exceeded": siLabelsToDrop, + "gen_ai_otel_inference_proxy_cache_hit": siLabelsToDrop, + "gen_ai_otel_inference_proxy_cache_miss": siLabelsToDrop, +} diff --git a/cmd/do-agent/config.go b/cmd/do-agent/config.go index c3c8e2c..2f7eae6 100644 --- a/cmd/do-agent/config.go +++ b/cmd/do-agent/config.go @@ -47,6 +47,7 @@ var ( defaultMaxMetricLength int promAddr string diMetricsPath string + siMetricsPath string gpuMetricsPath string topK int scrapeTimeout time.Duration @@ -131,6 +132,9 @@ func init() { kingpin.Flag("di-metrics-path", "enable Dedicated Inference (DI) metrics collection from a prometheus endpoint"). StringVar(&config.diMetricsPath) + kingpin.Flag("si-metrics-path", "enable Serverless Inference (SI) metrics collection from a prometheus endpoint"). + StringVar(&config.siMetricsPath) + kingpin.Flag("web.listen", "enable a local endpoint for scrapeable prometheus metrics as well"). Default("false"). BoolVar(&config.webListen) @@ -267,6 +271,11 @@ func initAggregatorSpecs() map[string][]string { aggregateSpecs[k] = append(aggregateSpecs[k], v...) } } + if config.siMetricsPath != "" { + for k, v := range siAggregationSpec { + aggregateSpecs[k] = append(aggregateSpecs[k], v...) + } + } return aggregateSpecs } @@ -350,6 +359,15 @@ func initCollectors() []prometheus.Collector { } } + if config.siMetricsPath != "" { + si, err := collector.NewScraper("si", config.siMetricsPath, nil, siWhitelist, collector.WithTimeout(config.scrapeTimeout)) + if err != nil { + log.Error("Failed to initialize SI metrics collector: %+v", err) + } else { + cols = append(cols, si) + } + } + if config.gpuMetricsPath != "" { gpu, err := collector.NewScraper("gpu", config.gpuMetricsPath, nil, gpuWhitelist, collector.WithTimeout(config.scrapeTimeout)) if err != nil { diff --git a/cmd/do-agent/whitelist.go b/cmd/do-agent/whitelist.go index 0be6eee..b195472 100644 --- a/cmd/do-agent/whitelist.go +++ b/cmd/do-agent/whitelist.go @@ -274,3 +274,11 @@ var diWhitelist = map[string]bool{ "gradient_infra_di_vllm:request_time_per_output_token_seconds_bucket": true, "gradient_infra_di_vllm:time_to_first_token_seconds_bucket": true, } + +var siWhitelist = map[string]bool{ + "gen_ai_otel_inference_proxy_http_requests_total": true, + "gen_ai_otel_inference_proxy_token_throughput": true, + "gen_ai_otel_inference_proxy_rate_limit_exceeded": true, + "gen_ai_otel_inference_proxy_cache_hit": true, + "gen_ai_otel_inference_proxy_cache_miss": true, +} From 8b78bd27a77ba4ad4f3ff896f98caad85b38c365 Mon Sep 17 00:00:00 2001 From: Shad Mirza Date: Fri, 13 Mar 2026 15:31:43 +0530 Subject: [PATCH 2/3] formatting fixes --- cmd/do-agent/aggregation.go | 10 +++++----- cmd/do-agent/whitelist.go | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cmd/do-agent/aggregation.go b/cmd/do-agent/aggregation.go index 9704dbc..c945975 100644 --- a/cmd/do-agent/aggregation.go +++ b/cmd/do-agent/aggregation.go @@ -249,9 +249,9 @@ var siLabelsToDrop = []string{ } var siAggregationSpec = map[string][]string{ - "gen_ai_otel_inference_proxy_http_requests_total": siLabelsToDrop, - "gen_ai_otel_inference_proxy_token_throughput": siLabelsToDrop, - "gen_ai_otel_inference_proxy_rate_limit_exceeded": siLabelsToDrop, - "gen_ai_otel_inference_proxy_cache_hit": siLabelsToDrop, - "gen_ai_otel_inference_proxy_cache_miss": siLabelsToDrop, + "gen_ai_otel_inference_proxy_http_requests_total": siLabelsToDrop, + "gen_ai_otel_inference_proxy_token_throughput": siLabelsToDrop, + "gen_ai_otel_inference_proxy_rate_limit_exceeded": siLabelsToDrop, + "gen_ai_otel_inference_proxy_cache_hit": siLabelsToDrop, + "gen_ai_otel_inference_proxy_cache_miss": siLabelsToDrop, } diff --git a/cmd/do-agent/whitelist.go b/cmd/do-agent/whitelist.go index b195472..64946bb 100644 --- a/cmd/do-agent/whitelist.go +++ b/cmd/do-agent/whitelist.go @@ -276,9 +276,9 @@ var diWhitelist = map[string]bool{ } var siWhitelist = map[string]bool{ - "gen_ai_otel_inference_proxy_http_requests_total": true, - "gen_ai_otel_inference_proxy_token_throughput": true, - "gen_ai_otel_inference_proxy_rate_limit_exceeded": true, - "gen_ai_otel_inference_proxy_cache_hit": true, - "gen_ai_otel_inference_proxy_cache_miss": true, + "gen_ai_otel_inference_proxy_http_requests_total": true, + "gen_ai_otel_inference_proxy_token_throughput": true, + "gen_ai_otel_inference_proxy_rate_limit_exceeded": true, + "gen_ai_otel_inference_proxy_cache_hit": true, + "gen_ai_otel_inference_proxy_cache_miss": true, } From 026fe3752c15cda623662d40515fa74c958f77b3 Mon Sep 17 00:00:00 2001 From: Shad Mirza Date: Fri, 13 Mar 2026 15:33:43 +0530 Subject: [PATCH 3/3] update metric list --- cmd/do-agent/aggregation.go | 14 +++++++++----- cmd/do-agent/whitelist.go | 14 +++++++++----- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/cmd/do-agent/aggregation.go b/cmd/do-agent/aggregation.go index c945975..38130aa 100644 --- a/cmd/do-agent/aggregation.go +++ b/cmd/do-agent/aggregation.go @@ -249,9 +249,13 @@ var siLabelsToDrop = []string{ } var siAggregationSpec = map[string][]string{ - "gen_ai_otel_inference_proxy_http_requests_total": siLabelsToDrop, - "gen_ai_otel_inference_proxy_token_throughput": siLabelsToDrop, - "gen_ai_otel_inference_proxy_rate_limit_exceeded": siLabelsToDrop, - "gen_ai_otel_inference_proxy_cache_hit": siLabelsToDrop, - "gen_ai_otel_inference_proxy_cache_miss": siLabelsToDrop, + "gen_ai_platform_inference_proxy_http_requests_total": siLabelsToDrop, + "gen_ai_platform_inference_proxy_http_request_duration_seconds": siLabelsToDrop, + "gen_ai_platform_inference_proxy_token_throughput": siLabelsToDrop, + "gen_ai_platform_inference_proxy_rate_limit_exceeded": siLabelsToDrop, + "gen_ai_platform_inference_proxy_cache_hit": siLabelsToDrop, + "gen_ai_platform_inference_proxy_cache_miss": siLabelsToDrop, + "gen_ai_platform_inference_proxy_inference_client_ttft_duration": siLabelsToDrop, + "gen_ai_platform_inference_proxy_inference_client_itl_duration_seconds": siLabelsToDrop, + "gen_ai_platform_inference_proxy_inference_client_requests_total": siLabelsToDrop, } diff --git a/cmd/do-agent/whitelist.go b/cmd/do-agent/whitelist.go index 64946bb..cd133ba 100644 --- a/cmd/do-agent/whitelist.go +++ b/cmd/do-agent/whitelist.go @@ -276,9 +276,13 @@ var diWhitelist = map[string]bool{ } var siWhitelist = map[string]bool{ - "gen_ai_otel_inference_proxy_http_requests_total": true, - "gen_ai_otel_inference_proxy_token_throughput": true, - "gen_ai_otel_inference_proxy_rate_limit_exceeded": true, - "gen_ai_otel_inference_proxy_cache_hit": true, - "gen_ai_otel_inference_proxy_cache_miss": true, + "gen_ai_platform_inference_proxy_http_requests_total": true, + "gen_ai_platform_inference_proxy_http_request_duration_seconds": true, + "gen_ai_platform_inference_proxy_token_throughput": true, + "gen_ai_platform_inference_proxy_rate_limit_exceeded": true, + "gen_ai_platform_inference_proxy_cache_hit": true, + "gen_ai_platform_inference_proxy_cache_miss": true, + "gen_ai_platform_inference_proxy_inference_client_ttft_duration": true, + "gen_ai_platform_inference_proxy_inference_client_itl_duration_seconds": true, + "gen_ai_platform_inference_proxy_inference_client_requests_total": true, }