From 392ec175708bd9cc562b61574119cafec15f42f7 Mon Sep 17 00:00:00 2001 From: RichardoMrMu Date: Wed, 25 Mar 2026 13:33:04 +0800 Subject: [PATCH 1/3] feat: integrate Prometheus metrics for HTTP and LLM observability - Register PrometheusMiddleware in main.go for HTTP golden metrics - Add /metrics endpoint in router for Prometheus scraping - Call RecordAIMetrics in PostTextConsumeQuota (text_quota.go) and PostWssConsumeQuota (quota.go) for LLM metrics collection HTTP metrics (middleware/metrics.go - already present): - newapi_http_requests_total (Counter) - newapi_http_request_duration_seconds (Histogram) - newapi_http_requests_in_flight (Gauge) - newapi_http_response_size_bytes (Histogram) LLM metrics (metrics/ai_metrics.go - already present): - newapi_llm_input_token_total (Counter) - newapi_llm_output_token_total (Counter) - newapi_llm_request_total (Counter) - newapi_llm_service_duration_seconds (Histogram) - newapi_llm_first_token_duration_seconds (Histogram/TTFT) Closes #2402 --- main.go | 1 + router/main.go | 1 + service/quota.go | 7 +++++++ service/text_quota.go | 3 +++ 4 files changed, 12 insertions(+) diff --git a/main.go b/main.go index dbbf44a182..10215488b9 100644 --- a/main.go +++ b/main.go @@ -164,6 +164,7 @@ func main() { })) // This will cause SSE not to work!!! //server.Use(gzip.Gzip(gzip.DefaultCompression)) + server.Use(middleware.PrometheusMiddleware()) server.Use(middleware.RequestId()) server.Use(middleware.PoweredBy()) server.Use(middleware.I18n()) diff --git a/router/main.go b/router/main.go index ac9506fe45..83bfb37264 100644 --- a/router/main.go +++ b/router/main.go @@ -14,6 +14,7 @@ import ( ) func SetRouter(router *gin.Engine, buildFS embed.FS, indexPage []byte) { + router.GET("/metrics", middleware.MetricsHandler()) SetApiRouter(router) SetDashboardRouter(router) SetRelayRouter(router) diff --git a/service/quota.go b/service/quota.go index 9dc84ab4be..75400a99e8 100644 --- a/service/quota.go +++ b/service/quota.go @@ -12,6 +12,7 @@ import ( "github.com/QuantumNous/new-api/constant" "github.com/QuantumNous/new-api/dto" "github.com/QuantumNous/new-api/logger" + "github.com/QuantumNous/new-api/metrics" "github.com/QuantumNous/new-api/model" relaycommon "github.com/QuantumNous/new-api/relay/common" "github.com/QuantumNous/new-api/setting/ratio_setting" @@ -233,6 +234,12 @@ func PostWssConsumeQuota(ctx *gin.Context, relayInfo *relaycommon.RelayInfo, mod Group: relayInfo.UsingGroup, Other: other, }) + + metrics.RecordAIMetrics(relayInfo, &dto.Usage{ + PromptTokens: usage.InputTokens, + CompletionTokens: usage.OutputTokens, + TotalTokens: usage.TotalTokens, + }) } func CalcOpenRouterCacheCreateTokens(usage dto.Usage, priceData types.PriceData) int { diff --git a/service/text_quota.go b/service/text_quota.go index a300097e50..5ebfc15c6a 100644 --- a/service/text_quota.go +++ b/service/text_quota.go @@ -9,6 +9,7 @@ import ( "github.com/QuantumNous/new-api/constant" "github.com/QuantumNous/new-api/dto" "github.com/QuantumNous/new-api/logger" + "github.com/QuantumNous/new-api/metrics" "github.com/QuantumNous/new-api/model" relaycommon "github.com/QuantumNous/new-api/relay/common" "github.com/QuantumNous/new-api/setting/operation_setting" @@ -424,4 +425,6 @@ func PostTextConsumeQuota(ctx *gin.Context, relayInfo *relaycommon.RelayInfo, us Group: relayInfo.UsingGroup, Other: other, }) + + metrics.RecordAIMetrics(relayInfo, usage) } From cc0a5bad68b17668d4296b3da0ed5ae69079b47d Mon Sep 17 00:00:00 2001 From: RichardoMrMu Date: Wed, 25 Mar 2026 16:15:51 +0800 Subject: [PATCH 2/3] fix: add metrics recording to LogTaskConsumption for task-based requests Address CodeRabbit review feedback: LogTaskConsumption (image generation, audio processing, video tasks) was missing metrics.RecordAIMetrics call, creating an observability gap for task-based requests. The call passes nil usage since task-based billing is per-invocation rather than per-token, but still records request count and duration. --- service/task_billing.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/service/task_billing.go b/service/task_billing.go index b887f66825..bea4d59cc1 100644 --- a/service/task_billing.go +++ b/service/task_billing.go @@ -8,6 +8,7 @@ import ( "github.com/QuantumNous/new-api/common" "github.com/QuantumNous/new-api/constant" "github.com/QuantumNous/new-api/logger" + "github.com/QuantumNous/new-api/metrics" "github.com/QuantumNous/new-api/model" relaycommon "github.com/QuantumNous/new-api/relay/common" "github.com/QuantumNous/new-api/setting/ratio_setting" @@ -58,6 +59,8 @@ func LogTaskConsumption(c *gin.Context, info *relaycommon.RelayInfo) { }) model.UpdateUserUsedQuotaAndRequestCount(info.UserId, info.PriceData.Quota) model.UpdateChannelUsedQuota(info.ChannelId, info.PriceData.Quota) + + metrics.RecordAIMetrics(info, nil) } // --------------------------------------------------------------------------- From 426cc06b837d550f1d3e2b8641f4a168da3a6775 Mon Sep 17 00:00:00 2001 From: RichardoMrMu Date: Wed, 25 Mar 2026 16:48:50 +0800 Subject: [PATCH 3/3] fix: add missing metrics package files (ai_metrics.go, metrics.go) These files were present locally but never committed to the branch, causing compilation failures. This commit adds: - metrics/ai_metrics.go: LLM metrics definitions and RecordAIMetrics() - middleware/metrics.go: HTTP Prometheus middleware and MetricsHandler() Both files are required by the integration points in main.go, router/main.go, service/quota.go, service/text_quota.go, and service/task_billing.go. --- metrics/ai_metrics.go | 219 ++++++++++++++++++++++++++++++++++++++++++ middleware/metrics.go | 115 ++++++++++++++++++++++ 2 files changed, 334 insertions(+) create mode 100644 metrics/ai_metrics.go create mode 100644 middleware/metrics.go diff --git a/metrics/ai_metrics.go b/metrics/ai_metrics.go new file mode 100644 index 0000000000..5e676e2418 --- /dev/null +++ b/metrics/ai_metrics.go @@ -0,0 +1,219 @@ +package metrics + +import ( + "fmt" + "os" + "strconv" + "time" + + "github.com/QuantumNous/new-api/dto" + relaycommon "github.com/QuantumNous/new-api/relay/common" + + "github.com/prometheus/client_golang/prometheus" +) + +// region is read once from MAAS_REGION env var at startup. +var region string + +func init() { + region = os.Getenv("MAAS_REGION") + if region == "" { + region = "unknown" + } + + prometheus.MustRegister( + llmInputTokenTotal, + llmOutputTokenTotal, + llmRequestTotal, + llmServiceDuration, + llmFirstTokenDuration, + llmTimePerOutputToken, + rateLimitTotal, + circuitBreakerState, + llmGatewayDuration, + ) +} + +// GetRegion returns the configured MAAS_REGION value. +func GetRegion() string { + return region +} + +// ---- LLM Metrics (6) ---- + +var llmRequestLabelNames = []string{ + "model", "channel", "upstream_model", "status", "error_type", + "region", "is_stream", "token_name", +} + +var llmTokenLabelNames = []string{ + "model", "channel", "upstream_model", "region", "token_name", +} + +var llmLatencyLabelNames = []string{ + "model", "channel", "region", +} + +var ( + llmRequestTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "newapi", + Name: "llm_request_total", + Help: "Total number of LLM requests", + }, + llmRequestLabelNames, + ) + + llmInputTokenTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "newapi", + Name: "llm_input_token_total", + Help: "Total number of LLM input (prompt) tokens", + }, + llmTokenLabelNames, + ) + + llmOutputTokenTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "newapi", + Name: "llm_output_token_total", + Help: "Total number of LLM output (completion) tokens", + }, + llmTokenLabelNames, + ) + + llmFirstTokenDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "newapi", + Name: "llm_first_token_duration_seconds", + Help: "LLM time-to-first-token (TTFT) in seconds", + Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30}, + }, + llmLatencyLabelNames, + ) + + llmTimePerOutputToken = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "newapi", + Name: "llm_time_per_output_token_seconds", + Help: "LLM time per output token (TPOT) in seconds", + Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1}, + }, + llmLatencyLabelNames, + ) + + llmServiceDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "newapi", + Name: "llm_service_duration_seconds", + Help: "LLM upstream service duration in seconds (from request start to response complete)", + Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300}, + }, + llmLatencyLabelNames, + ) +) + +// ---- Rate Limit / Circuit Breaker / Gateway Metrics (3) ---- + +var ( + rateLimitTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "newapi", + Name: "rate_limit_total", + Help: "Total number of rate limit triggers", + }, + []string{"model", "channel", "type", "token_name"}, + ) + + circuitBreakerState = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "newapi", + Name: "circuit_breaker_state", + Help: "Circuit breaker state (0=Closed, 1=HalfOpen, 2=Open)", + }, + []string{"channel", "model"}, + ) + + llmGatewayDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "newapi", + Name: "llm_gateway_duration_seconds", + Help: "Gateway processing duration in seconds (excluding upstream)", + Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5}, + }, + []string{"model", "channel"}, + ) +) + +// RecordAIMetrics should be called after a relay request completes. +// It records token counts, request count, service duration, TTFT, and TPOT. +func RecordAIMetrics(relayInfo *relaycommon.RelayInfo, usage *dto.Usage) { + RecordAIMetricsWithStatus(relayInfo, usage, "success", "") +} + +// RecordAIMetricsWithStatus records LLM metrics with explicit status and error type. +func RecordAIMetricsWithStatus(relayInfo *relaycommon.RelayInfo, usage *dto.Usage, status string, errorType string) { + if relayInfo == nil || relayInfo.ChannelMeta == nil { + return + } + + model := relayInfo.OriginModelName + channel := fmt.Sprintf("%d", relayInfo.ChannelMeta.ChannelId) + upstreamModel := relayInfo.ChannelMeta.UpstreamModelName + tokenName := "" + if relayInfo.TokenKey != "" { + tokenName = relayInfo.TokenKey + } + isStream := strconv.FormatBool(relayInfo.IsStream) + + // Request count (with status labels) + llmRequestTotal.WithLabelValues( + model, channel, upstreamModel, status, errorType, + region, isStream, tokenName, + ).Inc() + + // Token counts + if usage != nil { + tokenLabels := []string{model, channel, upstreamModel, region, tokenName} + llmInputTokenTotal.WithLabelValues(tokenLabels...).Add(float64(usage.PromptTokens)) + llmOutputTokenTotal.WithLabelValues(tokenLabels...).Add(float64(usage.CompletionTokens)) + } + + latencyLabels := []string{model, channel, region} + + // Service duration (total time from request start to now) + serviceDuration := time.Since(relayInfo.StartTime).Seconds() + llmServiceDuration.WithLabelValues(latencyLabels...).Observe(serviceDuration) + + // Time-to-first-token (only meaningful when FirstResponseTime was recorded) + if !relayInfo.FirstResponseTime.IsZero() { + ttft := relayInfo.FirstResponseTime.Sub(relayInfo.StartTime).Seconds() + if ttft > 0 { + llmFirstTokenDuration.WithLabelValues(latencyLabels...).Observe(ttft) + } + + // Time per output token (TPOT): (total_duration - ttft) / output_tokens + if usage != nil && usage.CompletionTokens > 0 { + generationDuration := serviceDuration - ttft + if generationDuration > 0 { + tpot := generationDuration / float64(usage.CompletionTokens) + llmTimePerOutputToken.WithLabelValues(latencyLabels...).Observe(tpot) + } + } + } +} + +// RecordRateLimit records a rate limit trigger event. +func RecordRateLimit(model string, channel string, limitType string, tokenName string) { + rateLimitTotal.WithLabelValues(model, channel, limitType, tokenName).Inc() +} + +// RecordCircuitBreakerState updates the circuit breaker state gauge. +func RecordCircuitBreakerState(channel string, model string, state float64) { + circuitBreakerState.WithLabelValues(channel, model).Set(state) +} + +// RecordGatewayDuration records the gateway processing duration (excluding upstream). +func RecordGatewayDuration(model string, channel string, durationSeconds float64) { + llmGatewayDuration.WithLabelValues(model, channel).Observe(durationSeconds) +} diff --git a/middleware/metrics.go b/middleware/metrics.go new file mode 100644 index 0000000000..3bf93b9800 --- /dev/null +++ b/middleware/metrics.go @@ -0,0 +1,115 @@ +package middleware + +import ( + "strconv" + "time" + + "github.com/QuantumNous/new-api/metrics" + + "github.com/gin-gonic/gin" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" +) + +var httpLabelNames = []string{"method", "path", "status", "region"} + +var ( + httpRequestsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "newapi", + Name: "http_requests_total", + Help: "Total number of HTTP requests", + }, + httpLabelNames, + ) + + httpRequestDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "newapi", + Name: "http_request_duration_seconds", + Help: "HTTP request duration in seconds", + Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60}, + }, + httpLabelNames, + ) + + httpRequestsInFlight = prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: "newapi", + Name: "http_requests_in_flight", + Help: "Number of HTTP requests currently being processed", + }, + ) + + httpResponseSizeBytes = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "newapi", + Name: "http_response_size_bytes", + Help: "HTTP response size in bytes", + Buckets: prometheus.ExponentialBuckets(100, 10, 8), + }, + httpLabelNames, + ) +) + +func init() { + prometheus.MustRegister( + httpRequestsTotal, + httpRequestDuration, + httpRequestsInFlight, + httpResponseSizeBytes, + ) +} + +// PrometheusMiddleware collects HTTP golden metrics for each request. +// It records request count, latency, in-flight requests, and response size. +func PrometheusMiddleware() gin.HandlerFunc { + return func(c *gin.Context) { + if c.Request.URL.Path == "/metrics" { + c.Next() + return + } + + startTime := time.Now() + httpRequestsInFlight.Inc() + + c.Next() + + httpRequestsInFlight.Dec() + + statusCode := strconv.Itoa(c.Writer.Status()) + routePath := normalizeRoutePath(c) + method := c.Request.Method + duration := time.Since(startTime).Seconds() + responseSize := float64(c.Writer.Size()) + regionLabel := metrics.GetRegion() + + httpRequestsTotal.WithLabelValues(method, routePath, statusCode, regionLabel).Inc() + httpRequestDuration.WithLabelValues(method, routePath, statusCode, regionLabel).Observe(duration) + httpResponseSizeBytes.WithLabelValues(method, routePath, statusCode, regionLabel).Observe(responseSize) + } +} + +// MetricsHandler returns the Prometheus metrics HTTP handler for the /metrics endpoint. +func MetricsHandler() gin.HandlerFunc { + handler := promhttp.Handler() + return func(c *gin.Context) { + handler.ServeHTTP(c.Writer, c.Request) + } +} + +// normalizeRoutePath extracts the matched route template to avoid high-cardinality labels. +// Falls back to a generic label if no route template is available. +func normalizeRoutePath(c *gin.Context) string { + routePath := c.FullPath() + if routePath != "" { + return routePath + } + + routeTag, exists := c.Get(RouteTagKey) + if exists { + return routeTag.(string) + } + + return "unmatched" +}