From 392ec175708bd9cc562b61574119cafec15f42f7 Mon Sep 17 00:00:00 2001
From: RichardoMrMu <tianbowen.tbw@antgroup.com>
Date: Wed, 25 Mar 2026 13:33:04 +0800
Subject: [PATCH 1/3] feat: integrate Prometheus metrics for HTTP and LLM
 observability

- Register PrometheusMiddleware in main.go for HTTP golden metrics
- Add /metrics endpoint in router for Prometheus scraping
- Call RecordAIMetrics in PostTextConsumeQuota (text_quota.go) and
  PostWssConsumeQuota (quota.go) for LLM metrics collection

HTTP metrics (middleware/metrics.go - already present):
  - newapi_http_requests_total (Counter)
  - newapi_http_request_duration_seconds (Histogram)
  - newapi_http_requests_in_flight (Gauge)
  - newapi_http_response_size_bytes (Histogram)

LLM metrics (metrics/ai_metrics.go - already present):
  - newapi_llm_input_token_total (Counter)
  - newapi_llm_output_token_total (Counter)
  - newapi_llm_request_total (Counter)
  - newapi_llm_service_duration_seconds (Histogram)
  - newapi_llm_first_token_duration_seconds (Histogram/TTFT)

Closes #2402
---
 main.go               | 1 +
 router/main.go        | 1 +
 service/quota.go      | 7 +++++++
 service/text_quota.go | 3 +++
 4 files changed, 12 insertions(+)

diff --git a/main.go b/main.go
index dbbf44a182..10215488b9 100644
--- a/main.go
+++ b/main.go
@@ -164,6 +164,7 @@ func main() {
 	}))
 	// This will cause SSE not to work!!!
 	//server.Use(gzip.Gzip(gzip.DefaultCompression))
+	server.Use(middleware.PrometheusMiddleware())
 	server.Use(middleware.RequestId())
 	server.Use(middleware.PoweredBy())
 	server.Use(middleware.I18n())
diff --git a/router/main.go b/router/main.go
index ac9506fe45..83bfb37264 100644
--- a/router/main.go
+++ b/router/main.go
@@ -14,6 +14,7 @@ import (
 )
 
 func SetRouter(router *gin.Engine, buildFS embed.FS, indexPage []byte) {
+	router.GET("/metrics", middleware.MetricsHandler())
 	SetApiRouter(router)
 	SetDashboardRouter(router)
 	SetRelayRouter(router)
diff --git a/service/quota.go b/service/quota.go
index 9dc84ab4be..75400a99e8 100644
--- a/service/quota.go
+++ b/service/quota.go
@@ -12,6 +12,7 @@ import (
 	"github.com/QuantumNous/new-api/constant"
 	"github.com/QuantumNous/new-api/dto"
 	"github.com/QuantumNous/new-api/logger"
+	"github.com/QuantumNous/new-api/metrics"
 	"github.com/QuantumNous/new-api/model"
 	relaycommon "github.com/QuantumNous/new-api/relay/common"
 	"github.com/QuantumNous/new-api/setting/ratio_setting"
@@ -233,6 +234,12 @@ func PostWssConsumeQuota(ctx *gin.Context, relayInfo *relaycommon.RelayInfo, mod
 		Group:            relayInfo.UsingGroup,
 		Other:            other,
 	})
+
+	metrics.RecordAIMetrics(relayInfo, &dto.Usage{
+		PromptTokens:     usage.InputTokens,
+		CompletionTokens: usage.OutputTokens,
+		TotalTokens:      usage.TotalTokens,
+	})
 }
 
 func CalcOpenRouterCacheCreateTokens(usage dto.Usage, priceData types.PriceData) int {
diff --git a/service/text_quota.go b/service/text_quota.go
index a300097e50..5ebfc15c6a 100644
--- a/service/text_quota.go
+++ b/service/text_quota.go
@@ -9,6 +9,7 @@ import (
 	"github.com/QuantumNous/new-api/constant"
 	"github.com/QuantumNous/new-api/dto"
 	"github.com/QuantumNous/new-api/logger"
+	"github.com/QuantumNous/new-api/metrics"
 	"github.com/QuantumNous/new-api/model"
 	relaycommon "github.com/QuantumNous/new-api/relay/common"
 	"github.com/QuantumNous/new-api/setting/operation_setting"
@@ -424,4 +425,6 @@ func PostTextConsumeQuota(ctx *gin.Context, relayInfo *relaycommon.RelayInfo, us
 		Group:            relayInfo.UsingGroup,
 		Other:            other,
 	})
+
+	metrics.RecordAIMetrics(relayInfo, usage)
 }

From cc0a5bad68b17668d4296b3da0ed5ae69079b47d Mon Sep 17 00:00:00 2001
From: RichardoMrMu <tianbowen.tbw@antgroup.com>
Date: Wed, 25 Mar 2026 16:15:51 +0800
Subject: [PATCH 2/3] fix: add metrics recording to LogTaskConsumption for
 task-based requests

Address CodeRabbit review feedback: LogTaskConsumption (image generation,
audio processing, video tasks) was missing metrics.RecordAIMetrics call,
creating an observability gap for task-based requests.

The call passes nil usage since task-based billing is per-invocation
rather than per-token, but still records request count and duration.
---
 service/task_billing.go | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/service/task_billing.go b/service/task_billing.go
index b887f66825..bea4d59cc1 100644
--- a/service/task_billing.go
+++ b/service/task_billing.go
@@ -8,6 +8,7 @@ import (
 	"github.com/QuantumNous/new-api/common"
 	"github.com/QuantumNous/new-api/constant"
 	"github.com/QuantumNous/new-api/logger"
+	"github.com/QuantumNous/new-api/metrics"
 	"github.com/QuantumNous/new-api/model"
 	relaycommon "github.com/QuantumNous/new-api/relay/common"
 	"github.com/QuantumNous/new-api/setting/ratio_setting"
@@ -58,6 +59,8 @@ func LogTaskConsumption(c *gin.Context, info *relaycommon.RelayInfo) {
 	})
 	model.UpdateUserUsedQuotaAndRequestCount(info.UserId, info.PriceData.Quota)
 	model.UpdateChannelUsedQuota(info.ChannelId, info.PriceData.Quota)
+
+	metrics.RecordAIMetrics(info, nil)
 }
 
 // ---------------------------------------------------------------------------

From 426cc06b837d550f1d3e2b8641f4a168da3a6775 Mon Sep 17 00:00:00 2001
From: RichardoMrMu <tianbowen.tbw@antgroup.com>
Date: Wed, 25 Mar 2026 16:48:50 +0800
Subject: [PATCH 3/3] fix: add missing metrics package files (ai_metrics.go,
 metrics.go)

These files were present locally but never committed to the branch,
causing compilation failures. This commit adds:

- metrics/ai_metrics.go: LLM metrics definitions and RecordAIMetrics()
- middleware/metrics.go: HTTP Prometheus middleware and MetricsHandler()

Both files are required by the integration points in main.go,
router/main.go, service/quota.go, service/text_quota.go, and
service/task_billing.go.
---
 metrics/ai_metrics.go | 219 ++++++++++++++++++++++++++++++++++++++++++
 middleware/metrics.go | 115 ++++++++++++++++++++++
 2 files changed, 334 insertions(+)
 create mode 100644 metrics/ai_metrics.go
 create mode 100644 middleware/metrics.go

diff --git a/metrics/ai_metrics.go b/metrics/ai_metrics.go
new file mode 100644
index 0000000000..5e676e2418
--- /dev/null
+++ b/metrics/ai_metrics.go
@@ -0,0 +1,219 @@
+package metrics
+
+import (
+	"fmt"
+	"os"
+	"strconv"
+	"time"
+
+	"github.com/QuantumNous/new-api/dto"
+	relaycommon "github.com/QuantumNous/new-api/relay/common"
+
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+// region is read once from MAAS_REGION env var at startup.
+var region string
+
+func init() {
+	region = os.Getenv("MAAS_REGION")
+	if region == "" {
+		region = "unknown"
+	}
+
+	prometheus.MustRegister(
+		llmInputTokenTotal,
+		llmOutputTokenTotal,
+		llmRequestTotal,
+		llmServiceDuration,
+		llmFirstTokenDuration,
+		llmTimePerOutputToken,
+		rateLimitTotal,
+		circuitBreakerState,
+		llmGatewayDuration,
+	)
+}
+
+// GetRegion returns the configured MAAS_REGION value.
+func GetRegion() string {
+	return region
+}
+
+// ---- LLM Metrics (6) ----
+
+var llmRequestLabelNames = []string{
+	"model", "channel", "upstream_model", "status", "error_type",
+	"region", "is_stream", "token_name",
+}
+
+var llmTokenLabelNames = []string{
+	"model", "channel", "upstream_model", "region", "token_name",
+}
+
+var llmLatencyLabelNames = []string{
+	"model", "channel", "region",
+}
+
+var (
+	llmRequestTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Namespace: "newapi",
+			Name:      "llm_request_total",
+			Help:      "Total number of LLM requests",
+		},
+		llmRequestLabelNames,
+	)
+
+	llmInputTokenTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Namespace: "newapi",
+			Name:      "llm_input_token_total",
+			Help:      "Total number of LLM input (prompt) tokens",
+		},
+		llmTokenLabelNames,
+	)
+
+	llmOutputTokenTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Namespace: "newapi",
+			Name:      "llm_output_token_total",
+			Help:      "Total number of LLM output (completion) tokens",
+		},
+		llmTokenLabelNames,
+	)
+
+	llmFirstTokenDuration = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: "newapi",
+			Name:      "llm_first_token_duration_seconds",
+			Help:      "LLM time-to-first-token (TTFT) in seconds",
+			Buckets:   []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30},
+		},
+		llmLatencyLabelNames,
+	)
+
+	llmTimePerOutputToken = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: "newapi",
+			Name:      "llm_time_per_output_token_seconds",
+			Help:      "LLM time per output token (TPOT) in seconds",
+			Buckets:   []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1},
+		},
+		llmLatencyLabelNames,
+	)
+
+	llmServiceDuration = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: "newapi",
+			Name:      "llm_service_duration_seconds",
+			Help:      "LLM upstream service duration in seconds (from request start to response complete)",
+			Buckets:   []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300},
+		},
+		llmLatencyLabelNames,
+	)
+)
+
+// ---- Rate Limit / Circuit Breaker / Gateway Metrics (3) ----
+
+var (
+	rateLimitTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Namespace: "newapi",
+			Name:      "rate_limit_total",
+			Help:      "Total number of rate limit triggers",
+		},
+		[]string{"model", "channel", "type", "token_name"},
+	)
+
+	circuitBreakerState = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Namespace: "newapi",
+			Name:      "circuit_breaker_state",
+			Help:      "Circuit breaker state (0=Closed, 1=HalfOpen, 2=Open)",
+		},
+		[]string{"channel", "model"},
+	)
+
+	llmGatewayDuration = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: "newapi",
+			Name:      "llm_gateway_duration_seconds",
+			Help:      "Gateway processing duration in seconds (excluding upstream)",
+			Buckets:   []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5},
+		},
+		[]string{"model", "channel"},
+	)
+)
+
+// RecordAIMetrics should be called after a relay request completes.
+// It records token counts, request count, service duration, TTFT, and TPOT.
+func RecordAIMetrics(relayInfo *relaycommon.RelayInfo, usage *dto.Usage) {
+	RecordAIMetricsWithStatus(relayInfo, usage, "success", "")
+}
+
+// RecordAIMetricsWithStatus records LLM metrics with explicit status and error type.
+func RecordAIMetricsWithStatus(relayInfo *relaycommon.RelayInfo, usage *dto.Usage, status string, errorType string) {
+	if relayInfo == nil || relayInfo.ChannelMeta == nil {
+		return
+	}
+
+	model := relayInfo.OriginModelName
+	channel := fmt.Sprintf("%d", relayInfo.ChannelMeta.ChannelId)
+	upstreamModel := relayInfo.ChannelMeta.UpstreamModelName
+	tokenName := ""
+	if relayInfo.TokenKey != "" {
+		tokenName = relayInfo.TokenKey
+	}
+	isStream := strconv.FormatBool(relayInfo.IsStream)
+
+	// Request count (with status labels)
+	llmRequestTotal.WithLabelValues(
+		model, channel, upstreamModel, status, errorType,
+		region, isStream, tokenName,
+	).Inc()
+
+	// Token counts
+	if usage != nil {
+		tokenLabels := []string{model, channel, upstreamModel, region, tokenName}
+		llmInputTokenTotal.WithLabelValues(tokenLabels...).Add(float64(usage.PromptTokens))
+		llmOutputTokenTotal.WithLabelValues(tokenLabels...).Add(float64(usage.CompletionTokens))
+	}
+
+	latencyLabels := []string{model, channel, region}
+
+	// Service duration (total time from request start to now)
+	serviceDuration := time.Since(relayInfo.StartTime).Seconds()
+	llmServiceDuration.WithLabelValues(latencyLabels...).Observe(serviceDuration)
+
+	// Time-to-first-token (only meaningful when FirstResponseTime was recorded)
+	if !relayInfo.FirstResponseTime.IsZero() {
+		ttft := relayInfo.FirstResponseTime.Sub(relayInfo.StartTime).Seconds()
+		if ttft > 0 {
+			llmFirstTokenDuration.WithLabelValues(latencyLabels...).Observe(ttft)
+		}
+
+		// Time per output token (TPOT): (total_duration - ttft) / output_tokens
+		if usage != nil && usage.CompletionTokens > 0 {
+			generationDuration := serviceDuration - ttft
+			if generationDuration > 0 {
+				tpot := generationDuration / float64(usage.CompletionTokens)
+				llmTimePerOutputToken.WithLabelValues(latencyLabels...).Observe(tpot)
+			}
+		}
+	}
+}
+
+// RecordRateLimit records a rate limit trigger event.
+func RecordRateLimit(model string, channel string, limitType string, tokenName string) {
+	rateLimitTotal.WithLabelValues(model, channel, limitType, tokenName).Inc()
+}
+
+// RecordCircuitBreakerState updates the circuit breaker state gauge.
+func RecordCircuitBreakerState(channel string, model string, state float64) {
+	circuitBreakerState.WithLabelValues(channel, model).Set(state)
+}
+
+// RecordGatewayDuration records the gateway processing duration (excluding upstream).
+func RecordGatewayDuration(model string, channel string, durationSeconds float64) {
+	llmGatewayDuration.WithLabelValues(model, channel).Observe(durationSeconds)
+}
diff --git a/middleware/metrics.go b/middleware/metrics.go
new file mode 100644
index 0000000000..3bf93b9800
--- /dev/null
+++ b/middleware/metrics.go
@@ -0,0 +1,115 @@
+package middleware
+
+import (
+	"strconv"
+	"time"
+
+	"github.com/QuantumNous/new-api/metrics"
+
+	"github.com/gin-gonic/gin"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promhttp"
+)
+
+var httpLabelNames = []string{"method", "path", "status", "region"}
+
+var (
+	httpRequestsTotal = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Namespace: "newapi",
+			Name:      "http_requests_total",
+			Help:      "Total number of HTTP requests",
+		},
+		httpLabelNames,
+	)
+
+	httpRequestDuration = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: "newapi",
+			Name:      "http_request_duration_seconds",
+			Help:      "HTTP request duration in seconds",
+			Buckets:   []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
+		},
+		httpLabelNames,
+	)
+
+	httpRequestsInFlight = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "newapi",
+			Name:      "http_requests_in_flight",
+			Help:      "Number of HTTP requests currently being processed",
+		},
+	)
+
+	httpResponseSizeBytes = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Namespace: "newapi",
+			Name:      "http_response_size_bytes",
+			Help:      "HTTP response size in bytes",
+			Buckets:   prometheus.ExponentialBuckets(100, 10, 8),
+		},
+		httpLabelNames,
+	)
+)
+
+func init() {
+	prometheus.MustRegister(
+		httpRequestsTotal,
+		httpRequestDuration,
+		httpRequestsInFlight,
+		httpResponseSizeBytes,
+	)
+}
+
+// PrometheusMiddleware collects HTTP golden metrics for each request.
+// It records request count, latency, in-flight requests, and response size.
+func PrometheusMiddleware() gin.HandlerFunc {
+	return func(c *gin.Context) {
+		if c.Request.URL.Path == "/metrics" {
+			c.Next()
+			return
+		}
+
+		startTime := time.Now()
+		httpRequestsInFlight.Inc()
+
+		c.Next()
+
+		httpRequestsInFlight.Dec()
+
+		statusCode := strconv.Itoa(c.Writer.Status())
+		routePath := normalizeRoutePath(c)
+		method := c.Request.Method
+		duration := time.Since(startTime).Seconds()
+		responseSize := float64(c.Writer.Size())
+		regionLabel := metrics.GetRegion()
+
+		httpRequestsTotal.WithLabelValues(method, routePath, statusCode, regionLabel).Inc()
+		httpRequestDuration.WithLabelValues(method, routePath, statusCode, regionLabel).Observe(duration)
+		httpResponseSizeBytes.WithLabelValues(method, routePath, statusCode, regionLabel).Observe(responseSize)
+	}
+}
+
+// MetricsHandler returns the Prometheus metrics HTTP handler for the /metrics endpoint.
+func MetricsHandler() gin.HandlerFunc {
+	handler := promhttp.Handler()
+	return func(c *gin.Context) {
+		handler.ServeHTTP(c.Writer, c.Request)
+	}
+}
+
+// normalizeRoutePath extracts the matched route template to avoid high-cardinality labels.
+// Falls back to a generic label if no route template is available.
+func normalizeRoutePath(c *gin.Context) string {
+	routePath := c.FullPath()
+	if routePath != "" {
+		return routePath
+	}
+
+	routeTag, exists := c.Get(RouteTagKey)
+	if exists {
+		return routeTag.(string)
+	}
+
+	return "unmatched"
+}