Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ func main() {
}))
// This will cause SSE not to work!!!
//server.Use(gzip.Gzip(gzip.DefaultCompression))
server.Use(middleware.PrometheusMiddleware())
server.Use(middleware.RequestId())
server.Use(middleware.PoweredBy())
server.Use(middleware.I18n())
Expand Down
219 changes: 219 additions & 0 deletions metrics/ai_metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
package metrics

import (
"fmt"
"os"
"strconv"
"time"

"github.com/QuantumNous/new-api/dto"
relaycommon "github.com/QuantumNous/new-api/relay/common"

"github.com/prometheus/client_golang/prometheus"
)

// region is read once from MAAS_REGION env var at startup.
var region string

func init() {
region = os.Getenv("MAAS_REGION")
if region == "" {
region = "unknown"
}

prometheus.MustRegister(
llmInputTokenTotal,
llmOutputTokenTotal,
llmRequestTotal,
llmServiceDuration,
llmFirstTokenDuration,
llmTimePerOutputToken,
rateLimitTotal,
circuitBreakerState,
llmGatewayDuration,
)
}

// GetRegion returns the configured MAAS_REGION value.
func GetRegion() string {
return region
}

// ---- LLM Metrics (6) ----

var llmRequestLabelNames = []string{
"model", "channel", "upstream_model", "status", "error_type",
"region", "is_stream", "token_name",
}

var llmTokenLabelNames = []string{
"model", "channel", "upstream_model", "region", "token_name",
}
Comment on lines +45 to +51
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

Remove raw token keys from metric labels (secret leakage + cardinality blow-up).

Line 164-Line 166 maps relayInfo.TokenKey into token_name, and that label is used in counters on Line 170-Line 179. This can expose credentials in /metrics and create unbounded label cardinality.

Suggested fix
 var llmRequestLabelNames = []string{
 	"model", "channel", "upstream_model", "status", "error_type",
-	"region", "is_stream", "token_name",
+	"region", "is_stream",
 }

 var llmTokenLabelNames = []string{
-	"model", "channel", "upstream_model", "region", "token_name",
+	"model", "channel", "upstream_model", "region",
 }
@@
-	tokenName := ""
-	if relayInfo.TokenKey != "" {
-		tokenName = relayInfo.TokenKey
-	}
 	isStream := strconv.FormatBool(relayInfo.IsStream)
@@
 	llmRequestTotal.WithLabelValues(
 		model, channel, upstreamModel, status, errorType,
-		region, isStream, tokenName,
+		region, isStream,
 	).Inc()
@@
-		tokenLabels := []string{model, channel, upstreamModel, region, tokenName}
+		tokenLabels := []string{model, channel, upstreamModel, region}
 		llmInputTokenTotal.WithLabelValues(tokenLabels...).Add(float64(usage.PromptTokens))
 		llmOutputTokenTotal.WithLabelValues(tokenLabels...).Add(float64(usage.CompletionTokens))
 	}

Also applies to: 163-166, 170-179

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@metrics/ai_metrics.go` around lines 45 - 51, The code is exposing raw token
keys via the "token_name" metric label (relayInfo.TokenKey) causing secret
leakage and unbounded cardinality; remove "token_name" from llmTokenLabelNames
and any counters/gauges that use it, and replace the mapped value with a safe,
low-cardinality alternative (e.g., a token_type, provider identifier, a boolean
like "has_token", or a fixed hash/salt-and-hash function) wherever
relayInfo.TokenKey is currently assigned; update the metric registration
(variables referencing llmTokenLabelNames and any counters using that label) and
the code path that sets token label (referencing relayInfo.TokenKey and
"token_name") to emit the new safe label instead.


var llmLatencyLabelNames = []string{
"model", "channel", "region",
}

var (
llmRequestTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "newapi",
Name: "llm_request_total",
Help: "Total number of LLM requests",
},
llmRequestLabelNames,
)

llmInputTokenTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "newapi",
Name: "llm_input_token_total",
Help: "Total number of LLM input (prompt) tokens",
},
llmTokenLabelNames,
)

llmOutputTokenTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "newapi",
Name: "llm_output_token_total",
Help: "Total number of LLM output (completion) tokens",
},
llmTokenLabelNames,
)

llmFirstTokenDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "newapi",
Name: "llm_first_token_duration_seconds",
Help: "LLM time-to-first-token (TTFT) in seconds",
Buckets: []float64{0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30},
},
llmLatencyLabelNames,
)

llmTimePerOutputToken = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "newapi",
Name: "llm_time_per_output_token_seconds",
Help: "LLM time per output token (TPOT) in seconds",
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1},
},
llmLatencyLabelNames,
)

llmServiceDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "newapi",
Name: "llm_service_duration_seconds",
Help: "LLM upstream service duration in seconds (from request start to response complete)",
Buckets: []float64{0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60, 120, 300},
},
llmLatencyLabelNames,
)
)

// ---- Rate Limit / Circuit Breaker / Gateway Metrics (3) ----

var (
rateLimitTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "newapi",
Name: "rate_limit_total",
Help: "Total number of rate limit triggers",
},
[]string{"model", "channel", "type", "token_name"},
)

circuitBreakerState = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "newapi",
Name: "circuit_breaker_state",
Help: "Circuit breaker state (0=Closed, 1=HalfOpen, 2=Open)",
},
[]string{"channel", "model"},
)

llmGatewayDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "newapi",
Name: "llm_gateway_duration_seconds",
Help: "Gateway processing duration in seconds (excluding upstream)",
Buckets: []float64{0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5},
},
[]string{"model", "channel"},
)
)

// RecordAIMetrics should be called after a relay request completes.
// It records token counts, request count, service duration, TTFT, and TPOT.
func RecordAIMetrics(relayInfo *relaycommon.RelayInfo, usage *dto.Usage) {
RecordAIMetricsWithStatus(relayInfo, usage, "success", "")
}

// RecordAIMetricsWithStatus records LLM metrics with explicit status and error type.
func RecordAIMetricsWithStatus(relayInfo *relaycommon.RelayInfo, usage *dto.Usage, status string, errorType string) {
if relayInfo == nil || relayInfo.ChannelMeta == nil {
return
}

model := relayInfo.OriginModelName
channel := fmt.Sprintf("%d", relayInfo.ChannelMeta.ChannelId)
upstreamModel := relayInfo.ChannelMeta.UpstreamModelName
tokenName := ""
if relayInfo.TokenKey != "" {
tokenName = relayInfo.TokenKey
}
isStream := strconv.FormatBool(relayInfo.IsStream)

// Request count (with status labels)
llmRequestTotal.WithLabelValues(
model, channel, upstreamModel, status, errorType,
region, isStream, tokenName,
).Inc()

// Token counts
if usage != nil {
tokenLabels := []string{model, channel, upstreamModel, region, tokenName}
llmInputTokenTotal.WithLabelValues(tokenLabels...).Add(float64(usage.PromptTokens))
llmOutputTokenTotal.WithLabelValues(tokenLabels...).Add(float64(usage.CompletionTokens))
}

latencyLabels := []string{model, channel, region}

// Service duration (total time from request start to now)
serviceDuration := time.Since(relayInfo.StartTime).Seconds()
llmServiceDuration.WithLabelValues(latencyLabels...).Observe(serviceDuration)

// Time-to-first-token (only meaningful when FirstResponseTime was recorded)
if !relayInfo.FirstResponseTime.IsZero() {
ttft := relayInfo.FirstResponseTime.Sub(relayInfo.StartTime).Seconds()
if ttft > 0 {
llmFirstTokenDuration.WithLabelValues(latencyLabels...).Observe(ttft)
}
Comment on lines +185 to +193
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Guard zero StartTime before duration calculations.

Line 185 computes time.Since(relayInfo.StartTime) unconditionally. If StartTime is zero, duration/TTFT values become invalid and heavily skew histograms.

Suggested fix
 	latencyLabels := []string{model, channel, region}

+	if relayInfo.StartTime.IsZero() {
+		return
+	}
+
 	// Service duration (total time from request start to now)
 	serviceDuration := time.Since(relayInfo.StartTime).Seconds()
 	llmServiceDuration.WithLabelValues(latencyLabels...).Observe(serviceDuration)

 	// Time-to-first-token (only meaningful when FirstResponseTime was recorded)
-	if !relayInfo.FirstResponseTime.IsZero() {
+	if !relayInfo.FirstResponseTime.IsZero() && relayInfo.FirstResponseTime.After(relayInfo.StartTime) {
 		ttft := relayInfo.FirstResponseTime.Sub(relayInfo.StartTime).Seconds()
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@metrics/ai_metrics.go` around lines 185 - 193, The code computes durations
from relayInfo.StartTime without checking it; guard against a zero StartTime
before computing serviceDuration and ttft by verifying
relayInfo.StartTime.IsZero() and skipping observation when it is zero.
Specifically, in the block that calculates serviceDuration (variable
serviceDuration and metric llmServiceDuration) and the TTFT logic that uses
relayInfo.FirstResponseTime and llmFirstTokenDuration, add a check for
relayInfo.StartTime.IsZero() and only call time.Since(...) /
FirstResponseTime.Sub(...) and observe metrics when StartTime is non-zero (and
ttft > 0 as already done).


// Time per output token (TPOT): (total_duration - ttft) / output_tokens
if usage != nil && usage.CompletionTokens > 0 {
generationDuration := serviceDuration - ttft
if generationDuration > 0 {
tpot := generationDuration / float64(usage.CompletionTokens)
llmTimePerOutputToken.WithLabelValues(latencyLabels...).Observe(tpot)
}
}
}
}

// RecordRateLimit records a rate limit trigger event.
func RecordRateLimit(model string, channel string, limitType string, tokenName string) {
rateLimitTotal.WithLabelValues(model, channel, limitType, tokenName).Inc()
}

// RecordCircuitBreakerState updates the circuit breaker state gauge.
func RecordCircuitBreakerState(channel string, model string, state float64) {
circuitBreakerState.WithLabelValues(channel, model).Set(state)
}

// RecordGatewayDuration records the gateway processing duration (excluding upstream).
func RecordGatewayDuration(model string, channel string, durationSeconds float64) {
llmGatewayDuration.WithLabelValues(model, channel).Observe(durationSeconds)
}
115 changes: 115 additions & 0 deletions middleware/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
package middleware

import (
"strconv"
"time"

"github.com/QuantumNous/new-api/metrics"

"github.com/gin-gonic/gin"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)

var httpLabelNames = []string{"method", "path", "status", "region"}

var (
httpRequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "newapi",
Name: "http_requests_total",
Help: "Total number of HTTP requests",
},
httpLabelNames,
)

httpRequestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "newapi",
Name: "http_request_duration_seconds",
Help: "HTTP request duration in seconds",
Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60},
},
httpLabelNames,
)

httpRequestsInFlight = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "newapi",
Name: "http_requests_in_flight",
Help: "Number of HTTP requests currently being processed",
},
)

httpResponseSizeBytes = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: "newapi",
Name: "http_response_size_bytes",
Help: "HTTP response size in bytes",
Buckets: prometheus.ExponentialBuckets(100, 10, 8),
},
httpLabelNames,
)
)

func init() {
prometheus.MustRegister(
httpRequestsTotal,
httpRequestDuration,
httpRequestsInFlight,
httpResponseSizeBytes,
)
}

// PrometheusMiddleware collects HTTP golden metrics for each request.
// It records request count, latency, in-flight requests, and response size.
func PrometheusMiddleware() gin.HandlerFunc {
return func(c *gin.Context) {
if c.Request.URL.Path == "/metrics" {
c.Next()
return
}

startTime := time.Now()
httpRequestsInFlight.Inc()

c.Next()

httpRequestsInFlight.Dec()

statusCode := strconv.Itoa(c.Writer.Status())
routePath := normalizeRoutePath(c)
method := c.Request.Method
duration := time.Since(startTime).Seconds()
responseSize := float64(c.Writer.Size())
regionLabel := metrics.GetRegion()

httpRequestsTotal.WithLabelValues(method, routePath, statusCode, regionLabel).Inc()
httpRequestDuration.WithLabelValues(method, routePath, statusCode, regionLabel).Observe(duration)
httpResponseSizeBytes.WithLabelValues(method, routePath, statusCode, regionLabel).Observe(responseSize)
}
}

// MetricsHandler returns the Prometheus metrics HTTP handler for the /metrics endpoint.
func MetricsHandler() gin.HandlerFunc {
handler := promhttp.Handler()
return func(c *gin.Context) {
handler.ServeHTTP(c.Writer, c.Request)
}
}

// normalizeRoutePath extracts the matched route template to avoid high-cardinality labels.
// Falls back to a generic label if no route template is available.
func normalizeRoutePath(c *gin.Context) string {
routePath := c.FullPath()
if routePath != "" {
return routePath
}

routeTag, exists := c.Get(RouteTagKey)
if exists {
return routeTag.(string)
}

return "unmatched"
}
1 change: 1 addition & 0 deletions router/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
)

func SetRouter(router *gin.Engine, buildFS embed.FS, indexPage []byte) {
router.GET("/metrics", middleware.MetricsHandler())
SetApiRouter(router)
SetDashboardRouter(router)
SetRelayRouter(router)
Expand Down
7 changes: 7 additions & 0 deletions service/quota.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/QuantumNous/new-api/constant"
"github.com/QuantumNous/new-api/dto"
"github.com/QuantumNous/new-api/logger"
"github.com/QuantumNous/new-api/metrics"
"github.com/QuantumNous/new-api/model"
relaycommon "github.com/QuantumNous/new-api/relay/common"
"github.com/QuantumNous/new-api/setting/ratio_setting"
Expand Down Expand Up @@ -233,6 +234,12 @@ func PostWssConsumeQuota(ctx *gin.Context, relayInfo *relaycommon.RelayInfo, mod
Group: relayInfo.UsingGroup,
Other: other,
})

metrics.RecordAIMetrics(relayInfo, &dto.Usage{
PromptTokens: usage.InputTokens,
CompletionTokens: usage.OutputTokens,
TotalTokens: usage.TotalTokens,
})
}

func CalcOpenRouterCacheCreateTokens(usage dto.Usage, priceData types.PriceData) int {
Expand Down
3 changes: 3 additions & 0 deletions service/task_billing.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"github.com/QuantumNous/new-api/common"
"github.com/QuantumNous/new-api/constant"
"github.com/QuantumNous/new-api/logger"
"github.com/QuantumNous/new-api/metrics"
"github.com/QuantumNous/new-api/model"
relaycommon "github.com/QuantumNous/new-api/relay/common"
"github.com/QuantumNous/new-api/setting/ratio_setting"
Expand Down Expand Up @@ -58,6 +59,8 @@ func LogTaskConsumption(c *gin.Context, info *relaycommon.RelayInfo) {
})
model.UpdateUserUsedQuotaAndRequestCount(info.UserId, info.PriceData.Quota)
model.UpdateChannelUsedQuota(info.ChannelId, info.PriceData.Quota)

metrics.RecordAIMetrics(info, nil)
}

// ---------------------------------------------------------------------------
Expand Down
Loading