Skip to content

Commit 530b2c0

Browse files
feat(metrics): emit hosted-key metrics to Grafana via OTel (#4885)
* feat(metrics): emit hosted-key metrics to Grafana via OTel Replace the dropped platform.hosted_key.* spans with OTel counters/histograms for usage, cost, failures, throttles, and queue waits. Wire a MeterProvider into the Next.js OTel SDK (trigger.dev already exports metrics). Per-key attribution via a key label (env var name). * fix(metrics): correct hosted-key failure attribution - Re-point used/cost/failed labels at the freshly acquired key after reacquire - Classify quota-style 401/403 as rate_limited (mirror isRateLimitError) - Count returned success:false runs (e.g. deep_research polling) as failed * fix(metrics): label hosted_key.throttled with real provider on exhausted retries * fix(metrics): parse OTLP metrics URL via URL/pathname, not string suffix Handles query strings and trailing slashes so the /v1/traces->/v1/metrics swap can't produce a malformed endpoint, matching normalizeOtlpTracesUrl.
1 parent 80ea0dd commit 530b2c0

6 files changed

Lines changed: 260 additions & 49 deletions

File tree

apps/sim/instrumentation-node.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,25 @@ function normalizeOtlpTracesUrl(url: string): string {
6363
}
6464
}
6565

66+
// Metrics counterpart to `normalizeOtlpTracesUrl`. Operates on the parsed
67+
// pathname (not a raw string suffix) so query strings and trailing slashes
68+
// don't corrupt the result: swap a `/v1/traces` suffix for `/v1/metrics`,
69+
// otherwise append `/v1/metrics`.
70+
function normalizeOtlpMetricsUrl(url: string): string {
71+
if (!url) return url
72+
try {
73+
const u = new URL(url)
74+
const path = u.pathname.replace(/\/$/, '')
75+
if (path.endsWith('/v1/metrics')) return url
76+
u.pathname = path.endsWith('/v1/traces')
77+
? path.replace(/\/v1\/traces$/, '/v1/metrics')
78+
: `${path}/v1/metrics`
79+
return u.toString()
80+
} catch {
81+
return url
82+
}
83+
}
84+
6685
// Sampling ratio from env (mirrors Go's `samplerFromEnv`); fallback
6786
// is 100% everywhere. Retention caps cost, not sampling.
6887
function resolveSamplingRatio(_isLocalEndpoint: boolean): number {
@@ -144,6 +163,8 @@ async function initializeOpenTelemetry() {
144163
'@opentelemetry/semantic-conventions/incubating'
145164
)
146165
const { OTLPTraceExporter } = await import('@opentelemetry/exporter-trace-otlp-http')
166+
const { OTLPMetricExporter } = await import('@opentelemetry/exporter-metrics-otlp-http')
167+
const { PeriodicExportingMetricReader } = await import('@opentelemetry/sdk-metrics')
147168
const { BatchSpanProcessor } = await import('@opentelemetry/sdk-trace-node')
148169
const { TraceIdRatioBasedSampler, SamplingDecision } = await import(
149170
'@opentelemetry/sdk-trace-base'
@@ -226,6 +247,18 @@ async function initializeOpenTelemetry() {
226247
exportTimeoutMillis: telemetryConfig.batchSettings.exportTimeoutMillis,
227248
})
228249

250+
// Metrics (hosted-key counters/histograms) share the trace endpoint and
251+
// headers — only the signal path differs. Unlike spans these aren't sampled.
252+
const metricReader = new PeriodicExportingMetricReader({
253+
exporter: new OTLPMetricExporter({
254+
url: normalizeOtlpMetricsUrl(telemetryConfig.endpoint),
255+
headers: otlpHeaders,
256+
timeoutMillis: Math.min(telemetryConfig.batchSettings.exportTimeoutMillis, 10000),
257+
keepAlive: false,
258+
}),
259+
exportIntervalMillis: 60000,
260+
})
261+
229262
// Unique instance id per origin keeps Jaeger's clock-skew adjuster
230263
// from grouping Sim+Go spans together (they'd see multi-second
231264
// drift as intra-service and emit spurious warnings).
@@ -268,6 +301,7 @@ async function initializeOpenTelemetry() {
268301
resource,
269302
spanProcessors,
270303
sampler,
304+
metricReader,
271305
})
272306

273307
sdk.start()

apps/sim/lib/core/telemetry.ts

Lines changed: 13 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import { createLogger } from '@sim/logger'
2121
import { getErrorMessage, toError } from '@sim/utils/errors'
2222
import { TraceAttr } from '@/lib/copilot/generated/trace-attributes-v1'
2323
import type { TraceSpan } from '@/lib/logs/types'
24+
import { hostedKeyMetrics } from '@/lib/monitoring/metrics'
2425

2526
/**
2627
* GenAI Semantic Convention Attributes
@@ -954,14 +955,10 @@ export const PlatformEvents = {
954955
workspaceId?: string
955956
workflowId?: string
956957
}) => {
957-
trackPlatformEvent('platform.hosted_key.user_throttled', {
958-
'tool.id': attrs.toolId,
959-
'throttle.reason': attrs.reason,
960-
...(attrs.provider && { 'provider.id': attrs.provider }),
961-
...(attrs.retryAfterMs != null && { 'rate_limit.retry_after_ms': attrs.retryAfterMs }),
962-
...(attrs.userId && { 'user.id': attrs.userId }),
963-
...(attrs.workspaceId && { 'workspace.id': attrs.workspaceId }),
964-
...(attrs.workflowId && { 'workflow.id': attrs.workflowId }),
958+
hostedKeyMetrics.recordThrottled({
959+
provider: attrs.provider ?? attrs.toolId,
960+
tool: attrs.toolId,
961+
reason: attrs.reason,
965962
})
966963
},
967964

@@ -978,28 +975,15 @@ export const PlatformEvents = {
978975
workspaceId?: string
979976
workflowId?: string
980977
}) => {
981-
trackPlatformEvent('platform.hosted_key.rate_limited', {
982-
'tool.id': attrs.toolId,
983-
'hosted_key.env_var': attrs.envVarName,
984-
'rate_limit.attempt': attrs.attempt,
985-
'rate_limit.max_retries': attrs.maxRetries,
986-
'rate_limit.delay_ms': attrs.delayMs,
987-
...(attrs.userId && { 'user.id': attrs.userId }),
988-
...(attrs.workspaceId && { 'workspace.id': attrs.workspaceId }),
989-
...(attrs.workflowId && { 'workflow.id': attrs.workflowId }),
990-
})
978+
hostedKeyMetrics.recordUpstreamRateLimited({ tool: attrs.toolId, key: attrs.envVarName })
991979
},
992980

993981
hostedKeyUnknownModelCost: (attrs: {
994982
toolId: string
995983
modelName: string
996984
defaultCost: number
997985
}) => {
998-
trackPlatformEvent('platform.hosted_key.unknown_model_cost', {
999-
'tool.id': attrs.toolId,
1000-
'model.name': attrs.modelName,
1001-
'cost.default_cost': attrs.defaultCost,
1002-
})
986+
hostedKeyMetrics.recordUnknownModelCost({ tool: attrs.toolId })
1003987
},
1004988

1005989
/**
@@ -1016,14 +1000,9 @@ export const PlatformEvents = {
10161000
dimension?: string
10171001
queuePosition?: number
10181002
}) => {
1019-
trackPlatformEvent('platform.hosted_key.queue_waited', {
1020-
'provider.id': attrs.provider,
1021-
'workspace.id': attrs.workspaceId,
1022-
'queue.waited_ms': attrs.waitedMs,
1023-
'queue.attempts': attrs.attempts,
1024-
'queue.reason': attrs.reason,
1025-
...(attrs.dimension && { 'queue.dimension': attrs.dimension }),
1026-
...(attrs.queuePosition != null && { 'queue.position': attrs.queuePosition }),
1003+
hostedKeyMetrics.recordQueueWait(attrs.waitedMs, {
1004+
provider: attrs.provider,
1005+
reason: attrs.reason,
10271006
})
10281007
},
10291008

@@ -1037,12 +1016,9 @@ export const PlatformEvents = {
10371016
reason: 'actor_requests' | 'dimension' | 'queue_position'
10381017
dimension?: string
10391018
}) => {
1040-
trackPlatformEvent('platform.hosted_key.queue_wait_exceeded', {
1041-
'provider.id': attrs.provider,
1042-
'workspace.id': attrs.workspaceId,
1043-
'queue.waited_ms': attrs.waitedMs,
1044-
'queue.reason': attrs.reason,
1045-
...(attrs.dimension && { 'queue.dimension': attrs.dimension }),
1019+
hostedKeyMetrics.recordQueueWaitExceeded({
1020+
provider: attrs.provider,
1021+
reason: attrs.reason,
10461022
})
10471023
},
10481024

apps/sim/lib/monitoring/metrics.ts

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
/**
2+
* Hosted-key OTel metrics.
3+
*
4+
* Point events (usage, cost, throttles, queue waits) are emitted as metrics —
5+
* not spans — so they bypass trace sampling and survive aggregation. Reads the
6+
* global MeterProvider, which the Next.js app registers in `instrumentation-node.ts`
7+
* and trigger.dev registers from `trigger.config.ts`; with no provider the API
8+
* returns a no-op meter, so these recorders are always safe to call.
9+
*
10+
* Labels stay low-cardinality (provider, tool, reason, key). `key` is the env var
11+
* NAME of the chosen hosted key (e.g. `PERPLEXITY_API_KEY_2`) — never the secret —
12+
* and the pool is operator-managed, so it's safe to label. Per-workspace/user cost
13+
* lives exactly in the `usage_log` table — never put those on metric labels.
14+
*/
15+
16+
import { type Counter, type Histogram, metrics } from '@opentelemetry/api'
17+
18+
const METER_NAME = 'sim.hosted-key'
19+
const METER_VERSION = '1.0.0'
20+
21+
type ThrottleReason = 'billing_actor_limit' | 'upstream_retries_exhausted'
22+
type QueueReason = 'actor_requests' | 'dimension' | 'queue_position'
23+
type FailureReason = 'rate_limited' | 'auth' | 'other'
24+
25+
let meter: ReturnType<typeof metrics.getMeter> | undefined
26+
let usedCounter: Counter | undefined
27+
let failedCounter: Counter | undefined
28+
let costCounter: Counter | undefined
29+
let throttledCounter: Counter | undefined
30+
let upstreamRateLimitedCounter: Counter | undefined
31+
let queueWaitHistogram: Histogram | undefined
32+
let queueWaitExceededCounter: Counter | undefined
33+
let unknownModelCostCounter: Counter | undefined
34+
35+
function getMeter() {
36+
if (!meter) meter = metrics.getMeter(METER_NAME, METER_VERSION)
37+
return meter
38+
}
39+
40+
function getUsedCounter() {
41+
if (!usedCounter) {
42+
usedCounter = getMeter().createCounter('hosted_key.used', {
43+
description: 'Successful tool executions backed by a Sim-hosted API key',
44+
})
45+
}
46+
return usedCounter
47+
}
48+
49+
function getFailedCounter() {
50+
if (!failedCounter) {
51+
failedCounter = getMeter().createCounter('hosted_key.failed', {
52+
description: 'Failed tool executions backed by a Sim-hosted API key',
53+
})
54+
}
55+
return failedCounter
56+
}
57+
58+
function getCostCounter() {
59+
if (!costCounter) {
60+
costCounter = getMeter().createCounter('hosted_key.cost_charged', {
61+
description: 'Dollar cost charged to the billing actor for hosted-key usage',
62+
unit: 'USD',
63+
})
64+
}
65+
return costCounter
66+
}
67+
68+
function getThrottledCounter() {
69+
if (!throttledCounter) {
70+
throttledCounter = getMeter().createCounter('hosted_key.throttled', {
71+
description: 'Rate-limit errors surfaced to the end user (not retried/absorbed)',
72+
})
73+
}
74+
return throttledCounter
75+
}
76+
77+
function getUpstreamRateLimitedCounter() {
78+
if (!upstreamRateLimitedCounter) {
79+
upstreamRateLimitedCounter = getMeter().createCounter('hosted_key.upstream_rate_limited', {
80+
description: 'Upstream provider 429s absorbed via retry/backoff',
81+
})
82+
}
83+
return upstreamRateLimitedCounter
84+
}
85+
86+
function getQueueWaitHistogram() {
87+
if (!queueWaitHistogram) {
88+
queueWaitHistogram = getMeter().createHistogram('hosted_key.queue_wait_duration', {
89+
description: 'Time a hosted-key acquisition spent waiting in the per-workspace queue/bucket',
90+
unit: 'ms',
91+
})
92+
}
93+
return queueWaitHistogram
94+
}
95+
96+
function getQueueWaitExceededCounter() {
97+
if (!queueWaitExceededCounter) {
98+
queueWaitExceededCounter = getMeter().createCounter('hosted_key.queue_wait_exceeded', {
99+
description: 'Hosted-key acquisitions that exceeded the queue wait cap and fell back to 429',
100+
})
101+
}
102+
return queueWaitExceededCounter
103+
}
104+
105+
function getUnknownModelCostCounter() {
106+
if (!unknownModelCostCounter) {
107+
unknownModelCostCounter = getMeter().createCounter('hosted_key.unknown_model_cost', {
108+
description: 'Hosted-key cost calculations that fell back to a default for an unmapped model',
109+
})
110+
}
111+
return unknownModelCostCounter
112+
}
113+
114+
export const hostedKeyMetrics = {
115+
recordUsed(labels: { provider: string; tool: string; key: string }) {
116+
getUsedCounter().add(1, labels)
117+
},
118+
recordFailed(labels: { provider: string; tool: string; key: string; reason: FailureReason }) {
119+
getFailedCounter().add(1, labels)
120+
},
121+
recordCostCharged(costUsd: number, labels: { provider: string; tool: string }) {
122+
if (costUsd > 0) getCostCounter().add(costUsd, labels)
123+
},
124+
recordThrottled(labels: { provider: string; tool: string; reason: ThrottleReason }) {
125+
getThrottledCounter().add(1, labels)
126+
},
127+
recordUpstreamRateLimited(labels: { tool: string; key: string }) {
128+
getUpstreamRateLimitedCounter().add(1, labels)
129+
},
130+
recordQueueWait(durationMs: number, labels: { provider: string; reason: QueueReason }) {
131+
getQueueWaitHistogram().record(durationMs, labels)
132+
},
133+
recordQueueWaitExceeded(labels: { provider: string; reason: QueueReason }) {
134+
getQueueWaitExceededCounter().add(1, labels)
135+
},
136+
recordUnknownModelCost(labels: { tool: string }) {
137+
getUnknownModelCostCounter().add(1, labels)
138+
},
139+
}

apps/sim/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
"@opentelemetry/exporter-metrics-otlp-http": "^0.217.0",
7171
"@opentelemetry/exporter-trace-otlp-http": "^0.217.0",
7272
"@opentelemetry/resources": "^2.7.0",
73+
"@opentelemetry/sdk-metrics": "^2.7.0",
7374
"@opentelemetry/sdk-node": "^0.217.0",
7475
"@opentelemetry/sdk-trace-base": "^2.7.0",
7576
"@opentelemetry/sdk-trace-node": "^2.7.0",

0 commit comments

Comments
 (0)