|
| 1 | +/** |
| 2 | + * Hosted-key OTel metrics. |
| 3 | + * |
| 4 | + * Point events (usage, cost, throttles, queue waits) are emitted as metrics — |
| 5 | + * not spans — so they bypass trace sampling and survive aggregation. Reads the |
| 6 | + * global MeterProvider, which the Next.js app registers in `instrumentation-node.ts` |
| 7 | + * and trigger.dev registers from `trigger.config.ts`; with no provider the API |
| 8 | + * returns a no-op meter, so these recorders are always safe to call. |
| 9 | + * |
| 10 | + * Labels stay low-cardinality (provider, tool, reason, key). `key` is the env var |
| 11 | + * NAME of the chosen hosted key (e.g. `PERPLEXITY_API_KEY_2`) — never the secret — |
| 12 | + * and the pool is operator-managed, so it's safe to label. Per-workspace/user cost |
| 13 | + * lives exactly in the `usage_log` table — never put those on metric labels. |
| 14 | + */ |
| 15 | + |
| 16 | +import { type Counter, type Histogram, metrics } from '@opentelemetry/api' |
| 17 | + |
| 18 | +const METER_NAME = 'sim.hosted-key' |
| 19 | +const METER_VERSION = '1.0.0' |
| 20 | + |
| 21 | +type ThrottleReason = 'billing_actor_limit' | 'upstream_retries_exhausted' |
| 22 | +type QueueReason = 'actor_requests' | 'dimension' | 'queue_position' |
| 23 | +type FailureReason = 'rate_limited' | 'auth' | 'other' |
| 24 | + |
| 25 | +let meter: ReturnType<typeof metrics.getMeter> | undefined |
| 26 | +let usedCounter: Counter | undefined |
| 27 | +let failedCounter: Counter | undefined |
| 28 | +let costCounter: Counter | undefined |
| 29 | +let throttledCounter: Counter | undefined |
| 30 | +let upstreamRateLimitedCounter: Counter | undefined |
| 31 | +let queueWaitHistogram: Histogram | undefined |
| 32 | +let queueWaitExceededCounter: Counter | undefined |
| 33 | +let unknownModelCostCounter: Counter | undefined |
| 34 | + |
| 35 | +function getMeter() { |
| 36 | + if (!meter) meter = metrics.getMeter(METER_NAME, METER_VERSION) |
| 37 | + return meter |
| 38 | +} |
| 39 | + |
| 40 | +function getUsedCounter() { |
| 41 | + if (!usedCounter) { |
| 42 | + usedCounter = getMeter().createCounter('hosted_key.used', { |
| 43 | + description: 'Successful tool executions backed by a Sim-hosted API key', |
| 44 | + }) |
| 45 | + } |
| 46 | + return usedCounter |
| 47 | +} |
| 48 | + |
| 49 | +function getFailedCounter() { |
| 50 | + if (!failedCounter) { |
| 51 | + failedCounter = getMeter().createCounter('hosted_key.failed', { |
| 52 | + description: 'Failed tool executions backed by a Sim-hosted API key', |
| 53 | + }) |
| 54 | + } |
| 55 | + return failedCounter |
| 56 | +} |
| 57 | + |
| 58 | +function getCostCounter() { |
| 59 | + if (!costCounter) { |
| 60 | + costCounter = getMeter().createCounter('hosted_key.cost_charged', { |
| 61 | + description: 'Dollar cost charged to the billing actor for hosted-key usage', |
| 62 | + unit: 'USD', |
| 63 | + }) |
| 64 | + } |
| 65 | + return costCounter |
| 66 | +} |
| 67 | + |
| 68 | +function getThrottledCounter() { |
| 69 | + if (!throttledCounter) { |
| 70 | + throttledCounter = getMeter().createCounter('hosted_key.throttled', { |
| 71 | + description: 'Rate-limit errors surfaced to the end user (not retried/absorbed)', |
| 72 | + }) |
| 73 | + } |
| 74 | + return throttledCounter |
| 75 | +} |
| 76 | + |
| 77 | +function getUpstreamRateLimitedCounter() { |
| 78 | + if (!upstreamRateLimitedCounter) { |
| 79 | + upstreamRateLimitedCounter = getMeter().createCounter('hosted_key.upstream_rate_limited', { |
| 80 | + description: 'Upstream provider 429s absorbed via retry/backoff', |
| 81 | + }) |
| 82 | + } |
| 83 | + return upstreamRateLimitedCounter |
| 84 | +} |
| 85 | + |
| 86 | +function getQueueWaitHistogram() { |
| 87 | + if (!queueWaitHistogram) { |
| 88 | + queueWaitHistogram = getMeter().createHistogram('hosted_key.queue_wait_duration', { |
| 89 | + description: 'Time a hosted-key acquisition spent waiting in the per-workspace queue/bucket', |
| 90 | + unit: 'ms', |
| 91 | + }) |
| 92 | + } |
| 93 | + return queueWaitHistogram |
| 94 | +} |
| 95 | + |
| 96 | +function getQueueWaitExceededCounter() { |
| 97 | + if (!queueWaitExceededCounter) { |
| 98 | + queueWaitExceededCounter = getMeter().createCounter('hosted_key.queue_wait_exceeded', { |
| 99 | + description: 'Hosted-key acquisitions that exceeded the queue wait cap and fell back to 429', |
| 100 | + }) |
| 101 | + } |
| 102 | + return queueWaitExceededCounter |
| 103 | +} |
| 104 | + |
| 105 | +function getUnknownModelCostCounter() { |
| 106 | + if (!unknownModelCostCounter) { |
| 107 | + unknownModelCostCounter = getMeter().createCounter('hosted_key.unknown_model_cost', { |
| 108 | + description: 'Hosted-key cost calculations that fell back to a default for an unmapped model', |
| 109 | + }) |
| 110 | + } |
| 111 | + return unknownModelCostCounter |
| 112 | +} |
| 113 | + |
| 114 | +export const hostedKeyMetrics = { |
| 115 | + recordUsed(labels: { provider: string; tool: string; key: string }) { |
| 116 | + getUsedCounter().add(1, labels) |
| 117 | + }, |
| 118 | + recordFailed(labels: { provider: string; tool: string; key: string; reason: FailureReason }) { |
| 119 | + getFailedCounter().add(1, labels) |
| 120 | + }, |
| 121 | + recordCostCharged(costUsd: number, labels: { provider: string; tool: string }) { |
| 122 | + if (costUsd > 0) getCostCounter().add(costUsd, labels) |
| 123 | + }, |
| 124 | + recordThrottled(labels: { provider: string; tool: string; reason: ThrottleReason }) { |
| 125 | + getThrottledCounter().add(1, labels) |
| 126 | + }, |
| 127 | + recordUpstreamRateLimited(labels: { tool: string; key: string }) { |
| 128 | + getUpstreamRateLimitedCounter().add(1, labels) |
| 129 | + }, |
| 130 | + recordQueueWait(durationMs: number, labels: { provider: string; reason: QueueReason }) { |
| 131 | + getQueueWaitHistogram().record(durationMs, labels) |
| 132 | + }, |
| 133 | + recordQueueWaitExceeded(labels: { provider: string; reason: QueueReason }) { |
| 134 | + getQueueWaitExceededCounter().add(1, labels) |
| 135 | + }, |
| 136 | + recordUnknownModelCost(labels: { tool: string }) { |
| 137 | + getUnknownModelCostCounter().add(1, labels) |
| 138 | + }, |
| 139 | +} |
0 commit comments