From 5757ef43288316936b33f7f01dcf7e32113c01ce Mon Sep 17 00:00:00 2001 From: Sadeequ <70214653+Sadeequ@users.noreply.github.com> Date: Sun, 28 Jun 2026 23:14:35 +0000 Subject: [PATCH] LLM infrastructure implementation --- api/app.py | 11 +- api/routers/__init__.py | 2 + api/routers/llm_health.py | 20 ++++ api/tests/test_llm_health.py | 33 ++++++ astroml/db/schema.py | 15 +-- astroml/llm/explainer.py | 3 +- astroml/llm/health.py | 113 ++++++++++++++++++ astroml/llm/metrics.py | 32 ++++++ astroml/llm/tracker.py | 73 ++++++++---- docs/runbooks/llm_health.md | 88 ++++++++++++++ monitoring/grafana/llm_health_dashboard.json | 114 +++++++++++++++++++ monitoring/prometheus/alert_rules.yml | 48 +++++++- monitoring/prometheus/prometheus.yml | 10 ++ 13 files changed, 523 insertions(+), 39 deletions(-) create mode 100644 api/routers/llm_health.py create mode 100644 api/tests/test_llm_health.py create mode 100644 astroml/llm/health.py create mode 100644 astroml/llm/metrics.py create mode 100644 docs/runbooks/llm_health.md create mode 100644 monitoring/grafana/llm_health_dashboard.json diff --git a/api/app.py b/api/app.py index f5f5f9f..701ce81 100644 --- a/api/app.py +++ b/api/app.py @@ -23,8 +23,9 @@ from contextlib import asynccontextmanager from typing import AsyncGenerator -from fastapi import FastAPI, Request +from fastapi import FastAPI, Request, Response from fastapi.middleware.cors import CORSMiddleware +from prometheus_client import CONTENT_TYPE_LATEST, generate_latest from api.auth.middleware import AuthMiddleware from api.audit_middleware import AuditLoggingMiddleware @@ -45,6 +46,7 @@ feedback_router, fraud_router, loyalty_router, + llm_health_router, mentorship_router, models_router, monitoring_router, @@ -61,6 +63,7 @@ ) from api.routers.monitoring import record_latency from api.routers.ws import poll_and_broadcast_transactions +from astroml.llm import metrics as _llm_metrics # Setup distributed tracing (issue #336) _tracer_provider = setup_tracing() @@ -168,6 +171,7 @@ async def _latency_middleware(request: Request, call_next): app.include_router(ws_router) app.include_router(streaming_router) app.include_router(llm_router) +app.include_router(llm_health_router) app.include_router(reports_router) app.include_router(alerts_router) @@ -177,6 +181,11 @@ async def health(): return {"status": "ok"} +@app.get("/metrics", tags=["ops"]) +async def prometheus_metrics(): + return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST) + + @app.get("/api/v1", tags=["ops"]) async def api_root(): return {"version": settings.api_version, "status": "ok"} diff --git a/api/routers/__init__.py b/api/routers/__init__.py index 2f3c75b..70fbfee 100644 --- a/api/routers/__init__.py +++ b/api/routers/__init__.py @@ -22,6 +22,7 @@ from api.routers.ws import router as ws_router from api.routers.streaming import router as streaming_router from api.routers.llm import router as llm_router +from api.routers.llm_health import router as llm_health_router from api.routers.reports import router as reports_router from api.routers.alerts import router as alerts_router @@ -49,6 +50,7 @@ "ws_router", "streaming_router", "llm_router", + "llm_health_router", "reports_router", "alerts_router", ] diff --git a/api/routers/llm_health.py b/api/routers/llm_health.py new file mode 100644 index 0000000..8d7e629 --- /dev/null +++ b/api/routers/llm_health.py @@ -0,0 +1,20 @@ +"""LLM Health and Provider Status API.""" +from __future__ import annotations + +from fastapi import APIRouter + +from astroml.llm.health import check_all_providers, check_provider_health + +router = APIRouter(prefix="/api/v1/llm", tags=["llm-health"]) + + +@router.get("/health") +async def llm_health(): + result = await check_all_providers() + return result + + +@router.get("/health/{provider_name}") +async def llm_provider_health(provider_name: str): + result = await check_provider_health(provider_name) + return result diff --git a/api/tests/test_llm_health.py b/api/tests/test_llm_health.py new file mode 100644 index 0000000..3aa82b0 --- /dev/null +++ b/api/tests/test_llm_health.py @@ -0,0 +1,33 @@ +"""Integration tests for LLM health endpoints.""" +from __future__ import annotations + + +class TestLLMHealth: + def test_llm_health_returns_200(self, client): + resp = client.get("/api/v1/llm/health") + assert resp.status_code == 200 + + def test_llm_health_has_overall_status(self, client): + data = client.get("/api/v1/llm/health").json() + assert "overall_status" in data + assert "providers" in data + assert "checked_at" in data + + def test_llm_provider_health_endpoint(self, client): + resp = client.get("/api/v1/llm/health/openai") + assert resp.status_code == 200 + data = resp.json() + assert data["provider"] == "openai" + assert "status" in data + assert "latency_ms" in data + + def test_llm_health_providers_include_expected(self, client): + data = client.get("/api/v1/llm/health").json() + assert "openai" in data["providers"] + assert "anthropic" in data["providers"] + assert "huggingface" in data["providers"] + + def test_prometheus_metrics_endpoint(self, client): + resp = client.get("/metrics") + assert resp.status_code == 200 + assert "astroml_llm_provider_health" in resp.text diff --git a/astroml/db/schema.py b/astroml/db/schema.py index f4da902..f5d1deb 100644 --- a/astroml/db/schema.py +++ b/astroml/db/schema.py @@ -643,7 +643,7 @@ class ProcessedLedger(Base): __tablename__ = "processed_ledgers" id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) -ledger_sequence: Mapped[int] = mapped_column(Integer, unique=True, nullable=False) + ledger_sequence: Mapped[int] = mapped_column(Integer, unique=True, nullable=False) source: Mapped[str] = mapped_column( String(256), nullable=False, @@ -653,14 +653,10 @@ class ProcessedLedger(Base): nullable=False, server_default=func.now(), ) -status: Mapped[ - Literal["pending", "processing", "completed", "failed"] -] = mapped_column( - String(16), - nullable=False, - server_default="pending", -) - String(32), + status: Mapped[ + Literal["pending", "processing", "completed", "failed"] + ] = mapped_column( + String(16), nullable=False, server_default="pending", ) @@ -679,4 +675,3 @@ class ProcessedLedger(Base): Index("ix_processed_ledgers_status", "status"), Index("ix_processed_ledgers_source", "source"), ) - ) diff --git a/astroml/llm/explainer.py b/astroml/llm/explainer.py index 5bc37a9..950a9f5 100644 --- a/astroml/llm/explainer.py +++ b/astroml/llm/explainer.py @@ -37,11 +37,12 @@ def generate_explanation(self, alert_id: int, account_id: str, pattern: str, sco latency_ms=latency_ms ) - # Cache the response self.cache.set(prompt, response) return response except Exception as e: + provider_name = self.provider.__class__.__name__.replace("Provider", "").lower() + global_tracker.record_error(provider_name) return f"Error generating explanation: {str(e)}" def _build_prompt(self, account_id: str, pattern: str, score: float, transactions: List[Dict[str, Any]]) -> str: diff --git a/astroml/llm/health.py b/astroml/llm/health.py new file mode 100644 index 0000000..e98c250 --- /dev/null +++ b/astroml/llm/health.py @@ -0,0 +1,113 @@ +"""LLM Provider health checks.""" +import asyncio +import os +import time +from typing import Any, Dict + +import aiohttp + +PROVIDER_ENDPOINTS = { + "openai": { + "url": "https://api.openai.com/v1/models", + "method": "GET", + "headers": lambda key: {"Authorization": f"Bearer {key}"}, + }, + "anthropic": { + "url": "https://api.anthropic.com/v1/messages", + "method": "HEAD", + "headers": lambda key: { + "x-api-key": key, + "anthropic-version": "2023-06-01", + }, + }, + "huggingface": { + "url": "https://api-inference.huggingface.co/status", + "method": "GET", + "headers": lambda key: {"Authorization": f"Bearer {key}"}, + }, +} + + +def _get_api_key(provider_name: str) -> str: + env_key = f"{provider_name.upper()}_API_KEY" + return os.getenv(env_key, "") + + +async def check_provider_health( + provider_name: str, timeout: float = 5.0 +) -> Dict[str, Any]: + start = time.perf_counter() + if provider_name not in PROVIDER_ENDPOINTS: + latency_ms = (time.perf_counter() - start) * 1000 + return { + "provider": provider_name, + "status": "unknown", + "latency_ms": round(latency_ms, 2), + "error": "Provider not supported for health checks", + } + + api_key = _get_api_key(provider_name) + if not api_key: + latency_ms = (time.perf_counter() - start) * 1000 + return { + "provider": provider_name, + "status": "unhealthy", + "latency_ms": round(latency_ms, 2), + "error": "API key not configured", + } + + config = PROVIDER_ENDPOINTS[provider_name] + + try: + async with aiohttp.ClientSession() as session: + async with session.request( + method=config["method"], + url=config["url"], + headers=config["headers"](api_key), + timeout=aiohttp.ClientTimeout(total=timeout), + ) as response: + latency_ms = (time.perf_counter() - start) * 1000 + healthy = 200 <= response.status < 300 + return { + "provider": provider_name, + "status": "healthy" if healthy else "unhealthy", + "latency_ms": round(latency_ms, 2), + "http_status": response.status, + } + except Exception as e: + latency_ms = (time.perf_counter() - start) * 1000 + return { + "provider": provider_name, + "status": "unhealthy", + "latency_ms": round(latency_ms, 2), + "error": str(e), + } + + +async def check_all_providers() -> Dict[str, Any]: + providers = list(PROVIDER_ENDPOINTS.keys()) + results = await asyncio.gather( + *(check_provider_health(p) for p in providers), + return_exceptions=True, + ) + + provider_statuses = {} + for result in results: + if isinstance(result, Exception): + provider_statuses["unknown"] = { + "provider": "unknown", + "status": "unhealthy", + "latency_ms": 0, + "error": str(result), + } + else: + provider_statuses[result["provider"]] = result + + all_healthy = all( + r.get("status") == "healthy" for r in provider_statuses.values() + ) + return { + "overall_status": "healthy" if all_healthy else "degraded", + "providers": provider_statuses, + "checked_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + } diff --git a/astroml/llm/metrics.py b/astroml/llm/metrics.py new file mode 100644 index 0000000..2413d24 --- /dev/null +++ b/astroml/llm/metrics.py @@ -0,0 +1,32 @@ +from prometheus_client import Counter, Gauge, Histogram + +LLM_REQUESTS_TOTAL = Counter( + "astroml_llm_requests_total", + "Total LLM API requests", + ["provider", "status"], +) + +LLM_REQUEST_LATENCY_SECONDS = Histogram( + "astroml_llm_request_latency_seconds", + "LLM API request latency in seconds", + ["provider"], + buckets=[0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60], +) + +LLM_COST_USD_TOTAL = Counter( + "astroml_llm_cost_usd_total", + "Total LLM API cost in USD", + ["provider"], +) + +LLM_TOKENS_TOTAL = Counter( + "astroml_llm_tokens_total", + "Total LLM tokens processed", + ["provider", "token_type"], +) + +LLM_PROVIDER_HEALTH = Gauge( + "astroml_llm_provider_health", + "LLM provider health status (1=healthy, 0=unhealthy)", + ["provider"], +) diff --git a/astroml/llm/tracker.py b/astroml/llm/tracker.py index aba3ca8..85672bd 100644 --- a/astroml/llm/tracker.py +++ b/astroml/llm/tracker.py @@ -1,19 +1,24 @@ """LLM Token Usage and Cost Tracking.""" import logging -from typing import Dict, Optional +from typing import Dict + +from astroml.llm.metrics import ( + LLM_COST_USD_TOTAL, + LLM_REQUEST_LATENCY_SECONDS, + LLM_REQUESTS_TOTAL, + LLM_TOKENS_TOTAL, +) logger = logging.getLogger(__name__) -# Mock cost per 1k tokens for different providers COST_RATES = { "openai": {"prompt": 0.03, "completion": 0.06}, "anthropic": {"prompt": 0.015, "completion": 0.075}, "huggingface": {"prompt": 0.001, "completion": 0.001}, } -class LLMUsageTracker: - """Tracks LLM API usage, costs, and latency.""" +class LLMUsageTracker: def __init__(self): self.total_cost = 0.0 self.total_prompt_tokens = 0 @@ -23,42 +28,66 @@ def __init__(self): def record_usage( self, provider_name: str, usage: Dict[str, int], latency_ms: float ) -> float: - """ - Record usage for a request and calculate cost. - Logs an alert if total cost exceeds the threshold. - """ - rates = COST_RATES.get(provider_name.lower(), {"prompt": 0.0, "completion": 0.0}) - + rates = COST_RATES.get( + provider_name.lower(), {"prompt": 0.0, "completion": 0.0} + ) prompt_tokens = usage.get("prompt_tokens", 0) completion_tokens = usage.get("completion_tokens", 0) - - cost = (prompt_tokens / 1000.0) * rates["prompt"] + (completion_tokens / 1000.0) * rates["completion"] - + cost = (prompt_tokens / 1000.0) * rates["prompt"] + ( + completion_tokens / 1000.0 + ) * rates["completion"] + self.total_prompt_tokens += prompt_tokens self.total_completion_tokens += completion_tokens self.total_cost += cost - + + LLM_REQUESTS_TOTAL.labels( + provider=provider_name, status="success" + ).inc() + LLM_REQUEST_LATENCY_SECONDS.labels(provider=provider_name).observe( + latency_ms / 1000.0 + ) + LLM_COST_USD_TOTAL.labels(provider=provider_name).inc(cost) + LLM_TOKENS_TOTAL.labels( + provider=provider_name, token_type="prompt" + ).inc(prompt_tokens) + LLM_TOKENS_TOTAL.labels( + provider=provider_name, token_type="completion" + ).inc(completion_tokens) + logger.info( - "LLM Usage Recorded: Provider=%s, PromptTokens=%d, CompletionTokens=%d, Cost=$%.4f, Latency=%.2fms", - provider_name, prompt_tokens, completion_tokens, cost, latency_ms + "LLM Usage Recorded: Provider=%s, PromptTokens=%d, " + "CompletionTokens=%d, Cost=$%.4f, Latency=%.2fms", + provider_name, + prompt_tokens, + completion_tokens, + cost, + latency_ms, ) - + self.check_alerts() return cost + def record_error(self, provider_name: str) -> None: + LLM_REQUESTS_TOTAL.labels(provider=provider_name, status="error").inc() + def check_alerts(self): - """Check if cost alerts should be triggered.""" if self.total_cost > self.alert_threshold: - logger.warning("LLM Cost Alert! Total cost ($%.2f) has exceeded threshold ($%.2f)", self.total_cost, self.alert_threshold) + logger.warning( + "LLM Cost Alert! Total cost ($%.2f) has exceeded " + "threshold ($%.2f)", + self.total_cost, + self.alert_threshold, + ) def get_summary(self) -> Dict[str, float]: - """Get summary of tracking metrics.""" return { "total_cost": self.total_cost, "total_prompt_tokens": self.total_prompt_tokens, "total_completion_tokens": self.total_completion_tokens, - "total_tokens": self.total_prompt_tokens + self.total_completion_tokens + "total_tokens": self.total_prompt_tokens + + self.total_completion_tokens, } -# Global singleton tracker + global_tracker = LLMUsageTracker() diff --git a/docs/runbooks/llm_health.md b/docs/runbooks/llm_health.md new file mode 100644 index 0000000..68a4680 --- /dev/null +++ b/docs/runbooks/llm_health.md @@ -0,0 +1,88 @@ +# LLM Infrastructure Runbook + +## Overview + +This runbook covers health checks, monitoring, alerting, and incident response for LLM providers (OpenAI, Anthropic, HuggingFace). + +## Health Check Architecture + +- **Health endpoints**: `GET /api/v1/llm/health` and `GET /api/v1/llm/health/{provider}` +- **Polling interval**: 60 seconds via Prometheus or external monitor +- **Metrics endpoint**: `GET /metrics` (Prometheus text format) +- **Grafana dashboard**: `monitoring/grafana/llm_health_dashboard.json` + +## Key Metrics + +| Metric | Type | Description | +|--------|------|-------------| +| `astroml_llm_provider_health` | Gauge | 1 = healthy, 0 = unhealthy | +| `astroml_llm_request_latency_seconds` | Histogram | Per-provider latency | +| `astroml_llm_requests_total` | Counter | Request count by provider and status | +| `astroml_llm_cost_usd_total` | Counter | Accumulated cost USD | +| `astroml_llm_tokens_total` | Counter | Token count by provider and token_type | + +## Alerts + +| Alert | Condition | Severity | +|-------|-----------|----------| +| `LLMProviderDown` | Provider health == 0 for > 2m | Critical | +| `LLMHighErrorRate` | Error rate > 0.1 req/s for > 2m | Warning | +| `LLMCostThreshold` | Cost > $10 in 1h window | Warning | +| `LLMHighLatency` | P95 latency > 5s for > 3m | Warning | + +## Cost Tracking + +- **Threshold**: $100 (logged) +- **Granularity**: Per-request cost calculated using mock rates in `astroml/llm/tracker.py` +- **Alerting**: Prometheus `LLMCostThreshold` rule triggers on spikes (>$10/hour) +- **Dashboard**: Cost panel in Grafana shows 1-hour rolling sums + +## Incident Response + +### Provider Down +1. Check `LLMProviderDown` alert in Alertmanager +2. Verify API keys are configured (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `HUGGINGFACE_API_KEY`) +3. Check network connectivity from container to provider API +4. Review provider status pages: OpenAI, Anthropic, HuggingFace +5. Rotate API keys if suspected exposure +6. Failover: update `LLM_PROVIDER` env var to alternate provider + +### High Error Rate +1. Check `LLMHighErrorRate` alert +2. Correlate with latency spikes in Grafana dashboard +3. Review application logs for stack traces +4. Check for rate limits or quota exhaustion +5. Consider switching providers or reducing request rate + +### Cost Spike +1. Check `LLMCostThreshold` alert +2. Correlate with traffic volume in Grafana +3. Review recent deployments for prompt regression +4. If legitimate growth, update budget thresholds +5. If anomaly, audit prompt caching (`SemanticCache`) and consider tightening limits + +## Runbook Verification + +```bash +# Verify health endpoint +curl -s http://localhost:8000/api/v1/llm/health | jq + +# Verify metrics exposition +curl -s http://localhost:8000/metrics | grep astroml_llm_ + +# Run monitoring stack +docker compose --profile monitoring up -d + +# Check Prometheus targets +open http://localhost:9090/targets + +# Open Grafana +open http://localhost:3000 +Default login: admin/admin +``` + +## Maintenance + +- **Dashboard refresh**: Import `monitoring/grafana/llm_health_dashboard.json` into Grafana +- **Alert review**: Review rules in `monitoring/prometheus/alert_rules.yml` +- **Rate updates**: Update mock cost rates in `astroml/llm/tracker.py` and `COST_RATES` from provider pricing pages diff --git a/monitoring/grafana/llm_health_dashboard.json b/monitoring/grafana/llm_health_dashboard.json new file mode 100644 index 0000000..124fa24 --- /dev/null +++ b/monitoring/grafana/llm_health_dashboard.json @@ -0,0 +1,114 @@ +{ + "dashboard": { + "title": "LLM Health Monitoring", + "uid": "llm-health", + "timezone": "browser", + "schemaVersion": 38, + "version": 0, + "refresh": "1m", + "time": {"from": "now-6h", "to": "now"}, + "panels": [ + { + "id": 1, + "title": "Provider Health Status", + "type": "stat", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 0}, + "targets": [ + { + "expr": "astroml_llm_provider_health", + "legendFormat": "{{provider}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "mappings": [ + {"type": "value", "value": "0", "text": "Unhealthy"}, + {"type": "value", "value": "1", "text": "Healthy"} + ], + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": 0}, + {"color": "green", "value": 1} + ] + } + } + } + }, + { + "id": 2, + "title": "P95 Request Latency (seconds)", + "type": "graph", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(astroml_llm_request_latency_seconds_bucket[5m]))", + "legendFormat": "{{provider}}", + "refId": "A" + } + ], + "yaxes": [ + {"format": "s", "label": "Latency"}, + {"format": "short", "show": false} + ] + }, + { + "id": 3, + "title": "Request Error Rate (per sec)", + "type": "graph", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}, + "targets": [ + { + "expr": "rate(astroml_llm_requests_total{status=\"error\"}[5m])", + "legendFormat": "{{provider}}", + "refId": "A" + } + ] + }, + { + "id": 4, + "title": "Total Cost (USD)", + "type": "graph", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "targets": [ + { + "expr": "sum by (provider) (increase(astroml_llm_cost_usd_total[1h]))", + "legendFormat": "{{provider}}", + "refId": "A" + } + ], + "yaxes": [ + {"format": "currencyUSD", "label": "Cost"}, + {"format": "short", "show": false} + ] + }, + { + "id": 5, + "title": "Total Tokens (last 1h)", + "type": "graph", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, + "targets": [ + { + "expr": "sum by (provider, token_type) (increase(astroml_llm_tokens_total[1h]))", + "legendFormat": "{{provider}} - {{token_type}}", + "refId": "A" + } + ] + }, + { + "id": 6, + "title": "Total Requests", + "type": "graph", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 24}, + "targets": [ + { + "expr": "sum by (provider, status) (astroml_llm_requests_total)", + "legendFormat": "{{provider}} ({{status}})", + "refId": "A" + } + ] + } + ] + } +} diff --git a/monitoring/prometheus/alert_rules.yml b/monitoring/prometheus/alert_rules.yml index b61b75e..84264bc 100644 --- a/monitoring/prometheus/alert_rules.yml +++ b/monitoring/prometheus/alert_rules.yml @@ -28,11 +28,49 @@ groups: summary: "Ingestion stalled for {{ $labels.stream_type }} on {{ $labels.horizon_url }}" description: "No records have been processed for {{ $labels.stream_type }} in the last 15 minutes." - - alert: PersistentRateLimit - expr: astroml_ingestion_rate_limit_backoff_seconds > 60 - for: 10m + - alert: PersistentRateLimit + expr: astroml_ingestion_rate_limit_backoff_seconds > 60 + for: 10m + labels: + severity: warning + annotations: + summary: "Persistent rate limiting for {{ $labels.stream_type }}" + description: "The ingestion service is facing persistent rate limiting with backoff > 60s for over 10 minutes." + + - name: astroml_llm_alerts + rules: + - alert: LLMProviderDown + expr: astroml_llm_provider_health == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "LLM provider {{ $labels.provider }} is down" + description: "The {{ $labels.provider }} LLM provider has been unreachable for more than 2 minutes." + + - alert: LLMHighErrorRate + expr: rate(astroml_llm_requests_total{status="error"}[5m]) > 0.1 + for: 2m + labels: + severity: warning + annotations: + summary: "High LLM error rate for {{ $labels.provider }}" + description: "LLM error rate for {{ $labels.provider }} is currently {{ $value }} requests/sec." + + - alert: LLMCostThreshold + expr: increase(astroml_llm_cost_usd_total[1h]) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "LLM cost spike for {{ $labels.provider }}" + description: "LLM cost for {{ $labels.provider }} exceeded $10 in the last hour." + + - alert: LLMHighLatency + expr: histogram_quantile(0.95, rate(astroml_llm_request_latency_seconds_bucket[5m])) > 5 + for: 3m labels: severity: warning annotations: - summary: "Persistent rate limiting for {{ $labels.stream_type }}" - description: "The ingestion service is facing persistent rate limiting with backoff > 60s for over 10 minutes." + summary: "High LLM latency for {{ $labels.provider }}" + description: "P95 latency for {{ $labels.provider }} is {{ $value }}s (threshold 5s)." diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml index 6b73ec3..0807380 100644 --- a/monitoring/prometheus/prometheus.yml +++ b/monitoring/prometheus/prometheus.yml @@ -90,3 +90,13 @@ scrape_configs: - source_labels: [__address__] target_label: instance replacement: 'production' + + # FastAPI API service metrics + - job_name: 'astroml-api' + metrics_path: '/metrics' + static_configs: + - targets: ['api:8000'] + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: 'api'