From 5757ef43288316936b33f7f01dcf7e32113c01ce Mon Sep 17 00:00:00 2001
From: Sadeequ <70214653+Sadeequ@users.noreply.github.com>
Date: Sun, 28 Jun 2026 23:14:35 +0000
Subject: [PATCH] LLM infrastructure implementation

---
 api/app.py                                   |  11 +-
 api/routers/__init__.py                      |   2 +
 api/routers/llm_health.py                    |  20 ++++
 api/tests/test_llm_health.py                 |  33 ++++++
 astroml/db/schema.py                         |  15 +--
 astroml/llm/explainer.py                     |   3 +-
 astroml/llm/health.py                        | 113 ++++++++++++++++++
 astroml/llm/metrics.py                       |  32 ++++++
 astroml/llm/tracker.py                       |  73 ++++++++----
 docs/runbooks/llm_health.md                  |  88 ++++++++++++++
 monitoring/grafana/llm_health_dashboard.json | 114 +++++++++++++++++++
 monitoring/prometheus/alert_rules.yml        |  48 +++++++-
 monitoring/prometheus/prometheus.yml         |  10 ++
 13 files changed, 523 insertions(+), 39 deletions(-)
 create mode 100644 api/routers/llm_health.py
 create mode 100644 api/tests/test_llm_health.py
 create mode 100644 astroml/llm/health.py
 create mode 100644 astroml/llm/metrics.py
 create mode 100644 docs/runbooks/llm_health.md
 create mode 100644 monitoring/grafana/llm_health_dashboard.json

diff --git a/api/app.py b/api/app.py
index f5f5f9f..701ce81 100644
--- a/api/app.py
+++ b/api/app.py
@@ -23,8 +23,9 @@
 from contextlib import asynccontextmanager
 from typing import AsyncGenerator
 
-from fastapi import FastAPI, Request
+from fastapi import FastAPI, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
+from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
 
 from api.auth.middleware import AuthMiddleware
 from api.audit_middleware import AuditLoggingMiddleware
@@ -45,6 +46,7 @@
     feedback_router,
     fraud_router,
     loyalty_router,
+    llm_health_router,
     mentorship_router,
     models_router,
     monitoring_router,
@@ -61,6 +63,7 @@
 )
 from api.routers.monitoring import record_latency
 from api.routers.ws import poll_and_broadcast_transactions
+from astroml.llm import metrics as _llm_metrics
 
 # Setup distributed tracing (issue #336)
 _tracer_provider = setup_tracing()
@@ -168,6 +171,7 @@ async def _latency_middleware(request: Request, call_next):
 app.include_router(ws_router)
 app.include_router(streaming_router)
 app.include_router(llm_router)
+app.include_router(llm_health_router)
 app.include_router(reports_router)
 app.include_router(alerts_router)
 
@@ -177,6 +181,11 @@ async def health():
     return {"status": "ok"}
 
 
+@app.get("/metrics", tags=["ops"])
+async def prometheus_metrics():
+    return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
+
+
 @app.get("/api/v1", tags=["ops"])
 async def api_root():
     return {"version": settings.api_version, "status": "ok"}
diff --git a/api/routers/__init__.py b/api/routers/__init__.py
index 2f3c75b..70fbfee 100644
--- a/api/routers/__init__.py
+++ b/api/routers/__init__.py
@@ -22,6 +22,7 @@
 from api.routers.ws import router as ws_router
 from api.routers.streaming import router as streaming_router
 from api.routers.llm import router as llm_router
+from api.routers.llm_health import router as llm_health_router
 from api.routers.reports import router as reports_router
 from api.routers.alerts import router as alerts_router
 
@@ -49,6 +50,7 @@
     "ws_router",
     "streaming_router",
     "llm_router",
+    "llm_health_router",
     "reports_router",
     "alerts_router",
 ]
diff --git a/api/routers/llm_health.py b/api/routers/llm_health.py
new file mode 100644
index 0000000..8d7e629
--- /dev/null
+++ b/api/routers/llm_health.py
@@ -0,0 +1,20 @@
+"""LLM Health and Provider Status API."""
+from __future__ import annotations
+
+from fastapi import APIRouter
+
+from astroml.llm.health import check_all_providers, check_provider_health
+
+router = APIRouter(prefix="/api/v1/llm", tags=["llm-health"])
+
+
+@router.get("/health")
+async def llm_health():
+    result = await check_all_providers()
+    return result
+
+
+@router.get("/health/{provider_name}")
+async def llm_provider_health(provider_name: str):
+    result = await check_provider_health(provider_name)
+    return result
diff --git a/api/tests/test_llm_health.py b/api/tests/test_llm_health.py
new file mode 100644
index 0000000..3aa82b0
--- /dev/null
+++ b/api/tests/test_llm_health.py
@@ -0,0 +1,33 @@
+"""Integration tests for LLM health endpoints."""
+from __future__ import annotations
+
+
+class TestLLMHealth:
+    def test_llm_health_returns_200(self, client):
+        resp = client.get("/api/v1/llm/health")
+        assert resp.status_code == 200
+
+    def test_llm_health_has_overall_status(self, client):
+        data = client.get("/api/v1/llm/health").json()
+        assert "overall_status" in data
+        assert "providers" in data
+        assert "checked_at" in data
+
+    def test_llm_provider_health_endpoint(self, client):
+        resp = client.get("/api/v1/llm/health/openai")
+        assert resp.status_code == 200
+        data = resp.json()
+        assert data["provider"] == "openai"
+        assert "status" in data
+        assert "latency_ms" in data
+
+    def test_llm_health_providers_include_expected(self, client):
+        data = client.get("/api/v1/llm/health").json()
+        assert "openai" in data["providers"]
+        assert "anthropic" in data["providers"]
+        assert "huggingface" in data["providers"]
+
+    def test_prometheus_metrics_endpoint(self, client):
+        resp = client.get("/metrics")
+        assert resp.status_code == 200
+        assert "astroml_llm_provider_health" in resp.text
diff --git a/astroml/db/schema.py b/astroml/db/schema.py
index f4da902..f5d1deb 100644
--- a/astroml/db/schema.py
+++ b/astroml/db/schema.py
@@ -643,7 +643,7 @@ class ProcessedLedger(Base):
     __tablename__ = "processed_ledgers"
 
     id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
-ledger_sequence: Mapped[int] = mapped_column(Integer, unique=True, nullable=False)
+    ledger_sequence: Mapped[int] = mapped_column(Integer, unique=True, nullable=False)
     source: Mapped[str] = mapped_column(
         String(256),
         nullable=False,
@@ -653,14 +653,10 @@ class ProcessedLedger(Base):
         nullable=False,
         server_default=func.now(),
     )
-status: Mapped[
-    Literal["pending", "processing", "completed", "failed"]
-] = mapped_column(
-    String(16),
-    nullable=False,
-    server_default="pending",
-)
-        String(32),
+    status: Mapped[
+        Literal["pending", "processing", "completed", "failed"]
+    ] = mapped_column(
+        String(16),
         nullable=False,
         server_default="pending",
     )
@@ -679,4 +675,3 @@ class ProcessedLedger(Base):
         Index("ix_processed_ledgers_status", "status"),
         Index("ix_processed_ledgers_source", "source"),
     )
-    )
diff --git a/astroml/llm/explainer.py b/astroml/llm/explainer.py
index 5bc37a9..950a9f5 100644
--- a/astroml/llm/explainer.py
+++ b/astroml/llm/explainer.py
@@ -37,11 +37,12 @@ def generate_explanation(self, alert_id: int, account_id: str, pattern: str, sco
                 latency_ms=latency_ms
             )
             
-            # Cache the response
             self.cache.set(prompt, response)
             
             return response
         except Exception as e:
+            provider_name = self.provider.__class__.__name__.replace("Provider", "").lower()
+            global_tracker.record_error(provider_name)
             return f"Error generating explanation: {str(e)}"
 
     def _build_prompt(self, account_id: str, pattern: str, score: float, transactions: List[Dict[str, Any]]) -> str:
diff --git a/astroml/llm/health.py b/astroml/llm/health.py
new file mode 100644
index 0000000..e98c250
--- /dev/null
+++ b/astroml/llm/health.py
@@ -0,0 +1,113 @@
+"""LLM Provider health checks."""
+import asyncio
+import os
+import time
+from typing import Any, Dict
+
+import aiohttp
+
+PROVIDER_ENDPOINTS = {
+    "openai": {
+        "url": "https://api.openai.com/v1/models",
+        "method": "GET",
+        "headers": lambda key: {"Authorization": f"Bearer {key}"},
+    },
+    "anthropic": {
+        "url": "https://api.anthropic.com/v1/messages",
+        "method": "HEAD",
+        "headers": lambda key: {
+            "x-api-key": key,
+            "anthropic-version": "2023-06-01",
+        },
+    },
+    "huggingface": {
+        "url": "https://api-inference.huggingface.co/status",
+        "method": "GET",
+        "headers": lambda key: {"Authorization": f"Bearer {key}"},
+    },
+}
+
+
+def _get_api_key(provider_name: str) -> str:
+    env_key = f"{provider_name.upper()}_API_KEY"
+    return os.getenv(env_key, "")
+
+
+async def check_provider_health(
+    provider_name: str, timeout: float = 5.0
+) -> Dict[str, Any]:
+    start = time.perf_counter()
+    if provider_name not in PROVIDER_ENDPOINTS:
+        latency_ms = (time.perf_counter() - start) * 1000
+        return {
+            "provider": provider_name,
+            "status": "unknown",
+            "latency_ms": round(latency_ms, 2),
+            "error": "Provider not supported for health checks",
+        }
+
+    api_key = _get_api_key(provider_name)
+    if not api_key:
+        latency_ms = (time.perf_counter() - start) * 1000
+        return {
+            "provider": provider_name,
+            "status": "unhealthy",
+            "latency_ms": round(latency_ms, 2),
+            "error": "API key not configured",
+        }
+
+    config = PROVIDER_ENDPOINTS[provider_name]
+
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.request(
+                method=config["method"],
+                url=config["url"],
+                headers=config["headers"](api_key),
+                timeout=aiohttp.ClientTimeout(total=timeout),
+            ) as response:
+                latency_ms = (time.perf_counter() - start) * 1000
+                healthy = 200 <= response.status < 300
+                return {
+                    "provider": provider_name,
+                    "status": "healthy" if healthy else "unhealthy",
+                    "latency_ms": round(latency_ms, 2),
+                    "http_status": response.status,
+                }
+    except Exception as e:
+        latency_ms = (time.perf_counter() - start) * 1000
+        return {
+            "provider": provider_name,
+            "status": "unhealthy",
+            "latency_ms": round(latency_ms, 2),
+            "error": str(e),
+        }
+
+
+async def check_all_providers() -> Dict[str, Any]:
+    providers = list(PROVIDER_ENDPOINTS.keys())
+    results = await asyncio.gather(
+        *(check_provider_health(p) for p in providers),
+        return_exceptions=True,
+    )
+
+    provider_statuses = {}
+    for result in results:
+        if isinstance(result, Exception):
+            provider_statuses["unknown"] = {
+                "provider": "unknown",
+                "status": "unhealthy",
+                "latency_ms": 0,
+                "error": str(result),
+            }
+        else:
+            provider_statuses[result["provider"]] = result
+
+    all_healthy = all(
+        r.get("status") == "healthy" for r in provider_statuses.values()
+    )
+    return {
+        "overall_status": "healthy" if all_healthy else "degraded",
+        "providers": provider_statuses,
+        "checked_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
+    }
diff --git a/astroml/llm/metrics.py b/astroml/llm/metrics.py
new file mode 100644
index 0000000..2413d24
--- /dev/null
+++ b/astroml/llm/metrics.py
@@ -0,0 +1,32 @@
+from prometheus_client import Counter, Gauge, Histogram
+
+LLM_REQUESTS_TOTAL = Counter(
+    "astroml_llm_requests_total",
+    "Total LLM API requests",
+    ["provider", "status"],
+)
+
+LLM_REQUEST_LATENCY_SECONDS = Histogram(
+    "astroml_llm_request_latency_seconds",
+    "LLM API request latency in seconds",
+    ["provider"],
+    buckets=[0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60],
+)
+
+LLM_COST_USD_TOTAL = Counter(
+    "astroml_llm_cost_usd_total",
+    "Total LLM API cost in USD",
+    ["provider"],
+)
+
+LLM_TOKENS_TOTAL = Counter(
+    "astroml_llm_tokens_total",
+    "Total LLM tokens processed",
+    ["provider", "token_type"],
+)
+
+LLM_PROVIDER_HEALTH = Gauge(
+    "astroml_llm_provider_health",
+    "LLM provider health status (1=healthy, 0=unhealthy)",
+    ["provider"],
+)
diff --git a/astroml/llm/tracker.py b/astroml/llm/tracker.py
index aba3ca8..85672bd 100644
--- a/astroml/llm/tracker.py
+++ b/astroml/llm/tracker.py
@@ -1,19 +1,24 @@
 """LLM Token Usage and Cost Tracking."""
 import logging
-from typing import Dict, Optional
+from typing import Dict
+
+from astroml.llm.metrics import (
+    LLM_COST_USD_TOTAL,
+    LLM_REQUEST_LATENCY_SECONDS,
+    LLM_REQUESTS_TOTAL,
+    LLM_TOKENS_TOTAL,
+)
 
 logger = logging.getLogger(__name__)
 
-# Mock cost per 1k tokens for different providers
 COST_RATES = {
     "openai": {"prompt": 0.03, "completion": 0.06},
     "anthropic": {"prompt": 0.015, "completion": 0.075},
     "huggingface": {"prompt": 0.001, "completion": 0.001},
 }
 
-class LLMUsageTracker:
-    """Tracks LLM API usage, costs, and latency."""
 
+class LLMUsageTracker:
     def __init__(self):
         self.total_cost = 0.0
         self.total_prompt_tokens = 0
@@ -23,42 +28,66 @@ def __init__(self):
     def record_usage(
         self, provider_name: str, usage: Dict[str, int], latency_ms: float
     ) -> float:
-        """
-        Record usage for a request and calculate cost.
-        Logs an alert if total cost exceeds the threshold.
-        """
-        rates = COST_RATES.get(provider_name.lower(), {"prompt": 0.0, "completion": 0.0})
-        
+        rates = COST_RATES.get(
+            provider_name.lower(), {"prompt": 0.0, "completion": 0.0}
+        )
         prompt_tokens = usage.get("prompt_tokens", 0)
         completion_tokens = usage.get("completion_tokens", 0)
-        
-        cost = (prompt_tokens / 1000.0) * rates["prompt"] + (completion_tokens / 1000.0) * rates["completion"]
-        
+        cost = (prompt_tokens / 1000.0) * rates["prompt"] + (
+            completion_tokens / 1000.0
+        ) * rates["completion"]
+
         self.total_prompt_tokens += prompt_tokens
         self.total_completion_tokens += completion_tokens
         self.total_cost += cost
-        
+
+        LLM_REQUESTS_TOTAL.labels(
+            provider=provider_name, status="success"
+        ).inc()
+        LLM_REQUEST_LATENCY_SECONDS.labels(provider=provider_name).observe(
+            latency_ms / 1000.0
+        )
+        LLM_COST_USD_TOTAL.labels(provider=provider_name).inc(cost)
+        LLM_TOKENS_TOTAL.labels(
+            provider=provider_name, token_type="prompt"
+        ).inc(prompt_tokens)
+        LLM_TOKENS_TOTAL.labels(
+            provider=provider_name, token_type="completion"
+        ).inc(completion_tokens)
+
         logger.info(
-            "LLM Usage Recorded: Provider=%s, PromptTokens=%d, CompletionTokens=%d, Cost=$%.4f, Latency=%.2fms",
-            provider_name, prompt_tokens, completion_tokens, cost, latency_ms
+            "LLM Usage Recorded: Provider=%s, PromptTokens=%d, "
+            "CompletionTokens=%d, Cost=$%.4f, Latency=%.2fms",
+            provider_name,
+            prompt_tokens,
+            completion_tokens,
+            cost,
+            latency_ms,
         )
-        
+
         self.check_alerts()
         return cost
 
+    def record_error(self, provider_name: str) -> None:
+        LLM_REQUESTS_TOTAL.labels(provider=provider_name, status="error").inc()
+
     def check_alerts(self):
-        """Check if cost alerts should be triggered."""
         if self.total_cost > self.alert_threshold:
-            logger.warning("LLM Cost Alert! Total cost ($%.2f) has exceeded threshold ($%.2f)", self.total_cost, self.alert_threshold)
+            logger.warning(
+                "LLM Cost Alert! Total cost ($%.2f) has exceeded "
+                "threshold ($%.2f)",
+                self.total_cost,
+                self.alert_threshold,
+            )
 
     def get_summary(self) -> Dict[str, float]:
-        """Get summary of tracking metrics."""
         return {
             "total_cost": self.total_cost,
             "total_prompt_tokens": self.total_prompt_tokens,
             "total_completion_tokens": self.total_completion_tokens,
-            "total_tokens": self.total_prompt_tokens + self.total_completion_tokens
+            "total_tokens": self.total_prompt_tokens
+            + self.total_completion_tokens,
         }
 
-# Global singleton tracker
+
 global_tracker = LLMUsageTracker()
diff --git a/docs/runbooks/llm_health.md b/docs/runbooks/llm_health.md
new file mode 100644
index 0000000..68a4680
--- /dev/null
+++ b/docs/runbooks/llm_health.md
@@ -0,0 +1,88 @@
+# LLM Infrastructure Runbook
+
+## Overview
+
+This runbook covers health checks, monitoring, alerting, and incident response for LLM providers (OpenAI, Anthropic, HuggingFace).
+
+## Health Check Architecture
+
+- **Health endpoints**: `GET /api/v1/llm/health` and `GET /api/v1/llm/health/{provider}`
+- **Polling interval**: 60 seconds via Prometheus or external monitor
+- **Metrics endpoint**: `GET /metrics` (Prometheus text format)
+- **Grafana dashboard**: `monitoring/grafana/llm_health_dashboard.json`
+
+## Key Metrics
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `astroml_llm_provider_health` | Gauge | 1 = healthy, 0 = unhealthy |
+| `astroml_llm_request_latency_seconds` | Histogram | Per-provider latency |
+| `astroml_llm_requests_total` | Counter | Request count by provider and status |
+| `astroml_llm_cost_usd_total` | Counter | Accumulated cost USD |
+| `astroml_llm_tokens_total` | Counter | Token count by provider and token_type |
+
+## Alerts
+
+| Alert | Condition | Severity |
+|-------|-----------|----------|
+| `LLMProviderDown` | Provider health == 0 for > 2m | Critical |
+| `LLMHighErrorRate` | Error rate > 0.1 req/s for > 2m | Warning |
+| `LLMCostThreshold` | Cost > $10 in 1h window | Warning |
+| `LLMHighLatency` | P95 latency > 5s for > 3m | Warning |
+
+## Cost Tracking
+
+- **Threshold**: $100 (logged)
+- **Granularity**: Per-request cost calculated using mock rates in `astroml/llm/tracker.py`
+- **Alerting**: Prometheus `LLMCostThreshold` rule triggers on spikes (>$10/hour)
+- **Dashboard**: Cost panel in Grafana shows 1-hour rolling sums
+
+## Incident Response
+
+### Provider Down
+1. Check `LLMProviderDown` alert in Alertmanager
+2. Verify API keys are configured (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `HUGGINGFACE_API_KEY`)
+3. Check network connectivity from container to provider API
+4. Review provider status pages: OpenAI, Anthropic, HuggingFace
+5. Rotate API keys if suspected exposure
+6. Failover: update `LLM_PROVIDER` env var to alternate provider
+
+### High Error Rate
+1. Check `LLMHighErrorRate` alert
+2. Correlate with latency spikes in Grafana dashboard
+3. Review application logs for stack traces
+4. Check for rate limits or quota exhaustion
+5. Consider switching providers or reducing request rate
+
+### Cost Spike
+1. Check `LLMCostThreshold` alert
+2. Correlate with traffic volume in Grafana
+3. Review recent deployments for prompt regression
+4. If legitimate growth, update budget thresholds
+5. If anomaly, audit prompt caching (`SemanticCache`) and consider tightening limits
+
+## Runbook Verification
+
+```bash
+# Verify health endpoint
+curl -s http://localhost:8000/api/v1/llm/health | jq
+
+# Verify metrics exposition
+curl -s http://localhost:8000/metrics | grep astroml_llm_
+
+# Run monitoring stack
+docker compose --profile monitoring up -d
+
+# Check Prometheus targets
+open http://localhost:9090/targets
+
+# Open Grafana
+open http://localhost:3000
+Default login: admin/admin
+```
+
+## Maintenance
+
+- **Dashboard refresh**: Import `monitoring/grafana/llm_health_dashboard.json` into Grafana
+- **Alert review**: Review rules in `monitoring/prometheus/alert_rules.yml`
+- **Rate updates**: Update mock cost rates in `astroml/llm/tracker.py` and `COST_RATES` from provider pricing pages
diff --git a/monitoring/grafana/llm_health_dashboard.json b/monitoring/grafana/llm_health_dashboard.json
new file mode 100644
index 0000000..124fa24
--- /dev/null
+++ b/monitoring/grafana/llm_health_dashboard.json
@@ -0,0 +1,114 @@
+{
+  "dashboard": {
+    "title": "LLM Health Monitoring",
+    "uid": "llm-health",
+    "timezone": "browser",
+    "schemaVersion": 38,
+    "version": 0,
+    "refresh": "1m",
+    "time": {"from": "now-6h", "to": "now"},
+    "panels": [
+      {
+        "id": 1,
+        "title": "Provider Health Status",
+        "type": "stat",
+        "gridPos": {"h": 8, "w": 24, "x": 0, "y": 0},
+        "targets": [
+          {
+            "expr": "astroml_llm_provider_health",
+            "legendFormat": "{{provider}}",
+            "refId": "A"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "mappings": [
+              {"type": "value", "value": "0", "text": "Unhealthy"},
+              {"type": "value", "value": "1", "text": "Healthy"}
+            ],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {"color": "red", "value": 0},
+                {"color": "green", "value": 1}
+              ]
+            }
+          }
+        }
+      },
+      {
+        "id": 2,
+        "title": "P95 Request Latency (seconds)",
+        "type": "graph",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+        "targets": [
+          {
+            "expr": "histogram_quantile(0.95, rate(astroml_llm_request_latency_seconds_bucket[5m]))",
+            "legendFormat": "{{provider}}",
+            "refId": "A"
+          }
+        ],
+        "yaxes": [
+          {"format": "s", "label": "Latency"},
+          {"format": "short", "show": false}
+        ]
+      },
+      {
+        "id": 3,
+        "title": "Request Error Rate (per sec)",
+        "type": "graph",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
+        "targets": [
+          {
+            "expr": "rate(astroml_llm_requests_total{status=\"error\"}[5m])",
+            "legendFormat": "{{provider}}",
+            "refId": "A"
+          }
+        ]
+      },
+      {
+        "id": 4,
+        "title": "Total Cost (USD)",
+        "type": "graph",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16},
+        "targets": [
+          {
+            "expr": "sum by (provider) (increase(astroml_llm_cost_usd_total[1h]))",
+            "legendFormat": "{{provider}}",
+            "refId": "A"
+          }
+        ],
+        "yaxes": [
+          {"format": "currencyUSD", "label": "Cost"},
+          {"format": "short", "show": false}
+        ]
+      },
+      {
+        "id": 5,
+        "title": "Total Tokens (last 1h)",
+        "type": "graph",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16},
+        "targets": [
+          {
+            "expr": "sum by (provider, token_type) (increase(astroml_llm_tokens_total[1h]))",
+            "legendFormat": "{{provider}} - {{token_type}}",
+            "refId": "A"
+          }
+        ]
+      },
+      {
+        "id": 6,
+        "title": "Total Requests",
+        "type": "graph",
+        "gridPos": {"h": 8, "w": 24, "x": 0, "y": 24},
+        "targets": [
+          {
+            "expr": "sum by (provider, status) (astroml_llm_requests_total)",
+            "legendFormat": "{{provider}} ({{status}})",
+            "refId": "A"
+          }
+        ]
+      }
+    ]
+  }
+}
diff --git a/monitoring/prometheus/alert_rules.yml b/monitoring/prometheus/alert_rules.yml
index b61b75e..84264bc 100644
--- a/monitoring/prometheus/alert_rules.yml
+++ b/monitoring/prometheus/alert_rules.yml
@@ -28,11 +28,49 @@ groups:
           summary: "Ingestion stalled for {{ $labels.stream_type }} on {{ $labels.horizon_url }}"
           description: "No records have been processed for {{ $labels.stream_type }} in the last 15 minutes."
 
-      - alert: PersistentRateLimit
-        expr: astroml_ingestion_rate_limit_backoff_seconds > 60
-        for: 10m
+  - alert: PersistentRateLimit
+    expr: astroml_ingestion_rate_limit_backoff_seconds > 60
+    for: 10m
+    labels:
+      severity: warning
+    annotations:
+      summary: "Persistent rate limiting for {{ $labels.stream_type }}"
+      description: "The ingestion service is facing persistent rate limiting with backoff > 60s for over 10 minutes."
+
+  - name: astroml_llm_alerts
+    rules:
+      - alert: LLMProviderDown
+        expr: astroml_llm_provider_health == 0
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "LLM provider {{ $labels.provider }} is down"
+          description: "The {{ $labels.provider }} LLM provider has been unreachable for more than 2 minutes."
+
+      - alert: LLMHighErrorRate
+        expr: rate(astroml_llm_requests_total{status="error"}[5m]) > 0.1
+        for: 2m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High LLM error rate for {{ $labels.provider }}"
+          description: "LLM error rate for {{ $labels.provider }} is currently {{ $value }} requests/sec."
+
+      - alert: LLMCostThreshold
+        expr: increase(astroml_llm_cost_usd_total[1h]) > 10
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "LLM cost spike for {{ $labels.provider }}"
+          description: "LLM cost for {{ $labels.provider }} exceeded $10 in the last hour."
+
+      - alert: LLMHighLatency
+        expr: histogram_quantile(0.95, rate(astroml_llm_request_latency_seconds_bucket[5m])) > 5
+        for: 3m
         labels:
           severity: warning
         annotations:
-          summary: "Persistent rate limiting for {{ $labels.stream_type }}"
-          description: "The ingestion service is facing persistent rate limiting with backoff > 60s for over 10 minutes."
+          summary: "High LLM latency for {{ $labels.provider }}"
+          description: "P95 latency for {{ $labels.provider }} is {{ $value }}s (threshold 5s)."
diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml
index 6b73ec3..0807380 100644
--- a/monitoring/prometheus/prometheus.yml
+++ b/monitoring/prometheus/prometheus.yml
@@ -90,3 +90,13 @@ scrape_configs:
       - source_labels: [__address__]
         target_label: instance
         replacement: 'production'
+
+  # FastAPI API service metrics
+  - job_name: 'astroml-api'
+    metrics_path: '/metrics'
+    static_configs:
+      - targets: ['api:8000']
+    relabel_configs:
+      - source_labels: [__address__]
+        target_label: instance
+        replacement: 'api'