Traqora · Sadeequ · Jun 28, 2026 · Jun 29, 2026 · Jun 29, 2026
diff --git a/.github/workflows/llm-cicd.yml b/.github/workflows/llm-cicd.yml
@@ -0,0 +1,197 @@
+name: LLM CI/CD Pipeline
+
+on:
+  push:
+    branches: [ main, develop ]
+    paths:
+      - "api/**"
+      - "astroml/llm/**"
+      - "api/tests/test_llm*"
+  pull_request:
+    branches: [ main, develop ]
+    paths:
+      - "api/**"
+      - "astroml/llm/**"
+      - "api/tests/test_llm*"
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+  LLM_COST_THRESHOLD: 0.50
+  LLM_LATENCY_BUDGET_MS: 2000
+  CANARY_NAMESPACE: astroml
+
+jobs:
+  llm-test:
+    name: LLM Tests + Cost Awareness
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: pip
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pytest pytest-asyncio httpx fastapi sqlalchemy prometheus-client
+
+      - name: Run LLM unit tests
+        run: pytest api/tests/test_llm.py -v --tb=short
+
+      - name: Run LLM health tests
+        run: pytest api/tests/test_llm_health.py -v --tb=short
+
+      - name: Run cost-aware tests
+        run: pytest api/tests/test_llm_cost_aware.py -v --tb=short
+
+      - name: Upload test results
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: llm-test-results
+          path: |
+            .pytest_cache/
+            coverage.xml
+
+  build-llm-image:
+    name: Build LLM API Image
+    runs-on: ubuntu-latest
+    needs: llm-test
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+          tags: |
+            type=ref,event=branch
+            type=sha,prefix=
+
+      - name: Build LLM production image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          dockerfile: api/Dockerfile
+          target: production
+          push: true
+          tags: |
+            ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:llm-${{ github.sha }}
+            ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:llm-latest
+          labels: ${{ steps.meta.outputs.labels }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+          build-args: |
+            LLM_PROVIDER=openai
+
+  canary-deploy:
+    name: Canary Deploy + Validate
+    runs-on: ubuntu-latest
+    needs: build-llm-image
+    if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/develop'
+    environment:
+      name: canary
+      url: https://canary.astroml.example.com
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up kubectl
+        uses: azure/setup-kubectl@v3
+        with:
+          version: 'v1.28.0'
+
+      - name: Configure kubectl
+        run: |
+          echo "${{ secrets.KUBE_CONFIG_CANARY }}" | base64 -d > kubeconfig
+          export KUBECONFIG=kubeconfig
+
+      - name: Deploy canary
+        run: |
+          export KUBECONFIG=kubeconfig
+          export IMAGE_TAG="llm-${{ github.sha }}"
+          export REGISTRY="${{ env.REGISTRY }}"
+          export REPO="${{ github.repository }}"
+          ./scripts/canary-deploy.sh
+
+      - name: Wait for canary rollout
+        run: |
+          export KUBECONFIG=kubeconfig
+          kubectl rollout status deployment/astroml-api-canary -n ${{ env.CANARY_NAMESPACE }} --timeout=300s
+
+      - name: Health check canary
+        run: |
+          export KUBECONFIG=kubeconfig
+          CANARY_POD=$(kubectl get pods -n ${{ env.CANARY_NAMESPACE }} -l app=astroml-api,version=canary -o jsonpath='{.items[0].metadata.name}')
+          kubectl port-forward -n ${{ env.CANARY_NAMESPACE }} pod/$CANARY_POD 9000:8000 &
+          PF_PID=$!
+          sleep 5
+          curl -f http://localhost:9000/health || (kill $PF_PID && exit 1)
+          curl -f http://localhost:9000/api/v1/llm/health || (kill $PF_PID && exit 1)
+          kill $PF_PID
+
+      - name: Validate cost metrics
+        run: |
+          export KUBECONFIG=kubeconfig
+          CANARY_POD=$(kubectl get pods -n ${{ env.CANARY_NAMESPACE }} -l app=astroml-api,version=canary -o jsonpath='{.items[0].metadata.name}')
+          kubectl exec -n ${{ env.CANARY_NAMESPACE }} pod/$CANARY_POD -- python -c "
+          from astroml.llm.tracker import global_tracker
+          from astroml.llm.metrics import LLM_COST_USD_TOTAL
+          assert global_tracker.total_cost < 100.0, 'Session cost exceeded threshold'
+          print('Cost check passed: $%.4f' % global_tracker.total_cost)
+          print('Metrics registered: LLM cost counter active')
+          "
+
+      - name: Promote canary
+        if: success()
+        run: |
+          export KUBECONFIG=kubeconfig
+          ./scripts/canary-promote.sh
+
+      - name: Auto rollback on failure
+        if: failure()
+        run: |
+          export KUBECONFIG=kubeconfig
+          ./scripts/auto-rollback.sh
+
+      - name: Cleanup canary on failure
+        if: failure()
+        run: |
+          export KUBECONFIG=kubeconfig
+          kubectl delete deployment astroml-api-canary -n ${{ env.CANARY_NAMESPACE }} --ignore-not-found=true
+
+  notify:
+    name: Notify Status
+    runs-on: ubuntu-latest
+    needs: [llm-test, canary-deploy]
+    if: always()
+    steps:
+      - name: Slack notification
+        uses: 8398a7/action-slack@v3
+        with:
+          status: ${{ job.status }}
+          text: |
+            LLM CI/CD Pipeline Status: ${{ job.status }}
+            Branch: ${{ github.ref }}
+            Commit: ${{ github.sha }}
+            Author: ${{ github.actor }}
+        env:
+          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: help quickstart test test-api lint format clean install run-api
+.PHONY: help quickstart test test-api lint format clean install run-api canary-deploy canary-promote rollback-llm
 
 help:
 	@echo "AstroML Development Commands"
@@ -13,6 +13,9 @@ help:
 	@echo "make install             Install development dependencies"
 	@echo "make clean               Clean build artifacts and cache"
 	@echo "make run-api             Start the FastAPI dev server on localhost:8000"
+	@echo "make canary-deploy       Deploy LLM canary to Kubernetes"
+	@echo "make canary-promote      Promote canary to stable"
+	@echo "make rollback-llm        Rollback LLM canary deployment"
 	@echo ""
 
 quickstart:
@@ -64,3 +67,24 @@ dev-setup:
 	@./scripts/seed_data.sh
 	@./scripts/health_check.sh
 	@echo "✅ Development environment ready."
+
+.PHONY: canary-deploy
+canary-deploy:
+	@echo "🚀 Deploying LLM canary..."
+	REGISTRY=$(shell grep -E '^REGISTRY' .github/workflows/llm-cicd.yml | head -n1 | sed 's/.*: //' | tr -d '"')
+	IMAGE_TAG=llm-$(shell git rev-parse --short HEAD)
+	REPO=$(shell basename $$(pwd))
+	NAMESPACE=astroml ./scripts/canary-deploy.sh
+
+.PHONY: canary-promote
+canary-promote:
+	@echo "✅ Promoting canary to stable..."
+	REGISTRY=$(shell grep -E '^REGISTRY' .github/workflows/llm-cicd.yml | head -n1 | sed 's/.*: //' | tr -d '"')
+	IMAGE_TAG=llm-$(shell git rev-parse --short HEAD)
+	REPO=$(shell basename $$(pwd))
+	NAMESPACE=astroml ./scripts/canary-promote.sh
+
+.PHONY: rollback-llm
+rollback-llm:
+	@echo "🔄 Rolling back LLM deployment..."
+	NAMESPACE=astroml STABLE_DEPLOYMENT=astroml-api ./scripts/auto-rollback.sh
diff --git a/api/app.py b/api/app.py
@@ -23,8 +23,9 @@
 from contextlib import asynccontextmanager
 from typing import AsyncGenerator
 
-from fastapi import FastAPI, Request
+from fastapi import FastAPI, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
+from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
 
 from api.auth.middleware import AuthMiddleware
 from api.audit_middleware import AuditLoggingMiddleware
@@ -45,6 +46,7 @@
     feedback_router,
     fraud_router,
     loyalty_router,
+    llm_health_router,
     mentorship_router,
     models_router,
     monitoring_router,
@@ -61,6 +63,7 @@
 )
 from api.routers.monitoring import record_latency
 from api.routers.ws import poll_and_broadcast_transactions
+from astroml.llm import metrics as _llm_metrics
 
 # Setup distributed tracing (issue #336)
 _tracer_provider = setup_tracing()
@@ -168,6 +171,7 @@ async def _latency_middleware(request: Request, call_next):
 app.include_router(ws_router)
 app.include_router(streaming_router)
 app.include_router(llm_router)
+app.include_router(llm_health_router)
 app.include_router(reports_router)
 app.include_router(alerts_router)
 
@@ -177,6 +181,11 @@ async def health():
     return {"status": "ok"}
 
 
+@app.get("/metrics", tags=["ops"])
+async def prometheus_metrics():
+    return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
+
+
 @app.get("/api/v1", tags=["ops"])
 async def api_root():
     return {"version": settings.api_version, "status": "ok"}
diff --git a/api/routers/__init__.py b/api/routers/__init__.py
@@ -22,6 +22,7 @@
 from api.routers.ws import router as ws_router
 from api.routers.streaming import router as streaming_router
 from api.routers.llm import router as llm_router
+from api.routers.llm_health import router as llm_health_router
 from api.routers.reports import router as reports_router
 from api.routers.alerts import router as alerts_router
 
@@ -49,6 +50,7 @@
     "ws_router",
     "streaming_router",
     "llm_router",
+    "llm_health_router",
     "reports_router",
     "alerts_router",
 ]