Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 197 additions & 0 deletions .github/workflows/llm-cicd.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
name: LLM CI/CD Pipeline

on:
push:
branches: [ main, develop ]
paths:
- "api/**"
- "astroml/llm/**"
- "api/tests/test_llm*"
pull_request:
branches: [ main, develop ]
paths:
- "api/**"
- "astroml/llm/**"
- "api/tests/test_llm*"

env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}
LLM_COST_THRESHOLD: 0.50
LLM_LATENCY_BUDGET_MS: 2000
CANARY_NAMESPACE: astroml

jobs:
llm-test:
name: LLM Tests + Cost Awareness
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
cache: pip

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pytest pytest-asyncio httpx fastapi sqlalchemy prometheus-client

- name: Run LLM unit tests
run: pytest api/tests/test_llm.py -v --tb=short

- name: Run LLM health tests
run: pytest api/tests/test_llm_health.py -v --tb=short

- name: Run cost-aware tests
run: pytest api/tests/test_llm_cost_aware.py -v --tb=short

- name: Upload test results
uses: actions/upload-artifact@v4
if: always()
with:
name: llm-test-results
path: |
.pytest_cache/
coverage.xml

build-llm-image:
name: Build LLM API Image
runs-on: ubuntu-latest
needs: llm-test
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Log in to Container Registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
tags: |
type=ref,event=branch
type=sha,prefix=

- name: Build LLM production image
uses: docker/build-push-action@v5
with:
context: .
dockerfile: api/Dockerfile
target: production
push: true
tags: |
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:llm-${{ github.sha }}
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:llm-latest
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
cache-to: type=gha,mode=max
build-args: |
LLM_PROVIDER=openai

canary-deploy:
name: Canary Deploy + Validate
runs-on: ubuntu-latest
needs: build-llm-image
if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/develop'
environment:
name: canary
url: https://canary.astroml.example.com
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Set up kubectl
uses: azure/setup-kubectl@v3
with:
version: 'v1.28.0'

- name: Configure kubectl
run: |
echo "${{ secrets.KUBE_CONFIG_CANARY }}" | base64 -d > kubeconfig
export KUBECONFIG=kubeconfig

- name: Deploy canary
run: |
export KUBECONFIG=kubeconfig
export IMAGE_TAG="llm-${{ github.sha }}"
export REGISTRY="${{ env.REGISTRY }}"
export REPO="${{ github.repository }}"
./scripts/canary-deploy.sh

- name: Wait for canary rollout
run: |
export KUBECONFIG=kubeconfig
kubectl rollout status deployment/astroml-api-canary -n ${{ env.CANARY_NAMESPACE }} --timeout=300s

- name: Health check canary
run: |
export KUBECONFIG=kubeconfig
CANARY_POD=$(kubectl get pods -n ${{ env.CANARY_NAMESPACE }} -l app=astroml-api,version=canary -o jsonpath='{.items[0].metadata.name}')
kubectl port-forward -n ${{ env.CANARY_NAMESPACE }} pod/$CANARY_POD 9000:8000 &
PF_PID=$!
sleep 5
curl -f http://localhost:9000/health || (kill $PF_PID && exit 1)
curl -f http://localhost:9000/api/v1/llm/health || (kill $PF_PID && exit 1)
kill $PF_PID

- name: Validate cost metrics
run: |
export KUBECONFIG=kubeconfig
CANARY_POD=$(kubectl get pods -n ${{ env.CANARY_NAMESPACE }} -l app=astroml-api,version=canary -o jsonpath='{.items[0].metadata.name}')
kubectl exec -n ${{ env.CANARY_NAMESPACE }} pod/$CANARY_POD -- python -c "
from astroml.llm.tracker import global_tracker
from astroml.llm.metrics import LLM_COST_USD_TOTAL
assert global_tracker.total_cost < 100.0, 'Session cost exceeded threshold'
print('Cost check passed: $%.4f' % global_tracker.total_cost)
print('Metrics registered: LLM cost counter active')
"

- name: Promote canary
if: success()
run: |
export KUBECONFIG=kubeconfig
./scripts/canary-promote.sh

- name: Auto rollback on failure
if: failure()
run: |
export KUBECONFIG=kubeconfig
./scripts/auto-rollback.sh

- name: Cleanup canary on failure
if: failure()
run: |
export KUBECONFIG=kubeconfig
kubectl delete deployment astroml-api-canary -n ${{ env.CANARY_NAMESPACE }} --ignore-not-found=true

notify:
name: Notify Status
runs-on: ubuntu-latest
needs: [llm-test, canary-deploy]
if: always()
steps:
- name: Slack notification
uses: 8398a7/action-slack@v3
with:
status: ${{ job.status }}
text: |
LLM CI/CD Pipeline Status: ${{ job.status }}
Branch: ${{ github.ref }}
Commit: ${{ github.sha }}
Author: ${{ github.actor }}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
26 changes: 25 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
.PHONY: help quickstart test test-api lint format clean install run-api
.PHONY: help quickstart test test-api lint format clean install run-api canary-deploy canary-promote rollback-llm

help:
@echo "AstroML Development Commands"
Expand All @@ -13,6 +13,9 @@ help:
@echo "make install Install development dependencies"
@echo "make clean Clean build artifacts and cache"
@echo "make run-api Start the FastAPI dev server on localhost:8000"
@echo "make canary-deploy Deploy LLM canary to Kubernetes"
@echo "make canary-promote Promote canary to stable"
@echo "make rollback-llm Rollback LLM canary deployment"
@echo ""

quickstart:
Expand Down Expand Up @@ -64,3 +67,24 @@ dev-setup:
@./scripts/seed_data.sh
@./scripts/health_check.sh
@echo "✅ Development environment ready."

.PHONY: canary-deploy
canary-deploy:
@echo "🚀 Deploying LLM canary..."
REGISTRY=$(shell grep -E '^REGISTRY' .github/workflows/llm-cicd.yml | head -n1 | sed 's/.*: //' | tr -d '"')
IMAGE_TAG=llm-$(shell git rev-parse --short HEAD)
REPO=$(shell basename $$(pwd))
NAMESPACE=astroml ./scripts/canary-deploy.sh

.PHONY: canary-promote
canary-promote:
@echo "✅ Promoting canary to stable..."
REGISTRY=$(shell grep -E '^REGISTRY' .github/workflows/llm-cicd.yml | head -n1 | sed 's/.*: //' | tr -d '"')
IMAGE_TAG=llm-$(shell git rev-parse --short HEAD)
REPO=$(shell basename $$(pwd))
NAMESPACE=astroml ./scripts/canary-promote.sh

.PHONY: rollback-llm
rollback-llm:
@echo "🔄 Rolling back LLM deployment..."
NAMESPACE=astroml STABLE_DEPLOYMENT=astroml-api ./scripts/auto-rollback.sh
11 changes: 10 additions & 1 deletion api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@
from contextlib import asynccontextmanager
from typing import AsyncGenerator

from fastapi import FastAPI, Request
from fastapi import FastAPI, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest

from api.auth.middleware import AuthMiddleware
from api.audit_middleware import AuditLoggingMiddleware
Expand All @@ -45,6 +46,7 @@
feedback_router,
fraud_router,
loyalty_router,
llm_health_router,
mentorship_router,
models_router,
monitoring_router,
Expand All @@ -61,6 +63,7 @@
)
from api.routers.monitoring import record_latency
from api.routers.ws import poll_and_broadcast_transactions
from astroml.llm import metrics as _llm_metrics

# Setup distributed tracing (issue #336)
_tracer_provider = setup_tracing()
Expand Down Expand Up @@ -168,6 +171,7 @@ async def _latency_middleware(request: Request, call_next):
app.include_router(ws_router)
app.include_router(streaming_router)
app.include_router(llm_router)
app.include_router(llm_health_router)
app.include_router(reports_router)
app.include_router(alerts_router)

Expand All @@ -177,6 +181,11 @@ async def health():
return {"status": "ok"}


@app.get("/metrics", tags=["ops"])
async def prometheus_metrics():
return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)


@app.get("/api/v1", tags=["ops"])
async def api_root():
return {"version": settings.api_version, "status": "ok"}
2 changes: 2 additions & 0 deletions api/routers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from api.routers.ws import router as ws_router
from api.routers.streaming import router as streaming_router
from api.routers.llm import router as llm_router
from api.routers.llm_health import router as llm_health_router
from api.routers.reports import router as reports_router
from api.routers.alerts import router as alerts_router

Expand Down Expand Up @@ -49,6 +50,7 @@
"ws_router",
"streaming_router",
"llm_router",
"llm_health_router",
"reports_router",
"alerts_router",
]
Loading