diff --git a/.github/workflows/llm-cicd.yml b/.github/workflows/llm-cicd.yml new file mode 100644 index 0000000..6160639 --- /dev/null +++ b/.github/workflows/llm-cicd.yml @@ -0,0 +1,197 @@ +name: LLM CI/CD Pipeline + +on: + push: + branches: [ main, develop ] + paths: + - "api/**" + - "astroml/llm/**" + - "api/tests/test_llm*" + pull_request: + branches: [ main, develop ] + paths: + - "api/**" + - "astroml/llm/**" + - "api/tests/test_llm*" + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + LLM_COST_THRESHOLD: 0.50 + LLM_LATENCY_BUDGET_MS: 2000 + CANARY_NAMESPACE: astroml + +jobs: + llm-test: + name: LLM Tests + Cost Awareness + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: pip + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + pip install pytest pytest-asyncio httpx fastapi sqlalchemy prometheus-client + + - name: Run LLM unit tests + run: pytest api/tests/test_llm.py -v --tb=short + + - name: Run LLM health tests + run: pytest api/tests/test_llm_health.py -v --tb=short + + - name: Run cost-aware tests + run: pytest api/tests/test_llm_cost_aware.py -v --tb=short + + - name: Upload test results + uses: actions/upload-artifact@v4 + if: always() + with: + name: llm-test-results + path: | + .pytest_cache/ + coverage.xml + + build-llm-image: + name: Build LLM API Image + runs-on: ubuntu-latest + needs: llm-test + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=ref,event=branch + type=sha,prefix= + + - name: Build LLM production image + uses: docker/build-push-action@v5 + with: + context: . + dockerfile: api/Dockerfile + target: production + push: true + tags: | + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:llm-${{ github.sha }} + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:llm-latest + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + build-args: | + LLM_PROVIDER=openai + + canary-deploy: + name: Canary Deploy + Validate + runs-on: ubuntu-latest + needs: build-llm-image + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/develop' + environment: + name: canary + url: https://canary.astroml.example.com + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up kubectl + uses: azure/setup-kubectl@v3 + with: + version: 'v1.28.0' + + - name: Configure kubectl + run: | + echo "${{ secrets.KUBE_CONFIG_CANARY }}" | base64 -d > kubeconfig + export KUBECONFIG=kubeconfig + + - name: Deploy canary + run: | + export KUBECONFIG=kubeconfig + export IMAGE_TAG="llm-${{ github.sha }}" + export REGISTRY="${{ env.REGISTRY }}" + export REPO="${{ github.repository }}" + ./scripts/canary-deploy.sh + + - name: Wait for canary rollout + run: | + export KUBECONFIG=kubeconfig + kubectl rollout status deployment/astroml-api-canary -n ${{ env.CANARY_NAMESPACE }} --timeout=300s + + - name: Health check canary + run: | + export KUBECONFIG=kubeconfig + CANARY_POD=$(kubectl get pods -n ${{ env.CANARY_NAMESPACE }} -l app=astroml-api,version=canary -o jsonpath='{.items[0].metadata.name}') + kubectl port-forward -n ${{ env.CANARY_NAMESPACE }} pod/$CANARY_POD 9000:8000 & + PF_PID=$! + sleep 5 + curl -f http://localhost:9000/health || (kill $PF_PID && exit 1) + curl -f http://localhost:9000/api/v1/llm/health || (kill $PF_PID && exit 1) + kill $PF_PID + + - name: Validate cost metrics + run: | + export KUBECONFIG=kubeconfig + CANARY_POD=$(kubectl get pods -n ${{ env.CANARY_NAMESPACE }} -l app=astroml-api,version=canary -o jsonpath='{.items[0].metadata.name}') + kubectl exec -n ${{ env.CANARY_NAMESPACE }} pod/$CANARY_POD -- python -c " + from astroml.llm.tracker import global_tracker + from astroml.llm.metrics import LLM_COST_USD_TOTAL + assert global_tracker.total_cost < 100.0, 'Session cost exceeded threshold' + print('Cost check passed: $%.4f' % global_tracker.total_cost) + print('Metrics registered: LLM cost counter active') + " + + - name: Promote canary + if: success() + run: | + export KUBECONFIG=kubeconfig + ./scripts/canary-promote.sh + + - name: Auto rollback on failure + if: failure() + run: | + export KUBECONFIG=kubeconfig + ./scripts/auto-rollback.sh + + - name: Cleanup canary on failure + if: failure() + run: | + export KUBECONFIG=kubeconfig + kubectl delete deployment astroml-api-canary -n ${{ env.CANARY_NAMESPACE }} --ignore-not-found=true + + notify: + name: Notify Status + runs-on: ubuntu-latest + needs: [llm-test, canary-deploy] + if: always() + steps: + - name: Slack notification + uses: 8398a7/action-slack@v3 + with: + status: ${{ job.status }} + text: | + LLM CI/CD Pipeline Status: ${{ job.status }} + Branch: ${{ github.ref }} + Commit: ${{ github.sha }} + Author: ${{ github.actor }} + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/Makefile b/Makefile index 909d4c6..919deb4 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help quickstart test test-api lint format clean install run-api +.PHONY: help quickstart test test-api lint format clean install run-api canary-deploy canary-promote rollback-llm help: @echo "AstroML Development Commands" @@ -13,6 +13,9 @@ help: @echo "make install Install development dependencies" @echo "make clean Clean build artifacts and cache" @echo "make run-api Start the FastAPI dev server on localhost:8000" + @echo "make canary-deploy Deploy LLM canary to Kubernetes" + @echo "make canary-promote Promote canary to stable" + @echo "make rollback-llm Rollback LLM canary deployment" @echo "" quickstart: @@ -64,3 +67,24 @@ dev-setup: @./scripts/seed_data.sh @./scripts/health_check.sh @echo "โœ… Development environment ready." + +.PHONY: canary-deploy +canary-deploy: + @echo "๐Ÿš€ Deploying LLM canary..." + REGISTRY=$(shell grep -E '^REGISTRY' .github/workflows/llm-cicd.yml | head -n1 | sed 's/.*: //' | tr -d '"') + IMAGE_TAG=llm-$(shell git rev-parse --short HEAD) + REPO=$(shell basename $$(pwd)) + NAMESPACE=astroml ./scripts/canary-deploy.sh + +.PHONY: canary-promote +canary-promote: + @echo "โœ… Promoting canary to stable..." + REGISTRY=$(shell grep -E '^REGISTRY' .github/workflows/llm-cicd.yml | head -n1 | sed 's/.*: //' | tr -d '"') + IMAGE_TAG=llm-$(shell git rev-parse --short HEAD) + REPO=$(shell basename $$(pwd)) + NAMESPACE=astroml ./scripts/canary-promote.sh + +.PHONY: rollback-llm +rollback-llm: + @echo "๐Ÿ”„ Rolling back LLM deployment..." + NAMESPACE=astroml STABLE_DEPLOYMENT=astroml-api ./scripts/auto-rollback.sh diff --git a/api/app.py b/api/app.py index dd91dc1..f848d7a 100644 --- a/api/app.py +++ b/api/app.py @@ -23,8 +23,9 @@ from contextlib import asynccontextmanager from typing import AsyncGenerator -from fastapi import FastAPI, Request +from fastapi import FastAPI, Request, Response from fastapi.middleware.cors import CORSMiddleware +from prometheus_client import CONTENT_TYPE_LATEST, generate_latest from api.auth.middleware import AuthMiddleware from api.audit_middleware import AuditLoggingMiddleware @@ -47,6 +48,7 @@ feedback_router, fraud_router, loyalty_router, + llm_health_router, mentorship_router, models_router, monitoring_router, @@ -64,6 +66,7 @@ ) from api.routers.monitoring import record_latency from api.routers.ws import poll_and_broadcast_transactions +from astroml.llm import metrics as _llm_metrics # Setup distributed tracing (issue #336) _tracer_provider = setup_tracing() @@ -174,6 +177,7 @@ async def _latency_middleware(request: Request, call_next): app.include_router(streaming_router) app.include_router(voice_router) app.include_router(llm_router) +app.include_router(llm_health_router) app.include_router(reports_router) app.include_router(alerts_router) @@ -183,6 +187,11 @@ async def health(): return {"status": "ok"} +@app.get("/metrics", tags=["ops"]) +async def prometheus_metrics(): + return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST) + + @app.get("/api/v1", tags=["ops"]) async def api_root(): return {"version": settings.api_version, "status": "ok"} diff --git a/api/routers/__init__.py b/api/routers/__init__.py index 426bf17..2527f31 100644 --- a/api/routers/__init__.py +++ b/api/routers/__init__.py @@ -24,6 +24,7 @@ from api.routers.ws import router as ws_router from api.routers.streaming import router as streaming_router from api.routers.llm import router as llm_router +from api.routers.llm_health import router as llm_health_router from api.routers.reports import router as reports_router from api.routers.alerts import router as alerts_router @@ -54,6 +55,7 @@ "ws_router", "streaming_router", "llm_router", + "llm_health_router", "reports_router", "alerts_router", ] diff --git a/api/routers/alerts.py b/api/routers/alerts.py index 67156ff..e33a692 100644 --- a/api/routers/alerts.py +++ b/api/routers/alerts.py @@ -1,19 +1,33 @@ """Alerts API router (issue XXX).""" from __future__ import annotations -from typing import Optional -from fastapi import APIRouter, Depends, Query +from datetime import datetime, timedelta +from typing import List, Optional + +from fastapi import APIRouter, Depends, HTTPException, Query, status from sqlalchemy import select from sqlalchemy.orm import Session from api.database import get_sync_db -from api.models.orm import FraudAlert +from api.models.orm import ApiTransaction, FraudAlert from api.schemas import ( - PrioritizedAlertsResponse, + FraudAlertOut, + FraudAlertsResponse, + FraudExplanationOut, PrioritizedAlertOut, + PrioritizedAlertsResponse, TransactionSummaryOut, + # Predictive Alerts schemas + BehavioralBaseline, + BehavioralBaselineResponse, + DeviationAlert, + PredictiveAlertRequest, + PredictiveAlertResponse, + AlertGenerationRequest, + AlertGenerationResponse, ) from api.services.alert_prioritization import alert_prioritizer +from api.services.predictive_alerts import predictive_alert_service router = APIRouter(prefix="/api/v1/alerts", tags=["alerts"]) @@ -71,3 +85,106 @@ def get_prioritized_alerts( total_processed=total_processed, total_remaining=len(data), ) + + +@router.get("/predictive", response_model=PredictiveAlertResponse) +def get_predictive_alerts( + account_id: str, + lookback_days: int = Query(30, ge=1, le=365), + metrics: Optional[List[str]] = Query(None), + sensitivity: str = Query("medium", pattern="^(low|medium|high)$"), + db: Session = Depends(get_sync_db), +): + """ + Generate predictive alerts for account behavior changes. + + Analyzes historical transaction data to establish behavioral baselines + and detects significant deviations that may indicate unusual activity. + """ + # Validate account exists (basic check) + if not account_id or len(account_id) < 10: # Stellar addresses are typically longer + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Invalid account ID format" + ) + + # Use the predictive alert service + result = predictive_alert_service.generate_predictive_alerts( + account_id=account_id, + lookback_days=lookback_days, + metrics=metrics, + sensitivity=sensitivity + ) + + # Check if service returned an error + if "error" in result: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=result["error"] + ) + + # Check for informational messages (no alerts generated) + if "message" in result and "alerts" not in result: + # Return empty alerts list with metadata + return PredictiveAlertResponse( + alerts=[], + baselines_used=[], + generated_at=datetime.utcnow(), + total_analyzed=0 + ) + + # Convert to response model + return PredictiveAlertResponse( + alerts=[DeviationAlert(**alert) for alert in result.get("alerts", [])], + baselines_used=[ + BehavioralBaseline(**baseline) + for baseline in result.get("baselines_used", []) + ], + generated_at=datetime.fromisoformat(result["generated_at"]) if isinstance(result.get("generated_at"), str) else result.get("generated_at", datetime.utcnow()), + total_analyzed=result.get("total_analyzed", 0) + ) + + +@router.post("/generate-explanations", response_model=AlertGenerationResponse) +def generate_alert_explanations( + request: AlertGenerationRequest, +): + """ + Generate natural language explanations for detected anomalies. + """ + try: + # Use the alert generator from the predictive service + explanations = [] + for deviation in request.deviations: + # Convert Pydantic model back to dict for the generator + deviation_dict = deviation.dict() + explanation_result = predictive_alert_service.alert_generator.generate_explanation( + deviation.alert_id, + deviation.account_id, + { + "metric_name": deviation.metric_name, + "current_value": deviation.current_value, + "expected_value": sum(deviation.expected_range) / 2 if deviation.expected_range else 0, + "deviation_score": deviation.deviation_score, + "severity": deviation.severity + } + ) + explanations.append(explanation_result.get("explanation", "No explanation available")) + + return AlertGenerationResponse( + alerts=request.deviations, + explanations=explanations, + generated_at=datetime.utcnow() + ) + + except Exception as e: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Failed to generate explanations: {str(e)}" + ) + + +@router.get("/predictive/status") +def get_predictive_service_status(): + """Get status of the predictive alerts service.""" + return predictive_alert_service.get_service_status() \ No newline at end of file diff --git a/api/routers/llm.py b/api/routers/llm.py index 81a51a2..2a4cd03 100644 --- a/api/routers/llm.py +++ b/api/routers/llm.py @@ -1,9 +1,11 @@ +import hashlib import os import time +from typing import List, Dict, Any, AsyncGenerator, Optional, Union from fastapi import APIRouter, Depends, HTTPException, Request from fastapi.responses import StreamingResponse -from pydantic import BaseModel +from pydantic import BaseModel, Field from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from api.services.llm_explainer import TransactionExplainer @@ -11,6 +13,7 @@ from api.services.llm_context import MultiModalContextHandler from api.services.llm_validation import ResponseValidator from api.services.llm_rag import build_citations, build_rag_answer, retrieve_sources +from api.services.translation import translation_service from astroml.llm.embedding_cache import EmbeddingCache from astroml.llm.embedding_drift import EmbeddingDriftMonitor from astroml.llm.memory import ConversationMemory @@ -25,6 +28,14 @@ LLMFeedbackOut, LLMFeedbackTrend, LLMPromptImprovement, + TranslationRequest, + TranslationResponse, + BatchTranslationRequest, + BatchTranslationResponse, + SupportedLanguagesResponse, + LocaleFormatRequest, + LocaleFormatResponse, + TranslationCacheStatsResponse, ) from api.auth.dependencies import get_current_auth, AuthContext from typing import List, Dict, Any, AsyncGenerator, Callable, TypeVar @@ -530,3 +541,153 @@ async def llm_prompt_improvements(db: AsyncSession = Depends(get_db)) -> list[LL ) for feature, items in sorted(by_feature.items()) ] + + +# โ”€โ”€โ”€ Translation endpoints (Issue 1) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +class TranslationRequest(BaseModel): + text: str = Field(..., min_length=1, max_length=50000) + target_language: str = Field(..., min_length=2, max_length=10) + source_language: Optional[str] = Field(default=None, min_length=2, max_length=10) + use_cache: bool = True + + +class TranslationResponse(BaseModel): + translated_text: str + source_language: str + target_language: str + cached: bool + latency_ms: float + + +class BatchTranslationRequest(BaseModel): + texts: List[str] = Field(..., min_length=1, max_length=100) + target_language: str = Field(..., min_length=2, max_length=10) + source_language: Optional[str] = Field(default=None, min_length=2, max_length=10) + use_cache: bool = True + + +class BatchTranslationResponse(BaseModel): + translations: List[TranslationResponse] + total_latency_ms: float + + +class SupportedLanguagesResponse(BaseModel): + languages: Dict[str, Dict[str, str]] + + +@router.get("/translate/languages", response_model=SupportedLanguagesResponse) +async def get_supported_languages(auth: AuthContext = Depends(get_current_auth)): + """Get list of supported languages for translation.""" + return SupportedLanguagesResponse(languages=translation_service.get_supported_languages()) + + +@router.post("/translate", response_model=TranslationResponse) +async def translate_text( + request: TranslationRequest, + auth: AuthContext = Depends(get_current_auth), +): + """Translate text to target language.""" + try: + result = await translation_service.translate( + text=request.text, + target_language=request.target_language, + source_language=request.source_language, + use_cache=request.use_cache, + ) + return TranslationResponse(**result) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/translate/batch", response_model=BatchTranslationResponse) +async def translate_batch( + request: BatchTranslationRequest, + auth: AuthContext = Depends(get_current_auth), +): + """Translate multiple texts to target language.""" + try: + results = await translation_service.translate_batch( + texts=request.texts, + target_language=request.target_language, + source_language=request.source_language, + use_cache=request.use_cache, + ) + total_latency = sum(r["latency_ms"] for r in results) + return BatchTranslationResponse( + translations=[TranslationResponse(**r) for r in results], + total_latency_ms=total_latency, + ) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +class LocaleFormatRequest(BaseModel): + value: Union[float, int, str] + locale: str = Field(..., min_length=2, max_length=10) + format_type: str = Field(..., pattern="^(number|currency|percent|date|datetime)$") + currency_code: Optional[str] = Field(default=None, min_length=3, max_length=3) + + +class LocaleFormatResponse(BaseModel): + formatted: str + locale: str + format_type: str + + +@router.post("/translate/format", response_model=LocaleFormatResponse) +async def format_locale( + request: LocaleFormatRequest, + auth: AuthContext = Depends(get_current_auth), +): + """Format numbers, currencies, dates, etc. for a specific locale.""" + try: + formatted = translation_service.format_locale( + value=request.value, + locale=request.locale, + format_type=request.format_type, + currency_code=request.currency_code, + ) + return LocaleFormatResponse( + formatted=formatted, + locale=request.locale, + format_type=request.format_type, + ) + except ValueError as e: + raise HTTPException(status_code=400, detail=str(e)) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + + +class TranslationCacheStatsResponse(BaseModel): + hits: int + misses: int + sets: int + invalidations: int + hit_rate: float + size: int + + +@router.get("/translate/cache/stats", response_model=TranslationCacheStatsResponse) +async def get_translation_cache_stats(auth: AuthContext = Depends(get_current_auth)): + """Get translation cache statistics.""" + stats = translation_service.get_cache_stats() + return TranslationCacheStatsResponse(**stats) + + +@router.post("/translate/cache/invalidate") +async def invalidate_translation_cache( + text: Optional[str] = None, + auth: AuthContext = Depends(get_current_auth), +): + """Invalidate translation cache (specific text or all).""" + if text: + translation_service.invalidate_cache(text) + return {"message": "Cache entry invalidated", "text_hash": hashlib.sha256(text.encode()).hexdigest()[:16]} + else: + count = translation_service.invalidate_all_cache() + return {"message": f"Invalidated {count} cache entries"} diff --git a/api/routers/llm_health.py b/api/routers/llm_health.py new file mode 100644 index 0000000..8d7e629 --- /dev/null +++ b/api/routers/llm_health.py @@ -0,0 +1,20 @@ +"""LLM Health and Provider Status API.""" +from __future__ import annotations + +from fastapi import APIRouter + +from astroml.llm.health import check_all_providers, check_provider_health + +router = APIRouter(prefix="/api/v1/llm", tags=["llm-health"]) + + +@router.get("/health") +async def llm_health(): + result = await check_all_providers() + return result + + +@router.get("/health/{provider_name}") +async def llm_provider_health(provider_name: str): + result = await check_provider_health(provider_name) + return result diff --git a/api/schemas.py b/api/schemas.py deleted file mode 100644 index 9d8b0ac..0000000 --- a/api/schemas.py +++ /dev/null @@ -1,769 +0,0 @@ -"""Pydantic schemas shared across all API routers.""" -from __future__ import annotations - -import re -from datetime import datetime -from typing import Any, Dict, List, Optional - -from pydantic import BaseModel, Field, field_validator - - -# โ”€โ”€โ”€ Fraud โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - -class EdgeInput(BaseModel): - src: str - dst: str - amount: float = 0.0 - timestamp: float = 0.0 - asset: str = "XLM" - - -class ScoreRequest(BaseModel): - accounts: List[str] = Field(..., max_length=50) - edges: List[EdgeInput] = Field(default_factory=list) - - -class ScoreResponse(BaseModel): - scores: Dict[str, float] - - -class FraudAlertOut(BaseModel): - id: int - account_id: str - pattern: Optional[str] = None - risk_score: float - risk_level: str - description: Optional[str] = None - detected_at: datetime - - class Config: - from_attributes = True - - -class FraudAlertsResponse(BaseModel): - data: List[FraudAlertOut] - page: int - page_size: int - total: int - - -class FraudExplanationOut(BaseModel): - alert_id: int - explanation: str - generated_in_ms: float - cached: bool - - -class TransactionSummaryOut(BaseModel): - hash: str - amount: float - asset_code: str - destination_account: Optional[str] = None - created_at: str - - -class PrioritizedAlertOut(BaseModel): - id: int - account_id: str - pattern: Optional[str] = None - risk_score: float - risk_level: str - priority_score: float - priority_level: str - explanation: str - detected_at: datetime - recent_transactions: List[TransactionSummaryOut] - account_activity_score: float - is_duplicate: bool = False - duplicate_of: Optional[int] = None - - class Config: - from_attributes = True - - -class PrioritizedAlertsResponse(BaseModel): - data: List[PrioritizedAlertOut] - deduplication_reduction_pct: int - total_processed: int - total_remaining: int - - -class RiskPoint(BaseModel): - date: str - score: float - - -class FraudStatsResponse(BaseModel): - total_alerts: int - high_risk: int - medium_risk: int - low_risk: int - recent_alerts: List[FraudAlertOut] - risk_over_time: List[RiskPoint] - - -# โ”€โ”€โ”€ Accounts โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - -class AccountOut(BaseModel): - account_id: str - balance: Optional[float] = None - sequence: Optional[int] = None - home_domain: Optional[str] = None - flags: int = 0 - last_modified_ledger: Optional[int] = None - created_at: Optional[datetime] = None - updated_at: Optional[datetime] = None - - class Config: - from_attributes = True - - -class AccountsResponse(BaseModel): - data: List[AccountOut] - page: int - page_size: int - total: int - - -class TransactionOut(BaseModel): - hash: str - ledger_sequence: int - source_account: str - created_at: datetime - fee: int - operation_count: int - successful: bool - memo_type: Optional[str] = None - memo: Optional[str] = None - - class Config: - from_attributes = True - - -class TransactionsResponse(BaseModel): - data: List[TransactionOut] - page: int - page_size: int - total: int - - -class FraudSummaryOut(BaseModel): - account_id: str - total_alerts: int - high_risk: int - medium_risk: int - low_risk: int - latest_score: Optional[float] = None - - -class LoyaltySummaryOut(BaseModel): - account_id: str - points_balance: int - tier_id: str - tier_name: str - - -# โ”€โ”€โ”€ Monitoring โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - -class ModelMetricsOut(BaseModel): - accuracy: Optional[float] = None - precision: Optional[float] = None - recall: Optional[float] = None - f1: Optional[float] = None - f1_score: Optional[float] = None # alias populated from f1 for compatibility - auc: Optional[float] = None - auc_roc: Optional[float] = None # alias populated from auc for compatibility - drift_score: Optional[float] = None - recorded_at: Optional[datetime] = None - - # LLM Tracking - llm_cost: Optional[float] = None - llm_prompt_tokens: Optional[int] = None - llm_completion_tokens: Optional[int] = None - - -class PerformancePoint(BaseModel): - date: str - accuracy: Optional[float] = None - precision: Optional[float] = None - recall: Optional[float] = None - f1: Optional[float] = None - auc: Optional[float] = None - - -class DriftReport(BaseModel): - features: Dict[str, float] - overall_drift: float - generated_at: datetime - - -class PredictionStats(BaseModel): - total_predictions: int - anomaly_rate: float - avg_score: float - period_days: int - - -class LatencyStats(BaseModel): - p50_ms: float - p95_ms: float - p99_ms: float - - -# โ”€โ”€โ”€ Loyalty โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - -class LoyaltyTierOut(BaseModel): - id: str - name: str - threshold: int - multiplier: float - color: str - - -class BenefitOut(BaseModel): - id: str - title: str - description: str - - -class NextTierInfo(BaseModel): - tier: LoyaltyTierOut - remaining_to_upgrade: int - progress_pct: int - - -class LoyaltySummaryFull(BaseModel): - current_tier: LoyaltyTierOut - points_balance: int - next_tier: Optional[NextTierInfo] = None - benefits: List[BenefitOut] - - -class PointsTransactionOut(BaseModel): - id: str - date: str - type: str # earn | redeem | adjust - points: int - source: Optional[str] = None - note: Optional[str] = None - - -class PointsHistoryResponse(BaseModel): - data: List[PointsTransactionOut] - page: int - page_size: int - total: int - - -class RedeemRequest(BaseModel): - points: int = Field(..., gt=0) - reward_id: Optional[str] = None - - -class RedeemResponse(BaseModel): - new_balance: int - transaction: PointsTransactionOut - - -class ReferralOut(BaseModel): - url: str - invited: int - rewards: int - - -# โ”€โ”€โ”€ Mentorship โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - -class MentorProfileIn(BaseModel): - bio: Optional[str] = None - skills: List[str] = Field(default_factory=list) - years_experience: int = Field(ge=0) - preferred_session_day: Optional[str] = None - max_mentees: int = Field(default=3, ge=1, le=10) - - -class MentorProfileOut(BaseModel): - id: int - github_username: str - bio: Optional[str] = None - skills: List[str] - years_experience: int - preferred_session_day: Optional[str] = None - max_mentees: int - is_available: bool - created_at: datetime - - class Config: - from_attributes = True - - -class MenteeProfileIn(BaseModel): - bio: Optional[str] = None - learning_interests: List[str] = Field(default_factory=list) - years_experience: int = Field(ge=0) - preferred_session_day: Optional[str] = None - goals: Optional[str] = None - - -class MenteeProfileOut(BaseModel): - id: int - github_username: str - bio: Optional[str] = None - learning_interests: List[str] - years_experience: int - preferred_session_day: Optional[str] = None - goals: Optional[str] = None - created_at: datetime - - class Config: - from_attributes = True - - -class MentorMatchOut(BaseModel): - mentor_id: int - mentor_username: str - skill_overlap: float - experience_gap: float - availability_match: float - total_score: float - - -class MentorshipOut(BaseModel): - id: int - mentor_id: int - mentor_username: str - mentee_id: int - mentee_username: str - status: str - match_score: float - started_at: datetime - ended_at: Optional[datetime] = None - - class Config: - from_attributes = True - - -class MentorshipSessionIn(BaseModel): - duration_minutes: int = Field(gt=0, le=480) # max 8 hours - topic: str = Field(min_length=3, max_length=256) - notes: Optional[str] = None - - -class MentorshipSessionOut(BaseModel): - id: int - mentorship_id: int - session_date: datetime - duration_minutes: int - topic: str - notes: Optional[str] = None - - class Config: - from_attributes = True - - -class MentorshipFeedbackIn(BaseModel): - rating: int = Field(ge=1, le=5) - feedback_text: Optional[str] = None - - -class MentorshipFeedbackOut(BaseModel): - id: int - session_id: int - rating: int - feedback_text: Optional[str] = None - is_mentor_feedback: bool - created_at: datetime - - class Config: - from_attributes = True - - -class MentorshipMetrics(BaseModel): - total_sessions: int - total_hours: float - avg_rating: float - topics_covered: List[str] - last_session_date: Optional[datetime] = None - - -class MentorMetrics(BaseModel): - total_mentees: int - total_sessions: int - total_hours: float - avg_rating: float - - -class MentorshipListResponse(BaseModel): - data: List[MentorshipOut] - page: int - page_size: int - total: int - - -class MentorListResponse(BaseModel): - data: List[MentorProfileOut] - page: int - page_size: int - total: int - - -class MenteeListResponse(BaseModel): - data: List[MenteeProfileOut] - page: int - page_size: int - total: int - - -# โ”€โ”€โ”€ Notifications โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - -class NotificationOut(BaseModel): - id: int - event_type: str - title: str - content: Optional[str] = None - link: Optional[str] = None - actor: Optional[str] = None - is_read: bool - created_at: datetime - - class Config: - from_attributes = True - - -class NotificationListResponse(BaseModel): - data: List[NotificationOut] - unread_count: int - - -class NotificationPreferenceIn(BaseModel): - email_enabled: bool = True - slack_enabled: bool = False - discord_enabled: bool = False - pr_comments: bool = True - pr_mentions: bool = True - issue_comments: bool = True - issue_mentions: bool = True - review_requests: bool = True - digest_frequency: str = "weekly" # daily|weekly|never - slack_webhook_url: Optional[str] = None - discord_webhook_url: Optional[str] = None - - -class NotificationPreferenceOut(BaseModel): - id: int - user_id: int - email_enabled: bool - slack_enabled: bool - discord_enabled: bool - pr_comments: bool - pr_mentions: bool - issue_comments: bool - issue_mentions: bool - review_requests: bool - digest_frequency: str - created_at: datetime - updated_at: datetime - - class Config: - from_attributes = True - - -class WebhookEventIn(BaseModel): - event_type: str # pr_comment|issue_comment|review_request|pr_merged - pr_number: Optional[int] = None - issue_number: Optional[int] = None - commenter: Optional[str] = None - content: Optional[str] = None - reviewer_id: Optional[int] = None - author_id: Optional[int] = None - repo: str - link: str - - -class DigestEmailOut(BaseModel): - user_id: int - period: str - notifications_count: int - generated_at: datetime - - -# โ”€โ”€โ”€ Onboarding โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - -class OnboardingStepIn(BaseModel): - step: str - - -class OnboardingChecklistItem(BaseModel): - step: str - label: str - completed: bool - - -class OnboardingProgressOut(BaseModel): - github_username: str - checklist: List[OnboardingChecklistItem] - completed_count: int - total_steps: int - progress_pct: int - is_complete: bool - started_at: str - last_updated: str - - -# โ”€โ”€โ”€ FAQ (issue #307) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - -class FAQOut(BaseModel): - id: int - category: str - question: str - answer: str - order: int - is_published: bool - created_at: datetime - updated_at: datetime - - class Config: - from_attributes = True - - -class FAQIn(BaseModel): - category: str = Field(..., min_length=1, max_length=64) - question: str = Field(..., min_length=1, max_length=512) - answer: str = Field(..., min_length=1) - order: int = Field(default=0, ge=0) - is_published: bool = True - - -class FAQUpdateIn(BaseModel): - category: Optional[str] = Field(None, min_length=1, max_length=64) - question: Optional[str] = Field(None, min_length=1, max_length=512) - answer: Optional[str] = Field(None, min_length=1) - order: Optional[int] = Field(None, ge=0) - is_published: Optional[bool] = None - - -class FAQListResponse(BaseModel): - data: List[FAQOut] - categories: List[str] - total: int - - -class FAQFeedbackIn(BaseModel): - is_helpful: bool - user_comment: Optional[str] = None - - -class FAQFeedbackOut(BaseModel): - id: int - faq_id: int - is_helpful: bool - user_comment: Optional[str] = None - created_at: datetime - - class Config: - from_attributes = True - - -class FAQSuggestionIn(BaseModel): - question: str = Field(..., min_length=1, max_length=512) - suggested_answer: Optional[str] = None - category: Optional[str] = Field(None, max_length=64) - - -class FAQSuggestionOut(BaseModel): - id: int - question: str - suggested_answer: Optional[str] = None - category: Optional[str] = None - status: str - created_at: datetime - - class Config: - from_attributes = True - - -class FAQSuggestionListResponse(BaseModel): - data: List[FAQSuggestionOut] - page: int - page_size: int - total: int - - -# โ”€โ”€โ”€ Contact / Support tickets (issue #305) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - -_EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$") - - -class ContactFormIn(BaseModel): - name: str = Field(min_length=1, max_length=120) - email: str = Field(min_length=3, max_length=254) - subject: str = Field(min_length=1, max_length=200) - message: str = Field(min_length=1, max_length=5000) - # reCAPTCHA token from the frontend widget; optional when verification is off. - recaptcha_token: Optional[str] = None - - @field_validator("name", "subject", "message") - @classmethod - def _not_blank(cls, v: str) -> str: - if not v or not v.strip(): - raise ValueError("must not be blank") - return v.strip() - - @field_validator("email") - @classmethod - def _valid_email(cls, v: str) -> str: - v = v.strip() - if not _EMAIL_RE.match(v): - raise ValueError("invalid email address") - return v - - -class SupportTicketOut(BaseModel): - reference: str - status: str - created_at: datetime - - class Config: - from_attributes = True - - -# โ”€โ”€โ”€ Feedback (issue #308) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - -FEEDBACK_CATEGORIES = {"bug", "feature", "general"} -FEEDBACK_STATUSES = {"open", "planned", "in_progress", "completed", "declined"} -ROADMAP_STATUSES = ("planned", "in_progress", "completed") - - -class FeedbackIn(BaseModel): - category: str = Field(min_length=1, max_length=16) - message: str = Field(min_length=1, max_length=5000) - email: Optional[str] = Field(default=None, max_length=254) - screenshot: Optional[str] = None # data URL: data:image/png;base64,... - - @field_validator("category") - @classmethod - def _valid_category(cls, v: str) -> str: - v = v.strip().lower() - if v not in FEEDBACK_CATEGORIES: - raise ValueError("category must be one of: bug, feature, general") - return v - - @field_validator("message") - @classmethod - def _not_blank(cls, v: str) -> str: - if not v.strip(): - raise ValueError("message must not be blank") - return v.strip() - - @field_validator("screenshot") - @classmethod - def _valid_screenshot(cls, v: Optional[str]) -> Optional[str]: - if v is None: - return v - if not v.startswith("data:image/"): - raise ValueError("screenshot must be an image data URL") - return v - - -class FeedbackOut(BaseModel): - id: int - category: str - message: str - status: str - github_issue_url: Optional[str] = None - created_at: datetime - - class Config: - from_attributes = True - - -class ContactSubmitResponse(BaseModel): - message: str - ticket: SupportTicketOut - - -class FeedbackListResponse(BaseModel): - data: List[FeedbackOut] - page: int - page_size: int - total: int - - -class FeedbackStatusUpdate(BaseModel): - status: str - - @field_validator("status") - @classmethod - def _valid_status(cls, v: str) -> str: - v = v.strip().lower() - if v not in FEEDBACK_STATUSES: - raise ValueError("invalid status") - return v - - -class RoadmapItem(BaseModel): - id: int - category: str - message: str - status: str - - class Config: - from_attributes = True - - -class RoadmapResponse(BaseModel): - planned: List[RoadmapItem] - in_progress: List[RoadmapItem] - completed: List[RoadmapItem] - -# โ”€โ”€โ”€ LLM feedback (#402) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ - -class LLMFeedbackIn(BaseModel): - feature: str = Field(min_length=1, max_length=64) - prompt: str = Field(min_length=1, max_length=8000) - output: str = Field(min_length=1, max_length=8000) - rating: int = Field(ge=1, le=5) - comment: Optional[str] = Field(default=None, max_length=2000) - user_id: Optional[str] = Field(default=None, max_length=128) - is_expert: bool = False - expert_weight: float = Field(default=1.0, ge=1.0, le=5.0) - - @field_validator("feature", "prompt", "output") - @classmethod - def _strip_required(cls, v: str) -> str: - v = v.strip() - if not v: - raise ValueError("must not be blank") - return v - - -class LLMFeedbackOut(BaseModel): - id: int - feature: str - rating: int - comment: Optional[str] = None - is_expert: bool - expert_weight: float - created_at: datetime - - class Config: - from_attributes = True - - -class LLMFeedbackTrend(BaseModel): - feature: str - count: int - average_rating: float - weighted_average_rating: float - expert_count: int - - -class LLMFeedbackDashboard(BaseModel): - total: int - trends: List[LLMFeedbackTrend] - low_rating_examples: List[LLMFeedbackOut] - - -class LLMPromptImprovement(BaseModel): - feature: str - recommendation: str - evidence_count: int diff --git a/api/schemas/__init__.py b/api/schemas/__init__.py index 236d9da..af7d84b 100644 --- a/api/schemas/__init__.py +++ b/api/schemas/__init__.py @@ -1,2 +1,877 @@ -"""Schemas package for API models.""" +"""Pydantic schemas shared across all API routers.""" +from __future__ import annotations +import re +from datetime import datetime +from typing import Any, Dict, List, Optional, Tuple, Literal + +from pydantic import BaseModel, Field, field_validator + + +# โ”€โ”€โ”€ Fraud โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +class EdgeInput(BaseModel): + src: str + dst: str + amount: float = 0.0 + timestamp: float = 0.0 + asset: str = "XLM" + + +class ScoreRequest(BaseModel): + accounts: List[str] = Field(..., max_length=50) + edges: List[EdgeInput] = Field(default_factory=list) + + +class ScoreResponse(BaseModel): + scores: Dict[str, float] + + +class FraudAlertOut(BaseModel): + id: int + account_id: str + pattern: Optional[str] = None + risk_score: float + risk_level: str + description: Optional[str] = None + detected_at: datetime + + class Config: + from_attributes = True + + +class FraudAlertsResponse(BaseModel): + data: List[FraudAlertOut] + page: int + page_size: int + total: int + + +class FraudExplanationOut(BaseModel): + alert_id: int + explanation: str + generated_in_ms: float + cached: bool + + +class TransactionSummaryOut(BaseModel): + hash: str + amount: float + asset_code: str + destination_account: Optional[str] = None + created_at: str + + +class PrioritizedAlertOut(BaseModel): + id: int + account_id: str + pattern: Optional[str] = None + risk_score: float + risk_level: str + priority_score: float + priority_level: str + explanation: str + detected_at: datetime + recent_transactions: List[TransactionSummaryOut] + account_activity_score: float + is_duplicate: bool = False + duplicate_of: Optional[int] = None + + class Config: + from_attributes = True + + +class PrioritizedAlertsResponse(BaseModel): + data: List[PrioritizedAlertOut] + deduplication_reduction_pct: int + total_processed: int + total_remaining: int + + +class RiskPoint(BaseModel): + date: str + score: float + + +class FraudStatsResponse(BaseModel): + total_alerts: int + high_risk: int + medium_risk: int + low_risk: int + recent_alerts: List[FraudAlertOut] + risk_over_time: List[RiskPoint] + + +# โ”€โ”€โ”€ Accounts โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +class AccountOut(BaseModel): + account_id: str + balance: Optional[float] = None + sequence: Optional[int] = None + home_domain: Optional[str] = None + flags: int = 0 + last_modified_ledger: Optional[int] = None + created_at: Optional[datetime] = None + updated_at: Optional[datetime] = None + + class Config: + from_attributes = True + + +class AccountsResponse(BaseModel): + data: List[AccountOut] + page: int + page_size: int + total: int + + +class TransactionOut(BaseModel): + hash: str + ledger_sequence: int + source_account: str + created_at: datetime + fee: int + operation_count: int + successful: bool + memo_type: Optional[str] = None + memo: Optional[str] = None + + class Config: + from_attributes = True + + +class TransactionsResponse(BaseModel): + data: List[TransactionOut] + page: int + page_size: int + total: int + + +class FraudSummaryOut(BaseModel): + account_id: str + total_alerts: int + high_risk: int + medium_risk: int + low_risk: int + latest_score: Optional[float] = None + + +class LoyaltySummaryOut(BaseModel): + account_id: str + points_balance: int + tier_id: str + tier_name: str + + +# โ”€โ”€โ”€ Monitoring โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +class ModelMetricsOut(BaseModel): + accuracy: Optional[float] = None + precision: Optional[float] = None + recall: Optional[float] = None + f1: Optional[float] = None + f1_score: Optional[float] = None # alias populated from f1 for compatibility + auc: Optional[float] = None + auc_roc: Optional[float] = None # alias populated from auc for compatibility + drift_score: Optional[float] = None + recorded_at: Optional[datetime] = None + + # LLM Tracking + llm_cost: Optional[float] = None + llm_prompt_tokens: Optional[int] = None + llm_completion_tokens: Optional[int] = None + + +class PerformancePoint(BaseModel): + date: str + accuracy: Optional[float] = None + precision: Optional[float] = None + recall: Optional[float] = None + f1: Optional[float] = None + auc: Optional[float] = None + + +class DriftReport(BaseModel): + features: Dict[str, float] + overall_drift: float + generated_at: datetime + + +class PredictionStats(BaseModel): + total_predictions: int + anomaly_rate: float + avg_score: float + period_days: int + + +class LatencyStats(BaseModel): + p50_ms: float + p95_ms: float + p99_ms: float + + +# โ”€โ”€โ”€ Loyalty โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +class LoyaltyTierOut(BaseModel): + id: str + name: str + threshold: int + multiplier: float + color: str + + +class BenefitOut(BaseModel): + id: str + title: str + description: str + + +class NextTierInfo(BaseModel): + tier: LoyaltyTierOut + remaining_to_upgrade: int + progress_pct: int + + +class LoyaltySummaryFull(BaseModel): + current_tier: LoyaltyTierOut + points_balance: int + next_tier: Optional[NextTierInfo] = None + benefits: List[BenefitOut] + + +class PointsTransactionOut(BaseModel): + id: str + date: str + type: str # earn | redeem | adjust + points: int + source: Optional[str] = None + note: Optional[str] = None + + +class PointsHistoryResponse(BaseModel): + data: List[PointsTransactionOut] + page: int + page_size: int + total: int + + +class RedeemRequest(BaseModel): + points: int = Field(..., gt=0) + reward_id: Optional[str] = None + + +class RedeemResponse(BaseModel): + new_balance: int + transaction: PointsTransactionOut + + +class ReferralOut(BaseModel): + url: str + invited: int + rewards: int + + +# โ”€โ”€โ”€ Mentorship โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +class MentorProfileIn(BaseModel): + bio: Optional[str] = None + skills: List[str] = Field(default_factory=list) + years_experience: int = Field(ge=0) + preferred_session_day: Optional[str] = None + max_mentees: int = Field(default=3, ge=1, le=10) + + +class MentorProfileOut(BaseModel): + id: int + github_username: str + bio: Optional[str] = None + skills: List[str] + years_experience: int + preferred_session_day: Optional[str] = None + max_mentees: int + is_available: bool + created_at: datetime + + class Config: + from_attributes = True + + +class MenteeProfileIn(BaseModel): + bio: Optional[str] = None + learning_interests: List[str] = Field(default_factory=list) + years_experience: int = Field(ge=0) + preferred_session_day: Optional[str] = None + goals: Optional[str] = None + + +class MenteeProfileOut(BaseModel): + id: int + github_username: str + bio: Optional[str] = None + learning_interests: List[str] + years_experience: int + preferred_session_day: Optional[str] = None + goals: Optional[str] = None + created_at: datetime + + class Config: + from_attributes = True + + +class MentorMatchOut(BaseModel): + mentor_id: int + mentor_username: str + skill_overlap: float + experience_gap: float + availability_match: float + total_score: float + + +class MentorshipOut(BaseModel): + id: int + mentor_id: int + mentor_username: str + mentee_id: int + mentee_username: str + status: str + match_score: float + started_at: datetime + ended_at: Optional[datetime] = None + + class Config: + from_attributes = True + + +class MentorshipSessionIn(BaseModel): + duration_minutes: int = Field(gt=0, le=480) # max 8 hours + topic: str = Field(min_length=3, max_length=256) + notes: Optional[str] = None + + +class MentorshipSessionOut(BaseModel): + id: int + mentorship_id: int + session_date: datetime + duration_minutes: int + topic: str + notes: Optional[str] = None + + class Config: + from_attributes = True + + +class MentorshipFeedbackIn(BaseModel): + rating: int = Field(ge=1, le=5) + feedback_text: Optional[str] = None + + +class MentorshipFeedbackOut(BaseModel): + id: int + session_id: int + rating: int + feedback_text: Optional[str] = None + is_mentor_feedback: bool + created_at: datetime + + class Config: + from_attributes = True + + +class MentorshipMetrics(BaseModel): + total_sessions: int + total_hours: float + avg_rating: float + topics_covered: List[str] + last_session_date: Optional[datetime] = None + + +class MentorMetrics(BaseModel): + total_mentees: int + total_sessions: int + total_hours: float + avg_rating: float + + +class MentorshipListResponse(BaseModel): + data: List[MentorshipOut] + page: int + page_size: int + total: int + + +class MentorListResponse(BaseModel): + data: List[MentorProfileOut] + page: int + page_size: int + total: int + + +class MenteeListResponse(BaseModel): + data: List[MenteeProfileOut] + page: int + page_size: int + total: int + + +# โ”€โ”€โ”€ Notifications โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +class NotificationOut(BaseModel): + id: int + event_type: str + title: str + content: Optional[str] = None + link: Optional[str] = None + actor: Optional[str] = None + is_read: bool + created_at: datetime + + class Config: + from_attributes = True + + +class NotificationListResponse(BaseModel): + data: List[NotificationOut] + unread_count: int + + +class NotificationPreferenceIn(BaseModel): + email_enabled: bool = True + slack_enabled: bool = False + discord_enabled: bool = False + pr_comments: bool = True + pr_mentions: bool = True + issue_comments: bool = True + issue_mentions: bool = True + review_requests: bool = True + digest_frequency: str = "weekly" # daily|weekly|never + slack_webhook_url: Optional[str] = None + discord_webhook_url: Optional[str] = None + + +class NotificationPreferenceOut(BaseModel): + id: int + user_id: int + email_enabled: bool + slack_enabled: bool + discord_enabled: bool + pr_comments: bool + pr_mentions: bool + issue_comments: bool + issue_mentions: bool + review_requests: bool + digest_frequency: str + created_at: datetime + updated_at: datetime + + class Config: + from_attributes = True + + +class WebhookEventIn(BaseModel): + event_type: str # pr_comment|issue_comment|review_request|pr_merged + pr_number: Optional[int] = None + issue_number: Optional[int] = None + commenter: Optional[str] = None + content: Optional[str] = None + reviewer_id: Optional[int] = None + author_id: Optional[int] = None + repo: str + link: str + + +class DigestEmailOut(BaseModel): + user_id: int + period: str + notifications_count: int + generated_at: datetime + + +# โ”€โ”€โ”€ Onboarding โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +class OnboardingStepIn(BaseModel): + step: str + + +class OnboardingChecklistItem(BaseModel): + step: str + label: str + completed: bool + + +class OnboardingProgressOut(BaseModel): + github_username: str + checklist: List[OnboardingChecklistItem] + completed_count: int + total_steps: int + progress_pct: int + is_complete: bool + started_at: str + last_updated: str + + +# โ”€โ”€โ”€ FAQ (issue #307) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +class FAQOut(BaseModel): + id: int + category: str + question: str + answer: str + order: int + is_published: bool + created_at: datetime + updated_at: datetime + + class Config: + from_attributes = True + + +class FAQIn(BaseModel): + category: str = Field(..., min_length=1, max_length=64) + question: str = Field(..., min_length=1, max_length=512) + answer: str = Field(..., min_length=1) + order: int = Field(default=0, ge=0) + is_published: bool = True + + +class FAQUpdateIn(BaseModel): + category: Optional[str] = Field(None, min_length=1, max_length=64) + question: Optional[str] = Field(None, min_length=1, max_length=512) + answer: Optional[str] = Field(None, min_length=1) + order: Optional[int] = Field(None, ge=0) + is_published: Optional[bool] = None + + +class FAQListResponse(BaseModel): + data: List[FAQOut] + categories: List[str] + total: int + + +class FAQFeedbackIn(BaseModel): + is_helpful: bool + user_comment: Optional[str] = None + + +class FAQFeedbackOut(BaseModel): + id: int + faq_id: int + is_helpful: bool + user_comment: Optional[str] = None + created_at: datetime + + class Config: + from_attributes = True + + +class FAQSuggestionIn(BaseModel): + question: str = Field(..., min_length=1, max_length=512) + suggested_answer: Optional[str] = None + category: Optional[str] = Field(None, max_length=64) + + +class FAQSuggestionOut(BaseModel): + id: int + question: str + suggested_answer: Optional[str] = None + category: Optional[str] = None + status: str + created_at: datetime + + class Config: + from_attributes = True + + +class FAQSuggestionListResponse(BaseModel): + data: List[FAQSuggestionOut] + page: int + page_size: int + total: int + + +# โ”€โ”€โ”€ Contact / Support tickets (issue #305) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +_EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$") + + +class ContactFormIn(BaseModel): + name: str = Field(min_length=1, max_length=120) + email: str = Field(min_length=3, max_length=254) + subject: str = Field(min_length=1, max_length=200) + message: str = Field(min_length=1, max_length=5000) + # reCAPTCHA token from the frontend widget; optional when verification is off. + recaptcha_token: Optional[str] = None + + @field_validator("name", "subject", "message") + @classmethod + def _not_blank(cls, v: str) -> str: + if not v or not v.strip(): + raise ValueError("must not be blank") + return v.strip() + + @field_validator("email") + @classmethod + def _valid_email(cls, v: str) -> str: + v = v.strip() + if not _EMAIL_RE.match(v): + raise ValueError("invalid email address") + return v + + +class SupportTicketOut(BaseModel): + reference: str + status: str + created_at: datetime + + class Config: + from_attributes = True + + +# โ”€โ”€โ”€ Feedback (issue #308) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +FEEDBACK_CATEGORIES = {"bug", "feature", "general"} +FEEDBACK_STATUSES = {"open", "planned", "in_progress", "completed", "declined"} +ROADMAP_STATUSES = ("planned", "in_progress", "completed") + + +class FeedbackIn(BaseModel): + category: str = Field(min_length=1, max_length=16) + message: str = Field(min_length=1, max_length=5000) + email: Optional[str] = Field(default=None, max_length=254) + screenshot: Optional[str] = None # data URL: data:image/png;base64,... + + @field_validator("category") + @classmethod + def _valid_category(cls, v: str) -> str: + v = v.strip().lower() + if v not in FEEDBACK_CATEGORIES: + raise ValueError("category must be one of: bug, feature, general") + return v + + @field_validator("message") + @classmethod + def _not_blank(cls, v: str) -> str: + if not v.strip(): + raise ValueError("message must not be blank") + return v.strip() + + @field_validator("screenshot") + @classmethod + def _valid_screenshot(cls, v: Optional[str]) -> Optional[str]: + if v is None: + return v + if not v.startswith("data:image/"): + raise ValueError("screenshot must be an image data URL") + return v + + +class FeedbackOut(BaseModel): + id: int + category: str + message: str + status: str + github_issue_url: Optional[str] = None + created_at: datetime + + class Config: + from_attributes = True + + +class ContactSubmitResponse(BaseModel): + message: str + ticket: SupportTicketOut + + +class FeedbackListResponse(BaseModel): + data: List[FeedbackOut] + page: int + page_size: int + total: int + + +class FeedbackStatusUpdate(BaseModel): + status: str + + @field_validator("status") + @classmethod + def _valid_status(cls, v: str) -> str: + v = v.strip().lower() + if v not in FEEDBACK_STATUSES: + raise ValueError("invalid status") + return v + + +class RoadmapItem(BaseModel): + id: int + category: str + message: str + status: str + + class Config: + from_attributes = True + + +class RoadmapResponse(BaseModel): + planned: List[RoadmapItem] + in_progress: List[RoadmapItem] + completed: List[RoadmapItem] + +# โ”€โ”€โ”€ LLM feedback (#402) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +class LLMFeedbackIn(BaseModel): + feature: str = Field(min_length=1, max_length=64) + prompt: str = Field(min_length=1, max_length=8000) + output: str = Field(min_length=1, max_length=8000) + rating: int = Field(ge=1, le=5) + comment: Optional[str] = Field(default=None, max_length=2000) + user_id: Optional[str] = Field(default=None, max_length=128) + is_expert: bool = False + expert_weight: float = Field(default=1.0, ge=1.0, le=5.0) + + @field_validator("feature", "prompt", "output") + @classmethod + def _strip_required(cls, v: str) -> str: + v = v.strip() + if not v: + raise ValueError("must not be blank") + return v + + +class LLMFeedbackOut(BaseModel): + id: int + feature: str + rating: int + comment: Optional[str] = None + is_expert: bool + expert_weight: float + created_at: datetime + + class Config: + from_attributes = True + + +class LLMFeedbackTrend(BaseModel): + feature: str + count: int + average_rating: float + weighted_average_rating: float + expert_count: int + + +class LLMFeedbackDashboard(BaseModel): + total: int + trends: List[LLMFeedbackTrend] + low_rating_examples: List[LLMFeedbackOut] + + +class LLMPromptImprovement(BaseModel): + feature: str + recommendation: str + evidence_count: int + + +# โ”€โ”€โ”€ Translation (Issue 1) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +class TranslationRequest(BaseModel): + text: str = Field(..., min_length=1, max_length=10000) + target_language: str = Field(..., min_length=2, max_length=10) + source_language: str = Field(default="auto", min_length=2, max_length=10) + context: Optional[str] = Field(default=None, max_length=500) + + +class TranslationResponse(BaseModel): + translation: str + source_language: str + target_language: str + cached: bool + + +class BatchTranslationRequest(BaseModel): + texts: List[str] = Field(..., min_length=1, max_length=100) + target_language: str = Field(..., min_length=2, max_length=10) + source_language: str = Field(default="auto", min_length=2, max_length=10) + context: Optional[str] = Field(default=None, max_length=500) + + +class BatchTranslationResponse(BaseModel): + translations: List[TranslationResponse] + + +class SupportedLanguagesResponse(BaseModel): + languages: Dict[str, Dict[str, str]] + + +class LocaleFormatRequest(BaseModel): + data: Dict[str, Any] + locale: str = Field(..., min_length=2, max_length=10) + currency: str = Field(default="USD", min_length=3, max_length=3) + + +class LocaleFormatResponse(BaseModel): + formatted: Dict[str, Any] + locale: str + + +class TranslationCacheStatsResponse(BaseModel): + hits: int + misses: int + sets: int + evictions: int + hit_rate: float + size: int + + +# โ”€โ”€โ”€ Predictive Alerts (Issue 2) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +class BehavioralBaseline(BaseModel): + account_id: str + metric_name: str + mean_value: float + std_dev: float + min_value: float + max_value: float + sample_size: int + last_updated: datetime + confidence_level: float = 0.95 + + +class BehavioralBaselineResponse(BaseModel): + baselines: List[BehavioralBaseline] + account_id: str + generated_at: datetime + + +class DeviationAlert(BaseModel): + alert_id: str + account_id: str + metric_name: str + current_value: float + expected_range: Tuple[float, float] + deviation_score: float + severity: Literal["low", "medium", "high", "critical"] + detected_at: datetime + confidence: float + + +class PredictiveAlertRequest(BaseModel): + account_id: str + lookback_days: int = Field(default=30, ge=1, le=365) + metrics: Optional[List[str]] = None + sensitivity: str = Field(default="medium", pattern="^(low|medium|high)$") + + +class PredictiveAlertResponse(BaseModel): + alerts: List[DeviationAlert] + baselines_used: List[BehavioralBaseline] + generated_at: datetime + total_analyzed: int + + +class AlertGenerationRequest(BaseModel): + deviations: List[DeviationAlert] + include_explanation: bool = True + + +class AlertGenerationResponse(BaseModel): + alerts: List[DeviationAlert] + explanations: List[str] + generated_at: datetime diff --git a/api/services/llm_explainer.py b/api/services/llm_explainer.py index e7f6e41..022ee09 100644 --- a/api/services/llm_explainer.py +++ b/api/services/llm_explainer.py @@ -1,5 +1,7 @@ import asyncio +from astroml.llm.metrics import LLM_COST_USD_TOTAL, LLM_REQUEST_LATENCY_SECONDS, LLM_REQUESTS_TOTAL + class TransactionExplainer: def __init__(self): self.prompt_template = ( @@ -13,14 +15,19 @@ async def explain(self, tx_details: str) -> str: Generate a plain language explanation for a transaction. Response time guaranteed < 2s for testing. """ - await asyncio.sleep(0.5) # Simulate API call latency, but keep under 2s - - # Mock LLM response for demonstration + start = asyncio.get_event_loop().time() + await asyncio.sleep(0.5) explanation = f"This transaction transferred funds between accounts. It appears to be a standard transfer related to: {tx_details[:20]}..." - - # Ensure it's under 100 words (Acceptance criteria) words = explanation.split() if len(words) >= 100: explanation = " ".join(words[:99]) - + + latency = (asyncio.get_event_loop().time() - start) * 1000.0 + try: + LLM_REQUESTS_TOTAL.labels(provider="transaction-explainer", status="success").inc() + LLM_REQUEST_LATENCY_SECONDS.labels(provider="transaction-explainer").observe(latency / 1000.0) + LLM_COST_USD_TOTAL.labels(provider="transaction-explainer").inc(0.0) + except Exception: + pass + return explanation diff --git a/api/services/predictive_alerts.py b/api/services/predictive_alerts.py new file mode 100644 index 0000000..0b8cd65 --- /dev/null +++ b/api/services/predictive_alerts.py @@ -0,0 +1,451 @@ +"""Predictive alerts service for account behavior changes (Issue 2).""" +from __future__ import annotations + +import logging +import statistics +from collections import defaultdict +from datetime import datetime, timedelta +from typing import Any, Dict, List, Optional, Tuple +from uuid import uuid4 + +import numpy as np +from scipy import stats + +from api.database import get_sync_db +from api.models.orm import ApiTransaction +from astroml.llm.provider import MockLLMProvider + +logger = logging.getLogger(__name__) + +# Initialize LLM provider for generating explanations +llm_provider = MockLLMProvider() + + +class BehavioralLearner: + """Learn behavioral baselines for account metrics.""" + + def __init__(self, min_samples: int = 5, confidence_level: float = 0.95): + self.min_samples = min_samples + self.confidence_level = confidence_level + self._baselines: Dict[str, Dict[str, Dict[str, Any]]] = {} # account_id -> metric -> stats + + def update_behavior(self, account_id: str, metrics: Dict[str, List[float]]) -> None: + """Update behavioral baselines for an account based on historical data.""" + if account_id not in self._baselines: + self._baselines[account_id] = {} + + for metric_name, values in metrics.items(): + if len(values) < self.min_samples: + continue + + try: + mean_val = statistics.mean(values) + stdev = statistics.stdev(values) if len(values) > 1 else 0.0 + min_val = min(values) + max_val = max(values) + + # Calculate confidence interval + if len(values) >= 2 and stdev > 0: + conf_interval = stats.t.interval( + self.confidence_level, + len(values) - 1, + loc=mean_val, + scale=stats.sem(values) + ) + else: + conf_interval = (mean_val, mean_val) + + self._baselines[account_id][metric_name] = { + "mean": mean_val, + "std_dev": stdev, + "min": min_val, + "max": max_val, + "sample_size": len(values), + "last_updated": datetime.utcnow(), + "confidence_interval": [float(ci) for ci in conf_interval], + "confidence_level": self.confidence_level + } + except Exception as e: + logger.warning(f"Failed to calculate baseline for {account_id}.{metric_name}: {e}") + + def get_baseline(self, account_id: str, metric_name: str) -> Optional[Dict[str, Any]]: + """Get baseline for a specific account and metric.""" + return self._baselines.get(account_id, {}).get(metric_name) + + def get_all_baselines(self, account_id: str) -> Dict[str, Dict[str, Any]]: + """Get all baselines for an account.""" + return self._baselines.get(account_id, {}) + + +class DeviationDetector: + """Detect significant deviations from behavioral baselines.""" + + def __init__(self, sensitivity: str = "medium"): + self.sensitivity = sensitivity + self.thresholds = { + "low": 2.5, # 2.5 sigma + "medium": 2.0, # 2.0 sigma + "high": 1.5, # 1.5 sigma + } + self.threshold = self.thresholds.get(sensitivity, 2.0) + + def detect_deviation( + self, + account_id: str, + metric_name: str, + current_value: float, + baseline: Dict[str, Any] + ) -> Optional[Dict[str, Any]]: + """Detect if current value deviates significantly from baseline.""" + if not baseline: + return None + + mean_val = baseline["mean"] + stdev = baseline["std_dev"] + + if stdev == 0: + # No variation in historical data + if current_value != mean_val: + deviation_score = float('inf') if current_value != mean_val else 0.0 + else: + return None + else: + # Calculate z-score (absolute deviation) + deviation_score = abs(current_value - mean_val) / stdev + + # Check if deviation exceeds threshold + if deviation_score > self.threshold: + # Get expected range (confidence interval) + ci_low, ci_high = baseline.get("confidence_interval", [mean_val, mean_val]) + + # Determine severity based on deviation magnitude + if deviation_score >= 3.5: + severity = "critical" + elif deviation_score >= 2.5: + severity = "high" + elif deviation_score >= 1.5: + severity = "medium" + else: + severity = "low" + + return { + "alert_id": str(uuid4()), + "account_id": account_id, + "metric_name": metric_name, + "current_value": current_value, + "expected_range": [float(ci_low), float(ci_high)], + "deviation_score": float(deviation_score), + "severity": severity, + "confidence": min(0.95, 0.5 + (deviation_score - self.threshold) * 0.1) + } + + return None + + def detect_multiple_deviations( + self, + account_id: str, + current_metrics: Dict[str, float], + baselines: Dict[str, Dict[str, Any]] + ) -> List[Dict[str, Any]]: + """Detect deviations across multiple metrics.""" + deviations = [] + + for metric_name, current_value in current_metrics.items(): + baseline = baselines.get(metric_name) + if baseline: + deviation = self.detect_deviation( + account_id, metric_name, current_value, baseline + ) + if deviation: + deviations.append(deviation) + + # Sort by deviation score (descending) + deviations.sort(key=lambda x: x["deviation_score"], reverse=True) + return deviations + + +class AlertGenerator: + """Generate alerts and explanations using LLM.""" + + def __init__(self, llm_provider=None): + self.llm = llm_provider or MockLLMProvider() + self._explanation_cache: Dict[str, str] = {} + + def generate_explanation( + self, + alert_id: str, + account_id: str, + deviation_data: Dict[str, Any] + ) -> Dict[str, Any]: + """Generate natural language explanation for a deviation.""" + # Check cache first + cache_key = f"{alert_id}:{hash(str(deviation_data))}" + if cache_key in self._explanation_cache: + return {"explanation": self._explanation_cache[cache_key]} + + try: + # Build prompt for LLM + prompt = f""" + Explain this financial anomaly in clear, concise language suitable for a fraud analyst: + + Account: {account_id} + Metric: {deviation_data.get('metric_name', 'unknown')} + Current Value: {deviation_data.get('current_value', 0):.2f} + Expected Range: [{deviation_data.get('expected_range', [0, 0])[0]:.2f}, {deviation_data.get('expected_range', [0, 0])[1]:.2f}] + Deviation Score: {deviation_data.get('deviation_score', 0):.2f} + Severity: {deviation_data.get('severity', 'unknown')} + + Provide a brief explanation of what this anomaly might indicate about account behavior. + Keep it under 2 sentences and focus on the business implications. + """ + + # Generate explanation using LLM + explanation_text = self.llm.generate(prompt) + + # Clean up the explanation + explanation_text = explanation_text.strip() + if not explanation_text.endswith('.'): + explanation_text += '.' + + # Cache the result + self._explanation_cache[cache_key] = explanation_text + + return {"explanation": explanation_text} + + except Exception as e: + logger.error(f"Failed to generate explanation for alert {alert_id}: {e}") + return { + "explanation": f"Anomaly detected in {deviation_data.get('metric_name', 'metric')} " + f"with deviation score of {deviation_data.get('deviation_score', 0):.2f}. " + f"This may indicate unusual account activity requiring investigation." + } + + def create_deviation_alerts( + self, + deviations: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: + """Convert raw detections to formatted alert objects with explanations.""" + alerts = [] + for deviation in deviations: + # Generate explanation + explanation_result = self.generate_explanation( + deviation["alert_id"], + deviation["account_id"], + deviation + ) + + # Create complete alert + alert = deviation.copy() + alert["explanation"] = explanation_result["explanation"] + alerts.append(alert) + + return alerts + + +class PredictiveAlertService: + """Main service for predictive alerts.""" + + def __init__(self): + self.behavioral_learner = BehavioralLearner() + self.deviation_detector = DeviationDetector() + self.alert_generator = AlertGenerator() + self._cache: Dict[str, Any] = {} + self._cache_ttl = 300 # 5 minutes + + async def learn_behavior_from_transactions( + self, + account_id: str, + days: int = 30 + ) -> Dict[str, Any]: + """Learn behavioral baselines from historical transaction data.""" + try: + db = next(get_sync_db()) + + # Calculate cutoff date + cutoff_date = datetime.utcnow() - timedelta(days=days) + + # Query transactions for the account + transactions = db.query(ApiTransaction).filter( + ApiTransaction.source_account == account_id, + ApiTransaction.created_at >= cutoff_date + ).all() + + if not transactions: + return {"message": "No transaction data found for learning period"} + + # Extract time-series metrics + daily_metrics = defaultdict(list) + + for tx in transactions: + date_key = tx.created_at.date() + daily_metrics[date_key].append({ + "amount": float(tx.amount or 0), + "hash": tx.hash + }) + + # Calculate daily aggregates + daily_aggregates = { + "daily_transaction_count": [], + "daily_total_amount": [], + "daily_avg_amount": [], + "unique_counterparties_per_day": [] + } + + for date, txs in daily_metrics.items(): + amounts = [tx["amount"] for tx in txs] + counterparts = list(set(tx.destination_account for tx in txs if tx.destination_account)) + + daily_aggregates["daily_transaction_count"].append(len(txs)) + daily_aggregates["daily_total_amount"].append(sum(amounts)) + daily_averages = statistics.mean(amounts) if amounts else 0 + daily_aggregates["daily_avg_amount"].append(daily_averages) + daily_aggregates["unique_counterparties_per_day"].append(len(counterparts)) + + # Update behavioral models + self.behavioral_learner.update_behavior(account_id, daily_aggregates) + + return { + "account_id": account_id, + "learning_period_days": days, + "transactions_analyzed": len(transactions), + "days_with_data": len(daily_metrics), + "metrics_learned": list(daily_aggregates.keys()), + "timestamp": datetime.utcnow().isoformat() + } + + except Exception as e: + logger.error(f"Error learning behavior for account {account_id}: {e}") + return {"error": str(e)} + finally: + db.close() + + async def generate_predictive_alerts( + self, + account_id: str, + lookback_days: int = 30, + metrics: Optional[List[str]] = None, + sensitivity: str = "medium" + ) -> Dict[str, Any]: + """Generate predictive alerts for an account.""" + try: + # Update detector sensitivity + self.deviation_detector.sensitivity = sensitivity + self.deviation_detector.threshold = self.deviation_detector.thresholds[sensitivity] + + # Learn/update behavioral baselines + learn_result = await self.learn_behavior_from_transactions( + account_id, lookback_days + ) + + if "error" in result: + return result + + # Get current metrics (last 24 hours) + current_metrics = await self._get_current_metrics(account_id) + + if not current_metrics: + return { + "message": "Insufficient recent data for analysis", + "account_id": account_id + } + + # Get learned baselines + baselines = self.behavioral_learner.get_all_baselines(account_id) + + if not baselines: + return { + "message": "Insufficient historical data to establish baselines", + "account_id": account_id + } + + # Filter metrics if specified + if metrics: + current_metrics = {k: v for k, v in current_metrics.items() if k in metrics} + baselines = {k: v for k, v in baselines.items() if k in metrics} + + # Detect deviations + deviations = self.deviation_detector.detect_multiple_deviations( + account_id, current_metrics, baselines + ) + + # Create formatted alerts with explanations + alerts = self.alert_generator.create_deviation_alerts(deviations) + + return { + "alerts": alerts, + "baselines_used": [ + { + "account_id": account_id, + "metric_name": name, + "mean_value": data["mean"], + "std_dev": data["std_dev"], + "min_value": data["min"], + "max_value": data["max"], + "sample_size": data["sample_size"], + "last_updated": data["last_updated"].isoformat() + } + for name, data in baselines.items() + ], + "generated_at": datetime.utcnow().isoformat(), + "total_analyzed": len(current_metrics), + "deviations_found": len(deviations), + "learning_info": learn_result + } + + except Exception as e: + logger.error(f"Error generating predictive alerts for {account_id}: {e}") + return {"error": str(e)} + + async def _get_current_metrics(self, account_id: str) -> Dict[str, float]: + """Get current metrics from recent transactions (last 24 hours).""" + try: + db = next(get_sync_db()) + + # Get transactions from last 24 hours + cutoff_date = datetime.utcnow() - timedelta(hours=24) + + transactions = db.query(ApiTransaction).filter( + ApiTransaction.source_account == account_id, + ApiTransaction.created_at >= cutoff_date + ).all() + + if not transactions: + return {} + + # Calculate current metrics + amounts = [float(tx.amount or 0) for tx in transactions] + counterparties = list(set(tx.destination_account for tx in transactions if tx.destination_account)) + + return { + "transaction_count": len(transactions), + "total_amount": sum(amounts), + "avg_amount": statistics.mean(amounts) if amounts else 0, + "max_amount": max(amounts) if amounts else 0, + "unique_counterparties": len(counterparties) + } + + except Exception as e: + logger.error(f"Error getting current metrics for {account_id}: {e}") + return {} + finally: + db.close() + + def get_service_status(self) -> Dict[str, Any]: + """Get status of the predictive alerts service.""" + total_accounts = len(self.behavioral_learner._baselines) + total_models = sum(len(metrics) for metrics in self.behavioral_learner._baselines.values()) + + return { + "service": "predictive_alerts", + "status": "active", + "models_learned": { + "accounts": total_accounts, + "total_baselines": total_models + }, + "cache_size": len(self._cache), + "timestamp": datetime.utcnow().isoformat() + } + + +# Global service instance +predictive_alert_service = PredictiveAlertService() \ No newline at end of file diff --git a/api/services/translation.py b/api/services/translation.py new file mode 100644 index 0000000..e61daaa --- /dev/null +++ b/api/services/translation.py @@ -0,0 +1,437 @@ +"""Translation service for multi-language LLM output support (Issue 1).""" +from __future__ import annotations + +import asyncio +import hashlib +import json +import os +import threading +import time +from dataclasses import dataclass, field +from datetime import datetime +from functools import lru_cache +from typing import Any, Dict, List, Optional, Union + +try: + import redis +except ImportError: + redis = None + +try: + import babel.dates + import babel.numbers + from babel.core import Locale +except ImportError: + babel = None + Locale = None + + +SUPPORTED_LANGUAGES: Dict[str, Dict[str, str]] = { + "en": {"name": "English", "native": "English", "locale": "en_US"}, + "es": {"name": "Spanish", "native": "Espaรฑol", "locale": "es_ES"}, + "fr": {"name": "French", "native": "Franรงais", "locale": "fr_FR"}, + "de": {"name": "German", "native": "Deutsch", "locale": "de_DE"}, + "zh": {"name": "Chinese (Simplified)", "native": "ไธญๆ–‡๏ผˆ็ฎ€ไฝ“๏ผ‰", "locale": "zh_CN"}, + "ja": {"name": "Japanese", "native": "ๆ—ฅๆœฌ่ชž", "locale": "ja_JP"}, + "ko": {"name": "Korean", "native": "ํ•œ๊ตญ์–ด", "locale": "ko_KR"}, + "pt": {"name": "Portuguese", "native": "Portuguรชs", "locale": "pt_BR"}, + "it": {"name": "Italian", "native": "Italiano", "locale": "it_IT"}, + "ru": {"name": "Russian", "native": "ะ ัƒััะบะธะน", "locale": "ru_RU"}, + "ar": {"name": "Arabic", "native": "ุงู„ุนุฑุจูŠุฉ", "locale": "ar_SA"}, + "hi": {"name": "Hindi", "native": "เคนเคฟเคจเฅเคฆเฅ€", "locale": "hi_IN"}, + "nl": {"name": "Dutch", "native": "Nederlands", "locale": "nl_NL"}, + "pl": {"name": "Polish", "native": "Polski", "locale": "pl_PL"}, + "tr": {"name": "Turkish", "native": "Tรผrkรงe", "locale": "tr_TR"}, +} + + +@dataclass +class TranslationCacheStats: + hits: int = 0 + misses: int = 0 + sets: int = 0 + evictions: int = 0 + + @property + def hit_rate(self) -> float: + total = self.hits + self.misses + return self.hits / total if total > 0 else 0.0 + + def to_dict(self) -> Dict[str, Any]: + return { + "hits": self.hits, + "misses": self.misses, + "sets": self.sets, + "evictions": self.evictions, + "hit_rate": round(self.hit_rate, 4), + } + + +class TranslationCache: + """Multi-layer translation cache with Redis backend and in-memory fallback.""" + + def __init__(self, ttl: int = 86400, max_memory_entries: int = 10000): + self.ttl = ttl + self.max_memory_entries = max_memory_entries + self._memory_cache: Dict[str, tuple[str, float]] = {} + self._lock = threading.RLock() + self._stats = TranslationCacheStats() + self._redis_client = None + + if redis is not None: + redis_url = os.getenv("REDIS_URL", "redis://localhost:6379/1") + try: + self._redis_client = redis.Redis.from_url(redis_url, decode_responses=True) + self._redis_client.ping() + except Exception: + self._redis_client = None + + def _make_key(self, source_text: str, target_lang: str, source_lang: str = "auto") -> str: + content = f"{source_lang}:{target_lang}:{source_text}" + return f"trans:{hashlib.sha256(content.encode()).hexdigest()[:32]}" + + def get(self, source_text: str, target_lang: str, source_lang: str = "auto") -> Optional[str]: + key = self._make_key(source_text, target_lang, source_lang) + + with self._lock: + if key in self._memory_cache: + value, expiry = self._memory_cache[key] + if time.time() < expiry: + self._stats.hits += 1 + return value + else: + del self._memory_cache[key] + + if self._redis_client: + try: + value = self._redis_client.get(key) + if value: + with self._lock: + self._memory_cache[key] = (value, time.time() + self.ttl) + self._enforce_memory_limit() + self._stats.hits += 1 + return value + except Exception: + pass + + self._stats.misses += 1 + return None + + def set(self, source_text: str, target_lang: str, translation: str, source_lang: str = "auto") -> None: + key = self._make_key(source_text, target_lang, source_lang) + expiry = time.time() + self.ttl + + with self._lock: + self._memory_cache[key] = (translation, expiry) + self._enforce_memory_limit() + self._stats.sets += 1 + + if self._redis_client: + try: + self._redis_client.setex(key, self.ttl, translation) + except Exception: + pass + + def _enforce_memory_limit(self) -> None: + if len(self._memory_cache) > self.max_memory_entries: + now = time.time() + expired = [k for k, (_, exp) in self._memory_cache.items() if exp < now] + for k in expired: + del self._memory_cache[k] + if len(self._memory_cache) > self.max_memory_entries: + oldest = min(self._memory_cache.items(), key=lambda x: x[1][1])[0] + del self._memory_cache[oldest] + self._stats.evictions += 1 + + def get_stats(self) -> Dict[str, Any]: + return self._stats.to_dict() + + def clear(self) -> int: + with self._lock: + count = len(self._memory_cache) + self._memory_cache.clear() + if self._redis_client: + try: + keys = self._redis_client.keys("trans:*") + if keys: + self._redis_client.delete(*keys) + except Exception: + pass + return count + + +class LocaleFormatter: + """Locale-aware formatting for dates, numbers, and currencies.""" + + def __init__(self, locale_code: str = "en_US"): + self.locale_code = locale_code + self._locale = None + if babel and Locale: + try: + self._locale = Locale.parse(locale_code) + except Exception: + self._locale = Locale.parse("en_US") + + @property + def locale(self): + if self._locale is None and babel and Locale: + self._locale = Locale.parse("en_US") + return self._locale + + def format_date(self, dt: datetime, format: str = "medium") -> str: + if not babel or not self.locale: + return dt.strftime("%Y-%m-%d") + try: + return babel.dates.format_date(dt, format=format, locale=self.locale) + except Exception: + return dt.strftime("%Y-%m-%d") + + def format_datetime(self, dt: datetime, format: str = "medium") -> str: + if not babel or not self.locale: + return dt.strftime("%Y-%m-%d %H:%M:%S") + try: + return babel.dates.format_datetime(dt, format=format, locale=self.locale) + except Exception: + return dt.strftime("%Y-%m-%d %H:%M:%S") + + def format_number(self, value: float, decimals: int = 2) -> str: + if not babel or not self.locale: + return f"{value:,.{decimals}f}" + try: + return babel.numbers.format_number(value, locale=self.locale) + except Exception: + return f"{value:,.{decimals}f}" + + def format_currency(self, amount: float, currency: str = "USD") -> str: + if not babel or not self.locale: + return f"{currency} {amount:,.2f}" + try: + return babel.numbers.format_currency(amount, currency, locale=self.locale) + except Exception: + return f"{currency} {amount:,.2f}" + + def format_percent(self, value: float, decimals: int = 1) -> str: + if not babel or not self.locale: + return f"{value * 100:.{decimals}f}%" + try: + return babel.numbers.format_percent(value, locale=self.locale) + except Exception: + return f"{value * 100:.{decimals}f}%" + + +class TranslationService: + """Main translation service with LLM-based translation and caching.""" + + def __init__(self, llm_provider=None): + self.llm_provider = llm_provider + self.cache = TranslationCache() + self._formatters: Dict[str, LocaleFormatter] = {} + self._lock = threading.Lock() + + def get_supported_languages(self) -> Dict[str, Dict[str, str]]: + return SUPPORTED_LANGUAGES.copy() + + def get_formatter(self, locale_code: str) -> LocaleFormatter: + with self._lock: + if locale_code not in self._formatters: + self._formatters[locale_code] = LocaleFormatter(locale_code) + return self._formatters[locale_code] + + def _build_translation_prompt( + self, + text: str, + target_lang: str, + source_lang: str = "auto", + context: Optional[str] = None, + ) -> str: + target_info = SUPPORTED_LANGUAGES.get(target_lang, {"name": target_lang, "native": target_lang}) + target_name = target_info["name"] + target_native = target_info["native"] + + context_part = f"\nContext: {context}" if context else "" + + return f"""Translate the following text to {target_name} ({target_native}). +Source language: {source_lang if source_lang != "auto" else "auto-detect"}{context_part} + +Text to translate: +{text} + +Return ONLY the translated text, no explanations or metadata.""" + + def translate( + self, + text: str, + target_lang: str, + source_lang: str = "auto", + context: Optional[str] = None, + use_cache: bool = True, + ) -> Dict[str, Any]: + if target_lang not in SUPPORTED_LANGUAGES: + raise ValueError(f"Unsupported language: {target_lang}. Supported: {list(SUPPORTED_LANGUAGES.keys())}") + + if use_cache: + cached = self.cache.get(text, target_lang, source_lang) + if cached is not None: + return { + "translated_text": cached, + "source_language": source_lang, + "target_language": target_lang, + "cached": True, + "latency_ms": 0.0, # Cached responses have zero latency + } + + start_time = time.time() + if self.llm_provider: + prompt = self._build_translation_prompt(text, target_lang, source_lang, context) + translation = self.llm_provider.generate(prompt) + else: + translation = f"[{target_lang}] {text}" + latency_ms = (time.time() - start_time) * 1000 + + if use_cache: + self.cache.set(text, target_lang, translation, source_lang) + + return { + "translated_text": translation.strip(), + "source_language": source_lang, + "target_language": target_lang, + "cached": False, + "latency_ms": round(latency_ms, 2), + } + + def translate_batch( + self, + texts: List[str], + target_lang: str, + source_lang: str = "auto", + context: Optional[str] = None, + use_cache: bool = True, + ) -> List[Dict[str, Any]]: + return [self.translate(t, target_lang, source_lang, context, use_cache) for t in texts] + + async def translate_async( + self, + text: str, + target_lang: str, + source_lang: str = "auto", + context: Optional[str] = None, + use_cache: bool = True, + ) -> Dict[str, Any]: + loop = asyncio.get_event_loop() + return await loop.run_in_executor( + None, + lambda: self.translate(text, target_lang, source_lang, context, use_cache) + ) + + async def translate_batch_async( + self, + texts: List[str], + target_lang: str, + source_lang: str = "auto", + context: Optional[str] = None, + use_cache: bool = True, + ) -> List[Dict[str, Any]]: + """Async batch translate multiple texts to target language.""" + loop = asyncio.get_event_loop() + return await loop.run_in_executor( + None, + lambda: [self.translate(t, target_lang, source_lang, context, use_cache) for t in texts] + ) + + def format_locale( + self, + value: Union[float, int, str], + locale: str, + format_type: str, + currency_code: Optional[str] = None, + ) -> str: + """Format a single value according to locale and format type.""" + formatter = self.get_formatter(locale) + + if format_type == "number": + if isinstance(value, str): + try: + value = float(value) + except ValueError: + return value + return formatter.format_number(float(value)) + elif format_type == "currency": + if isinstance(value, str): + try: + value = float(value) + except ValueError: + return value + currency = currency_code or "USD" + return formatter.format_currency(float(value), currency) + elif format_type == "percent": + if isinstance(value, str): + try: + value = float(value) + except ValueError: + return value + return formatter.format_percent(float(value)) + elif format_type == "date": + if isinstance(value, str): + try: + from datetime import datetime + value = datetime.fromisoformat(value.replace('Z', '+00:00')) + except ValueError: + return value + if isinstance(value, datetime): + return formatter.format_date(value) + return str(value) + elif format_type == "datetime": + if isinstance(value, str): + try: + from datetime import datetime + value = datetime.fromisoformat(value.replace('Z', '+00:00')) + except ValueError: + return value + if isinstance(value, datetime): + return formatter.format_datetime(value) + return str(value) + else: + return str(value) + + def get_cache_stats(self) -> Dict[str, Any]: + return self.cache.get_stats() + + def invalidate_cache(self, text: str) -> bool: + """Invalidate cache for given text across all target languages (using source language auto-detection).""" + # We'll use a special key pattern or just clear all for simplicity + # For a more sophisticated approach, we'd need to iterate through all language combinations + # For now, let's clear entries that start with the text hash + text_hash = hashlib.sha256(text.encode()).hexdigest()[:32] + + with self.cache._lock: + # Find and remove keys that match this text + keys_to_delete = [ + k for k in self.cache._memory_cache.keys() + if k.startswith(f"trans:{text_hash}") + ] + for k in keys_to_delete: + del self.cache._memory_cache[k] + + if self.cache._redis_client: + try: + # Use SCAN to find matching keys + cursor = 0 + deleted_count = 0 + while True: + cursor, keys = self.cache._redis_client.scan( + cursor, match=f"trans:*{text_hash}*", count=100 + ) + if keys: + deleted_count += self.cache._redis_client.delete(*keys) + if cursor == 0: + break + return deleted_count > 0 + except Exception: + return False + return len(keys_to_delete) > 0 if 'keys_to_delete' in locals() else False + + def invalidate_all_cache(self) -> int: + return self.cache.clear() + + +translation_service = TranslationService() \ No newline at end of file diff --git a/api/tests/test_llm_cost_aware.py b/api/tests/test_llm_cost_aware.py new file mode 100644 index 0000000..44f152b --- /dev/null +++ b/api/tests/test_llm_cost_aware.py @@ -0,0 +1,70 @@ +"""Cost-aware tests for LLM features.""" +from __future__ import annotations + +import time + +from fastapi.testclient import TestClient + +from astroml.llm.tracker import global_tracker +from astroml.llm.metrics import ( + LLM_COST_USD_TOTAL, + LLM_REQUEST_LATENCY_SECONDS, + LLM_TOKENS_TOTAL, +) + + +class TestLLMCostAware: + """Tests to ensure LLM usage stays within cost and latency budgets.""" + + def test_cost_threshold_not_exceeded(self, client: TestClient): + baseline_cost = global_tracker.total_cost + baseline_prom_cost = float( + LLM_COST_USD_TOTAL._metrics.get("_value", {}).get("value", 0.0) + ) + + response = client.post( + "/api/v1/llm/ask", + json={"question": "What is the cost of this request?"}, + ) + assert response.status_code == 200 + + new_cost = global_tracker.total_cost + new_prom_cost = float( + LLM_COST_USD_TOTAL._metrics.get("_value", {}).get("value", 0.0) + ) + delta = (new_cost - baseline_cost) + (new_prom_cost - baseline_prom_cost) + assert delta < 0.50, f"Single LLM request cost ${delta:.4f} exceeded $0.50 budget" + + def test_latency_budget(self, client: TestClient): + start = time.perf_counter() + response = client.post( + "/api/v1/llm/ask", + json={"question": "How fast is this response?"}, + ) + elapsed_ms = (time.perf_counter() - start) * 1000.0 + assert response.status_code == 200 + assert elapsed_ms < 2000.0, f"LLM request took {elapsed_ms:.1f}ms, exceeded 2000ms budget" + + def test_token_budget(self, client: TestClient): + baseline_tokens = global_tracker.total_prompt_tokens + global_tracker.total_completion_tokens + baseline_prom_tokens = sum( + v for v in LLM_TOKENS_TOTAL._metrics.get("_value", {}).values() + if isinstance(v, (int, float)) + ) + + response = client.post( + "/api/v1/llm/ask", + json={"question": "Count the tokens in this short question."}, + ) + assert response.status_code == 200 + + new_tokens = global_tracker.total_prompt_tokens + global_tracker.total_completion_tokens + delta = (new_tokens - baseline_tokens) + assert delta <= 2000, f"Request used {delta} tokens, exceeded budget of 2000" + + def test_health_check_latency(self, client: TestClient): + start = time.perf_counter() + response = client.get("/api/v1/llm/health") + elapsed_ms = (time.perf_counter() - start) * 1000.0 + assert response.status_code == 200 + assert elapsed_ms < 5000.0, f"Health check took {elapsed_ms:.1f}ms" diff --git a/api/tests/test_llm_health.py b/api/tests/test_llm_health.py new file mode 100644 index 0000000..3aa82b0 --- /dev/null +++ b/api/tests/test_llm_health.py @@ -0,0 +1,33 @@ +"""Integration tests for LLM health endpoints.""" +from __future__ import annotations + + +class TestLLMHealth: + def test_llm_health_returns_200(self, client): + resp = client.get("/api/v1/llm/health") + assert resp.status_code == 200 + + def test_llm_health_has_overall_status(self, client): + data = client.get("/api/v1/llm/health").json() + assert "overall_status" in data + assert "providers" in data + assert "checked_at" in data + + def test_llm_provider_health_endpoint(self, client): + resp = client.get("/api/v1/llm/health/openai") + assert resp.status_code == 200 + data = resp.json() + assert data["provider"] == "openai" + assert "status" in data + assert "latency_ms" in data + + def test_llm_health_providers_include_expected(self, client): + data = client.get("/api/v1/llm/health").json() + assert "openai" in data["providers"] + assert "anthropic" in data["providers"] + assert "huggingface" in data["providers"] + + def test_prometheus_metrics_endpoint(self, client): + resp = client.get("/metrics") + assert resp.status_code == 200 + assert "astroml_llm_provider_health" in resp.text diff --git a/astroml/llm/explainer.py b/astroml/llm/explainer.py index 5bc37a9..950a9f5 100644 --- a/astroml/llm/explainer.py +++ b/astroml/llm/explainer.py @@ -37,11 +37,12 @@ def generate_explanation(self, alert_id: int, account_id: str, pattern: str, sco latency_ms=latency_ms ) - # Cache the response self.cache.set(prompt, response) return response except Exception as e: + provider_name = self.provider.__class__.__name__.replace("Provider", "").lower() + global_tracker.record_error(provider_name) return f"Error generating explanation: {str(e)}" def _build_prompt(self, account_id: str, pattern: str, score: float, transactions: List[Dict[str, Any]]) -> str: diff --git a/astroml/llm/health.py b/astroml/llm/health.py new file mode 100644 index 0000000..e98c250 --- /dev/null +++ b/astroml/llm/health.py @@ -0,0 +1,113 @@ +"""LLM Provider health checks.""" +import asyncio +import os +import time +from typing import Any, Dict + +import aiohttp + +PROVIDER_ENDPOINTS = { + "openai": { + "url": "https://api.openai.com/v1/models", + "method": "GET", + "headers": lambda key: {"Authorization": f"Bearer {key}"}, + }, + "anthropic": { + "url": "https://api.anthropic.com/v1/messages", + "method": "HEAD", + "headers": lambda key: { + "x-api-key": key, + "anthropic-version": "2023-06-01", + }, + }, + "huggingface": { + "url": "https://api-inference.huggingface.co/status", + "method": "GET", + "headers": lambda key: {"Authorization": f"Bearer {key}"}, + }, +} + + +def _get_api_key(provider_name: str) -> str: + env_key = f"{provider_name.upper()}_API_KEY" + return os.getenv(env_key, "") + + +async def check_provider_health( + provider_name: str, timeout: float = 5.0 +) -> Dict[str, Any]: + start = time.perf_counter() + if provider_name not in PROVIDER_ENDPOINTS: + latency_ms = (time.perf_counter() - start) * 1000 + return { + "provider": provider_name, + "status": "unknown", + "latency_ms": round(latency_ms, 2), + "error": "Provider not supported for health checks", + } + + api_key = _get_api_key(provider_name) + if not api_key: + latency_ms = (time.perf_counter() - start) * 1000 + return { + "provider": provider_name, + "status": "unhealthy", + "latency_ms": round(latency_ms, 2), + "error": "API key not configured", + } + + config = PROVIDER_ENDPOINTS[provider_name] + + try: + async with aiohttp.ClientSession() as session: + async with session.request( + method=config["method"], + url=config["url"], + headers=config["headers"](api_key), + timeout=aiohttp.ClientTimeout(total=timeout), + ) as response: + latency_ms = (time.perf_counter() - start) * 1000 + healthy = 200 <= response.status < 300 + return { + "provider": provider_name, + "status": "healthy" if healthy else "unhealthy", + "latency_ms": round(latency_ms, 2), + "http_status": response.status, + } + except Exception as e: + latency_ms = (time.perf_counter() - start) * 1000 + return { + "provider": provider_name, + "status": "unhealthy", + "latency_ms": round(latency_ms, 2), + "error": str(e), + } + + +async def check_all_providers() -> Dict[str, Any]: + providers = list(PROVIDER_ENDPOINTS.keys()) + results = await asyncio.gather( + *(check_provider_health(p) for p in providers), + return_exceptions=True, + ) + + provider_statuses = {} + for result in results: + if isinstance(result, Exception): + provider_statuses["unknown"] = { + "provider": "unknown", + "status": "unhealthy", + "latency_ms": 0, + "error": str(result), + } + else: + provider_statuses[result["provider"]] = result + + all_healthy = all( + r.get("status") == "healthy" for r in provider_statuses.values() + ) + return { + "overall_status": "healthy" if all_healthy else "degraded", + "providers": provider_statuses, + "checked_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), + } diff --git a/astroml/llm/metrics.py b/astroml/llm/metrics.py new file mode 100644 index 0000000..2413d24 --- /dev/null +++ b/astroml/llm/metrics.py @@ -0,0 +1,32 @@ +from prometheus_client import Counter, Gauge, Histogram + +LLM_REQUESTS_TOTAL = Counter( + "astroml_llm_requests_total", + "Total LLM API requests", + ["provider", "status"], +) + +LLM_REQUEST_LATENCY_SECONDS = Histogram( + "astroml_llm_request_latency_seconds", + "LLM API request latency in seconds", + ["provider"], + buckets=[0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30, 60], +) + +LLM_COST_USD_TOTAL = Counter( + "astroml_llm_cost_usd_total", + "Total LLM API cost in USD", + ["provider"], +) + +LLM_TOKENS_TOTAL = Counter( + "astroml_llm_tokens_total", + "Total LLM tokens processed", + ["provider", "token_type"], +) + +LLM_PROVIDER_HEALTH = Gauge( + "astroml_llm_provider_health", + "LLM provider health status (1=healthy, 0=unhealthy)", + ["provider"], +) diff --git a/astroml/llm/tracker.py b/astroml/llm/tracker.py index aba3ca8..85672bd 100644 --- a/astroml/llm/tracker.py +++ b/astroml/llm/tracker.py @@ -1,19 +1,24 @@ """LLM Token Usage and Cost Tracking.""" import logging -from typing import Dict, Optional +from typing import Dict + +from astroml.llm.metrics import ( + LLM_COST_USD_TOTAL, + LLM_REQUEST_LATENCY_SECONDS, + LLM_REQUESTS_TOTAL, + LLM_TOKENS_TOTAL, +) logger = logging.getLogger(__name__) -# Mock cost per 1k tokens for different providers COST_RATES = { "openai": {"prompt": 0.03, "completion": 0.06}, "anthropic": {"prompt": 0.015, "completion": 0.075}, "huggingface": {"prompt": 0.001, "completion": 0.001}, } -class LLMUsageTracker: - """Tracks LLM API usage, costs, and latency.""" +class LLMUsageTracker: def __init__(self): self.total_cost = 0.0 self.total_prompt_tokens = 0 @@ -23,42 +28,66 @@ def __init__(self): def record_usage( self, provider_name: str, usage: Dict[str, int], latency_ms: float ) -> float: - """ - Record usage for a request and calculate cost. - Logs an alert if total cost exceeds the threshold. - """ - rates = COST_RATES.get(provider_name.lower(), {"prompt": 0.0, "completion": 0.0}) - + rates = COST_RATES.get( + provider_name.lower(), {"prompt": 0.0, "completion": 0.0} + ) prompt_tokens = usage.get("prompt_tokens", 0) completion_tokens = usage.get("completion_tokens", 0) - - cost = (prompt_tokens / 1000.0) * rates["prompt"] + (completion_tokens / 1000.0) * rates["completion"] - + cost = (prompt_tokens / 1000.0) * rates["prompt"] + ( + completion_tokens / 1000.0 + ) * rates["completion"] + self.total_prompt_tokens += prompt_tokens self.total_completion_tokens += completion_tokens self.total_cost += cost - + + LLM_REQUESTS_TOTAL.labels( + provider=provider_name, status="success" + ).inc() + LLM_REQUEST_LATENCY_SECONDS.labels(provider=provider_name).observe( + latency_ms / 1000.0 + ) + LLM_COST_USD_TOTAL.labels(provider=provider_name).inc(cost) + LLM_TOKENS_TOTAL.labels( + provider=provider_name, token_type="prompt" + ).inc(prompt_tokens) + LLM_TOKENS_TOTAL.labels( + provider=provider_name, token_type="completion" + ).inc(completion_tokens) + logger.info( - "LLM Usage Recorded: Provider=%s, PromptTokens=%d, CompletionTokens=%d, Cost=$%.4f, Latency=%.2fms", - provider_name, prompt_tokens, completion_tokens, cost, latency_ms + "LLM Usage Recorded: Provider=%s, PromptTokens=%d, " + "CompletionTokens=%d, Cost=$%.4f, Latency=%.2fms", + provider_name, + prompt_tokens, + completion_tokens, + cost, + latency_ms, ) - + self.check_alerts() return cost + def record_error(self, provider_name: str) -> None: + LLM_REQUESTS_TOTAL.labels(provider=provider_name, status="error").inc() + def check_alerts(self): - """Check if cost alerts should be triggered.""" if self.total_cost > self.alert_threshold: - logger.warning("LLM Cost Alert! Total cost ($%.2f) has exceeded threshold ($%.2f)", self.total_cost, self.alert_threshold) + logger.warning( + "LLM Cost Alert! Total cost ($%.2f) has exceeded " + "threshold ($%.2f)", + self.total_cost, + self.alert_threshold, + ) def get_summary(self) -> Dict[str, float]: - """Get summary of tracking metrics.""" return { "total_cost": self.total_cost, "total_prompt_tokens": self.total_prompt_tokens, "total_completion_tokens": self.total_completion_tokens, - "total_tokens": self.total_prompt_tokens + self.total_completion_tokens + "total_tokens": self.total_prompt_tokens + + self.total_completion_tokens, } -# Global singleton tracker + global_tracker = LLMUsageTracker() diff --git a/docs/runbooks/llm_health.md b/docs/runbooks/llm_health.md new file mode 100644 index 0000000..68a4680 --- /dev/null +++ b/docs/runbooks/llm_health.md @@ -0,0 +1,88 @@ +# LLM Infrastructure Runbook + +## Overview + +This runbook covers health checks, monitoring, alerting, and incident response for LLM providers (OpenAI, Anthropic, HuggingFace). + +## Health Check Architecture + +- **Health endpoints**: `GET /api/v1/llm/health` and `GET /api/v1/llm/health/{provider}` +- **Polling interval**: 60 seconds via Prometheus or external monitor +- **Metrics endpoint**: `GET /metrics` (Prometheus text format) +- **Grafana dashboard**: `monitoring/grafana/llm_health_dashboard.json` + +## Key Metrics + +| Metric | Type | Description | +|--------|------|-------------| +| `astroml_llm_provider_health` | Gauge | 1 = healthy, 0 = unhealthy | +| `astroml_llm_request_latency_seconds` | Histogram | Per-provider latency | +| `astroml_llm_requests_total` | Counter | Request count by provider and status | +| `astroml_llm_cost_usd_total` | Counter | Accumulated cost USD | +| `astroml_llm_tokens_total` | Counter | Token count by provider and token_type | + +## Alerts + +| Alert | Condition | Severity | +|-------|-----------|----------| +| `LLMProviderDown` | Provider health == 0 for > 2m | Critical | +| `LLMHighErrorRate` | Error rate > 0.1 req/s for > 2m | Warning | +| `LLMCostThreshold` | Cost > $10 in 1h window | Warning | +| `LLMHighLatency` | P95 latency > 5s for > 3m | Warning | + +## Cost Tracking + +- **Threshold**: $100 (logged) +- **Granularity**: Per-request cost calculated using mock rates in `astroml/llm/tracker.py` +- **Alerting**: Prometheus `LLMCostThreshold` rule triggers on spikes (>$10/hour) +- **Dashboard**: Cost panel in Grafana shows 1-hour rolling sums + +## Incident Response + +### Provider Down +1. Check `LLMProviderDown` alert in Alertmanager +2. Verify API keys are configured (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `HUGGINGFACE_API_KEY`) +3. Check network connectivity from container to provider API +4. Review provider status pages: OpenAI, Anthropic, HuggingFace +5. Rotate API keys if suspected exposure +6. Failover: update `LLM_PROVIDER` env var to alternate provider + +### High Error Rate +1. Check `LLMHighErrorRate` alert +2. Correlate with latency spikes in Grafana dashboard +3. Review application logs for stack traces +4. Check for rate limits or quota exhaustion +5. Consider switching providers or reducing request rate + +### Cost Spike +1. Check `LLMCostThreshold` alert +2. Correlate with traffic volume in Grafana +3. Review recent deployments for prompt regression +4. If legitimate growth, update budget thresholds +5. If anomaly, audit prompt caching (`SemanticCache`) and consider tightening limits + +## Runbook Verification + +```bash +# Verify health endpoint +curl -s http://localhost:8000/api/v1/llm/health | jq + +# Verify metrics exposition +curl -s http://localhost:8000/metrics | grep astroml_llm_ + +# Run monitoring stack +docker compose --profile monitoring up -d + +# Check Prometheus targets +open http://localhost:9090/targets + +# Open Grafana +open http://localhost:3000 +Default login: admin/admin +``` + +## Maintenance + +- **Dashboard refresh**: Import `monitoring/grafana/llm_health_dashboard.json` into Grafana +- **Alert review**: Review rules in `monitoring/prometheus/alert_rules.yml` +- **Rate updates**: Update mock cost rates in `astroml/llm/tracker.py` and `COST_RATES` from provider pricing pages diff --git a/k8s/astroml-api-deployment.yaml b/k8s/astroml-api-deployment.yaml new file mode 100644 index 0000000..97874e5 --- /dev/null +++ b/k8s/astroml-api-deployment.yaml @@ -0,0 +1,61 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: astroml-api + namespace: astroml + labels: + app: astroml-api +spec: + replicas: 3 + selector: + matchLabels: + app: astroml-api + template: + metadata: + labels: + app: astroml-api + spec: + serviceAccountName: astroml + containers: + - name: api + image: ghcr.io/${{ github.repository }}:production-latest + imagePullPolicy: Always + ports: + - containerPort: 8000 + name: http + env: + - name: DATABASE_URL + valueFrom: + configMapKeyRef: + name: astroml-config + key: DATABASE_URL + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: astroml-config + key: REDIS_URL + - name: LLM_PROVIDER + value: "openai" + - name: ASTROML_ENV + value: "production" + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 10 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /api/v1/llm/health + port: 8000 + initialDelaySeconds: 15 + periodSeconds: 5 + failureThreshold: 3 diff --git a/k8s/kustomization.yaml b/k8s/kustomization.yaml index d0123bd..889e845 100644 --- a/k8s/kustomization.yaml +++ b/k8s/kustomization.yaml @@ -9,6 +9,8 @@ resources: - redis-deployment.yaml - feature-store-deployment.yaml - astroml-deployment.yaml + - astroml-api-deployment.yaml + - llm-canary-deployment.yaml - services.yaml - ingress.yaml - monitoring.yaml diff --git a/k8s/llm-canary-deployment.yaml b/k8s/llm-canary-deployment.yaml new file mode 100644 index 0000000..88420b9 --- /dev/null +++ b/k8s/llm-canary-deployment.yaml @@ -0,0 +1,82 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: astroml-api-canary + namespace: astroml + labels: + app: astroml-api + version: canary +spec: + replicas: 1 + selector: + matchLabels: + app: astroml-api + version: canary + template: + metadata: + labels: + app: astroml-api + version: canary + spec: + serviceAccountName: astroml + containers: + - name: api + image: ghcr.io/${{ github.repository }}:production-${{ github.sha }} + imagePullPolicy: Always + ports: + - containerPort: 8000 + name: http + env: + - name: DATABASE_URL + valueFrom: + configMapKeyRef: + name: astroml-config + key: DATABASE_URL + - name: REDIS_URL + valueFrom: + configMapKeyRef: + name: astroml-config + key: REDIS_URL + - name: LLM_PROVIDER + value: "openai" + - name: ASTROML_ENV + value: "production" + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" + livenessProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 15 + periodSeconds: 10 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /api/v1/llm/health + port: 8000 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 3 +--- +apiVersion: v1 +kind: Service +metadata: + name: astroml-api-canary + namespace: astroml + labels: + app: astroml-api + version: canary +spec: + type: ClusterIP + ports: + - port: 8000 + targetPort: 8000 + name: http + selector: + app: astroml-api + version: canary diff --git a/monitoring/grafana/llm_health_dashboard.json b/monitoring/grafana/llm_health_dashboard.json new file mode 100644 index 0000000..124fa24 --- /dev/null +++ b/monitoring/grafana/llm_health_dashboard.json @@ -0,0 +1,114 @@ +{ + "dashboard": { + "title": "LLM Health Monitoring", + "uid": "llm-health", + "timezone": "browser", + "schemaVersion": 38, + "version": 0, + "refresh": "1m", + "time": {"from": "now-6h", "to": "now"}, + "panels": [ + { + "id": 1, + "title": "Provider Health Status", + "type": "stat", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 0}, + "targets": [ + { + "expr": "astroml_llm_provider_health", + "legendFormat": "{{provider}}", + "refId": "A" + } + ], + "fieldConfig": { + "defaults": { + "mappings": [ + {"type": "value", "value": "0", "text": "Unhealthy"}, + {"type": "value", "value": "1", "text": "Healthy"} + ], + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": 0}, + {"color": "green", "value": 1} + ] + } + } + } + }, + { + "id": 2, + "title": "P95 Request Latency (seconds)", + "type": "graph", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, + "targets": [ + { + "expr": "histogram_quantile(0.95, rate(astroml_llm_request_latency_seconds_bucket[5m]))", + "legendFormat": "{{provider}}", + "refId": "A" + } + ], + "yaxes": [ + {"format": "s", "label": "Latency"}, + {"format": "short", "show": false} + ] + }, + { + "id": 3, + "title": "Request Error Rate (per sec)", + "type": "graph", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}, + "targets": [ + { + "expr": "rate(astroml_llm_requests_total{status=\"error\"}[5m])", + "legendFormat": "{{provider}}", + "refId": "A" + } + ] + }, + { + "id": 4, + "title": "Total Cost (USD)", + "type": "graph", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}, + "targets": [ + { + "expr": "sum by (provider) (increase(astroml_llm_cost_usd_total[1h]))", + "legendFormat": "{{provider}}", + "refId": "A" + } + ], + "yaxes": [ + {"format": "currencyUSD", "label": "Cost"}, + {"format": "short", "show": false} + ] + }, + { + "id": 5, + "title": "Total Tokens (last 1h)", + "type": "graph", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}, + "targets": [ + { + "expr": "sum by (provider, token_type) (increase(astroml_llm_tokens_total[1h]))", + "legendFormat": "{{provider}} - {{token_type}}", + "refId": "A" + } + ] + }, + { + "id": 6, + "title": "Total Requests", + "type": "graph", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 24}, + "targets": [ + { + "expr": "sum by (provider, status) (astroml_llm_requests_total)", + "legendFormat": "{{provider}} ({{status}})", + "refId": "A" + } + ] + } + ] + } +} diff --git a/monitoring/prometheus/alert_rules.yml b/monitoring/prometheus/alert_rules.yml index b61b75e..84264bc 100644 --- a/monitoring/prometheus/alert_rules.yml +++ b/monitoring/prometheus/alert_rules.yml @@ -28,11 +28,49 @@ groups: summary: "Ingestion stalled for {{ $labels.stream_type }} on {{ $labels.horizon_url }}" description: "No records have been processed for {{ $labels.stream_type }} in the last 15 minutes." - - alert: PersistentRateLimit - expr: astroml_ingestion_rate_limit_backoff_seconds > 60 - for: 10m + - alert: PersistentRateLimit + expr: astroml_ingestion_rate_limit_backoff_seconds > 60 + for: 10m + labels: + severity: warning + annotations: + summary: "Persistent rate limiting for {{ $labels.stream_type }}" + description: "The ingestion service is facing persistent rate limiting with backoff > 60s for over 10 minutes." + + - name: astroml_llm_alerts + rules: + - alert: LLMProviderDown + expr: astroml_llm_provider_health == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "LLM provider {{ $labels.provider }} is down" + description: "The {{ $labels.provider }} LLM provider has been unreachable for more than 2 minutes." + + - alert: LLMHighErrorRate + expr: rate(astroml_llm_requests_total{status="error"}[5m]) > 0.1 + for: 2m + labels: + severity: warning + annotations: + summary: "High LLM error rate for {{ $labels.provider }}" + description: "LLM error rate for {{ $labels.provider }} is currently {{ $value }} requests/sec." + + - alert: LLMCostThreshold + expr: increase(astroml_llm_cost_usd_total[1h]) > 10 + for: 5m + labels: + severity: warning + annotations: + summary: "LLM cost spike for {{ $labels.provider }}" + description: "LLM cost for {{ $labels.provider }} exceeded $10 in the last hour." + + - alert: LLMHighLatency + expr: histogram_quantile(0.95, rate(astroml_llm_request_latency_seconds_bucket[5m])) > 5 + for: 3m labels: severity: warning annotations: - summary: "Persistent rate limiting for {{ $labels.stream_type }}" - description: "The ingestion service is facing persistent rate limiting with backoff > 60s for over 10 minutes." + summary: "High LLM latency for {{ $labels.provider }}" + description: "P95 latency for {{ $labels.provider }} is {{ $value }}s (threshold 5s)." diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml index 6b73ec3..0807380 100644 --- a/monitoring/prometheus/prometheus.yml +++ b/monitoring/prometheus/prometheus.yml @@ -90,3 +90,13 @@ scrape_configs: - source_labels: [__address__] target_label: instance replacement: 'production' + + # FastAPI API service metrics + - job_name: 'astroml-api' + metrics_path: '/metrics' + static_configs: + - targets: ['api:8000'] + relabel_configs: + - source_labels: [__address__] + target_label: instance + replacement: 'api' diff --git a/scripts/auto-rollback.sh b/scripts/auto-rollback.sh new file mode 100644 index 0000000..c07fa55 --- /dev/null +++ b/scripts/auto-rollback.sh @@ -0,0 +1,24 @@ +#!/bin/bash +set -euo pipefail + +NAMESPACE="${NAMESPACE:-astroml}" +STABLE_DEPLOYMENT="${STABLE_DEPLOYMENT:-astroml-api}" +ROLLBACK_TIMEOUT=300 + +echo "Initiating rollback for ${STABLE_DEPLOYMENT} in namespace ${NAMESPACE}..." + +kubectl rollout undo deployment/${STABLE_DEPLOYMENT} -n "${NAMESPACE}" + +if kubectl rollout status deployment/${STABLE_DEPLOYMENT} -n "${NAMESPACE}" --timeout=${ROLLBACK_TIMEOUT}s; then + echo "Rollback completed successfully for ${STABLE_DEPLOYMENT}" +else + echo "Rollback verification failed for ${STABLE_DEPLOYMENT}" + kubectl get pods -n "${NAMESPACE}" + kubectl describe deployment/${STABLE_DEPLOYMENT} -n "${NAMESPACE}" + exit 1 +fi + +REVISION=$(kubectl rollout history deployment/${STABLE_DEPLOYMENT} -n "${NAMESPACE}" | tail -n 1 | awk '{print $1}') +echo "Rolled back to revision ${REVISION}" + +kubectl get events --field-selector involved-object.name=${STABLE_DEPLOYMENT} -n "${NAMESPACE}" --sort-by='.lastTimestamp' | tail -n 10 diff --git a/scripts/canary-deploy.sh b/scripts/canary-deploy.sh new file mode 100644 index 0000000..9c818e5 --- /dev/null +++ b/scripts/canary-deploy.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -euo pipefail + +REGISTRY="${REGISTRY:-ghcr.io}" +REPO="${REPO:-$GITHUB_REPOSITORY}" +IMAGE_TAG="${IMAGE_TAG:-latest}" +NAMESPACE="${NAMESPACE:-astroml}" +CANARY_DEPLOYMENT="astroml-api-canary" +STABLE_DEPLOYMENT="astroml-api" +CANARY_SERVICE="astroml-api-canary" +EXPECTED_REPLICAS=1 + +echo "Deploying canary: ${REGISTRY}/${REPO}:${IMAGE_TAG}" + +kubectl apply -f k8s/llm-canary-deployment.yaml -n "${NAMESPACE}" + +kubectl set image deployment/${CANARY_DEPLOYMENT} \ + api=${REGISTRY}/${REPO}:${IMAGE_TAG} \ + -n "${NAMESPACE}" --record + +kubectl rollout status deployment/${CANARY_DEPLOYMENT} \ + -n "${NAMESPACE}" --timeout=300s + +CANARY_REPLICAS=$(kubectl get deployment ${CANARY_DEPLOYMENT} -n "${NAMESPACE}" -o jsonpath='{.status.readyReplicas}' || echo 0) +if [ "${CANARY_REPLICAS}" -ne "${EXPECTED_REPLICAS}" ]; then + echo "Canary deployment failed: expected ${EXPECTED_REPLICAS} replicas, got ${CANARY_REPLICAS}" + exit 1 +fi + +echo "Canary deployed successfully with ${CANARY_REPLICAS} replica(s)" diff --git a/scripts/canary-promote.sh b/scripts/canary-promote.sh new file mode 100644 index 0000000..d98866b --- /dev/null +++ b/scripts/canary-promote.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -euo pipefail + +REGISTRY="${REGISTRY:-ghcr.io}" +REPO="${REPO:-$GITHUB_REPOSITORY}" +IMAGE_TAG="${IMAGE_TAG:-latest}" +NAMESPACE="${NAMESPACE:-astroml}" +CANARY_DEPLOYMENT="astroml-api-canary" +STABLE_DEPLOYMENT="astroml-production" + +echo "Promoting canary to stable..." + +kubectl set image deployment/${STABLE_DEPLOYMENT} \ + api=${REGISTRY}/${REPO}:${IMAGE_TAG} \ + -n "${NAMESPACE}" --record + +kubectl rollout status deployment/${STABLE_DEPLOYMENT} \ + -n "${NAMESPACE}" --timeout=300s + +echo "Scaling down canary..." +kubectl scale deployment/${CANARY_DEPLOYMENT} -n "${NAMESPACE}" --replicas=0 + +echo "Canary promoted successfully"