diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml
new file mode 100644
index 0000000..de5ab0a
--- /dev/null
+++ b/.github/workflows/ci-cd.yml
@@ -0,0 +1,371 @@
+name: CI/CD Pipeline
+
+on:
+  push:
+    branches: [ main, develop ]
+  pull_request:
+    branches: [ main, develop ]
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+  # Linting and Code Quality
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.11'
+        cache: 'pip'
+    
+    - name: Install Python dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install flake8 black isort mypy pytest pytest-cov
+    
+    - name: Run linting
+      run: |
+        flake8 backend/app/ --max-line-length=88 --extend-ignore=E203,W503
+        black --check backend/app/
+        isort --check-only backend/app/
+        mypy backend/app/
+    
+    - name: Run security checks
+      run: |
+        pip install bandit safety
+        bandit -r backend/app/ -f json -o bandit-report.json
+        safety check
+
+  # Backend Testing
+  test-backend:
+    runs-on: ubuntu-latest
+    services:
+      postgres:
+        image: postgres:15
+        env:
+          POSTGRES_PASSWORD: password
+          POSTGRES_DB: test_db
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+        ports:
+          - 5432:5432
+      
+      redis:
+        image: redis:7
+        options: >-
+          --health-cmd "redis-cli ping"
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+        ports:
+          - 6379:6379
+    
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.11'
+        cache: 'pip'
+    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+    
+    - name: Run database migrations
+      run: |
+        cd backend
+        alembic upgrade head
+      env:
+        DATABASE_URL: postgresql://postgres:password@localhost:5432/test_db
+    
+    - name: Run tests with coverage
+      run: |
+        cd backend
+        pytest tests/ -v --cov=app --cov-report=xml --cov-report=html
+      env:
+        DATABASE_URL: postgresql://postgres:password@localhost:5432/test_db
+        REDIS_URL: redis://localhost:6379/0
+        SECRET_KEY: test-secret-key
+        OPENAI_API_KEY: test-key
+    
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v3
+      with:
+        file: ./backend/coverage.xml
+        flags: backend
+        name: backend-coverage
+
+  # Frontend Testing
+  test-frontend:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Node.js
+      uses: actions/setup-node@v4
+      with:
+        node-version: '18'
+        cache: 'npm'
+        cache-dependency-path: frontend/package-lock.json
+    
+    - name: Install dependencies
+      run: |
+        cd frontend
+        npm ci
+    
+    - name: Run linting
+      run: |
+        cd frontend
+        npm run lint
+    
+    - name: Run type checking
+      run: |
+        cd frontend
+        npm run type-check
+    
+    - name: Run tests
+      run: |
+        cd frontend
+        npm test -- --coverage --watchAll=false
+    
+    - name: Upload coverage to Codecov
+      uses: codecov/codecov-action@v3
+      with:
+        file: ./frontend/coverage/lcov.info
+        flags: frontend
+        name: frontend-coverage
+
+  # Integration Testing
+  integration-test:
+    runs-on: ubuntu-latest
+    needs: [test-backend, test-frontend]
+    services:
+      postgres:
+        image: postgres:15
+        env:
+          POSTGRES_PASSWORD: password
+          POSTGRES_DB: test_db
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+        ports:
+          - 5432:5432
+      
+      redis:
+        image: redis:7
+        options: >-
+          --health-cmd "redis-cli ping"
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+        ports:
+          - 6379:6379
+    
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.11'
+        cache: 'pip'
+    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+        pip install locust
+    
+    - name: Start backend server
+      run: |
+        cd backend
+        python -m uvicorn app.main:app --host 0.0.0.0 --port 8000 &
+        sleep 10
+      env:
+        DATABASE_URL: postgresql://postgres:password@localhost:5432/test_db
+        REDIS_URL: redis://localhost:6379/0
+        SECRET_KEY: test-secret-key
+        OPENAI_API_KEY: test-key
+    
+    - name: Run integration tests
+      run: |
+        cd backend
+        pytest tests/integration/ -v
+    
+    - name: Run load tests
+      run: |
+        cd backend
+        locust -f tests/load/locustfile.py --headless --users 10 --spawn-rate 2 --run-time 60s
+
+  # Build Docker Images
+  build:
+    runs-on: ubuntu-latest
+    needs: [lint, test-backend, test-frontend, integration-test]
+    if: github.event_name == 'push'
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+    
+    - name: Log in to Container Registry
+      uses: docker/login-action@v3
+      with:
+        registry: ${{ env.REGISTRY }}
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    
+    - name: Extract metadata
+      id: meta
+      uses: docker/metadata-action@v5
+      with:
+        images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+        tags: |
+          type=ref,event=branch
+          type=ref,event=pr
+          type=semver,pattern={{version}}
+          type=semver,pattern={{major}}.{{minor}}
+          type=sha
+    
+    - name: Build and push backend image
+      uses: docker/build-push-action@v5
+      with:
+        context: ./backend
+        push: true
+        tags: ${{ steps.meta.outputs.tags }}
+        labels: ${{ steps.meta.outputs.labels }}
+        cache-from: type=gha
+        cache-to: type=gha,mode=max
+    
+    - name: Build and push frontend image
+      uses: docker/build-push-action@v5
+      with:
+        context: ./frontend
+        push: true
+        tags: ${{ steps.meta.outputs.tags }}-frontend
+        labels: ${{ steps.meta.outputs.labels }}
+        cache-from: type=gha
+        cache-to: type=gha,mode=max
+
+  # Deploy to Staging
+  deploy-staging:
+    runs-on: ubuntu-latest
+    needs: [build]
+    if: github.ref == 'refs/heads/develop'
+    environment: staging
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Deploy to staging
+      run: |
+        echo "Deploying to staging environment"
+        # Add your staging deployment logic here
+        # Example: kubectl apply -f k8s/ -n staging
+
+  # Deploy to Production
+  deploy-production:
+    runs-on: ubuntu-latest
+    needs: [build]
+    if: github.ref == 'refs/heads/main'
+    environment: production
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Deploy to production
+      run: |
+        echo "Deploying to production environment"
+        # Add your production deployment logic here
+        # Example: kubectl apply -f k8s/ -n production
+
+  # Security Scanning
+  security-scan:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Run Trivy vulnerability scanner
+      uses: aquasecurity/trivy-action@master
+      with:
+        image-ref: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${{ github.sha }}
+        format: 'sarif'
+        output: 'trivy-results.sarif'
+    
+    - name: Upload Trivy scan results to GitHub Security tab
+      uses: github/codeql-action/upload-sarif@v2
+      if: always()
+      with:
+        sarif_file: 'trivy-results.sarif'
+
+  # Performance Testing
+  performance-test:
+    runs-on: ubuntu-latest
+    needs: [build]
+    if: github.ref == 'refs/heads/main'
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.11'
+        cache: 'pip'
+    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install locust
+    
+    - name: Run performance tests
+      run: |
+        cd backend
+        locust -f tests/performance/locustfile.py --headless --users 100 --spawn-rate 10 --run-time 300s
+
+  # Documentation Generation
+  docs:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.11'
+        cache: 'pip'
+    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install sphinx sphinx-rtd-theme
+    
+    - name: Generate API documentation
+      run: |
+        cd backend
+        python -m uvicorn app.main:app --host 0.0.0.0 --port 8000 &
+        sleep 10
+        curl http://localhost:8000/openapi.json > docs/api-spec.json
+    
+    - name: Build documentation
+      run: |
+        cd docs
+        make html
+    
+    - name: Deploy documentation
+      uses: peaceiris/actions-gh-pages@v3
+      if: github.ref == 'refs/heads/main'
+      with:
+        github_token: ${{ secrets.GITHUB_TOKEN }}
+        publish_dir: ./docs/_build/html
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
new file mode 100644
index 0000000..f9f28c2
--- /dev/null
+++ b/ARCHITECTURE.md
@@ -0,0 +1,396 @@
+# 🏗️ AI Document Agent - System Architecture
+
+## Overview
+
+The AI Document Agent is built using a modern, scalable microservices architecture that leverages cutting-edge AI technologies and enterprise-grade infrastructure patterns. This document provides a comprehensive overview of the system's architecture, design decisions, and technical implementation.
+
+## 🎯 Architecture Principles
+
+### **Design Philosophy**
+- **Scalability First**: Horizontal scaling capabilities for all components
+- **Resilience**: Fault tolerance and graceful degradation
+- **Security by Design**: Multi-layered security approach
+- **Observability**: Comprehensive monitoring and tracing
+- **Performance**: Optimized for high-throughput document processing
+
+### **Technology Selection Criteria**
+- **Modern & Proven**: Industry-standard technologies with strong community support
+- **Performance**: High-performance frameworks and databases
+- **Scalability**: Technologies that support horizontal scaling
+- **Security**: Enterprise-grade security features
+- **Maintainability**: Clear separation of concerns and modular design
+
+## 🏛️ High-Level Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                              Client Layer                                   │
+├─────────────────────────────────────────────────────────────────────────────┤
+│  Web Browser (React SPA)  │  Mobile App  │  API Clients  │  Third-party   │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                    │
+                                    ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                            Presentation Layer                              │
+├─────────────────────────────────────────────────────────────────────────────┤
+│  Nginx (Load Balancer)  │  SSL Termination  │  Rate Limiting  │  Caching   │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                    │
+                                    ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                            Application Layer                               │
+├─────────────────────────────────────────────────────────────────────────────┤
+│  Frontend (React)  │  Backend API (FastAPI)  │  WebSocket Server  │  Admin  │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                    │
+                                    ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                            Business Logic Layer                            │
+├─────────────────────────────────────────────────────────────────────────────┤
+│  Agent Orchestrator  │  Workflow Engine  │  Business Services  │  Rules    │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                    │
+                                    ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                              AI Agent Layer                                │
+├─────────────────────────────────────────────────────────────────────────────┤
+│  Orchestrator  │  Ingestion  │  Classifier  │  Entity  │  Risk  │  QA      │
+│  Compare       │  Audit      │  Summarizer  │  Translator │  Sentiment   │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                    │
+                                    ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                            Data Access Layer                               │
+├─────────────────────────────────────────────────────────────────────────────┤
+│  PostgreSQL  │  Redis  │  ChromaDB  │  Elasticsearch  │  File Storage   │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                    │
+                                    ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                            Infrastructure Layer                            │
+├─────────────────────────────────────────────────────────────────────────────┤
+│  Docker  │  Kubernetes  │  Monitoring  │  Logging  │  Security  │  Backup  │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+## 🔧 Component Architecture
+
+### **Frontend Architecture (React + TypeScript)**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    React Application                           │
+├─────────────────────────────────────────────────────────────────┤
+│  App Router  │  State Management  │  UI Components  │  Services │
+├─────────────────────────────────────────────────────────────────┤
+│  Pages       │  Context API       │  Material-UI    │  API      │
+│  Layouts     │  Custom Hooks      │  Custom Theme   │  WebSocket│
+│  Navigation  │  Local Storage     │  Animations     │  Utils    │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+**Key Features:**
+- **Component-Based Architecture**: Reusable, composable components
+- **State Management**: React Context + Custom Hooks
+- **Type Safety**: Full TypeScript implementation
+- **Responsive Design**: Mobile-first approach
+- **Progressive Web App**: Offline capabilities
+
+### **Backend Architecture (FastAPI + Python)**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    FastAPI Application                        │
+├─────────────────────────────────────────────────────────────────┤
+│  API Routes  │  Middleware  │  Dependencies  │  Background Tasks │
+├─────────────────────────────────────────────────────────────────┤
+│  Auth        │  CORS        │  Database      │  Celery Workers   │
+│  Documents   │  Logging     │  Cache         │  Agent Processing │
+│  Agents      │  Security    │  Validation    │  File Processing  │
+│  Analytics   │  Monitoring  │  Authentication│  Email/SMS       │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+**Key Features:**
+- **Async/Await**: High-performance async operations
+- **Dependency Injection**: Clean, testable code
+- **OpenAPI**: Auto-generated API documentation
+- **Middleware Stack**: Security, logging, monitoring
+- **Background Processing**: Celery for heavy tasks
+
+### **AI Agent Architecture**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Agent Orchestrator                         │
+├─────────────────────────────────────────────────────────────────┤
+│  Workflow Engine  │  Agent Registry  │  Task Scheduler  │  Monitor │
+├─────────────────────────────────────────────────────────────────┤
+│  Pipeline Builder │  Agent Factory   │  Queue Manager   │  Metrics │
+│  State Manager    │  Config Manager  │  Retry Logic     │  Alerts  │
+└─────────────────────────────────────────────────────────────────┘
+                                    │
+                                    ▼
+┌─────────────────────────────────────────────────────────────────┐
+│                    Agent Execution Layer                      │
+├─────────────────────────────────────────────────────────────────┤
+│  Orchestrator  │  Ingestion  │  Classifier  │  Entity  │  Risk   │
+│  Compare       │  Audit      │  Summarizer  │  Translator│  QA   │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+**Agent Capabilities:**
+- **Orchestrator**: Workflow coordination and decision-making
+- **Ingestion**: Document parsing and content extraction
+- **Classifier**: ML-powered document categorization
+- **Entity**: Named entity recognition and extraction
+- **Risk**: Compliance monitoring and risk assessment
+- **QA**: Interactive question-answering
+- **Compare**: Document comparison and diff analysis
+- **Audit**: Comprehensive audit logging
+- **Summarizer**: AI-powered document summarization
+- **Translator**: Multi-language translation
+- **Sentiment**: Sentiment analysis and tone detection
+
+## 🗄️ Data Architecture
+
+### **Database Design**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    PostgreSQL (Primary DB)                    │
+├─────────────────────────────────────────────────────────────────┤
+│  Users & Auth  │  Documents  │  Processing  │  Analytics      │
+├─────────────────────────────────────────────────────────────────┤
+│  users         │  documents  │  workflows   │  metrics        │
+│  roles         │  tags       │  agents      │  events         │
+│  sessions      │  entities   │  executions  │  reports        │
+│  permissions   │  compliance │  history     │  dashboards     │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### **Caching Strategy**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Redis Cache Layers                         │
+├─────────────────────────────────────────────────────────────────┤
+│  Session Cache  │  Query Cache  │  Agent Cache  │  Rate Limiting │
+├─────────────────────────────────────────────────────────────────┤
+│  User sessions  │  API results  │  Agent state  │  Request limits│
+│  Auth tokens    │  DB queries   │  Model cache  │  IP blocking   │
+│  Permissions    │  Aggregations │  Config cache │  DDoS protection│
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### **Vector Database (ChromaDB)**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    ChromaDB (Vector Store)                   │
+├─────────────────────────────────────────────────────────────────┤
+│  Document Embeddings  │  Semantic Search  │  Similarity Index │
+├─────────────────────────────────────────────────────────────────┤
+│  Text embeddings      │  Vector queries   │  Cosine similarity│
+│  Metadata vectors     │  Hybrid search    │  Clustering       │
+│  Multi-modal vectors  │  Context retrieval│  Recommendations  │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## 🔒 Security Architecture
+
+### **Multi-Layer Security**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Security Layers                            │
+├─────────────────────────────────────────────────────────────────┤
+│  Network Security  │  Application Security  │  Data Security   │
+├─────────────────────────────────────────────────────────────────┤
+│  Firewalls        │  Authentication        │  Encryption       │
+│  DDoS Protection  │  Authorization         │  PII Redaction    │
+│  VPN Access       │  Input Validation      │  Data Masking     │
+│  Network Segments │  Rate Limiting         │  Audit Logging    │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### **Authentication & Authorization**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Auth Flow                                  │
+├─────────────────────────────────────────────────────────────────┤
+│  Client Request  │  JWT Validation  │  Role Check  │  Resource Access │
+├─────────────────────────────────────────────────────────────────┤
+│  Credentials     │  Token Verify    │  Permissions │  ACL Check       │
+│  MFA (Optional)  │  Expiry Check    │  Scope Check │  Audit Log       │
+│  Rate Limiting   │  Blacklist Check │  Context     │  Response        │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## 📊 Monitoring & Observability
+
+### **Observability Stack**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Monitoring Architecture                    │
+├─────────────────────────────────────────────────────────────────┤
+│  Metrics Collection  │  Log Aggregation  │  Distributed Tracing │
+├─────────────────────────────────────────────────────────────────┤
+│  Prometheus         │  Elasticsearch     │  Jaeger             │
+│  Custom Metrics     │  Filebeat          │  OpenTelemetry      │
+│  Health Checks      │  Structured Logs   │  Correlation IDs    │
+│  Alert Manager      │  Log Analysis      │  Performance Profiling│
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### **Metrics & KPIs**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Business Metrics                           │
+├─────────────────────────────────────────────────────────────────┤
+│  System Metrics  │  Application Metrics  │  Business Metrics   │
+├─────────────────────────────────────────────────────────────────┤
+│  CPU/Memory      │  Request Rate        │  Documents Processed │
+│  Disk I/O        │  Response Time       │  Processing Success  │
+│  Network         │  Error Rate          │  User Engagement     │
+│  Container       │  Throughput          │  Compliance Score    │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## 🚀 Deployment Architecture
+
+### **Container Orchestration**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Kubernetes Cluster                         │
+├─────────────────────────────────────────────────────────────────┤
+│  Ingress Controller  │  Service Mesh  │  Pod Management       │
+├─────────────────────────────────────────────────────────────────┤
+│  Nginx Ingress      │  Istio         │  Horizontal Pod Autoscaler│
+│  SSL Termination    │  Traffic Split │  Rolling Updates       │
+│  Load Balancing     │  Circuit Breaker│  Health Checks        │
+│  Rate Limiting      │  Retry Logic   │  Resource Limits       │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### **Environment Strategy**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Environment Pipeline                       │
+├─────────────────────────────────────────────────────────────────┤
+│  Development  │  Staging  │  Production  │  Disaster Recovery  │
+├─────────────────────────────────────────────────────────────────┤
+│  Local Docker │  K8s Dev  │  K8s Prod    │  Backup Cluster    │
+│  Hot Reload   │  Testing  │  Monitoring  │  Data Replication   │
+│  Debug Tools  │  QA       │  Scaling     │  Failover           │
+│  Mock Data    │  UAT      │  Security    │  Recovery Testing   │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## 🔄 Data Flow Architecture
+
+### **Document Processing Pipeline**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Document Processing Flow                   │
+├─────────────────────────────────────────────────────────────────┤
+│  1. Upload    │  2. Validate  │  3. Ingest   │  4. Classify    │
+│  File Upload  │  File Type    │  Extract     │  ML Model       │
+│  Virus Scan   │  Size Check   │  Content     │  Categorize     │
+│  Metadata     │  Format       │  Structure   │  Domain         │
+├─────────────────────────────────────────────────────────────────┤
+│  5. Extract   │  6. Analyze   │  7. Store    │  8. Index       │
+│  Entities     │  Risk         │  Database    │  Vector DB      │
+│  Keywords     │  Compliance   │  File System │  Search Index   │
+│  Relations    │  Sentiment    │  Backup      │  Analytics      │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### **Real-time Processing**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Real-time Architecture                     │
+├─────────────────────────────────────────────────────────────────┤
+│  WebSocket    │  Event Stream  │  Message Queue  │  Background  │
+├─────────────────────────────────────────────────────────────────┤
+│  Connection   │  Processing    │  Celery        │  Workers      │
+│  Heartbeat    │  Updates       │  Redis         │  Agent Tasks  │
+│  Reconnection │  Notifications │  Priority      │  File Tasks   │
+│  Broadcasting │  Status        │  Dead Letter   │  Email Tasks  │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## 🎯 Performance Architecture
+
+### **Scaling Strategies**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Scaling Architecture                       │
+├─────────────────────────────────────────────────────────────────┤
+│  Horizontal Scaling  │  Vertical Scaling  │  Auto Scaling     │
+├─────────────────────────────────────────────────────────────────┤
+│  Load Balancer      │  Resource Limits   │  HPA (K8s)        │
+│  Multiple Instances │  Memory/CPU        │  VPA (K8s)        │
+│  Database Sharding  │  Connection Pools  │  Custom Metrics   │
+│  Cache Clustering   │  Query Optimization│  Predictive Scaling│
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### **Performance Optimization**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Performance Layers                         │
+├─────────────────────────────────────────────────────────────────┤
+│  CDN Layer     │  Cache Layer  │  Application Layer  │  DB Layer │
+├─────────────────────────────────────────────────────────────────┤
+│  Static Assets │  Redis Cache  │  Async Processing   │  Indexing │
+│  Global Edge   │  Query Cache  │  Connection Pooling │  Sharding │
+│  Compression   │  Session Cache│  Background Tasks   │  Replication│
+│  Caching       │  Agent Cache  │  Load Balancing     │  Partitioning│
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## 🔧 Configuration Management
+
+### **Configuration Strategy**
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    Configuration Architecture                 │
+├─────────────────────────────────────────────────────────────────┤
+│  Environment Config  │  Feature Flags  │  Agent Config  │  Security │
+├─────────────────────────────────────────────────────────────────┤
+│  .env Files         │  Feature Toggles │  Model Params  │  Keys     │
+│  K8s ConfigMaps     │  A/B Testing     │  Timeouts      │  Certs    │
+│  Secrets Management │  Gradual Rollout │  Retry Logic   │  Policies │
+│  Dynamic Config     │  Canary Deploy   │  Thresholds    │  Rules    │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+## 🎯 Future Architecture Considerations
+
+### **Planned Enhancements**
+- **GraphQL API**: For more flexible data querying
+- **Event Sourcing**: For complete audit trail
+- **CQRS Pattern**: For read/write optimization
+- **Service Mesh**: For advanced traffic management
+- **Multi-Region**: For global deployment
+- **Edge Computing**: For low-latency processing
+
+### **Technology Evolution**
+- **AI Model Updates**: Integration with latest LLMs
+- **Database Evolution**: Migration to distributed databases
+- **Cloud Native**: Full cloud-native architecture
+- **Serverless**: Event-driven serverless functions
+- **Blockchain**: For immutable audit trails
+
+---
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000..ff05ae3
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,191 @@
+# 🛠️ Development Standards & Best Practices
+
+This document outlines the development standards, coding practices, and quality assurance processes for the AI Document Agent platform.
+
+## 📋 Code Standards
+
+### **Python (Backend)**
+- **PEP 8** compliance with 120 character line limit
+- **Type hints** required for all function parameters and return values
+- **Docstrings** using Google style for all public functions and classes
+- **Black** for code formatting
+- **isort** for import organization
+- **flake8** for linting
+
+### **TypeScript/JavaScript (Frontend)**
+- **ESLint** with strict TypeScript rules
+- **Prettier** for code formatting
+- **TypeScript strict mode** enabled
+- **React hooks** for state management
+- **Functional components** with proper prop typing
+
+### **Database**
+- **SQLAlchemy** ORM with async support
+- **Alembic** for database migrations
+- **Proper indexing** for performance
+- **Foreign key constraints** for data integrity
+
+## 🧪 Testing Requirements
+
+### **Test Coverage**
+- **Minimum 90%** code coverage for backend
+- **Minimum 80%** code coverage for frontend
+- **Integration tests** for all API endpoints
+- **Unit tests** for all business logic
+- **E2E tests** for critical user flows
+
+### **Testing Tools**
+```bash
+# Backend Testing
+pytest --cov=app --cov-report=html
+pytest tests/integration/ -v
+pytest tests/unit/ -v
+
+# Frontend Testing
+npm test -- --coverage
+npm run test:e2e
+
+# Load Testing
+locust -f tests/load/locustfile.py
+```
+
+## 🔍 Code Quality Gates
+
+### **Pre-commit Checks**
+```bash
+# Python quality checks
+black app/
+isort app/
+flake8 app/
+mypy app/
+pytest --cov=app --cov-fail-under=90
+
+# Frontend quality checks
+npm run lint
+npm run type-check
+npm test -- --coverage --watchAll=false
+npm run build
+```
+
+### **Performance Benchmarks**
+- **API Response Time**: <200ms average
+- **Frontend Bundle Size**: <2MB gzipped
+- **Database Query Time**: <100ms for complex queries
+- **Memory Usage**: <512MB per service
+
+## 📚 Documentation Standards
+
+### **Code Documentation**
+- **Comprehensive docstrings** for all public APIs
+- **Type hints** for all function signatures
+- **Inline comments** for complex business logic
+- **README updates** for new features
+
+### **API Documentation**
+- **OpenAPI/Swagger** specifications
+- **Example requests/responses** for all endpoints
+- **Error code documentation**
+- **Authentication examples**
+
+## 🚀 Deployment Standards
+
+### **Environment Management**
+- **Environment-specific** configuration files
+- **Secrets management** with proper encryption
+- **Health checks** for all services
+- **Graceful shutdown** handling
+
+### **Monitoring & Observability**
+- **Custom metrics** for business KPIs
+- **Structured logging** with correlation IDs
+- **Performance monitoring** with alerting
+- **Error tracking** and reporting
+
+## 🔒 Security Standards
+
+### **Code Security**
+- **Input validation** for all user inputs
+- **SQL injection** prevention with parameterized queries
+- **XSS protection** with proper escaping
+- **CSRF protection** for state-changing operations
+
+### **Infrastructure Security**
+- **Secrets rotation** policies
+- **Network segmentation** and firewalls
+- **Regular security audits** and penetration testing
+- **Vulnerability scanning** in CI/CD pipeline
+
+## 📊 Performance Standards
+
+### **Backend Performance**
+- **Async/await** for I/O operations
+- **Connection pooling** for database connections
+- **Caching strategies** for frequently accessed data
+- **Background task processing** for heavy operations
+
+### **Frontend Performance**
+- **Code splitting** and lazy loading
+- **Image optimization** and compression
+- **Bundle analysis** and optimization
+- **Progressive Web App** features
+
+## 🏗️ Architecture Standards
+
+### **Design Patterns**
+- **Repository pattern** for data access
+- **Service layer** for business logic
+- **Factory pattern** for object creation
+- **Observer pattern** for event handling
+
+### **Microservices Principles**
+- **Single responsibility** for each service
+- **Loose coupling** between services
+- **API versioning** strategy
+- **Circuit breaker** pattern for resilience
+
+## 📈 Quality Metrics
+
+### **Code Quality Metrics**
+- **Cyclomatic complexity** < 10 per function
+- **Maintainability index** > 65
+- **Technical debt** ratio < 5%
+- **Code duplication** < 3%
+
+### **Performance Metrics**
+- **Response time** percentiles (P50, P95, P99)
+- **Throughput** measurements
+- **Error rates** and availability
+- **Resource utilization** monitoring
+
+## 🔄 Development Workflow
+
+### **Feature Development**
+1. **Requirements analysis** and documentation
+2. **Technical design** and architecture review
+3. **Implementation** with TDD approach
+4. **Code review** and quality checks
+5. **Testing** and validation
+6. **Documentation** updates
+7. **Deployment** and monitoring
+
+### **Bug Fixes**
+1. **Issue reproduction** and root cause analysis
+2. **Fix implementation** with regression tests
+3. **Code review** and testing
+4. **Deployment** with rollback plan
+5. **Monitoring** and verification
+
+## 🎯 Success Criteria
+
+### **Code Quality**
+- **Zero critical** security vulnerabilities
+- **90%+ test coverage** maintained
+- **All linting rules** passing
+- **Performance benchmarks** met
+
+### **Business Value**
+- **Feature delivery** on schedule
+- **User satisfaction** metrics
+- **System reliability** and uptime
+- **Scalability** requirements met
+
diff --git a/DEPLOYMENT.md b/DEPLOYMENT.md
new file mode 100644
index 0000000..d31716a
--- /dev/null
+++ b/DEPLOYMENT.md
@@ -0,0 +1,501 @@
+# 🚀 AI Document Agent - Deployment Guide
+
+## Overview
+
+This document provides comprehensive deployment instructions for the AI Document Agent platform across different environments, from development to production. The deployment strategy follows enterprise-grade practices with security, scalability, and reliability as core principles.
+
+## 🎯 Deployment Strategy
+
+### **Environment Tiers**
+- **Development**: Local development with hot-reload
+- **Staging**: Production-like environment for testing
+- **Production**: High-availability, scalable deployment
+- **Disaster Recovery**: Backup and failover systems
+
+### **Deployment Models**
+- **Docker Compose**: For development and small-scale deployments
+- **Kubernetes**: For production and enterprise deployments
+- **Cloud Native**: AWS, Azure, GCP deployment options
+- **Hybrid**: On-premise with cloud integration
+
+## 🐳 Docker Compose Deployment
+
+### **Development Environment**
+
+```bash
+# Clone repository
+git clone https://github.com/your-org/ai-document-agent.git
+cd ai-document-agent
+
+# Copy environment configuration
+cp .env.example .env
+
+# Configure environment variables
+nano .env
+
+# Start development services
+docker-compose -f docker-compose.dev.yml up -d
+
+# Initialize database
+./scripts/init-db.sh
+
+# Access services
+# Frontend: http://localhost:3000
+# Backend: http://localhost:8000
+# API Docs: http://localhost:8000/docs
+```
+
+### **Production Environment**
+
+```bash
+# Production deployment
+docker-compose -f docker-compose.prod.yml up -d
+
+# With monitoring stack
+docker-compose -f docker-compose.prod.yml -f docker-compose.monitoring.yml up -d
+
+# Health check
+docker-compose ps
+curl http://localhost:8000/health
+```
+
+## ☸️ Kubernetes Deployment
+
+### **Prerequisites**
+```bash
+# Kubernetes cluster (minikube, kind, or cloud provider)
+kubectl version --client
+helm version
+
+# Install required tools
+kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.12.0/cert-manager.yaml
+kubectl apply -f https://raw.githubusercontent.com/kubernetes/ingress-nginx/controller-v1.7.1/deploy/static/provider/cloud/deploy.yaml
+```
+
+### **Namespace Setup**
+```bash
+# Create namespace
+kubectl create namespace ai-document-agent
+
+# Set context
+kubectl config set-context --current --namespace=ai-document-agent
+```
+
+### **Secrets Management**
+```bash
+# Create secrets
+kubectl create secret generic ai-document-agent-secrets \
+  --from-literal=postgres-password=your-secure-password \
+  --from-literal=redis-password=your-redis-password \
+  --from-literal=jwt-secret=your-jwt-secret \
+  --from-literal=openai-api-key=your-openai-key
+
+# Create config maps
+kubectl create configmap ai-document-agent-config \
+  --from-file=config/app.yaml \
+  --from-file=config/agents.yaml
+```
+
+### **Database Deployment**
+```bash
+# Deploy PostgreSQL
+helm repo add bitnami https://charts.bitnami.com/bitnami
+helm install postgres bitnami/postgresql \
+  --set auth.postgresPassword=your-secure-password \
+  --set primary.persistence.size=10Gi \
+  --set architecture=standalone
+
+# Deploy Redis
+helm install redis bitnami/redis \
+  --set auth.password=your-redis-password \
+  --set architecture=standalone \
+  --set master.persistence.size=5Gi
+```
+
+### **Application Deployment**
+```bash
+# Deploy core application
+kubectl apply -f k8s/namespace.yaml
+kubectl apply -f k8s/secrets.yaml
+kubectl apply -f k8s/configmaps.yaml
+kubectl apply -f k8s/services.yaml
+kubectl apply -f k8s/deployments.yaml
+kubectl apply -f k8s/ingress.yaml
+
+# Deploy monitoring stack
+kubectl apply -f k8s/monitoring/
+```
+
+### **Verification**
+```bash
+# Check deployment status
+kubectl get pods
+kubectl get services
+kubectl get ingress
+
+# Check logs
+kubectl logs -f deployment/backend
+kubectl logs -f deployment/frontend
+
+# Port forward for local access
+kubectl port-forward service/backend 8000:8000
+kubectl port-forward service/frontend 3000:3000
+```
+
+## ☁️ Cloud Deployment
+
+### **AWS Deployment**
+
+#### **ECS Fargate**
+```bash
+# Deploy with AWS CLI
+aws ecs create-cluster --cluster-name ai-document-agent
+
+# Create task definitions
+aws ecs register-task-definition --cli-input-json file://task-definition.json
+
+# Create service
+aws ecs create-service \
+  --cluster ai-document-agent \
+  --service-name backend \
+  --task-definition backend:1 \
+  --desired-count 2 \
+  --launch-type FARGATE
+```
+
+#### **EKS (Elastic Kubernetes Service)**
+```bash
+# Create EKS cluster
+eksctl create cluster \
+  --name ai-document-agent \
+  --region us-west-2 \
+  --nodegroup-name workers \
+  --node-type t3.medium \
+  --nodes 3 \
+  --nodes-min 1 \
+  --nodes-max 5
+
+# Deploy application
+kubectl apply -f k8s/
+```
+
+### **Azure Deployment**
+
+#### **AKS (Azure Kubernetes Service)**
+```bash
+# Create AKS cluster
+az aks create \
+  --resource-group ai-document-agent-rg \
+  --name ai-document-agent-cluster \
+  --node-count 3 \
+  --enable-addons monitoring \
+  --generate-ssh-keys
+
+# Get credentials
+az aks get-credentials --resource-group ai-document-agent-rg --name ai-document-agent-cluster
+
+# Deploy application
+kubectl apply -f k8s/
+```
+
+### **Google Cloud Deployment**
+
+#### **GKE (Google Kubernetes Engine)**
+```bash
+# Create GKE cluster
+gcloud container clusters create ai-document-agent \
+  --zone us-central1-a \
+  --num-nodes 3 \
+  --machine-type n1-standard-2 \
+  --enable-autoscaling \
+  --min-nodes 1 \
+  --max-nodes 5
+
+# Get credentials
+gcloud container clusters get-credentials ai-document-agent --zone us-central1-a
+
+# Deploy application
+kubectl apply -f k8s/
+```
+
+## 🔒 Security Configuration
+
+### **SSL/TLS Setup**
+```bash
+# Generate SSL certificates
+openssl req -x509 -nodes -days 365 -newkey rsa:2048 \
+  -keyout nginx/ssl/private.key \
+  -out nginx/ssl/certificate.crt
+
+# Configure Nginx with SSL
+kubectl apply -f k8s/ssl/
+```
+
+### **Network Security**
+```bash
+# Configure network policies
+kubectl apply -f k8s/network-policies/
+
+# Set up firewall rules
+gcloud compute firewall-rules create ai-document-agent \
+  --allow tcp:80,tcp:443,tcp:22 \
+  --source-ranges 0.0.0.0/0 \
+  --target-tags ai-document-agent
+```
+
+### **Secrets Management**
+```bash
+# Use external secrets manager
+helm install external-secrets external-secrets/external-secrets \
+  --set installCRDs=true
+
+# Configure secrets
+kubectl apply -f k8s/external-secrets/
+```
+
+## 📊 Monitoring & Observability
+
+### **Prometheus & Grafana**
+```bash
+# Deploy monitoring stack
+helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+helm install prometheus prometheus-community/kube-prometheus-stack \
+  --set grafana.enabled=true \
+  --set prometheus.enabled=true
+
+# Access Grafana
+kubectl port-forward service/prometheus-grafana 3001:80
+# Username: admin, Password: prom-operator
+```
+
+### **ELK Stack**
+```bash
+# Deploy Elasticsearch
+helm repo add elastic https://helm.elastic.co
+helm install elasticsearch elastic/elasticsearch \
+  --set replicas=3
+
+# Deploy Kibana
+helm install kibana elastic/kibana \
+  --set service.type=LoadBalancer
+
+# Deploy Filebeat
+helm install filebeat elastic/filebeat
+```
+
+### **Jaeger Tracing**
+```bash
+# Deploy Jaeger
+helm repo add jaegertracing https://jaegertracing.github.io/helm-charts
+helm install jaeger jaegertracing/jaeger \
+  --set storage.type=elasticsearch \
+  --set storage.options.es.server-urls=http://elasticsearch-master:9200
+```
+
+## 🔄 CI/CD Pipeline
+
+### **GitHub Actions**
+```yaml
+# .github/workflows/deploy.yml
+name: Deploy to Production
+
+on:
+  push:
+    branches: [main]
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      
+      - name: Build and push Docker images
+        run: |
+          docker build -t ai-document-agent/backend:latest ./backend
+          docker build -t ai-document-agent/frontend:latest ./frontend
+          docker push ai-document-agent/backend:latest
+          docker push ai-document-agent/frontend:latest
+      
+      - name: Deploy to Kubernetes
+        run: |
+          kubectl set image deployment/backend backend=ai-document-agent/backend:latest
+          kubectl set image deployment/frontend frontend=ai-document-agent/frontend:latest
+          kubectl rollout status deployment/backend
+          kubectl rollout status deployment/frontend
+```
+
+### **ArgoCD**
+```bash
+# Install ArgoCD
+kubectl create namespace argocd
+kubectl apply -n argocd -f https://raw.githubusercontent.com/argoproj/argo-cd/stable/manifests/install.yaml
+
+# Create application
+kubectl apply -f argocd/application.yaml
+```
+
+## 📈 Scaling & Performance
+
+### **Horizontal Pod Autoscaler**
+```yaml
+# k8s/hpa.yaml
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: backend-hpa
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: backend
+  minReplicas: 2
+  maxReplicas: 10
+  metrics:
+  - type: Resource
+    resource:
+      name: cpu
+      target:
+        type: Utilization
+        averageUtilization: 70
+  - type: Resource
+    resource:
+      name: memory
+      target:
+        type: Utilization
+        averageUtilization: 80
+```
+
+### **Database Scaling**
+```bash
+# Scale PostgreSQL
+helm upgrade postgres bitnami/postgresql \
+  --set readReplicas.persistence.size=10Gi \
+  --set readReplicas.replicaCount=2
+
+# Scale Redis
+helm upgrade redis bitnami/redis \
+  --set architecture=replication \
+  --set replica.replicaCount=3
+```
+
+## 🔄 Backup & Recovery
+
+### **Database Backup**
+```bash
+# Automated backup script
+#!/bin/bash
+BACKUP_DIR="/backups"
+DATE=$(date +%Y%m%d_%H%M%S)
+
+# PostgreSQL backup
+pg_dump -h $DB_HOST -U $DB_USER -d $DB_NAME > $BACKUP_DIR/postgres_$DATE.sql
+
+# Redis backup
+redis-cli -h $REDIS_HOST BGSAVE
+
+# File storage backup
+tar -czf $BACKUP_DIR/files_$DATE.tar.gz /data/uploads/
+
+# Upload to cloud storage
+aws s3 cp $BACKUP_DIR/postgres_$DATE.sql s3://ai-document-agent-backups/
+aws s3 cp $BACKUP_DIR/files_$DATE.tar.gz s3://ai-document-agent-backups/
+```
+
+### **Disaster Recovery**
+```bash
+# Restore from backup
+pg_restore -h $DB_HOST -U $DB_USER -d $DB_NAME $BACKUP_DIR/postgres_$DATE.sql
+
+# Failover procedure
+kubectl apply -f k8s/disaster-recovery/failover.yaml
+```
+
+## 🧪 Testing Deployment
+
+### **Load Testing**
+```bash
+# Deploy load testing
+kubectl apply -f k8s/load-testing/
+
+# Run load test
+kubectl exec -it load-test-pod -- locust -f /app/locustfile.py \
+  --host=http://backend-service:8000
+```
+
+### **Health Checks**
+```bash
+# Automated health checks
+curl -f http://localhost:8000/health || exit 1
+curl -f http://localhost:3000/ || exit 1
+
+# Database connectivity
+pg_isready -h $DB_HOST -p $DB_PORT || exit 1
+
+# Redis connectivity
+redis-cli -h $REDIS_HOST ping || exit 1
+```
+
+## 📋 Deployment Checklist
+
+### **Pre-Deployment**
+- [ ] Environment variables configured
+- [ ] Secrets and certificates prepared
+- [ ] Database migrations ready
+- [ ] Load balancer configured
+- [ ] Monitoring stack deployed
+- [ ] Backup strategy implemented
+
+### **Deployment**
+- [ ] Database deployed and initialized
+- [ ] Application services deployed
+- [ ] Ingress and SSL configured
+- [ ] Health checks passing
+- [ ] Monitoring dashboards accessible
+- [ ] Logs being collected
+
+### **Post-Deployment**
+- [ ] Performance testing completed
+- [ ] Security scan passed
+- [ ] Documentation updated
+- [ ] Team notified
+- [ ] Rollback plan tested
+- [ ] Monitoring alerts configured
+
+## 🚨 Troubleshooting
+
+### **Common Issues**
+```bash
+# Pod not starting
+kubectl describe pod <pod-name>
+kubectl logs <pod-name>
+
+# Service not accessible
+kubectl get endpoints
+kubectl describe service <service-name>
+
+# Database connection issues
+kubectl exec -it <pod-name> -- nc -zv <db-host> <db-port>
+
+# Memory issues
+kubectl top pods
+kubectl describe node <node-name>
+```
+
+### **Debug Commands**
+```bash
+# Get cluster info
+kubectl cluster-info
+kubectl get nodes
+
+# Check resource usage
+kubectl top nodes
+kubectl top pods
+
+# View events
+kubectl get events --sort-by=.metadata.creationTimestamp
+
+# Port forwarding for debugging
+kubectl port-forward service/backend 8000:8000
+kubectl port-forward service/grafana 3001:80
+```
+
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..dbaa589
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,270 @@
+# AI Document Agent - Development Makefile
+# Provides convenient commands for development, testing, and deployment
+
+.PHONY: help install test lint format clean build deploy docker-build docker-run docker-stop
+
+# Default target
+help:
+	@echo "AI Document Agent - Development Commands"
+	@echo "========================================"
+	@echo ""
+	@echo "Installation:"
+	@echo "  install          Install all dependencies"
+	@echo "  install-dev      Install development dependencies"
+	@echo "  install-prod     Install production dependencies"
+	@echo ""
+	@echo "Development:"
+	@echo "  dev              Start development servers"
+	@echo "  dev-backend      Start backend development server"
+	@echo "  dev-frontend     Start frontend development server"
+	@echo ""
+	@echo "Testing:"
+	@echo "  test             Run all tests"
+	@echo "  test-unit        Run unit tests only"
+	@echo "  test-integration Run integration tests only"
+	@echo "  test-coverage    Run tests with coverage report"
+	@echo "  test-load        Run load tests"
+	@echo ""
+	@echo "Code Quality:"
+	@echo "  lint             Run all linting checks"
+	@echo "  lint-backend     Lint backend code"
+	@echo "  lint-frontend    Lint frontend code"
+	@echo "  format           Format all code"
+	@echo "  format-backend   Format backend code"
+	@echo "  format-frontend  Format frontend code"
+	@echo ""
+	@echo "Database:"
+	@echo "  db-init          Initialize database"
+	@echo "  db-migrate       Run database migrations"
+	@echo "  db-reset         Reset database"
+	@echo "  db-backup        Create database backup"
+	@echo ""
+	@echo "Docker:"
+	@echo "  docker-build     Build Docker images"
+	@echo "  docker-run       Run with Docker Compose"
+	@echo "  docker-stop      Stop Docker containers"
+	@echo "  docker-clean     Clean Docker resources"
+	@echo ""
+	@echo "Deployment:"
+	@echo "  build            Build for production"
+	@echo "  deploy-staging   Deploy to staging"
+	@echo "  deploy-prod      Deploy to production"
+	@echo ""
+	@echo "Monitoring:"
+	@echo "  monitoring-start Start monitoring stack"
+	@echo "  monitoring-stop  Stop monitoring stack"
+	@echo ""
+	@echo "Utilities:"
+	@echo "  clean            Clean build artifacts"
+	@echo "  logs             Show application logs"
+	@echo "  health           Check system health"
+
+# Installation
+install: install-backend install-frontend
+	@echo "✅ All dependencies installed"
+
+install-backend:
+	@echo "📦 Installing backend dependencies..."
+	pip install -r requirements.txt
+	pip install -r requirements-dev.txt
+
+install-frontend:
+	@echo "📦 Installing frontend dependencies..."
+	cd frontend && npm install
+
+install-dev: install-backend install-frontend
+	@echo "📦 Installing development dependencies..."
+	pip install -r requirements-dev.txt
+	cd frontend && npm install --include=dev
+
+install-prod: install-backend install-frontend
+	@echo "📦 Installing production dependencies..."
+	cd frontend && npm ci --only=production
+
+# Development
+dev: dev-backend dev-frontend
+	@echo "🚀 Development servers started"
+
+dev-backend:
+	@echo "🚀 Starting backend development server..."
+	cd backend && python -m uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
+
+dev-frontend:
+	@echo "🚀 Starting frontend development server..."
+	cd frontend && npm start
+
+# Testing
+test: test-backend test-frontend
+	@echo "✅ All tests completed"
+
+test-backend:
+	@echo "🧪 Running backend tests..."
+	cd backend && pytest tests/ -v
+
+test-frontend:
+	@echo "🧪 Running frontend tests..."
+	cd frontend && npm test -- --watchAll=false
+
+test-unit:
+	@echo "🧪 Running unit tests..."
+	cd backend && pytest tests/ -m "unit" -v
+
+test-integration:
+	@echo "🧪 Running integration tests..."
+	cd backend && pytest tests/ -m "integration" -v
+
+test-coverage:
+	@echo "🧪 Running tests with coverage..."
+	cd backend && pytest tests/ --cov=app --cov-report=html --cov-report=term-missing
+	cd frontend && npm test -- --coverage --watchAll=false
+
+test-load:
+	@echo "🧪 Running load tests..."
+	cd backend && locust -f tests/load/locustfile.py --headless --users 10 --spawn-rate 2 --run-time 60s
+
+# Code Quality
+lint: lint-backend lint-frontend
+	@echo "✅ All linting checks completed"
+
+lint-backend:
+	@echo "🔍 Linting backend code..."
+	cd backend && flake8 app/ --max-line-length=88 --extend-ignore=E203,W503
+	cd backend && black --check app/
+	cd backend && isort --check-only app/
+	cd backend && mypy app/
+
+lint-frontend:
+	@echo "🔍 Linting frontend code..."
+	cd frontend && npm run lint
+	cd frontend && npm run type-check
+
+format: format-backend format-frontend
+	@echo "✨ All code formatted"
+
+format-backend:
+	@echo "✨ Formatting backend code..."
+	cd backend && black app/
+	cd backend && isort app/
+
+format-frontend:
+	@echo "✨ Formatting frontend code..."
+	cd frontend && npm run format
+
+# Database
+db-init:
+	@echo "🗄️ Initializing database..."
+	cd backend && python -c "from app.database.connection import init_database; import asyncio; asyncio.run(init_database())"
+
+db-migrate:
+	@echo "🗄️ Running database migrations..."
+	cd backend && alembic upgrade head
+
+db-reset:
+	@echo "🗄️ Resetting database..."
+	cd backend && alembic downgrade base
+	cd backend && alembic upgrade head
+	cd backend && python -c "from app.database.connection import initialize_default_data; import asyncio; asyncio.run(initialize_default_data())"
+
+db-backup:
+	@echo "🗄️ Creating database backup..."
+	./scripts/backup.sh
+
+# Docker
+docker-build:
+	@echo "🐳 Building Docker images..."
+	docker-compose build
+
+docker-run:
+	@echo "🐳 Starting Docker containers..."
+	docker-compose up -d
+
+docker-stop:
+	@echo "🐳 Stopping Docker containers..."
+	docker-compose down
+
+docker-clean:
+	@echo "🐳 Cleaning Docker resources..."
+	docker-compose down -v --remove-orphans
+	docker system prune -f
+
+# Deployment
+build: build-backend build-frontend
+	@echo "🏗️ Production build completed"
+
+build-backend:
+	@echo "🏗️ Building backend..."
+	cd backend && python -m build
+
+build-frontend:
+	@echo "🏗️ Building frontend..."
+	cd frontend && npm run build
+
+deploy-staging:
+	@echo "🚀 Deploying to staging..."
+	./scripts/deploy.sh staging
+
+deploy-prod:
+	@echo "🚀 Deploying to production..."
+	./scripts/deploy.sh production
+
+# Monitoring
+monitoring-start:
+	@echo "📊 Starting monitoring stack..."
+	./scripts/monitoring-setup.sh start
+
+monitoring-stop:
+	@echo "📊 Stopping monitoring stack..."
+	./scripts/monitoring-setup.sh stop
+
+# Utilities
+clean:
+	@echo "🧹 Cleaning build artifacts..."
+	find . -type d -name "__pycache__" -exec rm -rf {} +
+	find . -type f -name "*.pyc" -delete
+	find . -type f -name "*.pyo" -delete
+	find . -type f -name "*.pyd" -delete
+	find . -type d -name "*.egg-info" -exec rm -rf {} +
+	find . -type d -name ".pytest_cache" -exec rm -rf {} +
+	find . -type d -name "htmlcov" -exec rm -rf {} +
+	find . -type f -name ".coverage" -delete
+	find . -type f -name "coverage.xml" -delete
+	cd frontend && rm -rf build/ node_modules/ .cache/
+	@echo "✅ Cleanup completed"
+
+logs:
+	@echo "📋 Showing application logs..."
+	docker-compose logs -f
+
+health:
+	@echo "🏥 Checking system health..."
+	curl -f http://localhost:8000/health || echo "❌ Backend health check failed"
+	curl -f http://localhost:3000 || echo "❌ Frontend health check failed"
+	curl -f http://localhost:9090/-/healthy || echo "❌ Prometheus health check failed"
+	curl -f http://localhost:3001/api/health || echo "❌ Grafana health check failed"
+
+# Security
+security-scan:
+	@echo "🔒 Running security scans..."
+	cd backend && bandit -r app/ -f json -o bandit-report.json
+	cd backend && safety check
+	docker run --rm -v $(PWD):/app aquasec/trivy fs /app
+
+# Performance
+performance-test:
+	@echo "⚡ Running performance tests..."
+	cd backend && locust -f tests/performance/locustfile.py --headless --users 100 --spawn-rate 10 --run-time 300s
+
+# Documentation
+docs:
+	@echo "📚 Generating documentation..."
+	cd backend && python -m uvicorn app.main:app --host 0.0.0.0 --port 8000 &
+	sleep 10
+	curl http://localhost:8000/openapi.json > docs/api-spec.json
+	pkill -f uvicorn
+
+# Quick start for new developers
+quickstart: install-dev db-init
+	@echo "🚀 Quick start completed!"
+	@echo "Run 'make dev' to start development servers"
+	@echo "Run 'make test' to run tests"
+	@echo "Run 'make lint' to check code quality"
diff --git a/README.md b/README.md
index 535c820..52ee389 100644
--- a/README.md
+++ b/README.md
@@ -110,4 +110,24 @@ The application can be configured through environment variables:
 
 ## License
 
-This project is licensed under the MIT License - see the LICENSE file for details.
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
+
+## 🏆 Enterprise Support
+
+### **Professional Services**
+- **Custom Development**: Tailored features and integrations
+- **Deployment Support**: On-premise and cloud deployment
+- **Training & Consulting**: Team training and best practices
+- **24/7 Support**: Enterprise support with SLA guarantees
+
+### **Contact Information**
+- **Email**: enterprise@ai-document-agent.com
+- **Phone**: +1 (555) 123-4567
+- **Documentation**: https://docs.ai-document-agent.com
+- **Support Portal**: https://support.ai-document-agent.com
+
+---
+
+**Built with ❤️ by the AI Document Agent Team**
+
+*Empowering enterprises with intelligent document processing since 2024*
diff --git a/alembic.ini b/alembic.ini
new file mode 100644
index 0000000..88bccab
--- /dev/null
+++ b/alembic.ini
@@ -0,0 +1,112 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = alembic
+
+# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s
+# Uncomment the line below if you want the files to be prepended with date and time
+# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s
+
+# sys.path path, will be prepended to sys.path if present.
+# defaults to the current working directory.
+prepend_sys_path = .
+
+# timezone to use when rendering the date within the migration file
+# as well as the filename.
+# If specified, requires the python-dateutil library that can be
+# installed by adding `alembic[tz]` to the pip requirements
+# string value is passed to dateutil.tz.gettz()
+# leave blank for localtime
+# timezone =
+
+# max length of characters to apply to the
+# "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version number format
+version_num_format = %04d
+
+# version path separator; As mentioned above, this is the character used to split
+# version_locations. The default within new alembic.ini files is "os", which uses
+# os.pathsep. If this key is omitted entirely, it falls back to the legacy
+# behavior of splitting on spaces and/or commas.
+# Valid values for version_path_separator are:
+#
+# version_path_separator = :
+# version_path_separator = ;
+# version_path_separator = space
+version_path_separator = os
+
+# set to 'true' to search source files recursively
+# in each "version_locations" directory
+# new in Alembic version 1.10
+# recursive_version_locations = false
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+sqlalchemy.url = postgresql://postgres:password@localhost:5432/smart_doc_bot
+
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts.  See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks = black
+# black.type = console_scripts
+# black.entrypoint = black
+# black.options = -l 79 REVISION_SCRIPT_FILENAME
+
+# lint with attempts to fix using "ruff" - use the exec runner, execute a binary
+# hooks = ruff
+# ruff.type = exec
+# ruff.executable = %(here)s/.venv/bin/ruff
+# ruff.options = --fix REVISION_SCRIPT_FILENAME
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
diff --git a/alembic/env.py b/alembic/env.py
new file mode 100644
index 0000000..62f963a
--- /dev/null
+++ b/alembic/env.py
@@ -0,0 +1,99 @@
+from logging.config import fileConfig
+from sqlalchemy import engine_from_config
+from sqlalchemy import pool
+from alembic import context
+import os
+import sys
+
+# Add the backend directory to the Python path
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'backend'))
+
+# Import your models and database configuration
+from app.database.connection import Base
+from app.database.models import (
+    User, Role, UserRole, Document, Tag, DocumentTag, ComplianceFramework,
+    ProcessingHistory, AgentExecution, DocumentComparison, AuditEvent,
+    SystemMetric, WorkflowTemplate, KnowledgeBase, Notification, APILog, SystemConfig
+)
+from app.core.config import settings
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+if config.config_file_name is not None:
+    fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+target_metadata = Base.metadata
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def get_url():
+    """Get database URL from environment or config"""
+    return settings.DATABASE_URL
+
+
+def run_migrations_offline() -> None:
+    """Run migrations in 'offline' mode.
+
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    url = get_url()
+    context.configure(
+        url=url,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+    )
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online() -> None:
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    configuration = config.get_section(config.config_ini_section)
+    configuration["sqlalchemy.url"] = get_url()
+    
+    connectable = engine_from_config(
+        configuration,
+        prefix="sqlalchemy.",
+        poolclass=pool.NullPool,
+    )
+
+    with connectable.connect() as connection:
+        context.configure(
+            connection=connection, 
+            target_metadata=target_metadata,
+            compare_type=True,
+            compare_server_default=True,
+        )
+
+        with context.begin_transaction():
+            context.run_migrations()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
diff --git a/alembic/script.py.mako b/alembic/script.py.mako
new file mode 100644
index 0000000..55df286
--- /dev/null
+++ b/alembic/script.py.mako
@@ -0,0 +1,24 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision = ${repr(up_revision)}
+down_revision = ${repr(down_revision)}
+branch_labels = ${repr(branch_labels)}
+depends_on = ${repr(depends_on)}
+
+
+def upgrade() -> None:
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade() -> None:
+    ${downgrades if downgrades else "pass"}
diff --git a/backend/app/agents/orchestrator.py b/backend/app/agents/orchestrator.py
index 66a3ebb..bb26bd2 100644
--- a/backend/app/agents/orchestrator.py
+++ b/backend/app/agents/orchestrator.py
@@ -1,5 +1,6 @@
 import json
 import asyncio
+import logging
 from typing import Any, Dict, List, Optional
 from datetime import datetime
 from enum import Enum
@@ -9,6 +10,7 @@
 
 from .base import BaseAgent, Tool
 from ..models.base import AgentResult, AgentType, Document
+from ..core.config import settings
 from .ingestion import IngestionAgent
 from .classifier import ClassifierAgent
 from .entity import EntityAgent
@@ -17,6 +19,9 @@
 from .compare import CompareAgent
 from .audit import AuditAgent
 
+# Configure logging
+logger = logging.getLogger(__name__)
+
 
 class WorkflowStage(Enum):
     """Workflow stage enumeration"""
@@ -410,15 +415,23 @@ async def _execute_stage(self, agent_type: str, document: Document, goal: str) -
             context = {
                 "document": document,
                 "goal": goal,
-                "orchestrator": self
+                "orchestrator": self,
+                "workflow_state": self.workflow_state
             }
             
-            # Execute agent
-            result = await agent.run(goal, context)
-            return result
+            # Execute agent with timeout
+            import asyncio
+            try:
+                result = await asyncio.wait_for(
+                    agent.run(goal, context),
+                    timeout=settings.AGENT_TIMEOUT
+                )
+                return result
+            except asyncio.TimeoutError:
+                raise Exception(f"Agent {agent_type} execution timed out after {settings.AGENT_TIMEOUT} seconds")
             
         except Exception as e:
-            print(f"Stage execution failed for {agent_type}: {str(e)}")
+            logger.error(f"Stage execution failed for {agent_type}: {str(e)}")
             return None
     
     def _calculate_confidence(self, execution_results: Dict, monitoring_result: Dict) -> float:
diff --git a/backend/app/api/v1/endpoints/auth.py b/backend/app/api/v1/endpoints/auth.py
index 6d37696..af22c2b 100644
--- a/backend/app/api/v1/endpoints/auth.py
+++ b/backend/app/api/v1/endpoints/auth.py
@@ -1,17 +1,17 @@
 from datetime import datetime, timedelta
 from typing import Optional
-from fastapi import APIRouter, Depends, HTTPException, status
+from fastapi import APIRouter, Depends, HTTPException, status, Request
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
 from pydantic import BaseModel, EmailStr
-import jwt
-from passlib.context import CryptContext
+from sqlalchemy.orm import Session
 
 from ...core.config import settings
-from ...core.security import create_access_token, verify_token
+from ...core.security import security_manager, get_current_user, require_permission
+from ...database.connection import get_db
+from ...database.models import User, Role, UserRole
 
 router = APIRouter()
 security = HTTPBearer()
-pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
 
 
 class LoginRequest(BaseModel):
@@ -27,185 +27,439 @@ class LoginResponse(BaseModel):
 
 
 class UserInfo(BaseModel):
-    id: str
+    id: int
     email: str
-    username: str
-    full_name: str
-    role: str
-    permissions: list[str]
+    full_name: Optional[str]
     is_active: bool
-    created_at: str
-    last_login: Optional[str] = None
-
-
-# Mock user database - in production, this would be a real database
-MOCK_USERS = {
-    "admin@redline.com": {
-        "id": "user_001",
-        "email": "admin@redline.com",
-        "username": "admin",
-        "full_name": "System Administrator",
-        "password_hash": pwd_context.hash("admin123"),
-        "role": "admin",
-        "permissions": ["read", "write", "delete", "analyze", "admin"],
-        "is_active": True,
-        "created_at": "2024-01-01T00:00:00Z",
-        "last_login": None
-    },
-    "user@redline.com": {
-        "id": "user_002",
-        "email": "user@redline.com",
-        "username": "user",
-        "full_name": "Regular User",
-        "password_hash": pwd_context.hash("user123"),
-        "role": "user",
-        "permissions": ["read", "write", "analyze"],
-        "is_active": True,
-        "created_at": "2024-01-01T00:00:00Z",
-        "last_login": None
-    }
-}
-
-
-def verify_password(plain_password: str, hashed_password: str) -> bool:
-    """Verify a password against its hash"""
-    return pwd_context.verify(plain_password, hashed_password)
-
-
-def get_user_by_email(email: str):
-    """Get user by email from mock database"""
-    return MOCK_USERS.get(email)
-
-
-def authenticate_user(email: str, password: str):
-    """Authenticate user with email and password"""
-    user = get_user_by_email(email)
-    if not user:
-        return None
-    if not verify_password(password, user["password_hash"]):
-        return None
-    return user
-
-
-async def get_current_user(credentials: HTTPAuthorizationCredentials = Depends(security)):
-    """Get current user from JWT token"""
+    is_superuser: bool
+    created_at: datetime
+    last_login: Optional[datetime] = None
+    roles: list[str] = []
+
+
+class RegisterRequest(BaseModel):
+    email: EmailStr
+    password: str
+    full_name: Optional[str] = None
+
+
+class ChangePasswordRequest(BaseModel):
+    current_password: str
+    new_password: str
+
+
+class ResetPasswordRequest(BaseModel):
+    email: EmailStr
+
+
+@router.post("/login", response_model=LoginResponse)
+async def login(
+    request: LoginRequest,
+    db: Session = Depends(get_db),
+    client_request: Request = None
+):
+    """Login endpoint with comprehensive security logging"""
     try:
-        payload = verify_token(credentials.credentials)
-        email: str = payload.get("sub")
-        if email is None:
+        # Authenticate user
+        user = security_manager.authenticate_user(db, request.email, request.password)
+        if not user:
+            # Log failed login attempt
+            if client_request:
+                security_manager.log_security_event(
+                    event_type="login_failed",
+                    user_id=None,
+                    ip_address=client_request.client.host,
+                    details={"email": request.email, "reason": "invalid_credentials"}
+                )
+            
             raise HTTPException(
                 status_code=status.HTTP_401_UNAUTHORIZED,
-                detail="Could not validate credentials",
+                detail="Incorrect email or password",
                 headers={"WWW-Authenticate": "Bearer"},
             )
-    except jwt.ExpiredSignatureError:
-        raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Token has expired",
-            headers={"WWW-Authenticate": "Bearer"},
-        )
-    except jwt.JWTError:
-        raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Could not validate credentials",
-            headers={"WWW-Authenticate": "Bearer"},
+        
+        # Update last login
+        user.last_login = datetime.utcnow()
+        db.commit()
+        
+        # Create access token
+        access_token_expires = timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES)
+        access_token = security_manager.create_access_token(
+            data={"sub": user.email}, expires_delta=access_token_expires
         )
-    
-    user = get_user_by_email(email)
-    if user is None:
-        raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="User not found",
-            headers={"WWW-Authenticate": "Bearer"},
+        
+        # Get user roles
+        user_roles = db.query(UserRole).filter(UserRole.user_id == user.id).all()
+        role_names = []
+        for user_role in user_roles:
+            role = db.query(Role).filter(Role.id == user_role.role_id).first()
+            if role:
+                role_names.append(role.name)
+        
+        # Log successful login
+        if client_request:
+            security_manager.log_security_event(
+                event_type="login_success",
+                user_id=user.id,
+                ip_address=client_request.client.host,
+                details={"email": user.email, "roles": role_names}
+            )
+        
+        return LoginResponse(
+            access_token=access_token,
+            token_type="bearer",
+            expires_in=settings.ACCESS_TOKEN_EXPIRE_MINUTES * 60,
+            user={
+                "id": user.id,
+                "email": user.email,
+                "full_name": user.full_name,
+                "is_active": user.is_active,
+                "is_superuser": user.is_superuser,
+                "roles": role_names
+            }
         )
-    
-    return user
-
-
-@router.post("/login", response_model=LoginResponse)
-async def login(request: LoginRequest):
-    """Login endpoint"""
-    user = authenticate_user(request.email, request.password)
-    if not user:
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        # Log unexpected error
+        if client_request:
+            security_manager.log_security_event(
+                event_type="login_error",
+                user_id=None,
+                ip_address=client_request.client.host,
+                details={"email": request.email, "error": str(e)}
+            )
         raise HTTPException(
-            status_code=status.HTTP_401_UNAUTHORIZED,
-            detail="Incorrect email or password",
-            headers={"WWW-Authenticate": "Bearer"},
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Internal server error during login"
         )
-    
-    # Update last login
-    user["last_login"] = datetime.utcnow().isoformat()
-    
-    # Create access token
-    access_token_expires = timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES)
-    access_token = create_access_token(
-        data={"sub": user["email"]}, expires_delta=access_token_expires
-    )
-    
-    return LoginResponse(
-        access_token=access_token,
-        token_type="bearer",
-        expires_in=settings.ACCESS_TOKEN_EXPIRE_MINUTES * 60,
-        user={
-            "id": user["id"],
-            "email": user["email"],
-            "username": user["username"],
-            "full_name": user["full_name"],
-            "role": user["role"],
-            "permissions": user["permissions"],
-            "is_active": user["is_active"]
-        }
-    )
 
 
 @router.post("/logout")
-async def logout(current_user: dict = Depends(get_current_user)):
-    """Logout endpoint"""
-    # In a real implementation, you might want to blacklist the token
-    # For now, we'll just return success
-    return {"message": "Successfully logged out"}
+async def logout(
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db),
+    client_request: Request = None
+):
+    """Logout endpoint with token blacklisting"""
+    try:
+        # Get the token from the request
+        if client_request and "authorization" in client_request.headers:
+            token = client_request.headers["authorization"].replace("Bearer ", "")
+            security_manager.blacklist_token(token)
+        
+        # Log logout event
+        if client_request:
+            security_manager.log_security_event(
+                event_type="logout",
+                user_id=current_user.id,
+                ip_address=client_request.client.host,
+                details={"email": current_user.email}
+            )
+        
+        return {"message": "Successfully logged out"}
+        
+    except Exception as e:
+        # Log error but don't fail the logout
+        if client_request:
+            security_manager.log_security_event(
+                event_type="logout_error",
+                user_id=current_user.id,
+                ip_address=client_request.client.host,
+                details={"error": str(e)}
+            )
+        return {"message": "Logged out (with warnings)"}
 
 
 @router.get("/me", response_model=UserInfo)
-async def get_current_user_info(current_user: dict = Depends(get_current_user)):
+async def get_current_user_info(
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
     """Get current user information"""
-    return UserInfo(
-        id=current_user["id"],
-        email=current_user["email"],
-        username=current_user["username"],
-        full_name=current_user["full_name"],
-        role=current_user["role"],
-        permissions=current_user["permissions"],
-        is_active=current_user["is_active"],
-        created_at=current_user["created_at"],
-        last_login=current_user["last_login"]
-    )
+    try:
+        # Get user roles
+        user_roles = db.query(UserRole).filter(UserRole.user_id == current_user.id).all()
+        role_names = []
+        for user_role in user_roles:
+            role = db.query(Role).filter(Role.id == user_role.role_id).first()
+            if role:
+                role_names.append(role.name)
+        
+        return UserInfo(
+            id=current_user.id,
+            email=current_user.email,
+            full_name=current_user.full_name,
+            is_active=current_user.is_active,
+            is_superuser=current_user.is_superuser,
+            created_at=current_user.created_at,
+            last_login=current_user.last_login,
+            roles=role_names
+        )
+        
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to retrieve user information"
+        )
 
 
 @router.post("/refresh")
-async def refresh_token(current_user: dict = Depends(get_current_user)):
+async def refresh_token(
+    current_user: User = Depends(get_current_user),
+    client_request: Request = None
+):
     """Refresh access token"""
-    access_token_expires = timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES)
-    access_token = create_access_token(
-        data={"sub": current_user["email"]}, expires_delta=access_token_expires
-    )
-    
-    return {
-        "access_token": access_token,
-        "token_type": "bearer",
-        "expires_in": settings.ACCESS_TOKEN_EXPIRE_MINUTES * 60
-    }
+    try:
+        access_token_expires = timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES)
+        access_token = security_manager.create_access_token(
+            data={"sub": current_user.email}, expires_delta=access_token_expires
+        )
+        
+        # Log token refresh
+        if client_request:
+            security_manager.log_security_event(
+                event_type="token_refresh",
+                user_id=current_user.id,
+                ip_address=client_request.client.host,
+                details={"email": current_user.email}
+            )
+        
+        return {
+            "access_token": access_token,
+            "token_type": "bearer",
+            "expires_in": settings.ACCESS_TOKEN_EXPIRE_MINUTES * 60
+        }
+        
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to refresh token"
+        )
+
+
+@router.post("/register")
+async def register_user(
+    request: RegisterRequest,
+    db: Session = Depends(get_db),
+    client_request: Request = None
+):
+    """Register new user (admin only)"""
+    try:
+        # Check if user already exists
+        existing_user = security_manager.get_user_by_email(db, request.email)
+        if existing_user:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="User with this email already exists"
+            )
+        
+        # Create new user
+        hashed_password = security_manager.get_password_hash(request.password)
+        new_user = User(
+            email=request.email,
+            hashed_password=hashed_password,
+            full_name=request.full_name,
+            is_active=True,
+            is_superuser=False
+        )
+        
+        db.add(new_user)
+        db.commit()
+        db.refresh(new_user)
+        
+        # Assign default user role
+        default_role = db.query(Role).filter(Role.name == "user").first()
+        if default_role:
+            user_role = UserRole(user_id=new_user.id, role_id=default_role.id)
+            db.add(user_role)
+            db.commit()
+        
+        # Log user registration
+        if client_request:
+            security_manager.log_security_event(
+                event_type="user_registered",
+                user_id=new_user.id,
+                ip_address=client_request.client.host,
+                details={"email": new_user.email, "registered_by": "admin"}
+            )
+        
+        return {
+            "message": "User registered successfully",
+            "user_id": new_user.id,
+            "email": new_user.email
+        }
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        db.rollback()
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to register user"
+        )
+
+
+@router.post("/change-password")
+async def change_password(
+    request: ChangePasswordRequest,
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db),
+    client_request: Request = None
+):
+    """Change user password"""
+    try:
+        # Verify current password
+        if not security_manager.verify_password(request.current_password, current_user.hashed_password):
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Current password is incorrect"
+            )
+        
+        # Update password
+        new_hashed_password = security_manager.get_password_hash(request.new_password)
+        current_user.hashed_password = new_hashed_password
+        db.commit()
+        
+        # Log password change
+        if client_request:
+            security_manager.log_security_event(
+                event_type="password_changed",
+                user_id=current_user.id,
+                ip_address=client_request.client.host,
+                details={"email": current_user.email}
+            )
+        
+        return {"message": "Password changed successfully"}
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        db.rollback()
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to change password"
+        )
+
+
+@router.post("/reset-password")
+async def reset_password(
+    request: ResetPasswordRequest,
+    db: Session = Depends(get_db),
+    client_request: Request = None
+):
+    """Request password reset (sends email)"""
+    try:
+        # Check if user exists
+        user = security_manager.get_user_by_email(db, request.email)
+        if not user:
+            # Don't reveal if user exists or not
+            return {"message": "If the email exists, a reset link has been sent"}
+        
+        # Generate reset token
+        reset_token = security_manager.create_access_token(
+            data={"sub": user.email, "type": "password_reset"},
+            expires_delta=timedelta(hours=1)
+        )
+        
+        # TODO: Send email with reset link
+        # In production, this would send an actual email
+        
+        # Log password reset request
+        if client_request:
+            security_manager.log_security_event(
+                event_type="password_reset_requested",
+                user_id=user.id,
+                ip_address=client_request.client.host,
+                details={"email": user.email}
+            )
+        
+        return {"message": "If the email exists, a reset link has been sent"}
+        
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to process password reset request"
+        )
 
 
 @router.get("/validate")
-async def validate_token(current_user: dict = Depends(get_current_user)):
+async def validate_token(
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
     """Validate current token"""
-    return {
-        "valid": True,
-        "user": {
-            "id": current_user["id"],
-            "email": current_user["email"],
-            "username": current_user["username"],
-            "role": current_user["role"]
+    try:
+        # Get user roles
+        user_roles = db.query(UserRole).filter(UserRole.user_id == current_user.id).all()
+        role_names = []
+        for user_role in user_roles:
+            role = db.query(Role).filter(Role.id == user_role.role_id).first()
+            if role:
+                role_names.append(role.name)
+        
+        return {
+            "valid": True,
+            "user": {
+                "id": current_user.id,
+                "email": current_user.email,
+                "full_name": current_user.full_name,
+                "is_active": current_user.is_active,
+                "is_superuser": current_user.is_superuser,
+                "roles": role_names
+            }
+        }
+        
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to validate token"
+        )
+
+
+@router.get("/permissions")
+async def get_user_permissions(
+    current_user: User = Depends(get_current_user),
+    db: Session = Depends(get_db)
+):
+    """Get current user permissions"""
+    try:
+        permissions = security_manager.get_user_permissions(db, current_user)
+        return {
+            "user_id": current_user.id,
+            "email": current_user.email,
+            "permissions": permissions
         }
-    }
+        
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to retrieve permissions"
+        )
+
+
+@router.get("/security-events")
+async def get_security_events(
+    current_user: User = Depends(require_permission("admin:security_events")),
+    limit: int = 100
+):
+    """Get recent security events (admin only)"""
+    try:
+        # Get security events from Redis
+        events = []
+        for i in range(min(limit, 1000)):
+            event = security_manager.redis_client.lindex("security_events", i)
+            if event:
+                events.append(eval(event))  # In production, use proper JSON parsing
+        
+        return {
+            "events": events[:limit],
+            "total": len(events)
+        }
+        
+    except Exception as e:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Failed to retrieve security events"
+        )
diff --git a/backend/app/core/celery_config.py b/backend/app/core/celery_config.py
new file mode 100644
index 0000000..b4b867d
--- /dev/null
+++ b/backend/app/core/celery_config.py
@@ -0,0 +1,195 @@
+"""
+Celery Configuration for AI Document Agent
+Handles distributed task processing for document analysis and AI operations
+"""
+
+import os
+from celery import Celery
+from celery.schedules import crontab
+from .config import settings
+
+# Create Celery instance
+celery_app = Celery(
+    "ai_document_agent",
+    broker=settings.REDIS_URL,
+    backend=settings.REDIS_URL,
+    include=[
+        "app.tasks.document_tasks",
+        "app.tasks.agent_tasks", 
+        "app.tasks.analytics_tasks",
+        "app.tasks.maintenance_tasks"
+    ]
+)
+
+# Celery Configuration
+celery_app.conf.update(
+    # Task routing
+    task_routes={
+        "app.tasks.document_tasks.*": {"queue": "documents"},
+        "app.tasks.agent_tasks.*": {"queue": "agents"},
+        "app.tasks.analytics_tasks.*": {"queue": "analytics"},
+        "app.tasks.maintenance_tasks.*": {"queue": "maintenance"},
+    },
+    
+    # Task serialization
+    task_serializer="json",
+    accept_content=["json"],
+    result_serializer="json",
+    timezone="UTC",
+    enable_utc=True,
+    
+    # Task execution
+    task_always_eager=False,
+    task_eager_propagates=True,
+    task_ignore_result=False,
+    task_store_errors_even_if_ignored=True,
+    
+    # Worker configuration
+    worker_prefetch_multiplier=1,
+    worker_max_tasks_per_child=1000,
+    worker_disable_rate_limits=False,
+    worker_send_task_events=True,
+    
+    # Result backend
+    result_expires=3600,  # 1 hour
+    result_backend_transport_options={
+        "master_name": "mymaster",
+        "visibility_timeout": 3600,
+    },
+    
+    # Beat schedule for periodic tasks
+    beat_schedule={
+        "cleanup-expired-documents": {
+            "task": "app.tasks.maintenance_tasks.cleanup_expired_documents",
+            "schedule": crontab(hour=2, minute=0),  # Daily at 2 AM
+        },
+        "generate-analytics-reports": {
+            "task": "app.tasks.analytics_tasks.generate_daily_reports",
+            "schedule": crontab(hour=6, minute=0),  # Daily at 6 AM
+        },
+        "backup-database": {
+            "task": "app.tasks.maintenance_tasks.backup_database",
+            "schedule": crontab(hour=1, minute=0),  # Daily at 1 AM
+        },
+        "cleanup-audit-logs": {
+            "task": "app.tasks.maintenance_tasks.cleanup_audit_logs",
+            "schedule": crontab(hour=3, minute=0),  # Daily at 3 AM
+        },
+        "update-system-metrics": {
+            "task": "app.tasks.analytics_tasks.update_system_metrics",
+            "schedule": 300.0,  # Every 5 minutes
+        },
+        "process-pending-documents": {
+            "task": "app.tasks.document_tasks.process_pending_documents",
+            "schedule": 60.0,  # Every minute
+        },
+    },
+    
+    # Task time limits
+    task_soft_time_limit=300,  # 5 minutes
+    task_time_limit=600,  # 10 minutes
+    
+    # Retry configuration
+    task_acks_late=True,
+    task_reject_on_worker_lost=True,
+    task_remote_tracebacks=True,
+    
+    # Monitoring
+    worker_send_task_events=True,
+    task_send_sent_event=True,
+    
+    # Security
+    security_key=settings.SECRET_KEY,
+    security_certificate=None,
+    security_cert_store=None,
+)
+
+# Task annotations for specific task configurations
+celery_app.conf.task_annotations = {
+    "app.tasks.document_tasks.process_document": {
+        "rate_limit": "10/m",  # 10 tasks per minute
+        "time_limit": 600,  # 10 minutes
+        "soft_time_limit": 300,  # 5 minutes
+    },
+    "app.tasks.agent_tasks.execute_agent": {
+        "rate_limit": "5/m",  # 5 tasks per minute
+        "time_limit": 900,  # 15 minutes
+        "soft_time_limit": 600,  # 10 minutes
+    },
+    "app.tasks.analytics_tasks.generate_report": {
+        "rate_limit": "1/h",  # 1 task per hour
+        "time_limit": 1800,  # 30 minutes
+        "soft_time_limit": 1200,  # 20 minutes
+    },
+}
+
+# Error handling
+@celery_app.task(bind=True)
+def debug_task(self):
+    """Debug task for testing Celery setup"""
+    print(f"Request: {self.request!r}")
+
+# Health check task
+@celery_app.task
+def health_check():
+    """Health check task for monitoring"""
+    return {
+        "status": "healthy",
+        "timestamp": "2024-01-01T00:00:00Z",
+        "version": settings.APP_VERSION
+    }
+
+# Task failure handling
+@celery_app.task(bind=True, max_retries=3)
+def handle_task_failure(self, task_id, exc, traceback):
+    """Handle task failures with retry logic"""
+    try:
+        # Log the failure
+        print(f"Task {task_id} failed: {exc}")
+        
+        # Retry with exponential backoff
+        if self.request.retries < self.max_retries:
+            countdown = 2 ** self.request.retries  # Exponential backoff
+            raise self.retry(countdown=countdown, exc=exc)
+        else:
+            # Max retries reached, mark as permanently failed
+            print(f"Task {task_id} permanently failed after {self.max_retries} retries")
+            
+    except Exception as e:
+        print(f"Error handling task failure: {e}")
+
+# Task success handling
+@celery_app.task
+def handle_task_success(task_id, result):
+    """Handle successful task completion"""
+    try:
+        print(f"Task {task_id} completed successfully: {result}")
+        # Additional success handling logic here
+    except Exception as e:
+        print(f"Error handling task success: {e}")
+
+# Celery signal handlers
+from celery.signals import task_success, task_failure, task_revoked
+
+@task_success.connect
+def task_success_handler(sender=None, **kwargs):
+    """Handle task success events"""
+    task_id = kwargs.get("result", {}).get("id")
+    if task_id:
+        handle_task_success.delay(task_id, kwargs.get("result"))
+
+@task_failure.connect
+def task_failure_handler(sender=None, task_id=None, exception=None, traceback=None, einfo=None, **kwargs):
+    """Handle task failure events"""
+    if task_id:
+        handle_task_failure.delay(task_id, str(exception), traceback)
+
+@task_revoked.connect
+def task_revoked_handler(sender=None, request=None, terminated=None, signum=None, expired=None, **kwargs):
+    """Handle task revocation events"""
+    task_id = request.id if request else None
+    if task_id:
+        print(f"Task {task_id} was revoked (terminated={terminated}, expired={expired})")
+
+# Export the Celery app
+__all__ = ["celery_app"]
diff --git a/backend/app/core/config.py b/backend/app/core/config.py
index 1e667d8..901b9a2 100644
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -1,79 +1,109 @@
 import os
-from typing import List, Optional
-from pydantic import BaseSettings, Field
+from typing import List, Optional, Dict, Any
+from pydantic import BaseSettings, Field, validator
+from pydantic_settings import BaseSettings as PydanticBaseSettings
 
 
-class Settings(BaseSettings):
-    """Application settings"""
+class Settings(PydanticBaseSettings):
+    """Application settings with environment variable support"""
     
     # Application settings
-    APP_NAME: str = "AI Document Agent"
-    APP_VERSION: str = "1.0.0"
+    APP_NAME: str = Field(default="AI Document Agent", env="APP_NAME")
+    APP_VERSION: str = Field(default="1.0.0", env="APP_VERSION")
     DEBUG: bool = Field(default=False, env="DEBUG")
+    LOG_LEVEL: str = Field(default="INFO", env="LOG_LEVEL")
+    
+    # Server settings
     HOST: str = Field(default="0.0.0.0", env="HOST")
     PORT: int = Field(default=8000, env="PORT")
     
     # Security settings
     SECRET_KEY: str = Field(default="your-secret-key-here", env="SECRET_KEY")
-    ALGORITHM: str = "HS256"
+    ALGORITHM: str = Field(default="HS256", env="ALGORITHM")
     ACCESS_TOKEN_EXPIRE_MINUTES: int = Field(default=30, env="ACCESS_TOKEN_EXPIRE_MINUTES")
     
-    # CORS settings
-    ALLOWED_ORIGINS: List[str] = Field(
-        default=["http://localhost:3000", "http://localhost:8080"],
-        env="ALLOWED_ORIGINS"
-    )
-    ALLOWED_HOSTS: List[str] = Field(
-        default=["localhost", "127.0.0.1"],
-        env="ALLOWED_HOSTS"
-    )
-    
-    # File upload settings
+    # Database settings
+    DATABASE_URL: str = Field(default="postgresql://user:password@localhost/ai_document_agent", env="DATABASE_URL")
+    DATABASE_POOL_SIZE: int = Field(default=10, env="DATABASE_POOL_SIZE")
+    DATABASE_MAX_OVERFLOW: int = Field(default=20, env="DATABASE_MAX_OVERFLOW")
+    
+    # Redis settings
+    REDIS_URL: str = Field(default="redis://localhost:6379/0", env="REDIS_URL")
+    REDIS_MAX_CONNECTIONS: int = Field(default=10, env="REDIS_MAX_CONNECTIONS")
+    
+    # ChromaDB settings
+    CHROMA_PERSIST_DIRECTORY: str = Field(default="./chroma_db", env="CHROMA_PERSIST_DIRECTORY")
+    CHROMA_COLLECTION_NAME: str = Field(default="documents", env="CHROMA_COLLECTION_NAME")
+    
+    # File storage settings
     UPLOAD_DIR: str = Field(default="./uploads", env="UPLOAD_DIR")
-    MAX_FILE_SIZE: int = Field(default=50 * 1024 * 1024, env="MAX_FILE_SIZE")  # 50MB
-    ALLOWED_FILE_TYPES: List[str] = Field(
-        default=[".pdf", ".docx", ".txt", ".csv", ".xlsx"],
-        env="ALLOWED_FILE_TYPES"
-    )
-    
-    # LLM settings
-    LLM_MODEL: str = Field(default="gpt-4", env="LLM_MODEL")
-    LLM_API_KEY: str = Field(default="", env="OPENAI_API_KEY")
-    LLM_TEMPERATURE: float = Field(default=0.1, env="LLM_TEMPERATURE")
-    LLM_MAX_TOKENS: int = Field(default=4000, env="LLM_MAX_TOKENS")
+    MAX_FILE_SIZE: int = Field(default=100 * 1024 * 1024, env="MAX_FILE_SIZE")  # 100MB
+    ALLOWED_FILE_TYPES: List[str] = Field(default=[".pdf", ".docx", ".txt", ".csv", ".xlsx"], env="ALLOWED_FILE_TYPES")
+    
+    # CORS settings
+    ALLOWED_ORIGINS: List[str] = Field(default=["http://localhost:3000"], env="ALLOWED_ORIGINS")
+    ALLOWED_METHODS: List[str] = Field(default=["GET", "POST", "PUT", "DELETE", "OPTIONS"], env="ALLOWED_METHODS")
+    ALLOWED_HEADERS: List[str] = Field(default=["*"], env="ALLOWED_HEADERS")
+    
+    # Monitoring settings
+    ENABLE_MONITORING: bool = Field(default=True, env="ENABLE_MONITORING")
+    PROMETHEUS_PORT: int = Field(default=9090, env="PROMETHEUS_PORT")
+    GRAFANA_PORT: int = Field(default=3001, env="GRAFANA_PORT")
+    
+    # AI/ML settings
+    OPENAI_API_KEY: str = Field(default="", env="OPENAI_API_KEY")
+    OPENAI_MODEL: str = Field(default="gpt-4", env="OPENAI_MODEL")
+    OPENAI_MAX_TOKENS: int = Field(default=4000, env="OPENAI_MAX_TOKENS")
     
     # Agent settings
     AGENT_TIMEOUT: int = Field(default=300, env="AGENT_TIMEOUT")  # 5 minutes
     AGENT_MAX_RETRIES: int = Field(default=3, env="AGENT_MAX_RETRIES")
-    AGENT_CONFIDENCE_THRESHOLD: float = Field(default=0.7, env="AGENT_CONFIDENCE_THRESHOLD")
+    AGENT_CONCURRENT_LIMIT: int = Field(default=10, env="AGENT_CONCURRENT_LIMIT")
     
-    # Workflow settings
-    WORKFLOW_MAX_STAGES: int = Field(default=10, env="WORKFLOW_MAX_STAGES")
-    WORKFLOW_PARALLEL_EXECUTION: bool = Field(default=True, env="WORKFLOW_PARALLEL_EXECUTION")
-    WORKFLOW_MONITORING_INTERVAL: int = Field(default=5, env="WORKFLOW_MONITORING_INTERVAL")
+    # Rate limiting settings
+    RATE_LIMIT_REQUESTS: int = Field(default=1000, env="RATE_LIMIT_REQUESTS")
+    RATE_LIMIT_WINDOW: int = Field(default=3600, env="RATE_LIMIT_WINDOW")  # 1 hour
     
-    # Database settings
-    DATABASE_URL: str = Field(default="sqlite:///./smart_doc_bot.db", env="DATABASE_URL")
-    REDIS_URL: str = Field(default="redis://localhost:6379", env="REDIS_URL")
-    CHROMA_PERSIST_DIRECTORY: str = Field(default="./chroma_db", env="CHROMA_PERSIST_DIRECTORY")
+    # Audit settings
+    AUDIT_LOG_ENABLED: bool = Field(default=True, env="AUDIT_LOG_ENABLED")
+    AUDIT_LOG_RETENTION_DAYS: int = Field(default=90, env="AUDIT_LOG_RETENTION_DAYS")
     
-    # Memory settings
-    MEMORY_TTL: int = Field(default=3600, env="MEMORY_TTL")  # 1 hour
-    MEMORY_MAX_SIZE: int = Field(default=1000, env="MEMORY_MAX_SIZE")
-    VECTOR_SIMILARITY_THRESHOLD: float = Field(default=0.8, env="VECTOR_SIMILARITY_THRESHOLD")
+    # Email settings (for notifications)
+    SMTP_HOST: str = Field(default="", env="SMTP_HOST")
+    SMTP_PORT: int = Field(default=587, env="SMTP_PORT")
+    SMTP_USERNAME: str = Field(default="", env="SMTP_USERNAME")
+    SMTP_PASSWORD: str = Field(default="", env="SMTP_PASSWORD")
+    SMTP_USE_TLS: bool = Field(default=True, env="SMTP_USE_TLS")
     
-    # Monitoring settings
-    ENABLE_MONITORING: bool = Field(default=True, env="ENABLE_MONITORING")
-    METRICS_PORT: int = Field(default=9090, env="METRICS_PORT")
-    LOG_LEVEL: str = Field(default="INFO", env="LOG_LEVEL")
+    # Backup settings
+    BACKUP_ENABLED: bool = Field(default=True, env="BACKUP_ENABLED")
+    BACKUP_RETENTION_DAYS: int = Field(default=30, env="BACKUP_RETENTION_DAYS")
+    BACKUP_SCHEDULE: str = Field(default="0 2 * * *", env="BACKUP_SCHEDULE")  # Daily at 2 AM
     
-    # Audit settings
-    AUDIT_ENABLED: bool = Field(default=True, env="AUDIT_ENABLED")
-    AUDIT_RETENTION_DAYS: int = Field(default=90, env="AUDIT_RETENTION_DAYS")
-    AUDIT_ENCRYPTION_ENABLED: bool = Field(default=False, env="AUDIT_ENCRYPTION_ENABLED")
+    # Performance settings
+    WORKER_PROCESSES: int = Field(default=4, env="WORKER_PROCESSES")
+    MAX_CONCURRENT_REQUESTS: int = Field(default=100, env="MAX_CONCURRENT_REQUESTS")
+    
+    # Feature flags
+    ENABLE_WEBSOCKETS: bool = Field(default=True, env="ENABLE_WEBSOCKETS")
+    ENABLE_SSE: bool = Field(default=True, env="ENABLE_SSE")
+    ENABLE_REAL_TIME_UPDATES: bool = Field(default=True, env="ENABLE_REAL_TIME_UPDATES")
     
     class Config:
         env_file = ".env"
+        case_sensitive = True
+    
+    @validator("ALLOWED_ORIGINS", pre=True)
+    def parse_allowed_origins(cls, v):
+        if isinstance(v, str):
+            return [origin.strip() for origin in v.split(",")]
+        return v
+    
+    @validator("ALLOWED_FILE_TYPES", pre=True)
+    def parse_allowed_file_types(cls, v):
+        if isinstance(v, str):
+            return [file_type.strip() for file_type in v.split(",")]
+        return v
 
 
 # Global settings instance
@@ -85,23 +115,130 @@ def get_settings() -> Settings:
     return settings
 
 
-def get_agent_config():
-    """Get agent configuration"""
+def get_agent_config() -> Dict[str, Any]:
+    """Get agent-specific configuration"""
+    return {
+        "timeout": settings.AGENT_TIMEOUT,
+        "max_retries": settings.AGENT_MAX_RETRIES,
+        "concurrent_limit": settings.AGENT_CONCURRENT_LIMIT,
+        "openai_model": settings.OPENAI_MODEL,
+        "openai_max_tokens": settings.OPENAI_MAX_TOKENS,
+        "chroma_collection": settings.CHROMA_COLLECTION_NAME,
+        "chroma_persist_dir": settings.CHROMA_PERSIST_DIRECTORY
+    }
+
+
+def get_workflow_config() -> Dict[str, Any]:
+    """Get workflow-specific configuration"""
     return {
-        "TIMEOUT": settings.AGENT_TIMEOUT,
-        "MAX_RETRIES": settings.AGENT_MAX_RETRIES,
-        "CONFIDENCE_THRESHOLD": settings.AGENT_CONFIDENCE_THRESHOLD,
-        "LLM_MODEL": settings.LLM_MODEL,
-        "LLM_API_KEY": settings.LLM_API_KEY,
-        "LLM_TEMPERATURE": settings.LLM_TEMPERATURE,
-        "LLM_MAX_TOKENS": settings.LLM_MAX_TOKENS
+        "max_file_size": settings.MAX_FILE_SIZE,
+        "allowed_file_types": settings.ALLOWED_FILE_TYPES,
+        "upload_dir": settings.UPLOAD_DIR,
+        "backup_enabled": settings.BACKUP_ENABLED,
+        "audit_enabled": settings.AUDIT_LOG_ENABLED,
+        "rate_limit_requests": settings.RATE_LIMIT_REQUESTS,
+        "rate_limit_window": settings.RATE_LIMIT_WINDOW
     }
 
 
-def get_workflow_config():
-    """Get workflow configuration"""
+def get_database_config() -> Dict[str, Any]:
+    """Get database-specific configuration"""
+    return {
+        "url": settings.DATABASE_URL,
+        "pool_size": settings.DATABASE_POOL_SIZE,
+        "max_overflow": settings.DATABASE_MAX_OVERFLOW,
+        "echo": settings.DEBUG
+    }
+
+
+def get_redis_config() -> Dict[str, Any]:
+    """Get Redis-specific configuration"""
+    return {
+        "url": settings.REDIS_URL,
+        "max_connections": settings.REDIS_MAX_CONNECTIONS,
+        "decode_responses": True
+    }
+
+
+def get_monitoring_config() -> Dict[str, Any]:
+    """Get monitoring-specific configuration"""
+    return {
+        "enabled": settings.ENABLE_MONITORING,
+        "prometheus_port": settings.PROMETHEUS_PORT,
+        "grafana_port": settings.GRAFANA_PORT,
+        "log_level": settings.LOG_LEVEL
+    }
+
+
+def get_security_config() -> Dict[str, Any]:
+    """Get security-specific configuration"""
+    return {
+        "secret_key": settings.SECRET_KEY,
+        "algorithm": settings.ALGORITHM,
+        "access_token_expire_minutes": settings.ACCESS_TOKEN_EXPIRE_MINUTES,
+        "allowed_origins": settings.ALLOWED_ORIGINS,
+        "allowed_methods": settings.ALLOWED_METHODS,
+        "allowed_headers": settings.ALLOWED_HEADERS,
+        "rate_limit_requests": settings.RATE_LIMIT_REQUESTS,
+        "rate_limit_window": settings.RATE_LIMIT_WINDOW
+    }
+
+
+def get_email_config() -> Dict[str, Any]:
+    """Get email-specific configuration"""
+    return {
+        "smtp_host": settings.SMTP_HOST,
+        "smtp_port": settings.SMTP_PORT,
+        "smtp_username": settings.SMTP_USERNAME,
+        "smtp_password": settings.SMTP_PASSWORD,
+        "smtp_use_tls": settings.SMTP_USE_TLS
+    }
+
+
+def validate_settings() -> List[str]:
+    """Validate settings and return list of issues"""
+    issues = []
+    
+    # Check required settings
+    if not settings.SECRET_KEY or settings.SECRET_KEY == "your-secret-key-here":
+        issues.append("SECRET_KEY must be set to a secure value")
+    
+    if not settings.OPENAI_API_KEY:
+        issues.append("OPENAI_API_KEY must be set for AI functionality")
+    
+    if not settings.DATABASE_URL or "localhost" in settings.DATABASE_URL:
+        issues.append("DATABASE_URL should point to a production database")
+    
+    # Check file paths
+    if not os.path.exists(settings.UPLOAD_DIR):
+        try:
+            os.makedirs(settings.UPLOAD_DIR, exist_ok=True)
+        except Exception as e:
+            issues.append(f"Cannot create upload directory: {e}")
+    
+    if not os.path.exists(settings.CHROMA_PERSIST_DIRECTORY):
+        try:
+            os.makedirs(settings.CHROMA_PERSIST_DIRECTORY, exist_ok=True)
+        except Exception as e:
+            issues.append(f"Cannot create ChromaDB directory: {e}")
+    
+    return issues
+
+
+def get_environment_info() -> Dict[str, Any]:
+    """Get environment information for debugging"""
     return {
-        "MAX_STAGES": settings.WORKFLOW_MAX_STAGES,
-        "PARALLEL_EXECUTION": settings.WORKFLOW_PARALLEL_EXECUTION,
-        "MONITORING_INTERVAL": settings.WORKFLOW_MONITORING_INTERVAL
+        "app_name": settings.APP_NAME,
+        "app_version": settings.APP_VERSION,
+        "debug": settings.DEBUG,
+        "log_level": settings.LOG_LEVEL,
+        "host": settings.HOST,
+        "port": settings.PORT,
+        "database_url": settings.DATABASE_URL.replace(settings.DATABASE_URL.split("@")[0].split(":")[-1], "***") if "@" in settings.DATABASE_URL else settings.DATABASE_URL,
+        "redis_url": settings.REDIS_URL.replace("redis://", "redis://***@") if "redis://" in settings.REDIS_URL else settings.REDIS_URL,
+        "openai_model": settings.OPENAI_MODEL,
+        "monitoring_enabled": settings.ENABLE_MONITORING,
+        "websockets_enabled": settings.ENABLE_WEBSOCKETS,
+        "audit_enabled": settings.AUDIT_LOG_ENABLED,
+        "backup_enabled": settings.BACKUP_ENABLED
     }
\ No newline at end of file
diff --git a/backend/app/core/middleware.py b/backend/app/core/middleware.py
index 175925f..579d94b 100644
--- a/backend/app/core/middleware.py
+++ b/backend/app/core/middleware.py
@@ -1,381 +1,318 @@
-import re
 import time
+import logging
 import json
-import uuid
-from datetime import datetime
-from typing import Dict, List, Optional, Any
+from typing import Callable, Dict, Any
+from fastapi import FastAPI, Request, Response
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.middleware.trustedhost import TrustedHostMiddleware
 from starlette.middleware.base import BaseHTTPMiddleware
-from starlette.requests import Request
-from starlette.responses import Response
-from fastapi import FastAPI
+from starlette.responses import JSONResponse
 
 from .config import settings
-from .monitoring import get_monitor
 
-# PII patterns for redaction
-PII_PATTERNS = [
-    # Email addresses
-    (r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]'),
-    # Phone numbers (various formats)
-    (r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', '[PHONE]'),
-    (r'\b\(\d{3}\)\s*\d{3}[-.]?\d{4}\b', '[PHONE]'),
-    # Social Security Numbers
-    (r'\b\d{3}-\d{2}-\d{4}\b', '[SSN]'),
-    # Credit Card Numbers (basic pattern)
-    (r'\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b', '[CREDIT_CARD]'),
-    # IP Addresses
-    (r'\b(?:\d{1,3}\.){3}\d{1,3}\b', '[IP_ADDRESS]'),
-    # Basic name patterns (consecutive capitalized words)
-    (r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', '[NAME]'),
-]
+logger = logging.getLogger(__name__)
 
 
-class PIIRedactionMiddleware(BaseHTTPMiddleware):
-    """Middleware to redact PII from request/response data"""
-    
-    def __init__(self, app: FastAPI, enabled: bool = True):
-        super().__init__(app)
-        self.enabled = enabled
-        self.monitor = get_monitor()
-    
-    async def dispatch(self, request: Request, call_next):
-        if not self.enabled:
-            return await call_next(request)
-        
-        # Store original body for redaction
-        body = b""
-        if request.method in ["POST", "PUT", "PATCH"]:
-            body = await request.body()
-        
-        # Create new request with redacted body if needed
-        if body:
-            redacted_body = self._redact_pii(body.decode('utf-8', errors='ignore'))
-            # Note: In production, you might want to log the redacted version
-            # but pass the original to the application
-        
-        # Process the request
-        response = await call_next(request)
-        
-        # Redact response if needed (for logging purposes)
-        if hasattr(response, 'body'):
-            # This is a simplified approach - in production you'd want more sophisticated handling
-            pass
-        
-        return response
-    
-    def _redact_pii(self, text: str) -> str:
-        """Redact PII from text using regex patterns"""
-        redacted_text = text
-        
-        for pattern, replacement in PII_PATTERNS:
-            redacted_text = re.sub(pattern, replacement, redacted_text, flags=re.IGNORECASE)
-        
-        return redacted_text
-
-
-class AuditLogMiddleware(BaseHTTPMiddleware):
-    """Middleware to log API requests for audit purposes"""
-    
-    def __init__(self, app: FastAPI, enabled: bool = True):
-        super().__init__(app)
-        self.enabled = enabled
-        self.monitor = get_monitor()
+class RequestLoggingMiddleware(BaseHTTPMiddleware):
+    """Middleware for logging all incoming requests"""
     
-    async def dispatch(self, request: Request, call_next):
-        if not self.enabled:
-            return await call_next(request)
-        
-        # Generate request ID for tracing
-        request_id = str(uuid.uuid4())
-        
-        # Extract request information
+    async def dispatch(self, request: Request, call_next: Callable) -> Response:
         start_time = time.time()
-        client_ip = self._get_client_ip(request)
-        user_agent = request.headers.get("user-agent", "")
-        method = request.method
-        url = str(request.url)
-        
-        # Extract user information if available
-        user_id = None
-        if hasattr(request.state, 'user'):
-            user_id = request.state.user.get('id')
         
-        # Log request start
-        self.monitor.log_info(
-            "audit_middleware",
-            f"API request started: {method} {url}",
-            {
-                "request_id": request_id,
-                "method": method,
-                "url": url,
-                "client_ip": client_ip,
-                "user_agent": user_agent,
-                "user_id": user_id
-            },
-            trace_id=request_id,
-            user_id=user_id
+        # Log request
+        logger.info(
+            f"Request: {request.method} {request.url.path} - "
+            f"Client: {request.client.host if request.client else 'unknown'}"
         )
         
         # Process request
         try:
             response = await call_next(request)
             
-            # Calculate response time
-            end_time = time.time()
-            response_time = end_time - start_time
+            # Calculate processing time
+            process_time = time.time() - start_time
             
-            # Log successful request
-            self.monitor.log_info(
-                "audit_middleware",
-                f"API request completed: {method} {url} - {response.status_code}",
-                {
-                    "request_id": request_id,
-                    "method": method,
-                    "url": url,
-                    "status_code": response.status_code,
-                    "response_time": response_time,
-                    "client_ip": client_ip,
-                    "user_id": user_id
-                },
-                trace_id=request_id,
-                user_id=user_id
+            # Log response
+            logger.info(
+                f"Response: {request.method} {request.url.path} - "
+                f"Status: {response.status_code} - "
+                f"Time: {process_time:.3f}s"
             )
             
-            # Add request ID to response headers
-            response.headers["X-Request-ID"] = request_id
+            # Add processing time header
+            response.headers["X-Process-Time"] = str(process_time)
             
             return response
             
         except Exception as e:
-            # Log failed request
-            end_time = time.time()
-            response_time = end_time - start_time
-            
-            self.monitor.log_error(
-                "audit_middleware",
-                f"API request failed: {method} {url}",
-                str(e),
-                trace_id=request_id,
-                user_id=user_id
+            # Log error
+            process_time = time.time() - start_time
+            logger.error(
+                f"Error: {request.method} {request.url.path} - "
+                f"Exception: {str(e)} - "
+                f"Time: {process_time:.3f}s"
             )
-            
             raise
+
+
+class SecurityHeadersMiddleware(BaseHTTPMiddleware):
+    """Middleware for adding security headers"""
     
-    def _get_client_ip(self, request: Request) -> str:
-        """Extract client IP address from request"""
-        # Check for forwarded headers first
-        forwarded_for = request.headers.get("x-forwarded-for")
-        if forwarded_for:
-            return forwarded_for.split(",")[0].strip()
+    async def dispatch(self, request: Request, call_next: Callable) -> Response:
+        response = await call_next(request)
         
-        real_ip = request.headers.get("x-real-ip")
-        if real_ip:
-            return real_ip
+        # Add security headers
+        response.headers["X-Content-Type-Options"] = "nosniff"
+        response.headers["X-Frame-Options"] = "DENY"
+        response.headers["X-XSS-Protection"] = "1; mode=block"
+        response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"
+        response.headers["Permissions-Policy"] = "geolocation=(), microphone=(), camera=()"
         
-        # Fall back to direct client IP
-        if hasattr(request.client, 'host'):
-            return request.client.host
+        # Add CSP header if not already present
+        if "Content-Security-Policy" not in response.headers:
+            response.headers["Content-Security-Policy"] = (
+                "default-src 'self'; "
+                "script-src 'self' 'unsafe-inline' 'unsafe-eval'; "
+                "style-src 'self' 'unsafe-inline'; "
+                "img-src 'self' data: https:; "
+                "font-src 'self' data:; "
+                "connect-src 'self' ws: wss:;"
+            )
         
-        return "unknown"
+        return response
 
 
-class RequestLoggingMiddleware(BaseHTTPMiddleware):
-    """Middleware for detailed request/response logging"""
+class RateLimitMiddleware(BaseHTTPMiddleware):
+    """Middleware for rate limiting"""
     
-    def __init__(self, app: FastAPI, enabled: bool = True, log_bodies: bool = False):
+    def __init__(self, app, redis_client=None):
         super().__init__(app)
-        self.enabled = enabled
-        self.log_bodies = log_bodies
-        self.monitor = get_monitor()
-        
-        # Endpoints to exclude from detailed logging (to avoid noise)
-        self.exclude_paths = [
-            "/health",
-            "/metrics",
-            "/docs",
-            "/openapi.json",
-            "/favicon.ico"
-        ]
+        self.redis_client = redis_client
     
-    async def dispatch(self, request: Request, call_next):
-        if not self.enabled:
-            return await call_next(request)
-        
-        # Skip logging for excluded paths
-        if any(request.url.path.startswith(path) for path in self.exclude_paths):
+    async def dispatch(self, request: Request, call_next: Callable) -> Response:
+        if not self.redis_client:
             return await call_next(request)
         
-        # Generate trace ID
-        trace_id = str(uuid.uuid4())
-        
-        # Extract request details
-        start_time = time.time()
-        method = request.method
-        url = str(request.url)
-        headers = dict(request.headers)
-        
-        # Remove sensitive headers
-        sensitive_headers = ["authorization", "cookie", "x-api-key"]
-        filtered_headers = {
-            k: v if k.lower() not in sensitive_headers else "[REDACTED]"
-            for k, v in headers.items()
-        }
-        
-        # Log request body if enabled
-        request_body = None
-        if self.log_bodies and method in ["POST", "PUT", "PATCH"]:
-            try:
-                body = await request.body()
-                request_body = body.decode('utf-8', errors='ignore')[:1000]  # Limit size
-            except Exception:
-                request_body = "[ERROR_READING_BODY]"
-        
-        # Log request details
-        self.monitor.log_info(
-            "request_logging",
-            f"Incoming request: {method} {url}",
-            {
-                "trace_id": trace_id,
-                "method": method,
-                "url": url,
-                "headers": filtered_headers,
-                "body": request_body if self.log_bodies else None,
-                "content_length": headers.get("content-length"),
-                "content_type": headers.get("content-type")
-            },
-            trace_id=trace_id
-        )
+        # Get client identifier (IP address)
+        client_ip = request.client.host if request.client else "unknown"
         
-        # Process request
+        # Check rate limit
         try:
-            response = await call_next(request)
-            
-            # Calculate metrics
-            end_time = time.time()
-            response_time = end_time - start_time
-            
-            # Log response details
-            self.monitor.log_info(
-                "request_logging",
-                f"Response: {method} {url} - {response.status_code}",
-                {
-                    "trace_id": trace_id,
-                    "status_code": response.status_code,
-                    "response_time": response_time,
-                    "response_headers": dict(response.headers) if hasattr(response, 'headers') else {}
-                },
-                trace_id=trace_id
-            )
-            
-            # Add trace ID to response
-            response.headers["X-Trace-ID"] = trace_id
-            
-            return response
+            key = f"rate_limit:{client_ip}"
+            current_requests = self.redis_client.get(key)
             
+            if current_requests is None:
+                self.redis_client.setex(key, settings.RATE_LIMIT_WINDOW, 1)
+            else:
+                current_requests = int(current_requests)
+                if current_requests >= settings.RATE_LIMIT_REQUESTS:
+                    logger.warning(f"Rate limit exceeded for IP: {client_ip}")
+                    return JSONResponse(
+                        status_code=429,
+                        content={
+                            "error": "Rate limit exceeded",
+                            "message": f"Too many requests. Limit: {settings.RATE_LIMIT_REQUESTS} per {settings.RATE_LIMIT_WINDOW} seconds"
+                        }
+                    )
+                
+                self.redis_client.incr(key)
+                
         except Exception as e:
-            # Log error
-            end_time = time.time()
-            response_time = end_time - start_time
-            
-            self.monitor.log_error(
-                "request_logging",
-                f"Request error: {method} {url}",
-                str(e),
-                trace_id=trace_id
-            )
-            
-            raise
+            logger.error(f"Rate limit check failed: {e}")
+            # Continue without rate limiting if Redis fails
+        
+        return await call_next(request)
 
 
-class RateLimitMiddleware(BaseHTTPMiddleware):
-    """Simple rate limiting middleware"""
+class PIIRedactionMiddleware(BaseHTTPMiddleware):
+    """Middleware for redacting PII from logs"""
     
-    def __init__(self, app: FastAPI, enabled: bool = True, requests_per_minute: int = 60):
+    def __init__(self, app):
         super().__init__(app)
-        self.enabled = enabled
-        self.requests_per_minute = requests_per_minute
-        self.request_counts: Dict[str, List[float]] = {}
-        self.monitor = get_monitor()
+        self.pii_patterns = [
+            r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',  # Email
+            r'\b\d{3}-\d{2}-\d{4}\b',  # SSN
+            r'\b\d{4}-\d{4}-\d{4}-\d{4}\b',  # Credit card
+            r'\b\d{10,11}\b',  # Phone numbers
+        ]
     
-    async def dispatch(self, request: Request, call_next):
-        if not self.enabled:
+    def redact_pii(self, text: str) -> str:
+        """Redact PII from text"""
+        import re
+        for pattern in self.pii_patterns:
+            text = re.sub(pattern, '[REDACTED]', text)
+        return text
+    
+    async def dispatch(self, request: Request, call_next: Callable) -> Response:
+        # Redact PII from request body if present
+        if request.method in ["POST", "PUT", "PATCH"]:
+            try:
+                body = await request.body()
+                if body:
+                    body_str = body.decode()
+                    redacted_body = self.redact_pii(body_str)
+                    # Create new request with redacted body
+                    request._body = redacted_body.encode()
+            except Exception as e:
+                logger.error(f"PII redaction failed: {e}")
+        
+        return await call_next(request)
+
+
+class AuditLogMiddleware(BaseHTTPMiddleware):
+    """Middleware for audit logging"""
+    
+    async def dispatch(self, request: Request, call_next: Callable) -> Response:
+        # Skip audit logging for health checks and static files
+        if request.url.path in ["/health", "/health/detailed", "/docs", "/redoc"]:
             return await call_next(request)
         
-        # Get client identifier (IP address)
-        client_ip = self._get_client_ip(request)
-        current_time = time.time()
+        # Get user info if authenticated
+        user_id = None
+        user_email = None
+        try:
+            # This would be set by authentication middleware
+            user_id = getattr(request.state, "user_id", None)
+            user_email = getattr(request.state, "user_email", None)
+        except Exception:
+            pass
         
-        # Clean old requests (older than 1 minute)
-        if client_ip in self.request_counts:
-            self.request_counts[client_ip] = [
-                req_time for req_time in self.request_counts[client_ip]
-                if current_time - req_time < 60
-            ]
-        else:
-            self.request_counts[client_ip] = []
+        # Log audit event
+        audit_event = {
+            "timestamp": time.time(),
+            "method": request.method,
+            "path": request.url.path,
+            "query_params": dict(request.query_params),
+            "client_ip": request.client.host if request.client else "unknown",
+            "user_agent": request.headers.get("user-agent", ""),
+            "user_id": user_id,
+            "user_email": user_email,
+        }
         
-        # Check rate limit
-        if len(self.request_counts[client_ip]) >= self.requests_per_minute:
-            self.monitor.log_warning(
-                "rate_limit",
-                f"Rate limit exceeded for IP: {client_ip}",
-                {
-                    "client_ip": client_ip,
-                    "requests_in_window": len(self.request_counts[client_ip]),
-                    "limit": self.requests_per_minute
-                }
+        logger.info(f"AUDIT: {json.dumps(audit_event)}")
+        
+        return await call_next(request)
+
+
+class ErrorHandlingMiddleware(BaseHTTPMiddleware):
+    """Middleware for handling and logging errors"""
+    
+    async def dispatch(self, request: Request, call_next: Callable) -> Response:
+        try:
+            return await call_next(request)
+        except Exception as e:
+            # Log the error
+            logger.error(
+                f"Unhandled exception in {request.method} {request.url.path}: {str(e)}",
+                exc_info=True
             )
             
-            return Response(
-                content=json.dumps({"error": "Rate limit exceeded"}),
-                status_code=429,
-                headers={"Content-Type": "application/json"}
+            # Return error response
+            return JSONResponse(
+                status_code=500,
+                content={
+                    "error": "Internal server error",
+                    "message": "An unexpected error occurred",
+                    "path": request.url.path
+                }
             )
-        
-        # Add current request to count
-        self.request_counts[client_ip].append(current_time)
+
+
+class MetricsMiddleware(BaseHTTPMiddleware):
+    """Middleware for collecting metrics"""
+    
+    def __init__(self, app, metrics_collector=None):
+        super().__init__(app)
+        self.metrics_collector = metrics_collector
+    
+    async def dispatch(self, request: Request, call_next: Callable) -> Response:
+        start_time = time.time()
         
         # Process request
-        return await call_next(request)
-    
-    def _get_client_ip(self, request: Request) -> str:
-        """Extract client IP address from request"""
-        forwarded_for = request.headers.get("x-forwarded-for")
-        if forwarded_for:
-            return forwarded_for.split(",")[0].strip()
+        response = await call_next(request)
         
-        real_ip = request.headers.get("x-real-ip")
-        if real_ip:
-            return real_ip
+        # Calculate metrics
+        process_time = time.time() - start_time
         
-        if hasattr(request.client, 'host'):
-            return request.client.host
+        # Record metrics if collector is available
+        if self.metrics_collector:
+            try:
+                self.metrics_collector.record_request(
+                    method=request.method,
+                    path=request.url.path,
+                    status_code=response.status_code,
+                    duration=process_time
+                )
+            except Exception as e:
+                logger.error(f"Failed to record metrics: {e}")
         
-        return "unknown"
+        return response
 
 
-def setup_middleware(app: FastAPI):
-    """Setup all middleware for the application"""
-    
-    # Add rate limiting
-    if settings.ENABLE_MONITORING:
-        app.add_middleware(RateLimitMiddleware, enabled=True, requests_per_minute=120)
+def setup_middleware(app: FastAPI) -> None:
+    """Setup all middleware for the FastAPI application"""
     
-    # Add request logging
+    # Add CORS middleware
     app.add_middleware(
-        RequestLoggingMiddleware, 
-        enabled=settings.ENABLE_MONITORING,
-        log_bodies=settings.DEBUG
+        CORSMiddleware,
+        allow_origins=settings.ALLOWED_ORIGINS,
+        allow_credentials=True,
+        allow_methods=settings.ALLOWED_METHODS,
+        allow_headers=settings.ALLOWED_HEADERS,
     )
     
-    # Add audit logging
+    # Add trusted host middleware
     app.add_middleware(
-        AuditLogMiddleware,
-        enabled=settings.AUDIT_ENABLED
+        TrustedHostMiddleware,
+        allowed_hosts=["*"]  # Configure based on your deployment
     )
     
-    # Add PII redaction
-    app.add_middleware(
-        PIIRedactionMiddleware,
-        enabled=True
-    )
\ No newline at end of file
+    # Add custom middleware in order
+    app.add_middleware(ErrorHandlingMiddleware)
+    app.add_middleware(RequestLoggingMiddleware)
+    app.add_middleware(SecurityHeadersMiddleware)
+    app.add_middleware(PIIRedactionMiddleware)
+    app.add_middleware(AuditLogMiddleware)
+    
+    # Add rate limiting middleware if Redis is available
+    try:
+        import redis
+        redis_client = redis.Redis.from_url(settings.REDIS_URL, decode_responses=True)
+        redis_client.ping()  # Test connection
+        app.add_middleware(RateLimitMiddleware, redis_client=redis_client)
+        logger.info("Rate limiting middleware enabled")
+    except Exception as e:
+        logger.warning(f"Rate limiting middleware disabled: {e}")
+    
+    # Add metrics middleware if monitoring is enabled
+    if settings.ENABLE_MONITORING:
+        try:
+            from .monitoring import MetricsCollector
+            metrics_collector = MetricsCollector()
+            app.add_middleware(MetricsMiddleware, metrics_collector=metrics_collector)
+            logger.info("Metrics middleware enabled")
+        except Exception as e:
+            logger.warning(f"Metrics middleware disabled: {e}")
+    
+    logger.info("Middleware setup completed")
+
+
+def get_request_info(request: Request) -> Dict[str, Any]:
+    """Extract request information for logging"""
+    return {
+        "method": request.method,
+        "url": str(request.url),
+        "path": request.url.path,
+        "query_params": dict(request.query_params),
+        "headers": dict(request.headers),
+        "client_ip": request.client.host if request.client else "unknown",
+        "user_agent": request.headers.get("user-agent", ""),
+    }
+
+
+def get_response_info(response: Response) -> Dict[str, Any]:
+    """Extract response information for logging"""
+    return {
+        "status_code": response.status_code,
+        "headers": dict(response.headers),
+    }
\ No newline at end of file
diff --git a/backend/app/core/monitoring.py b/backend/app/core/monitoring.py
index 5dbe989..1ee45fa 100644
--- a/backend/app/core/monitoring.py
+++ b/backend/app/core/monitoring.py
@@ -1,726 +1,448 @@
-import logging
 import time
+import logging
 import json
 import asyncio
+from typing import Dict, Any, Optional, List
 from datetime import datetime, timedelta
-from typing import Dict, List, Optional, Any
-from contextlib import asynccontextmanager, contextmanager
-from dataclasses import dataclass, asdict
-from enum import Enum
-import traceback
-import psutil
-import threading
 from collections import defaultdict, deque
+import threading
+import psutil
+import os
 
-from .config import get_settings, get_agent_config, get_workflow_config
-
-settings = get_settings()
-agent_config = get_agent_config()
-workflow_config = get_workflow_config()
-
-
-class LogLevel(Enum):
-    """Log levels"""
-    DEBUG = "DEBUG"
-    INFO = "INFO"
-    WARNING = "WARNING"
-    ERROR = "ERROR"
-    CRITICAL = "CRITICAL"
-
-
-class MetricType(Enum):
-    """Metric types"""
-    COUNTER = "counter"
-    GAUGE = "gauge"
-    HISTOGRAM = "histogram"
-    SUMMARY = "summary"
-
-
-@dataclass
-class Metric:
-    """Metric data structure"""
-    name: str
-    value: float
-    metric_type: MetricType
-    labels: Dict[str, str]
-    timestamp: datetime
-    description: str = ""
-
-
-@dataclass
-class LogEntry:
-    """Log entry data structure"""
-    timestamp: datetime
-    level: LogLevel
-    message: str
-    module: str
-    function: str
-    line_number: int
-    extra_data: Dict[str, Any]
-    trace_id: Optional[str] = None
-    user_id: Optional[str] = None
-
-
-@dataclass
-class AgentExecutionMetrics:
-    """Agent execution metrics"""
-    agent_type: str
-    execution_time: float
-    confidence: float
-    success: bool
-    error_message: Optional[str] = None
-    input_size: int
-    output_size: int
-    memory_usage: float
-    cpu_usage: float
-
+from .config import settings
 
-@dataclass
-class WorkflowMetrics:
-    """Workflow execution metrics"""
-    workflow_id: str
-    total_stages: int
-    completed_stages: int
-    failed_stages: int
-    total_execution_time: float
-    average_stage_time: float
-    memory_peak: float
-    cpu_peak: float
-    status: str
+logger = logging.getLogger(__name__)
 
 
 class MetricsCollector:
-    """Metrics collection and storage"""
+    """Collects and stores application metrics"""
     
-    def __init__(self):
-        self.metrics: List[Metric] = []
-        self.metrics_lock = threading.Lock()
-        self.max_metrics = 10000
+    def __init__(self, max_history: int = 1000):
+        self.max_history = max_history
+        self.request_metrics = deque(maxlen=max_history)
+        self.error_metrics = deque(maxlen=max_history)
+        self.performance_metrics = deque(maxlen=max_history)
+        self.agent_metrics = deque(maxlen=max_history)
+        self.lock = threading.Lock()
+        
+        # Initialize counters
+        self.total_requests = 0
+        self.total_errors = 0
+        self.total_agent_executions = 0
+        
+        # Start background metrics collection
+        self._start_background_collection()
+    
+    def record_request(self, method: str, path: str, status_code: int, duration: float):
+        """Record a request metric"""
+        metric = {
+            "timestamp": time.time(),
+            "method": method,
+            "path": path,
+            "status_code": status_code,
+            "duration": duration,
+            "datetime": datetime.utcnow().isoformat()
+        }
+        
+        with self.lock:
+            self.request_metrics.append(metric)
+            self.total_requests += 1
+            
+            if status_code >= 400:
+                self.error_metrics.append(metric)
+                self.total_errors += 1
+    
+    def record_agent_execution(self, agent_name: str, duration: float, success: bool, 
+                              confidence: Optional[float] = None, error: Optional[str] = None):
+        """Record an agent execution metric"""
+        metric = {
+            "timestamp": time.time(),
+            "agent_name": agent_name,
+            "duration": duration,
+            "success": success,
+            "confidence": confidence,
+            "error": error,
+            "datetime": datetime.utcnow().isoformat()
+        }
+        
+        with self.lock:
+            self.agent_metrics.append(metric)
+            self.total_agent_executions += 1
+    
+    def record_performance_metric(self, metric_name: str, value: float, tags: Optional[Dict[str, str]] = None):
+        """Record a performance metric"""
+        metric = {
+            "timestamp": time.time(),
+            "metric_name": metric_name,
+            "value": value,
+            "tags": tags or {},
+            "datetime": datetime.utcnow().isoformat()
+        }
         
-    def add_metric(self, metric: Metric):
-        """Add a metric to the collection"""
-        with self.metrics_lock:
-            self.metrics.append(metric)
-            if len(self.metrics) > self.max_metrics:
-                # Remove oldest metrics
-                self.metrics = self.metrics[-self.max_metrics:]
+        with self.lock:
+            self.performance_metrics.append(metric)
     
-    def get_metrics(self, metric_name: Optional[str] = None, 
-                   start_time: Optional[datetime] = None,
-                   end_time: Optional[datetime] = None) -> List[Metric]:
-        """Get metrics with optional filtering"""
-        with self.metrics_lock:
-            filtered_metrics = self.metrics
+    def get_request_stats(self, window_minutes: int = 60) -> Dict[str, Any]:
+        """Get request statistics for the specified time window"""
+        cutoff_time = time.time() - (window_minutes * 60)
+        
+        with self.lock:
+            recent_requests = [
+                req for req in self.request_metrics 
+                if req["timestamp"] >= cutoff_time
+            ]
             
-            if metric_name:
-                filtered_metrics = [m for m in filtered_metrics if m.name == metric_name]
+            if not recent_requests:
+                return {
+                    "total_requests": 0,
+                    "avg_response_time": 0,
+                    "error_rate": 0,
+                    "status_codes": {},
+                    "endpoints": {}
+                }
             
-            if start_time:
-                filtered_metrics = [m for m in filtered_metrics if m.timestamp >= start_time]
+            # Calculate statistics
+            total_requests = len(recent_requests)
+            avg_response_time = sum(req["duration"] for req in recent_requests) / total_requests
+            error_count = len([req for req in recent_requests if req["status_code"] >= 400])
+            error_rate = (error_count / total_requests) * 100 if total_requests > 0 else 0
             
-            if end_time:
-                filtered_metrics = [m for m in filtered_metrics if m.timestamp <= end_time]
+            # Status code distribution
+            status_codes = defaultdict(int)
+            for req in recent_requests:
+                status_codes[req["status_code"]] += 1
             
-            return filtered_metrics
+            # Endpoint distribution
+            endpoints = defaultdict(int)
+            for req in recent_requests:
+                endpoints[req["path"]] += 1
+            
+            return {
+                "total_requests": total_requests,
+                "avg_response_time": avg_response_time,
+                "error_rate": error_rate,
+                "status_codes": dict(status_codes),
+                "endpoints": dict(endpoints),
+                "window_minutes": window_minutes
+            }
     
-    def get_metric_summary(self, metric_name: str, 
-                          time_window: timedelta = timedelta(hours=1)) -> Dict[str, Any]:
-        """Get summary statistics for a metric"""
-        end_time = datetime.utcnow()
-        start_time = end_time - time_window
-        
-        metrics = self.get_metrics(metric_name, start_time, end_time)
-        
-        if not metrics:
+    def get_agent_stats(self, window_minutes: int = 60) -> Dict[str, Any]:
+        """Get agent execution statistics"""
+        cutoff_time = time.time() - (window_minutes * 60)
+        
+        with self.lock:
+            recent_executions = [
+                exec_ for exec_ in self.agent_metrics 
+                if exec_["timestamp"] >= cutoff_time
+            ]
+            
+            if not recent_executions:
+                return {
+                    "total_executions": 0,
+                    "success_rate": 0,
+                    "avg_duration": 0,
+                    "agents": {}
+                }
+            
+            # Calculate statistics
+            total_executions = len(recent_executions)
+            successful_executions = len([exec_ for exec_ in recent_executions if exec_["success"]])
+            success_rate = (successful_executions / total_executions) * 100 if total_executions > 0 else 0
+            avg_duration = sum(exec_["duration"] for exec_ in recent_executions) / total_executions
+            
+            # Agent-specific statistics
+            agents = defaultdict(lambda: {"executions": 0, "successes": 0, "total_duration": 0})
+            for exec_ in recent_executions:
+                agent_name = exec_["agent_name"]
+                agents[agent_name]["executions"] += 1
+                agents[agent_name]["total_duration"] += exec_["duration"]
+                if exec_["success"]:
+                    agents[agent_name]["successes"] += 1
+            
+            # Calculate averages for each agent
+            for agent_name, stats in agents.items():
+                stats["success_rate"] = (stats["successes"] / stats["executions"]) * 100
+                stats["avg_duration"] = stats["total_duration"] / stats["executions"]
+            
             return {
-                "count": 0,
-                "min": 0,
-                "max": 0,
-                "avg": 0,
-                "sum": 0
+                "total_executions": total_executions,
+                "success_rate": success_rate,
+                "avg_duration": avg_duration,
+                "agents": dict(agents),
+                "window_minutes": window_minutes
             }
-        
-        values = [m.value for m in metrics]
+    
+    def get_system_stats(self) -> Dict[str, Any]:
+        """Get system performance statistics"""
+        try:
+            cpu_percent = psutil.cpu_percent(interval=1)
+            memory = psutil.virtual_memory()
+            disk = psutil.disk_usage('/')
+            
+            return {
+                "cpu_percent": cpu_percent,
+                "memory_percent": memory.percent,
+                "memory_available": memory.available,
+                "memory_total": memory.total,
+                "disk_percent": disk.percent,
+                "disk_free": disk.free,
+                "disk_total": disk.total,
+                "timestamp": time.time(),
+                "datetime": datetime.utcnow().isoformat()
+            }
+        except Exception as e:
+            logger.error(f"Failed to get system stats: {e}")
+            return {
+                "error": str(e),
+                "timestamp": time.time(),
+                "datetime": datetime.utcnow().isoformat()
+            }
+    
+    def get_all_metrics(self) -> Dict[str, Any]:
+        """Get all collected metrics"""
         return {
-            "count": len(values),
-            "min": min(values),
-            "max": max(values),
-            "avg": sum(values) / len(values),
-            "sum": sum(values)
+            "request_stats": self.get_request_stats(),
+            "agent_stats": self.get_agent_stats(),
+            "system_stats": self.get_system_stats(),
+            "total_requests": self.total_requests,
+            "total_errors": self.total_errors,
+            "total_agent_executions": self.total_agent_executions,
+            "timestamp": time.time(),
+            "datetime": datetime.utcnow().isoformat()
         }
+    
+    def _start_background_collection(self):
+        """Start background system metrics collection"""
+        def collect_system_metrics():
+            while True:
+                try:
+                    system_stats = self.get_system_stats()
+                    if "error" not in system_stats:
+                        self.record_performance_metric("cpu_percent", system_stats["cpu_percent"])
+                        self.record_performance_metric("memory_percent", system_stats["memory_percent"])
+                        self.record_performance_metric("disk_percent", system_stats["disk_percent"])
+                    
+                    time.sleep(60)  # Collect every minute
+                except Exception as e:
+                    logger.error(f"Background metrics collection failed: {e}")
+                    time.sleep(60)
+        
+        thread = threading.Thread(target=collect_system_metrics, daemon=True)
+        thread.start()
 
 
 class LogCollector:
-    """Log collection and storage"""
+    """Collects and manages application logs"""
     
-    def __init__(self):
-        self.logs: List[LogEntry] = []
-        self.logs_lock = threading.Lock()
-        self.max_logs = 10000
+    def __init__(self, max_logs: int = 10000):
+        self.max_logs = max_logs
+        self.logs = deque(maxlen=max_logs)
+        self.lock = threading.Lock()
+    
+    def add_log(self, level: str, message: str, context: Optional[Dict[str, Any]] = None):
+        """Add a log entry"""
+        log_entry = {
+            "timestamp": time.time(),
+            "datetime": datetime.utcnow().isoformat(),
+            "level": level,
+            "message": message,
+            "context": context or {}
+        }
         
-    def add_log(self, log_entry: LogEntry):
-        """Add a log entry to the collection"""
-        with self.logs_lock:
+        with self.lock:
             self.logs.append(log_entry)
-            if len(self.logs) > self.max_logs:
-                # Remove oldest logs
-                self.logs = self.logs[-self.max_logs:]
     
-    def get_logs(self, level: Optional[LogLevel] = None,
-                module: Optional[str] = None,
-                start_time: Optional[datetime] = None,
-                end_time: Optional[datetime] = None,
-                trace_id: Optional[str] = None) -> List[LogEntry]:
-        """Get logs with optional filtering"""
-        with self.logs_lock:
-            filtered_logs = self.logs
-            
+    def get_logs(self, level: Optional[str] = None, limit: int = 100) -> List[Dict[str, Any]]:
+        """Get recent logs, optionally filtered by level"""
+        with self.lock:
             if level:
-                filtered_logs = [l for l in filtered_logs if l.level == level]
-            
-            if module:
-                filtered_logs = [l for l in filtered_logs if l.module == module]
-            
-            if start_time:
-                filtered_logs = [l for l in filtered_logs if l.timestamp >= start_time]
-            
-            if end_time:
-                filtered_logs = [l for l in filtered_logs if l.timestamp <= end_time]
-            
-            if trace_id:
-                filtered_logs = [l for l in filtered_logs if l.trace_id == trace_id]
+                filtered_logs = [log for log in self.logs if log["level"] == level]
+            else:
+                filtered_logs = list(self.logs)
             
-            return filtered_logs
+            return filtered_logs[-limit:]
+    
+    def get_error_logs(self, limit: int = 100) -> List[Dict[str, Any]]:
+        """Get recent error logs"""
+        return self.get_logs(level="ERROR", limit=limit)
 
 
 class PerformanceMonitor:
-    """System performance monitoring"""
+    """Monitors application performance"""
     
     def __init__(self):
-        self.start_time = time.time()
         self.metrics_collector = MetricsCollector()
         self.log_collector = LogCollector()
-        self.agent_metrics: Dict[str, List[AgentExecutionMetrics]] = defaultdict(list)
-        self.workflow_metrics: Dict[str, WorkflowMetrics] = {}
-        self.system_metrics = deque(maxlen=1000)
-        
-        # Start system monitoring
-        if settings.ENABLE_MONITORING:
-            self._start_system_monitoring()
-    
-    def _start_system_monitoring(self):
-        """Start system monitoring thread"""
-        def monitor_system():
-            while True:
-                try:
-                    # CPU usage
-                    cpu_percent = psutil.cpu_percent(interval=1)
-                    self.metrics_collector.add_metric(Metric(
-                        name="system_cpu_usage",
-                        value=cpu_percent,
-                        metric_type=MetricType.GAUGE,
-                        labels={"component": "system"},
-                        timestamp=datetime.utcnow(),
-                        description="System CPU usage percentage"
-                    ))
-                    
-                    # Memory usage
-                    memory = psutil.virtual_memory()
-                    self.metrics_collector.add_metric(Metric(
-                        name="system_memory_usage",
-                        value=memory.percent,
-                        metric_type=MetricType.GAUGE,
-                        labels={"component": "system"},
-                        timestamp=datetime.utcnow(),
-                        description="System memory usage percentage"
-                    ))
-                    
-                    # Disk usage
-                    disk = psutil.disk_usage('/')
-                    self.metrics_collector.add_metric(Metric(
-                        name="system_disk_usage",
-                        value=(disk.used / disk.total) * 100,
-                        metric_type=MetricType.GAUGE,
-                        labels={"component": "system"},
-                        timestamp=datetime.utcnow(),
-                        description="System disk usage percentage"
-                    ))
-                    
-                    # Store system metrics
-                    self.system_metrics.append({
-                        "timestamp": datetime.utcnow(),
-                        "cpu_percent": cpu_percent,
-                        "memory_percent": memory.percent,
-                        "disk_percent": (disk.used / disk.total) * 100
-                    })
-                    
-                    time.sleep(workflow_config.MONITORING_INTERVAL)
-                    
-                except Exception as e:
-                    self.log_error("system_monitor", "Failed to collect system metrics", str(e))
-                    time.sleep(5)
-        
-        thread = threading.Thread(target=monitor_system, daemon=True)
-        thread.start()
-    
-    def log_info(self, module: str, message: str, extra_data: Dict[str, Any] = None,
-                trace_id: Optional[str] = None, user_id: Optional[str] = None):
-        """Log info message"""
-        self._log(LogLevel.INFO, module, message, extra_data or {}, trace_id, user_id)
-    
-    def log_warning(self, module: str, message: str, extra_data: Dict[str, Any] = None,
-                   trace_id: Optional[str] = None, user_id: Optional[str] = None):
-        """Log warning message"""
-        self._log(LogLevel.WARNING, module, message, extra_data or {}, trace_id, user_id)
-    
-    def log_error(self, module: str, message: str, error_details: str = None,
-                 trace_id: Optional[str] = None, user_id: Optional[str] = None):
-        """Log error message"""
-        extra_data = {"error_details": error_details} if error_details else {}
-        self._log(LogLevel.ERROR, module, message, extra_data, trace_id, user_id)
+        self.start_time = time.time()
     
-    def log_critical(self, module: str, message: str, error_details: str = None,
-                    trace_id: Optional[str] = None, user_id: Optional[str] = None):
-        """Log critical message"""
-        extra_data = {"error_details": error_details} if error_details else {}
-        self._log(LogLevel.CRITICAL, module, message, extra_data, trace_id, user_id)
+    def record_request(self, method: str, path: str, status_code: int, duration: float):
+        """Record a request"""
+        self.metrics_collector.record_request(method, path, status_code, duration)
     
-    def _log(self, level: LogLevel, module: str, message: str, extra_data: Dict[str, Any],
-             trace_id: Optional[str] = None, user_id: Optional[str] = None):
-        """Internal logging method"""
-        # Get caller information
-        frame = traceback.extract_stack()[-2]
-        
-        log_entry = LogEntry(
-            timestamp=datetime.utcnow(),
-            level=level,
-            message=message,
-            module=module,
-            function=frame.name,
-            line_number=frame.lineno,
-            extra_data=extra_data,
-            trace_id=trace_id,
-            user_id=user_id
-        )
-        
-        self.log_collector.add_log(log_entry)
-        
-        # Also log to standard logging
-        logger = logging.getLogger(module)
-        log_message = f"{message} | {json.dumps(extra_data)}" if extra_data else message
-        getattr(logger, level.value.lower())(log_message)
+    def record_agent_execution(self, agent_name: str, duration: float, success: bool, 
+                              confidence: Optional[float] = None, error: Optional[str] = None):
+        """Record an agent execution"""
+        self.metrics_collector.record_agent_execution(agent_name, duration, success, confidence, error)
     
-    @contextmanager
-    def monitor_agent_execution(self, agent_type: str, trace_id: Optional[str] = None):
-        """Context manager for monitoring agent execution"""
-        start_time = time.time()
-        start_memory = psutil.Process().memory_info().rss / 1024 / 1024  # MB
-        start_cpu = psutil.Process().cpu_percent()
-        
-        try:
-            yield
-            success = True
-            error_message = None
-        except Exception as e:
-            success = False
-            error_message = str(e)
-            raise
-        finally:
-            end_time = time.time()
-            end_memory = psutil.Process().memory_info().rss / 1024 / 1024  # MB
-            end_cpu = psutil.Process().cpu_percent()
-            
-            execution_time = end_time - start_time
-            memory_usage = end_memory - start_memory
-            cpu_usage = end_cpu - start_cpu
-            
-            # Record agent metrics
-            agent_metric = AgentExecutionMetrics(
-                agent_type=agent_type,
-                execution_time=execution_time,
-                confidence=0.0,  # Will be set by agent
-                success=success,
-                error_message=error_message,
-                input_size=0,  # Will be set by agent
-                output_size=0,  # Will be set by agent
-                memory_usage=memory_usage,
-                cpu_usage=cpu_usage
-            )
-            
-            self.agent_metrics[agent_type].append(agent_metric)
-            
-            # Add metrics
-            self.metrics_collector.add_metric(Metric(
-                name="agent_execution_time",
-                value=execution_time,
-                metric_type=MetricType.HISTOGRAM,
-                labels={"agent_type": agent_type, "success": str(success)},
-                timestamp=datetime.utcnow(),
-                description=f"Execution time for {agent_type} agent"
-            ))
-            
-            self.metrics_collector.add_metric(Metric(
-                name="agent_memory_usage",
-                value=memory_usage,
-                metric_type=MetricType.HISTOGRAM,
-                labels={"agent_type": agent_type},
-                timestamp=datetime.utcnow(),
-                description=f"Memory usage for {agent_type} agent"
-            ))
-            
-            # Log execution
-            if success:
-                self.log_info(
-                    "agent_execution",
-                    f"Agent {agent_type} executed successfully",
-                    {
-                        "execution_time": execution_time,
-                        "memory_usage": memory_usage,
-                        "cpu_usage": cpu_usage
-                    },
-                    trace_id
-                )
-            else:
-                self.log_error(
-                    "agent_execution",
-                    f"Agent {agent_type} execution failed",
-                    error_message,
-                    trace_id
-                )
+    def log_info(self, component: str, message: str, context: Optional[Dict[str, Any]] = None):
+        """Log an info message"""
+        self.log_collector.add_log("INFO", f"[{component}] {message}", context)
+        logger.info(f"[{component}] {message}")
     
-    @asynccontextmanager
-    async def monitor_workflow_execution(self, workflow_id: str, total_stages: int,
-                                       trace_id: Optional[str] = None):
-        """Context manager for monitoring workflow execution"""
-        start_time = time.time()
-        start_memory = psutil.Process().memory_info().rss / 1024 / 1024  # MB
-        start_cpu = psutil.Process().cpu_percent()
-        
-        workflow_metric = WorkflowMetrics(
-            workflow_id=workflow_id,
-            total_stages=total_stages,
-            completed_stages=0,
-            failed_stages=0,
-            total_execution_time=0.0,
-            average_stage_time=0.0,
-            memory_peak=start_memory,
-            cpu_peak=start_cpu,
-            status="running"
-        )
-        
-        self.workflow_metrics[workflow_id] = workflow_metric
-        
-        try:
-            yield workflow_metric
-            workflow_metric.status = "completed"
-        except Exception as e:
-            workflow_metric.status = "failed"
-            raise
-        finally:
-            end_time = time.time()
-            end_memory = psutil.Process().memory_info().rss / 1024 / 1024  # MB
-            end_cpu = psutil.Process().cpu_percent()
-            
-            workflow_metric.total_execution_time = end_time - start_time
-            workflow_metric.memory_peak = max(workflow_metric.memory_peak, end_memory)
-            workflow_metric.cpu_peak = max(workflow_metric.cpu_peak, end_cpu)
-            
-            if workflow_metric.completed_stages > 0:
-                workflow_metric.average_stage_time = workflow_metric.total_execution_time / workflow_metric.completed_stages
-            
-            # Add workflow metrics
-            self.metrics_collector.add_metric(Metric(
-                name="workflow_execution_time",
-                value=workflow_metric.total_execution_time,
-                metric_type=MetricType.HISTOGRAM,
-                labels={"workflow_id": workflow_id, "status": workflow_metric.status},
-                timestamp=datetime.utcnow(),
-                description=f"Execution time for workflow {workflow_id}"
-            ))
-            
-            self.metrics_collector.add_metric(Metric(
-                name="workflow_stages_completed",
-                value=workflow_metric.completed_stages,
-                metric_type=MetricType.COUNTER,
-                labels={"workflow_id": workflow_id},
-                timestamp=datetime.utcnow(),
-                description=f"Completed stages for workflow {workflow_id}"
-            ))
-            
-            # Log workflow completion
-            self.log_info(
-                "workflow_execution",
-                f"Workflow {workflow_id} {workflow_metric.status}",
-                {
-                    "total_stages": total_stages,
-                    "completed_stages": workflow_metric.completed_stages,
-                    "failed_stages": workflow_metric.failed_stages,
-                    "total_execution_time": workflow_metric.total_execution_time,
-                    "memory_peak": workflow_metric.memory_peak,
-                    "cpu_peak": workflow_metric.cpu_peak
-                },
-                trace_id
-            )
+    def log_warning(self, component: str, message: str, context: Optional[Dict[str, Any]] = None):
+        """Log a warning message"""
+        self.log_collector.add_log("WARNING", f"[{component}] {message}", context)
+        logger.warning(f"[{component}] {message}")
     
-    def get_system_status(self) -> Dict[str, Any]:
-        """Get current system status"""
-        if not self.system_metrics:
-            return {
-                "status": "unknown",
-                "cpu_usage": 0,
-                "memory_usage": 0,
-                "disk_usage": 0,
-                "uptime": time.time() - self.start_time
-            }
-        
-        latest = self.system_metrics[-1]
-        return {
-            "status": "healthy" if latest["cpu_percent"] < 80 and latest["memory_percent"] < 80 else "warning",
-            "cpu_usage": latest["cpu_percent"],
-            "memory_usage": latest["memory_percent"],
-            "disk_usage": latest["disk_percent"],
-            "uptime": time.time() - self.start_time
-        }
+    def log_error(self, component: str, message: str, error: Optional[str] = None, 
+                  context: Optional[Dict[str, Any]] = None):
+        """Log an error message"""
+        full_message = f"[{component}] {message}"
+        if error:
+            full_message += f" - Error: {error}"
+        
+        self.log_collector.add_log("ERROR", full_message, context)
+        logger.error(full_message)
     
-    def get_agent_performance_summary(self, agent_type: Optional[str] = None,
-                                    time_window: timedelta = timedelta(hours=1)) -> Dict[str, Any]:
-        """Get agent performance summary"""
-        end_time = datetime.utcnow()
-        start_time = end_time - time_window
-        
-        if agent_type:
-            metrics = [m for m in self.agent_metrics[agent_type] 
-                      if m.execution_time >= start_time.timestamp()]
-        else:
-            all_metrics = []
-            for agent_metrics in self.agent_metrics.values():
-                all_metrics.extend([m for m in agent_metrics 
-                                  if m.execution_time >= start_time.timestamp()])
-            metrics = all_metrics
-        
-        if not metrics:
-            return {
-                "total_executions": 0,
-                "success_rate": 0,
-                "average_execution_time": 0,
-                "average_memory_usage": 0,
-                "average_cpu_usage": 0
-            }
-        
-        total_executions = len(metrics)
-        successful_executions = len([m for m in metrics if m.success])
-        success_rate = successful_executions / total_executions if total_executions > 0 else 0
-        
-        return {
-            "total_executions": total_executions,
-            "success_rate": success_rate,
-            "average_execution_time": sum(m.execution_time for m in metrics) / total_executions,
-            "average_memory_usage": sum(m.memory_usage for m in metrics) / total_executions,
-            "average_cpu_usage": sum(m.cpu_usage for m in metrics) / total_executions
-        }
+    def get_metrics(self) -> Dict[str, Any]:
+        """Get all metrics"""
+        return self.metrics_collector.get_all_metrics()
     
-    def get_workflow_performance_summary(self, time_window: timedelta = timedelta(hours=1)) -> Dict[str, Any]:
-        """Get workflow performance summary"""
-        end_time = datetime.utcnow()
-        start_time = end_time - time_window
-        
-        workflows = [w for w in self.workflow_metrics.values() 
-                    if w.total_execution_time >= start_time.timestamp()]
-        
-        if not workflows:
-            return {
-                "total_workflows": 0,
-                "completed_workflows": 0,
-                "failed_workflows": 0,
-                "average_execution_time": 0,
-                "average_stages_completed": 0
-            }
-        
-        total_workflows = len(workflows)
-        completed_workflows = len([w for w in workflows if w.status == "completed"])
-        failed_workflows = len([w for w in workflows if w.status == "failed"])
-        
-        return {
-            "total_workflows": total_workflows,
-            "completed_workflows": completed_workflows,
-            "failed_workflows": failed_workflows,
-            "average_execution_time": sum(w.total_execution_time for w in workflows) / total_workflows,
-            "average_stages_completed": sum(w.completed_stages for w in workflows) / total_workflows
-        }
+    def get_logs(self, level: Optional[str] = None, limit: int = 100) -> List[Dict[str, Any]]:
+        """Get logs"""
+        return self.log_collector.get_logs(level, limit)
     
-    def get_metrics_summary(self, time_window: timedelta = timedelta(hours=1)) -> Dict[str, Any]:
-        """Get comprehensive metrics summary"""
-        return {
-            "system": self.get_system_status(),
-            "agents": self.get_agent_performance_summary(time_window=time_window),
-            "workflows": self.get_workflow_performance_summary(time_window=time_window),
-            "metrics_count": len(self.metrics_collector.metrics),
-            "logs_count": len(self.log_collector.logs)
-        }
+    def get_uptime(self) -> float:
+        """Get application uptime in seconds"""
+        return time.time() - self.start_time
 
 
 # Global monitoring instance
-monitor = PerformanceMonitor()
+_monitor = None
 
 
 def get_monitor() -> PerformanceMonitor:
     """Get the global monitoring instance"""
-    return monitor
+    global _monitor
+    if _monitor is None:
+        _monitor = PerformanceMonitor()
+    return _monitor
 
 
-def setup_logging():
-    """Setup logging configuration"""
-    logging.basicConfig(
-        level=getattr(logging, settings.LOG_LEVEL),
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-        handlers=[
-            logging.StreamHandler(),
-            logging.FileHandler('app.log')
-        ]
-    )
-    
-    # Set specific logger levels
-    logging.getLogger('uvicorn').setLevel(logging.INFO)
-    logging.getLogger('fastapi').setLevel(logging.INFO)
-    
-    monitor.log_info("monitoring", "Logging system initialized")
+def setup_monitoring() -> None:
+    """Setup monitoring system"""
+    global _monitor
+    if _monitor is None:
+        _monitor = PerformanceMonitor()
+        logger.info("Monitoring system initialized")
 
 
-def log_agent_execution(agent_type: str, execution_time: float, confidence: float,
-                       success: bool, input_size: int, output_size: int,
-                       error_message: Optional[str] = None, trace_id: Optional[str] = None):
-    """Log agent execution metrics"""
-    if monitor.agent_metrics[agent_type]:
-        # Update the latest agent metric with additional data
-        latest_metric = monitor.agent_metrics[agent_type][-1]
-        latest_metric.confidence = confidence
-        latest_metric.input_size = input_size
-        latest_metric.output_size = output_size
-        
-        # Add confidence metric
-        monitor.metrics_collector.add_metric(Metric(
-            name="agent_confidence",
-            value=confidence,
-            metric_type=MetricType.GAUGE,
-            labels={"agent_type": agent_type, "success": str(success)},
-            timestamp=datetime.utcnow(),
-            description=f"Confidence score for {agent_type} agent"
-        ))
-        
-        # Add throughput metric
-        if execution_time > 0:
-            throughput = output_size / execution_time
-            monitor.metrics_collector.add_metric(Metric(
-                name="agent_throughput",
-                value=throughput,
-                metric_type=MetricType.GAUGE,
-                labels={"agent_type": agent_type},
-                timestamp=datetime.utcnow(),
-                description=f"Throughput for {agent_type} agent (output_size/time)"
-            ))
+def instrument_fastapi(app) -> None:
+    """Instrument FastAPI application for monitoring"""
+    from fastapi import Request, Response
+    from starlette.middleware.base import BaseHTTPMiddleware
+    
+    class MonitoringMiddleware(BaseHTTPMiddleware):
+        async def dispatch(self, request: Request, call_next):
+            start_time = time.time()
+            
+            # Process request
+            response = await call_next(request)
+            
+            # Calculate duration
+            duration = time.time() - start_time
+            
+            # Record metrics
+            monitor = get_monitor()
+            monitor.record_request(
+                method=request.method,
+                path=request.url.path,
+                status_code=response.status_code,
+                duration=duration
+            )
+            
+            return response
+    
+    # Add monitoring middleware
+    app.add_middleware(MonitoringMiddleware)
+    logger.info("FastAPI application instrumented for monitoring")
 
 
-def log_workflow_stage_completion(workflow_id: str, stage_name: str, success: bool,
-                                execution_time: float, trace_id: Optional[str] = None):
-    """Log workflow stage completion"""
-    if workflow_id in monitor.workflow_metrics:
-        workflow = monitor.workflow_metrics[workflow_id]
+def create_health_check() -> Dict[str, Any]:
+    """Create a comprehensive health check response"""
+    monitor = get_monitor()
+    
+    try:
+        system_stats = monitor.get_metrics()["system_stats"]
         
-        if success:
-            workflow.completed_stages += 1
-        else:
-            workflow.failed_stages += 1
+        # Determine overall health
+        health_status = "healthy"
+        issues = []
         
-        # Add stage completion metric
-        monitor.metrics_collector.add_metric(Metric(
-            name="workflow_stage_completion",
-            value=1 if success else 0,
-            metric_type=MetricType.COUNTER,
-            labels={"workflow_id": workflow_id, "stage_name": stage_name, "success": str(success)},
-            timestamp=datetime.utcnow(),
-            description=f"Stage completion for {stage_name} in workflow {workflow_id}"
-        ))
+        # Check CPU usage
+        if system_stats.get("cpu_percent", 0) > 90:
+            health_status = "degraded"
+            issues.append("High CPU usage")
         
-        monitor.log_info(
-            "workflow_stage",
-            f"Workflow {workflow_id} stage {stage_name} {'completed' if success else 'failed'}",
-            {
-                "execution_time": execution_time,
-                "completed_stages": workflow.completed_stages,
-                "failed_stages": workflow.failed_stages
-            },
-            trace_id
-        )
-
-
-def setup_monitoring():
-    """Setup monitoring and observability"""
-    try:
-        # Setup logging
-        setup_logging()
+        # Check memory usage
+        if system_stats.get("memory_percent", 0) > 90:
+            health_status = "degraded"
+            issues.append("High memory usage")
         
-        # Initialize monitoring
-        monitor.log_info("monitoring", "Monitoring system initialized")
+        # Check disk usage
+        if system_stats.get("disk_percent", 0) > 90:
+            health_status = "degraded"
+            issues.append("High disk usage")
         
-        # Setup Prometheus metrics endpoint if enabled
-        if settings.ENABLE_MONITORING:
-            monitor.log_info("monitoring", "Metrics collection enabled")
+        # Check error rate
+        request_stats = monitor.get_metrics()["request_stats"]
+        if request_stats.get("error_rate", 0) > 10:
+            health_status = "degraded"
+            issues.append("High error rate")
         
-        print("✅ Monitoring setup complete")
+        return {
+            "status": health_status,
+            "timestamp": datetime.utcnow().isoformat(),
+            "uptime_seconds": monitor.get_uptime(),
+            "system_stats": system_stats,
+            "request_stats": request_stats,
+            "issues": issues
+        }
         
     except Exception as e:
-        print(f"⚠️ Monitoring setup failed: {e}")
+        logger.error(f"Health check failed: {e}")
+        return {
+            "status": "unhealthy",
+            "timestamp": datetime.utcnow().isoformat(),
+            "error": str(e)
+        }
 
 
-def instrument_fastapi(app):
-    """Instrument FastAPI application with monitoring"""
-    try:
-        from fastapi import Request, Response
-        import time
-        
-        @app.middleware("http")
-        async def monitoring_middleware(request: Request, call_next):
-            start_time = time.time()
-            
-            # Process request
-            response = await call_next(request)
-            
-            # Calculate metrics
-            process_time = time.time() - start_time
-            
-            # Record API metrics
-            monitor.metrics_collector.add_metric(Metric(
-                name="http_request_duration",
-                value=process_time,
-                metric_type=MetricType.HISTOGRAM,
-                labels={
-                    "method": request.method,
-                    "endpoint": str(request.url.path),
-                    "status_code": str(response.status_code)
-                },
-                timestamp=datetime.utcnow(),
-                description="HTTP request duration"
-            ))
-            
-            monitor.metrics_collector.add_metric(Metric(
-                name="http_requests_total",
-                value=1,
-                metric_type=MetricType.COUNTER,
-                labels={
-                    "method": request.method,
-                    "endpoint": str(request.url.path),
-                    "status_code": str(response.status_code)
-                },
-                timestamp=datetime.utcnow(),
-                description="Total HTTP requests"
-            ))
-            
-            # Add response time header
-            response.headers["X-Process-Time"] = str(process_time)
-            
-            return response
-        
-        monitor.log_info("monitoring", "FastAPI instrumentation complete")
-        
-    except Exception as e:
-        monitor.log_error("monitoring", "FastAPI instrumentation failed", str(e))
\ No newline at end of file
+def export_metrics_prometheus() -> str:
+    """Export metrics in Prometheus format"""
+    monitor = get_monitor()
+    metrics = monitor.get_metrics()
+    
+    prometheus_metrics = []
+    
+    # System metrics
+    system_stats = metrics.get("system_stats", {})
+    if "cpu_percent" in system_stats:
+        prometheus_metrics.append(f"system_cpu_percent {system_stats['cpu_percent']}")
+    if "memory_percent" in system_stats:
+        prometheus_metrics.append(f"system_memory_percent {system_stats['memory_percent']}")
+    if "disk_percent" in system_stats:
+        prometheus_metrics.append(f"system_disk_percent {system_stats['disk_percent']}")
+    
+    # Request metrics
+    request_stats = metrics.get("request_stats", {})
+    prometheus_metrics.append(f"http_requests_total {request_stats.get('total_requests', 0)}")
+    prometheus_metrics.append(f"http_request_duration_seconds {request_stats.get('avg_response_time', 0)}")
+    prometheus_metrics.append(f"http_errors_total {request_stats.get('total_requests', 0) * request_stats.get('error_rate', 0) / 100}")
+    
+    # Agent metrics
+    agent_stats = metrics.get("agent_stats", {})
+    prometheus_metrics.append(f"agent_executions_total {agent_stats.get('total_executions', 0)}")
+    prometheus_metrics.append(f"agent_success_rate {agent_stats.get('success_rate', 0)}")
+    
+    return "\n".join(prometheus_metrics)
\ No newline at end of file
diff --git a/backend/app/core/security.py b/backend/app/core/security.py
index 595c0cc..83344c2 100644
--- a/backend/app/core/security.py
+++ b/backend/app/core/security.py
@@ -1,12 +1,19 @@
 import os
 from datetime import datetime, timedelta
-from typing import Optional, Dict, Any
+from typing import Optional, Dict, Any, List
 from jose import JWTError, jwt
 from passlib.context import CryptContext
-from fastapi import HTTPException, status, Depends
+from fastapi import HTTPException, status, Depends, Request
 from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
+from sqlalchemy.orm import Session
+import redis
+import hashlib
+import secrets
+import logging
 
 from .config import settings
+from ..database.connection import get_db
+from ..database.models import User, Role, UserRole
 
 # Password hashing
 pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")
@@ -14,87 +21,212 @@
 # JWT settings
 security = HTTPBearer()
 
-# Mock user database (in production, this would be a real database)
-MOCK_USERS = {
-    "admin@example.com": {
-        "id": "1",
-        "email": "admin@example.com",
-        "full_name": "Admin User",
-        "hashed_password": "$2b$12$EixZaYVK1fsbw1ZfbX3OXePaWxn96p36WQoeG6Lruj3vjPGga31lW",  # password: secret
-        "is_active": True,
-        "is_superuser": True,
-        "roles": ["admin", "user"]
-    },
-    "user@example.com": {
-        "id": "2", 
-        "email": "user@example.com",
-        "full_name": "Regular User",
-        "hashed_password": "$2b$12$EixZaYVK1fsbw1ZfbX3OXePaWxn96p36WQoeG6Lruj3vjPGga31lW",  # password: secret
-        "is_active": True,
-        "is_superuser": False,
-        "roles": ["user"]
-    }
-}
+# Redis for token blacklisting and rate limiting
+redis_client = redis.Redis.from_url(settings.REDIS_URL, decode_responses=True)
 
+# Security logging
+logger = logging.getLogger(__name__)
 
-def verify_password(plain_password: str, hashed_password: str) -> bool:
-    """Verify a plain password against a hashed password"""
-    return pwd_context.verify(plain_password, hashed_password)
-
-
-def get_password_hash(password: str) -> str:
-    """Hash a password"""
-    return pwd_context.hash(password)
-
-
-def get_user_by_email(email: str) -> Optional[Dict[str, Any]]:
-    """Get user by email from mock database"""
-    return MOCK_USERS.get(email)
-
-
-def authenticate_user(email: str, password: str) -> Optional[Dict[str, Any]]:
-    """Authenticate a user by email and password"""
-    user = get_user_by_email(email)
-    if not user:
-        return None
-    if not verify_password(password, user["hashed_password"]):
-        return None
-    return user
-
-
-def create_access_token(data: Dict[str, Any], expires_delta: Optional[timedelta] = None) -> str:
-    """Create a JWT access token"""
-    to_encode = data.copy()
+class SecurityManager:
+    """Comprehensive security management for the application"""
     
-    if expires_delta:
-        expire = datetime.utcnow() + expires_delta
-    else:
-        expire = datetime.utcnow() + timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES)
+    def __init__(self):
+        self.rate_limit_window = 3600  # 1 hour
+        self.max_requests_per_window = 1000
+        self.token_blacklist_prefix = "blacklist:"
+        self.rate_limit_prefix = "rate_limit:"
     
-    to_encode.update({"exp": expire})
-    encoded_jwt = jwt.encode(to_encode, settings.SECRET_KEY, algorithm=settings.ALGORITHM)
-    return encoded_jwt
-
-
-def verify_token(token: str) -> Optional[Dict[str, Any]]:
-    """Verify and decode a JWT token"""
-    try:
-        payload = jwt.decode(token, settings.SECRET_KEY, algorithms=[settings.ALGORITHM])
-        email: str = payload.get("sub")
-        if email is None:
+    def verify_password(self, plain_password: str, hashed_password: str) -> bool:
+        """Verify a plain password against a hashed password"""
+        return pwd_context.verify(plain_password, hashed_password)
+    
+    def get_password_hash(self, password: str) -> str:
+        """Hash a password"""
+        return pwd_context.hash(password)
+    
+    def get_user_by_email(self, db: Session, email: str) -> Optional[User]:
+        """Get user by email from database"""
+        return db.query(User).filter(User.email == email).first()
+    
+    def get_user_by_id(self, db: Session, user_id: int) -> Optional[User]:
+        """Get user by ID from database"""
+        return db.query(User).filter(User.id == user_id).first()
+    
+    def authenticate_user(self, db: Session, email: str, password: str) -> Optional[User]:
+        """Authenticate a user by email and password"""
+        user = self.get_user_by_email(db, email)
+        if not user:
+            return None
+        if not self.verify_password(password, user.hashed_password):
+            return None
+        if not user.is_active:
             return None
+        return user
+    
+    def create_access_token(self, data: Dict[str, Any], expires_delta: Optional[timedelta] = None) -> str:
+        """Create a JWT access token"""
+        to_encode = data.copy()
+        
+        if expires_delta:
+            expire = datetime.utcnow() + expires_delta
+        else:
+            expire = datetime.utcnow() + timedelta(minutes=settings.ACCESS_TOKEN_EXPIRE_MINUTES)
         
-        # Check if token is expired
-        exp = payload.get("exp")
-        if exp is None or datetime.utcnow() > datetime.fromtimestamp(exp):
+        to_encode.update({
+            "exp": expire,
+            "iat": datetime.utcnow(),
+            "jti": secrets.token_urlsafe(32)  # JWT ID for blacklisting
+        })
+        
+        encoded_jwt = jwt.encode(to_encode, settings.SECRET_KEY, algorithm=settings.ALGORITHM)
+        return encoded_jwt
+    
+    def verify_token(self, token: str) -> Optional[Dict[str, Any]]:
+        """Verify and decode a JWT token"""
+        try:
+            # Check if token is blacklisted
+            if self.is_token_blacklisted(token):
+                return None
+            
+            payload = jwt.decode(token, settings.SECRET_KEY, algorithms=[settings.ALGORITHM])
+            email: str = payload.get("sub")
+            if email is None:
+                return None
+            
+            # Check if token is expired
+            exp = payload.get("exp")
+            if exp is None or datetime.utcnow() > datetime.fromtimestamp(exp):
+                return None
+            
+            return payload
+        except JWTError:
             return None
+    
+    def blacklist_token(self, token: str, expires_in: int = 3600) -> bool:
+        """Add token to blacklist"""
+        try:
+            # Hash the token for storage
+            token_hash = hashlib.sha256(token.encode()).hexdigest()
+            redis_client.setex(
+                f"{self.token_blacklist_prefix}{token_hash}",
+                expires_in,
+                "blacklisted"
+            )
+            return True
+        except Exception as e:
+            logger.error(f"Failed to blacklist token: {e}")
+            return False
+    
+    def is_token_blacklisted(self, token: str) -> bool:
+        """Check if token is blacklisted"""
+        try:
+            token_hash = hashlib.sha256(token.encode()).hexdigest()
+            return redis_client.exists(f"{self.token_blacklist_prefix}{token_hash}") > 0
+        except Exception as e:
+            logger.error(f"Failed to check token blacklist: {e}")
+            return False
+    
+    def check_rate_limit(self, identifier: str) -> bool:
+        """Check rate limiting for an identifier (IP or user)"""
+        try:
+            key = f"{self.rate_limit_prefix}{identifier}"
+            current_requests = redis_client.get(key)
             
-        return payload
-    except JWTError:
-        return None
+            if current_requests is None:
+                redis_client.setex(key, self.rate_limit_window, 1)
+                return True
+            
+            current_requests = int(current_requests)
+            if current_requests >= self.max_requests_per_window:
+                return False
+            
+            redis_client.incr(key)
+            return True
+        except Exception as e:
+            logger.error(f"Rate limit check failed: {e}")
+            return True  # Allow if Redis fails
+    
+    def get_user_permissions(self, db: Session, user: User) -> List[str]:
+        """Get user permissions from database"""
+        try:
+            user_roles = db.query(UserRole).filter(UserRole.user_id == user.id).all()
+            permissions = []
+            
+            for user_role in user_roles:
+                role = db.query(Role).filter(Role.id == user_role.role_id).first()
+                if role and role.permissions:
+                    if isinstance(role.permissions, list):
+                        permissions.extend(role.permissions)
+                    elif isinstance(role.permissions, dict):
+                        permissions.extend(role.permissions.get("permissions", []))
+            
+            return list(set(permissions))  # Remove duplicates
+        except Exception as e:
+            logger.error(f"Failed to get user permissions: {e}")
+            return []
+    
+    def has_permission(self, db: Session, user: User, permission: str) -> bool:
+        """Check if user has a specific permission"""
+        try:
+            # Superuser has all permissions
+            if user.is_superuser:
+                return True
+            
+            user_permissions = self.get_user_permissions(db, user)
+            
+            # Check for wildcard permission
+            if "*" in user_permissions:
+                return True
+            
+            # Check for specific permission
+            if permission in user_permissions:
+                return True
+            
+            # Check for role-based permissions
+            if permission.startswith("role:"):
+                required_role = permission.split(":")[1]
+                user_roles = db.query(UserRole).filter(UserRole.user_id == user.id).all()
+                for user_role in user_roles:
+                    role = db.query(Role).filter(Role.id == user_role.role_id).first()
+                    if role and role.name == required_role:
+                        return True
+            
+            return False
+        except Exception as e:
+            logger.error(f"Permission check failed: {e}")
+            return False
+    
+    def log_security_event(self, event_type: str, user_id: Optional[int], 
+                          ip_address: str, details: Dict[str, Any]) -> None:
+        """Log security events"""
+        try:
+            event_data = {
+                "event_type": event_type,
+                "user_id": user_id,
+                "ip_address": ip_address,
+                "timestamp": datetime.utcnow().isoformat(),
+                "details": details
+            }
+            
+            # Log to application logs
+            logger.info(f"Security event: {event_data}")
+            
+            # Store in Redis for monitoring
+            redis_client.lpush("security_events", str(event_data))
+            redis_client.ltrim("security_events", 0, 999)  # Keep last 1000 events
+            
+        except Exception as e:
+            logger.error(f"Failed to log security event: {e}")
 
 
-async def get_current_user(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict[str, Any]:
+# Global security manager instance
+security_manager = SecurityManager()
+
+
+async def get_current_user(
+    credentials: HTTPAuthorizationCredentials = Depends(security),
+    db: Session = Depends(get_db)
+) -> User:
     """Get the current authenticated user"""
     credentials_exception = HTTPException(
         status_code=status.HTTP_401_UNAUTHORIZED,
@@ -105,7 +237,7 @@ async def get_current_user(credentials: HTTPAuthorizationCredentials = Depends(s
     try:
         # Extract token from credentials
         token = credentials.credentials
-        payload = verify_token(token)
+        payload = security_manager.verify_token(token)
         
         if payload is None:
             raise credentials_exception
@@ -118,12 +250,12 @@ async def get_current_user(credentials: HTTPAuthorizationCredentials = Depends(s
         raise credentials_exception
     
     # Get user from database
-    user = get_user_by_email(email)
+    user = security_manager.get_user_by_email(db, email)
     if user is None:
         raise credentials_exception
         
     # Check if user is active
-    if not user.get("is_active", False):
+    if not user.is_active:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="Inactive user"
@@ -132,9 +264,9 @@ async def get_current_user(credentials: HTTPAuthorizationCredentials = Depends(s
     return user
 
 
-async def get_current_active_user(current_user: Dict[str, Any] = Depends(get_current_user)) -> Dict[str, Any]:
+async def get_current_active_user(current_user: User = Depends(get_current_user)) -> User:
     """Get the current active user"""
-    if not current_user.get("is_active", False):
+    if not current_user.is_active:
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail="Inactive user"
@@ -142,9 +274,9 @@ async def get_current_active_user(current_user: Dict[str, Any] = Depends(get_cur
     return current_user
 
 
-async def get_current_superuser(current_user: Dict[str, Any] = Depends(get_current_user)) -> Dict[str, Any]:
+async def get_current_superuser(current_user: User = Depends(get_current_user)) -> User:
     """Get the current superuser"""
-    if not current_user.get("is_superuser", False):
+    if not current_user.is_superuser:
         raise HTTPException(
             status_code=status.HTTP_403_FORBIDDEN,
             detail="Not enough permissions"
@@ -152,52 +284,112 @@ async def get_current_superuser(current_user: Dict[str, Any] = Depends(get_curre
     return current_user
 
 
-def has_permission(user: Dict[str, Any], permission: str) -> bool:
-    """Check if user has a specific permission"""
-    # Simple role-based permission check
-    user_roles = user.get("roles", [])
-    
-    # Admin has all permissions
-    if "admin" in user_roles:
-        return True
-    
-    # Permission mapping for different roles
-    permission_map = {
-        "user": [
-            "documents:read",
-            "documents:upload",
-            "qa:ask",
-            "analytics:read"
-        ],
-        "manager": [
-            "documents:read",
-            "documents:upload", 
-            "documents:delete",
-            "qa:ask",
-            "analytics:read",
-            "audit:read",
-            "settings:read"
-        ],
-        "admin": ["*"]  # All permissions
-    }
-    
-    for role in user_roles:
-        if role in permission_map:
-            role_permissions = permission_map[role]
-            if "*" in role_permissions or permission in role_permissions:
-                return True
-    
-    return False
-
-
 def require_permission(permission: str):
     """Decorator to require a specific permission"""
-    def permission_dependency(current_user: Dict[str, Any] = Depends(get_current_user)):
-        if not has_permission(current_user, permission):
+    def permission_dependency(
+        current_user: User = Depends(get_current_user),
+        db: Session = Depends(get_db)
+    ):
+        if not security_manager.has_permission(db, current_user, permission):
             raise HTTPException(
                 status_code=status.HTTP_403_FORBIDDEN,
                 detail=f"Permission '{permission}' required"
             )
         return current_user
     
+    return permission_dependency
+
+
+def rate_limit_middleware(request: Request, db: Session = Depends(get_db)):
+    """Rate limiting middleware"""
+    # Get client IP
+    client_ip = request.client.host
+    
+    # Check rate limit
+    if not security_manager.check_rate_limit(client_ip):
+        raise HTTPException(
+            status_code=status.HTTP_429_TOO_MANY_REQUESTS,
+            detail="Rate limit exceeded"
+        )
+    
+    return True
+
+
+def log_security_event_middleware(request: Request, current_user: Optional[User] = Depends(get_current_user)):
+    """Log security events middleware"""
+    client_ip = request.client.host
+    
+    # Log the request
+    security_manager.log_security_event(
+        event_type="api_request",
+        user_id=current_user.id if current_user else None,
+        ip_address=client_ip,
+        details={
+            "method": request.method,
+            "path": request.url.path,
+            "user_agent": request.headers.get("user-agent", ""),
+            "referer": request.headers.get("referer", "")
+        }
+    )
+    
+    return True
+
+
+# Backward compatibility functions
+def verify_password(plain_password: str, hashed_password: str) -> bool:
+    """Verify a plain password against a hashed password"""
+    return security_manager.verify_password(plain_password, hashed_password)
+
+
+def get_password_hash(password: str) -> str:
+    """Hash a password"""
+    return security_manager.get_password_hash(password)
+
+
+def get_user_by_email(email: str):
+    """Get user by email (deprecated - use database session)"""
+    logger.warning("get_user_by_email called without database session - use database models directly")
+    return None
+
+
+def authenticate_user(email: str, password: str):
+    """Authenticate user (deprecated - use database session)"""
+    logger.warning("authenticate_user called without database session - use SecurityManager.authenticate_user")
+    return None
+
+
+def create_access_token(data: Dict[str, Any], expires_delta: Optional[timedelta] = None) -> str:
+    """Create a JWT access token"""
+    return security_manager.create_access_token(data, expires_delta)
+
+
+def verify_token(token: str) -> Optional[Dict[str, Any]]:
+    """Verify and decode a JWT token"""
+    return security_manager.verify_token(token)
+
+
+async def get_current_user_legacy(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict[str, Any]:
+    """Legacy get_current_user function (deprecated)"""
+    logger.warning("get_current_user_legacy called - use get_current_user with database session")
+    raise HTTPException(
+        status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+        detail="Legacy authentication not supported"
+    )
+
+
+def has_permission(user: Dict[str, Any], permission: str) -> bool:
+    """Check if user has a specific permission (deprecated)"""
+    logger.warning("has_permission called with dict user - use SecurityManager.has_permission with User model")
+    return False
+
+
+def require_permission_legacy(permission: str):
+    """Legacy permission decorator (deprecated)"""
+    def permission_dependency(current_user: Dict[str, Any] = Depends(get_current_user_legacy)):
+        logger.warning("require_permission_legacy called - use require_permission with database session")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Legacy permission system not supported"
+        )
+    
     return permission_dependency
\ No newline at end of file
diff --git a/backend/app/database/connection.py b/backend/app/database/connection.py
index 7ecdf37..66e6095 100644
--- a/backend/app/database/connection.py
+++ b/backend/app/database/connection.py
@@ -1,250 +1,435 @@
-from sqlalchemy import create_engine, MetaData
+import os
+import logging
+from typing import Generator, Optional
+from sqlalchemy import create_engine, text, event
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker, Session
-from sqlalchemy.pool import StaticPool
-import logging
-from typing import Generator
+from sqlalchemy.pool import QueuePool
+from sqlalchemy.exc import SQLAlchemyError, OperationalError
+import asyncio
 
 from ..core.config import settings
 
 logger = logging.getLogger(__name__)
 
+# Create declarative base
+Base = declarative_base()
+
 # Database engine
 engine = None
 SessionLocal = None
-Base = declarative_base()
 
 
-def create_database_engine():
-    """Create database engine based on configuration"""
+def create_database_engine() -> None:
+    """Create and configure the database engine"""
     global engine, SessionLocal
     
-    database_url = settings.DATABASE_URL
-    
-    # Configure engine based on database type
-    if database_url.startswith("sqlite"):
-        # SQLite configuration
-        engine = create_engine(
-            database_url,
-            connect_args={"check_same_thread": False},
-            poolclass=StaticPool,
-            echo=settings.DEBUG
-        )
-    elif database_url.startswith("postgresql"):
-        # PostgreSQL configuration
+    try:
+        # Parse database URL
+        database_url = settings.DATABASE_URL
+        
+        # Create engine with connection pooling
         engine = create_engine(
             database_url,
-            pool_pre_ping=True,
-            pool_recycle=300,
-            echo=settings.DEBUG
+            poolclass=QueuePool,
+            pool_size=settings.DATABASE_POOL_SIZE,
+            max_overflow=settings.DATABASE_MAX_OVERFLOW,
+            pool_pre_ping=True,  # Enable connection health checks
+            pool_recycle=3600,   # Recycle connections after 1 hour
+            echo=settings.DEBUG,  # Log SQL queries in debug mode
+            future=True          # Use SQLAlchemy 2.0 style
         )
-    else:
-        # Default configuration
-        engine = create_engine(
-            database_url,
-            echo=settings.DEBUG
+        
+        # Create session factory
+        SessionLocal = sessionmaker(
+            autocommit=False,
+            autoflush=False,
+            bind=engine
         )
-    
-    # Create session factory
-    SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
-    
-    logger.info(f"Database engine created for: {database_url}")
-    return engine
+        
+        logger.info("Database engine created successfully")
+        
+    except Exception as e:
+        logger.error(f"Failed to create database engine: {e}")
+        raise
 
 
-def get_database_session() -> Generator[Session, None, None]:
-    """Get database session"""
+def get_db() -> Generator[Session, None, None]:
+    """Dependency to get database session"""
     if SessionLocal is None:
         create_database_engine()
     
     db = SessionLocal()
     try:
         yield db
+    except Exception as e:
+        logger.error(f"Database session error: {e}")
+        db.rollback()
+        raise
     finally:
         db.close()
 
 
-def init_database():
+async def init_database() -> None:
     """Initialize database tables"""
-    global engine
-    
-    if engine is None:
-        create_database_engine()
-    
     try:
+        if engine is None:
+            create_database_engine()
+        
         # Import all models to ensure they are registered
         from .models import (
-            User, Role, Document, Tag, ComplianceFramework,
+            User, Role, UserRole, Document, Tag, ComplianceFramework,
             ProcessingHistory, AgentExecution, DocumentComparison,
-            AuditEvent, SystemMetrics, WorkflowTemplate,
-            KnowledgeBase, Notification, APILog, SystemConfiguration
+            AuditEvent, SystemMetric, WorkflowTemplate, KnowledgeBase,
+            Notification, APILog, SystemConfig
         )
         
         # Create all tables
         Base.metadata.create_all(bind=engine)
+        
         logger.info("Database tables created successfully")
         
-        # Create default data
-        create_default_data()
+        # Initialize default data
+        await initialize_default_data()
         
     except Exception as e:
-        logger.error(f"Failed to initialize database: {e}")
+        logger.error(f"Database initialization failed: {e}")
         raise
 
 
-def create_default_data():
-    """Create default data in database"""
+async def initialize_default_data() -> None:
+    """Initialize default data in the database"""
     try:
-        db = next(get_database_session())
+        db = SessionLocal()
         
-        # Import models
-        from .models import User, Role, ComplianceFramework, Tag
-        from ..core.security import get_password_hash
+        # Check if default data already exists
+        existing_admin = db.query(User).filter(User.email == "admin@example.com").first()
+        if existing_admin:
+            logger.info("Default data already exists, skipping initialization")
+            return
         
         # Create default roles
-        default_roles = [
-            {"name": "admin", "description": "Administrator with full access"},
-            {"name": "manager", "description": "Manager with limited admin access"},
-            {"name": "user", "description": "Regular user with basic access"}
-        ]
+        admin_role = Role(
+            name="admin",
+            description="Administrator with full access",
+            permissions=["*"]
+        )
         
-        for role_data in default_roles:
-            existing_role = db.query(Role).filter(Role.name == role_data["name"]).first()
-            if not existing_role:
-                role = Role(**role_data)
-                db.add(role)
+        user_role = Role(
+            name="user",
+            description="Standard user with basic access",
+            permissions=[
+                "documents:read",
+                "documents:upload",
+                "documents:delete",
+                "qa:ask",
+                "compare:compare",
+                "analytics:view"
+            ]
+        )
+        
+        analyst_role = Role(
+            name="analyst",
+            description="Analyst with advanced access",
+            permissions=[
+                "documents:read",
+                "documents:upload",
+                "documents:delete",
+                "qa:ask",
+                "compare:compare",
+                "analytics:view",
+                "analytics:export",
+                "audit:view"
+            ]
+        )
+        
+        db.add_all([admin_role, user_role, analyst_role])
+        db.commit()
         
         # Create default admin user
-        admin_email = "admin@example.com"
-        existing_admin = db.query(User).filter(User.email == admin_email).first()
-        if not existing_admin:
-            admin_user = User(
-                email=admin_email,
-                hashed_password=get_password_hash("admin123"),
-                full_name="System Administrator",
-                is_active=True,
-                is_superuser=True
-            )
-            db.add(admin_user)
+        from ..core.security import security_manager
+        
+        admin_user = User(
+            email="admin@example.com",
+            full_name="System Administrator",
+            hashed_password=security_manager.get_password_hash("admin123"),
+            is_active=True,
+            is_superuser=True
+        )
+        
+        db.add(admin_user)
+        db.commit()
+        
+        # Assign admin role to admin user
+        admin_user_role = UserRole(
+            user_id=admin_user.id,
+            role_id=admin_role.id,
+            assigned_by=admin_user.id
+        )
+        
+        db.add(admin_user_role)
+        db.commit()
         
         # Create default compliance frameworks
-        default_frameworks = [
-            {"name": "GDPR", "description": "General Data Protection Regulation"},
-            {"name": "HIPAA", "description": "Health Insurance Portability and Accountability Act"},
-            {"name": "SOX", "description": "Sarbanes-Oxley Act"},
-            {"name": "PCI-DSS", "description": "Payment Card Industry Data Security Standard"}
+        frameworks = [
+            ComplianceFramework(
+                name="GDPR",
+                description="General Data Protection Regulation",
+                version="2018",
+                requirements=["data_minimization", "consent", "right_to_erasure"]
+            ),
+            ComplianceFramework(
+                name="HIPAA",
+                description="Health Insurance Portability and Accountability Act",
+                version="1996",
+                requirements=["privacy_rule", "security_rule", "breach_notification"]
+            ),
+            ComplianceFramework(
+                name="SOX",
+                description="Sarbanes-Oxley Act",
+                version="2002",
+                requirements=["financial_reporting", "internal_controls", "audit_requirements"]
+            )
         ]
         
-        for framework_data in default_frameworks:
-            existing_framework = db.query(ComplianceFramework).filter(
-                ComplianceFramework.name == framework_data["name"]
-            ).first()
-            if not existing_framework:
-                framework = ComplianceFramework(**framework_data)
-                db.add(framework)
+        db.add_all(frameworks)
+        db.commit()
         
         # Create default tags
-        default_tags = [
-            {"name": "contract", "description": "Legal contracts", "color": "#2196F3"},
-            {"name": "invoice", "description": "Financial invoices", "color": "#4CAF50"},
-            {"name": "policy", "description": "Company policies", "color": "#FF9800"},
-            {"name": "report", "description": "Business reports", "color": "#9C27B0"},
-            {"name": "compliance", "description": "Compliance documents", "color": "#F44336"}
+        tags = [
+            Tag(name="confidential", description="Confidential documents"),
+            Tag(name="public", description="Public documents"),
+            Tag(name="financial", description="Financial documents"),
+            Tag(name="legal", description="Legal documents"),
+            Tag(name="hr", description="Human resources documents"),
+            Tag(name="technical", description="Technical documents")
         ]
         
-        for tag_data in default_tags:
-            existing_tag = db.query(Tag).filter(Tag.name == tag_data["name"]).first()
-            if not existing_tag:
-                tag = Tag(**tag_data)
-                db.add(tag)
+        db.add_all(tags)
+        db.commit()
+        
+        # Create default system configurations
+        configs = [
+            SystemConfig(
+                key="max_file_size_mb",
+                value="100",
+                description="Maximum file size in MB",
+                category="upload"
+            ),
+            SystemConfig(
+                key="allowed_file_types",
+                value="pdf,docx,txt,csv,xlsx",
+                description="Allowed file types",
+                category="upload"
+            ),
+            SystemConfig(
+                key="session_timeout_minutes",
+                value="30",
+                description="Session timeout in minutes",
+                category="security"
+            ),
+            SystemConfig(
+                key="audit_log_retention_days",
+                value="90",
+                description="Audit log retention period in days",
+                category="audit"
+            ),
+            SystemConfig(
+                key="backup_enabled",
+                value="true",
+                description="Enable automatic backups",
+                category="backup"
+            ),
+            SystemConfig(
+                key="monitoring_enabled",
+                value="true",
+                description="Enable system monitoring",
+                category="monitoring"
+            )
+        ]
         
-        # Commit all changes
+        db.add_all(configs)
         db.commit()
-        logger.info("Default data created successfully")
+        
+        logger.info("Default data initialized successfully")
         
     except Exception as e:
-        logger.error(f"Failed to create default data: {e}")
+        logger.error(f"Failed to initialize default data: {e}")
         db.rollback()
+        raise
     finally:
         db.close()
 
 
-def check_database_connection():
+async def check_database_connection() -> bool:
     """Check if database connection is working"""
     try:
         if engine is None:
             create_database_engine()
         
-        # Test connection
-        with engine.connect() as conn:
-            conn.execute("SELECT 1")
+        # Test connection with a simple query
+        with engine.connect() as connection:
+            result = connection.execute(text("SELECT 1"))
+            result.fetchone()
         
-        logger.info("Database connection test successful")
+        logger.info("Database connection check successful")
         return True
         
     except Exception as e:
-        logger.error(f"Database connection test failed: {e}")
+        logger.error(f"Database connection check failed: {e}")
         return False
 
 
-def get_database_info():
-    """Get database information"""
-    if engine is None:
-        return {"status": "not_initialized"}
-    
+async def get_database_info() -> dict:
+    """Get database information and statistics"""
     try:
-        with engine.connect() as conn:
-            # Get database type
-            db_type = engine.dialect.name
-            
-            # Get database URL (without credentials)
-            db_url = str(engine.url).replace(str(engine.url.password), "****") if engine.url.password else str(engine.url)
-            
-            # Test connection
-            conn.execute("SELECT 1")
-            
-            return {
-                "status": "connected",
-                "type": db_type,
-                "url": db_url,
-                "pool_size": engine.pool.size() if hasattr(engine.pool, 'size') else None,
-                "checked_out_connections": engine.pool.checkedout() if hasattr(engine.pool, 'checkedout') else None
-            }
-            
+        if engine is None:
+            create_database_engine()
+        
+        db = SessionLocal()
+        
+        # Get table counts
+        table_counts = {}
+        tables = [
+            "users", "roles", "user_roles", "documents", "tags",
+            "compliance_frameworks", "processing_history", "agent_executions",
+            "document_comparisons", "audit_events", "system_metrics",
+            "workflow_templates", "knowledge_base", "notifications",
+            "api_logs", "system_configs"
+        ]
+        
+        for table in tables:
+            try:
+                result = db.execute(text(f"SELECT COUNT(*) FROM {table}"))
+                count = result.scalar()
+                table_counts[table] = count
+            except Exception as e:
+                logger.warning(f"Could not get count for table {table}: {e}")
+                table_counts[table] = 0
+        
+        # Get database size
+        try:
+            result = db.execute(text("""
+                SELECT pg_size_pretty(pg_database_size(current_database())) as size
+            """))
+            db_size = result.scalar()
+        except Exception:
+            db_size = "unknown"
+        
+        # Get connection info
+        pool_info = {
+            "pool_size": engine.pool.size(),
+            "checked_in": engine.pool.checkedin(),
+            "checked_out": engine.pool.checkedout(),
+            "overflow": engine.pool.overflow()
+        }
+        
+        return {
+            "status": "connected",
+            "database_url": settings.DATABASE_URL.replace(
+                settings.DATABASE_URL.split("@")[0].split(":")[-1], "***"
+            ) if "@" in settings.DATABASE_URL else settings.DATABASE_URL,
+            "table_counts": table_counts,
+            "database_size": db_size,
+            "pool_info": pool_info,
+            "timestamp": "2024-01-01T00:00:00Z"
+        }
+        
     except Exception as e:
+        logger.error(f"Failed to get database info: {e}")
         return {
             "status": "error",
-            "error": str(e)
+            "error": str(e),
+            "timestamp": "2024-01-01T00:00:00Z"
         }
+    finally:
+        if 'db' in locals():
+            db.close()
 
 
-def reset_database():
-    """Reset database (drop and recreate all tables)"""
-    global engine
-    
-    if engine is None:
+async def cleanup_database() -> None:
+    """Cleanup database connections"""
+    try:
+        if engine:
+            engine.dispose()
+            logger.info("Database engine disposed")
+    except Exception as e:
+        logger.error(f"Database cleanup failed: {e}")
+
+
+def get_database_session() -> Session:
+    """Get a database session (for use outside of FastAPI dependencies)"""
+    if SessionLocal is None:
         create_database_engine()
-    
+    return SessionLocal()
+
+
+# Database event listeners for logging
+@event.listens_for(engine, "connect")
+def receive_connect(dbapi_connection, connection_record):
+    logger.debug("Database connection established")
+
+
+@event.listens_for(engine, "disconnect")
+def receive_disconnect(dbapi_connection, connection_record):
+    logger.debug("Database connection closed")
+
+
+@event.listens_for(engine, "checkout")
+def receive_checkout(dbapi_connection, connection_record, connection_proxy):
+    logger.debug("Database connection checked out")
+
+
+@event.listens_for(engine, "checkin")
+def receive_checkin(dbapi_connection, connection_record):
+    logger.debug("Database connection checked in")
+
+
+# Database health check function
+async def health_check_database() -> dict:
+    """Perform a comprehensive database health check"""
     try:
-        # Drop all tables
-        Base.metadata.drop_all(bind=engine)
-        logger.info("All database tables dropped")
+        # Check connection
+        connection_ok = await check_database_connection()
+        if not connection_ok:
+            return {
+                "status": "unhealthy",
+                "error": "Database connection failed",
+                "timestamp": "2024-01-01T00:00:00Z"
+            }
         
-        # Recreate all tables
-        Base.metadata.create_all(bind=engine)
-        logger.info("All database tables recreated")
+        # Get database info
+        db_info = await get_database_info()
         
-        # Create default data
-        create_default_data()
+        # Check for critical issues
+        issues = []
         
-        return True
+        # Check if admin user exists
+        if db_info.get("table_counts", {}).get("users", 0) == 0:
+            issues.append("No users found in database")
+        
+        # Check if roles exist
+        if db_info.get("table_counts", {}).get("roles", 0) == 0:
+            issues.append("No roles found in database")
+        
+        # Check connection pool health
+        pool_info = db_info.get("pool_info", {})
+        if pool_info.get("checked_out", 0) > pool_info.get("pool_size", 0) * 0.8:
+            issues.append("High connection pool usage")
+        
+        status = "healthy" if not issues else "degraded"
+        
+        return {
+            "status": status,
+            "issues": issues,
+            "database_info": db_info,
+            "timestamp": "2024-01-01T00:00:00Z"
+        }
         
     except Exception as e:
-        logger.error(f"Failed to reset database: {e}")
-        return False
-
-
-# Database dependency for FastAPI
-def get_db() -> Generator[Session, None, None]:
-    """Database dependency for FastAPI endpoints"""
-    return get_database_session()
\ No newline at end of file
+        logger.error(f"Database health check failed: {e}")
+        return {
+            "status": "unhealthy",
+            "error": str(e),
+            "timestamp": "2024-01-01T00:00:00Z"
+        }
\ No newline at end of file
diff --git a/backend/app/database/models.py b/backend/app/database/models.py
index c4fa059..4bac976 100644
--- a/backend/app/database/models.py
+++ b/backend/app/database/models.py
@@ -1,33 +1,15 @@
-from sqlalchemy import Column, Integer, String, Text, DateTime, Boolean, Float, JSON, ForeignKey, Table
-from sqlalchemy.ext.declarative import declarative_base
-from sqlalchemy.orm import relationship
-from sqlalchemy.sql import func
+import uuid
 from datetime import datetime
-from typing import Optional, List, Dict, Any
-
-Base = declarative_base()
-
-# Association tables for many-to-many relationships
-document_tags = Table(
-    'document_tags',
-    Base.metadata,
-    Column('document_id', Integer, ForeignKey('documents.id'), primary_key=True),
-    Column('tag_id', Integer, ForeignKey('tags.id'), primary_key=True)
+from typing import List, Optional, Dict, Any
+from sqlalchemy import (
+    Column, Integer, String, Text, Boolean, DateTime, Float, 
+    ForeignKey, Table, MetaData, JSON, LargeBinary, Index,
+    UniqueConstraint, CheckConstraint, func
 )
+from sqlalchemy.orm import relationship, declarative_base
+from sqlalchemy.dialects.postgresql import UUID, JSONB
 
-document_compliance_frameworks = Table(
-    'document_compliance_frameworks',
-    Base.metadata,
-    Column('document_id', Integer, ForeignKey('documents.id'), primary_key=True),
-    Column('framework_id', Integer, ForeignKey('compliance_frameworks.id'), primary_key=True)
-)
-
-user_roles = Table(
-    'user_roles',
-    Base.metadata,
-    Column('user_id', Integer, ForeignKey('users.id'), primary_key=True),
-    Column('role_id', Integer, ForeignKey('roles.id'), primary_key=True)
-)
+from .connection import Base
 
 
 class User(Base):
@@ -36,8 +18,8 @@ class User(Base):
     
     id = Column(Integer, primary_key=True, index=True)
     email = Column(String(255), unique=True, index=True, nullable=False)
+    full_name = Column(String(255), nullable=False)
     hashed_password = Column(String(255), nullable=False)
-    full_name = Column(String(255), nullable=True)
     is_active = Column(Boolean, default=True)
     is_superuser = Column(Boolean, default=False)
     created_at = Column(DateTime(timezone=True), server_default=func.now())
@@ -45,10 +27,16 @@ class User(Base):
     last_login = Column(DateTime(timezone=True), nullable=True)
     
     # Relationships
-    documents = relationship("Document", back_populates="uploaded_by_user")
+    documents = relationship("Document", back_populates="user")
     audit_events = relationship("AuditEvent", back_populates="user")
-    processing_history = relationship("ProcessingHistory", back_populates="user")
-    user_roles = relationship("Role", secondary=user_roles, back_populates="users")
+    notifications = relationship("Notification", back_populates="user")
+    
+    # Indexes
+    __table_args__ = (
+        Index('idx_users_email', 'email'),
+        Index('idx_users_active', 'is_active'),
+        Index('idx_users_created_at', 'created_at'),
+    )
 
 
 class Role(Base):
@@ -58,11 +46,40 @@ class Role(Base):
     id = Column(Integer, primary_key=True, index=True)
     name = Column(String(100), unique=True, nullable=False)
     description = Column(Text, nullable=True)
-    permissions = Column(JSON, nullable=True)  # Store permissions as JSON
+    permissions = Column(JSON, default=list)  # List of permission strings
     created_at = Column(DateTime(timezone=True), server_default=func.now())
+    updated_at = Column(DateTime(timezone=True), onupdate=func.now())
     
     # Relationships
-    users = relationship("User", secondary=user_roles, back_populates="user_roles")
+    user_roles = relationship("UserRole", back_populates="role")
+    
+    # Indexes
+    __table_args__ = (
+        Index('idx_roles_name', 'name'),
+        Index('idx_roles_created_at', 'created_at'),
+    )
+
+
+class UserRole(Base):
+    """User-Role association model for additional metadata"""
+    __tablename__ = "user_roles"
+    
+    user_id = Column(Integer, ForeignKey('users.id'), primary_key=True)
+    role_id = Column(Integer, ForeignKey('roles.id'), primary_key=True)
+    assigned_at = Column(DateTime(timezone=True), server_default=func.now())
+    assigned_by = Column(Integer, ForeignKey('users.id'), nullable=True)
+    
+    # Relationships
+    user = relationship("User", foreign_keys=[user_id])
+    role = relationship("Role", foreign_keys=[role_id], back_populates="user_roles")
+    assigned_by_user = relationship("User", foreign_keys=[assigned_by])
+    
+    # Indexes
+    __table_args__ = (
+        Index('idx_user_roles_user_id', 'user_id'),
+        Index('idx_user_roles_role_id', 'role_id'),
+        Index('idx_user_roles_assigned_at', 'assigned_at'),
+    )
 
 
 class Document(Base):
@@ -70,45 +87,45 @@ class Document(Base):
     __tablename__ = "documents"
     
     id = Column(Integer, primary_key=True, index=True)
+    uuid = Column(UUID(as_uuid=True), default=uuid.uuid4, unique=True, index=True)
     filename = Column(String(255), nullable=False)
     original_filename = Column(String(255), nullable=False)
-    content = Column(Text, nullable=True)  # Extracted text content
     file_path = Column(String(500), nullable=False)
     file_size = Column(Integer, nullable=False)
-    content_type = Column(String(100), nullable=False)
-    doc_type = Column(String(50), nullable=True)  # CONTRACT, INVOICE, etc.
-    domain = Column(String(100), nullable=True)  # LEGAL, FINANCIAL, etc.
-    
-    # Processing metadata
-    processing_status = Column(String(50), default="pending")  # pending, processing, completed, failed
-    processing_result = Column(JSON, nullable=True)  # Store agent processing results
-    confidence_score = Column(Float, nullable=True)
-    risk_score = Column(Float, nullable=True)
+    file_type = Column(String(50), nullable=False)
+    mime_type = Column(String(100), nullable=False)
     
-    # Metadata
-    metadata = Column(JSON, nullable=True)  # Additional metadata
-    tags = Column(JSON, nullable=True)  # Document tags
+    # Content and processing
+    extracted_text = Column(Text, nullable=True)
+    summary = Column(Text, nullable=True)
     entities = Column(JSON, nullable=True)  # Extracted entities
-    clauses = Column(JSON, nullable=True)  # Extracted clauses
-    risks = Column(JSON, nullable=True)  # Risk assessment results
-    qa_pairs = Column(JSON, nullable=True)  # Generated Q&A pairs
+    metadata = Column(JSON, nullable=True)  # Document metadata
     
-    # Timestamps
+    # Status and processing
+    status = Column(String(50), default="uploaded")  # uploaded, processing, completed, failed
+    processing_progress = Column(Float, default=0.0)
+    processing_error = Column(Text, nullable=True)
+    
+    # User and timestamps
+    user_id = Column(Integer, ForeignKey('users.id'), nullable=False)
     uploaded_at = Column(DateTime(timezone=True), server_default=func.now())
     processed_at = Column(DateTime(timezone=True), nullable=True)
     updated_at = Column(DateTime(timezone=True), onupdate=func.now())
     
-    # Foreign keys
-    uploaded_by = Column(Integer, ForeignKey("users.id"), nullable=False)
-    
     # Relationships
-    uploaded_by_user = relationship("User", back_populates="documents")
+    user = relationship("User", back_populates="documents")
+    tags = relationship("DocumentTag", back_populates="document")
     processing_history = relationship("ProcessingHistory", back_populates="document")
-    audit_events = relationship("AuditEvent", back_populates="document")
-    comparisons = relationship("DocumentComparison", foreign_keys="DocumentComparison.document_a_id")
-    comparisons_as_b = relationship("DocumentComparison", foreign_keys="DocumentComparison.document_b_id")
-    document_tags = relationship("Tag", secondary=document_tags, back_populates="documents")
-    compliance_frameworks = relationship("ComplianceFramework", secondary=document_compliance_frameworks, back_populates="documents")
+    agent_executions = relationship("AgentExecution", back_populates="document")
+    
+    # Indexes
+    __table_args__ = (
+        Index('idx_documents_uuid', 'uuid'),
+        Index('idx_documents_user_id', 'user_id'),
+        Index('idx_documents_status', 'status'),
+        Index('idx_documents_uploaded_at', 'uploaded_at'),
+        Index('idx_documents_file_type', 'file_type'),
+    )
 
 
 class Tag(Base):
@@ -118,11 +135,38 @@ class Tag(Base):
     id = Column(Integer, primary_key=True, index=True)
     name = Column(String(100), unique=True, nullable=False)
     description = Column(Text, nullable=True)
-    color = Column(String(7), nullable=True)  # Hex color code
+    color = Column(String(7), default="#2196F3")  # Hex color code
     created_at = Column(DateTime(timezone=True), server_default=func.now())
     
     # Relationships
-    documents = relationship("Document", secondary=document_tags, back_populates="document_tags")
+    document_tags = relationship("DocumentTag", back_populates="tag")
+    
+    # Indexes
+    __table_args__ = (
+        Index('idx_tags_name', 'name'),
+        Index('idx_tags_created_at', 'created_at'),
+    )
+
+
+class DocumentTag(Base):
+    """Document-Tag association table"""
+    __tablename__ = "document_tags"
+    
+    document_id = Column(Integer, ForeignKey('documents.id'), primary_key=True)
+    tag_id = Column(Integer, ForeignKey('tags.id'), primary_key=True)
+    assigned_at = Column(DateTime(timezone=True), server_default=func.now())
+    assigned_by = Column(Integer, ForeignKey('users.id'), nullable=True)
+    
+    # Relationships
+    document = relationship("Document", back_populates="tags")
+    tag = relationship("Tag", back_populates="document_tags")
+    assigned_by_user = relationship("User", foreign_keys=[assigned_by])
+    
+    # Indexes
+    __table_args__ = (
+        Index('idx_document_tags_document_id', 'document_id'),
+        Index('idx_document_tags_tag_id', 'tag_id'),
+    )
 
 
 class ComplianceFramework(Base):
@@ -130,14 +174,18 @@ class ComplianceFramework(Base):
     __tablename__ = "compliance_frameworks"
     
     id = Column(Integer, primary_key=True, index=True)
-    name = Column(String(100), unique=True, nullable=False)  # GDPR, SOX, HIPAA, etc.
+    name = Column(String(100), unique=True, nullable=False)
     description = Column(Text, nullable=True)
-    version = Column(String(20), nullable=True)
-    requirements = Column(JSON, nullable=True)  # Framework requirements
+    version = Column(String(50), nullable=True)
+    requirements = Column(JSON, nullable=True)  # List of requirements
     created_at = Column(DateTime(timezone=True), server_default=func.now())
+    updated_at = Column(DateTime(timezone=True), onupdate=func.now())
     
-    # Relationships
-    documents = relationship("Document", secondary=document_compliance_frameworks, back_populates="compliance_frameworks")
+    # Indexes
+    __table_args__ = (
+        Index('idx_compliance_frameworks_name', 'name'),
+        Index('idx_compliance_frameworks_created_at', 'created_at'),
+    )
 
 
 class ProcessingHistory(Base):
@@ -145,299 +193,328 @@ class ProcessingHistory(Base):
     __tablename__ = "processing_history"
     
     id = Column(Integer, primary_key=True, index=True)
-    processing_id = Column(String(100), unique=True, index=True, nullable=False)
-    
-    # Processing details
-    workflow_id = Column(String(100), nullable=True)
-    current_stage = Column(String(100), nullable=True)
-    completed_stages = Column(JSON, nullable=True)  # List of completed stages
-    failed_stages = Column(JSON, nullable=True)  # List of failed stages
-    total_execution_time = Column(Float, nullable=True)  # Total execution time in seconds
-    progress_percentage = Column(Float, default=0.0)
-    
-    # Results
-    status = Column(String(50), default="pending")  # pending, running, completed, failed
+    document_id = Column(Integer, ForeignKey('documents.id'), nullable=False)
+    stage = Column(String(100), nullable=False)  # ingestion, classification, extraction, etc.
+    status = Column(String(50), nullable=False)  # started, completed, failed
+    start_time = Column(DateTime(timezone=True), server_default=func.now())
+    end_time = Column(DateTime(timezone=True), nullable=True)
+    duration = Column(Float, nullable=True)  # Duration in seconds
     result = Column(JSON, nullable=True)  # Processing result
-    confidence = Column(Float, nullable=True)
-    rationale = Column(Text, nullable=True)
     error_message = Column(Text, nullable=True)
     
-    # Timestamps
-    started_at = Column(DateTime(timezone=True), server_default=func.now())
-    completed_at = Column(DateTime(timezone=True), nullable=True)
-    
-    # Foreign keys
-    document_id = Column(Integer, ForeignKey("documents.id"), nullable=False)
-    user_id = Column(Integer, ForeignKey("users.id"), nullable=False)
-    
     # Relationships
     document = relationship("Document", back_populates="processing_history")
-    user = relationship("User", back_populates="processing_history")
-    agent_executions = relationship("AgentExecution", back_populates="processing_history")
+    
+    # Indexes
+    __table_args__ = (
+        Index('idx_processing_history_document_id', 'document_id'),
+        Index('idx_processing_history_stage', 'stage'),
+        Index('idx_processing_history_status', 'status'),
+        Index('idx_processing_history_start_time', 'start_time'),
+    )
 
 
 class AgentExecution(Base):
-    """Agent execution model for tracking individual agent runs"""
+    """Agent execution model for tracking AI agent runs"""
     __tablename__ = "agent_executions"
     
     id = Column(Integer, primary_key=True, index=True)
-    execution_id = Column(String(100), unique=True, index=True, nullable=False)
-    
-    # Agent details
-    agent_type = Column(String(50), nullable=False)  # classifier, entity, risk, etc.
-    agent_name = Column(String(100), nullable=False)
+    document_id = Column(Integer, ForeignKey('documents.id'), nullable=True)
+    agent_type = Column(String(100), nullable=False)  # orchestrator, classifier, entity, etc.
+    execution_id = Column(UUID(as_uuid=True), default=uuid.uuid4, unique=True, index=True)
     
     # Execution details
-    input_size = Column(Integer, nullable=True)
-    output_size = Column(Integer, nullable=True)
-    execution_time = Column(Float, nullable=True)  # Execution time in seconds
-    memory_usage = Column(Float, nullable=True)  # Memory usage in MB
-    cpu_usage = Column(Float, nullable=True)  # CPU usage percentage
-    
-    # Results
-    status = Column(String(50), default="pending")  # pending, running, completed, failed
-    confidence = Column(Float, nullable=True)
-    output = Column(JSON, nullable=True)  # Agent output
-    error_message = Column(Text, nullable=True)
+    input_data = Column(JSON, nullable=True)
+    output_data = Column(JSON, nullable=True)
+    confidence_score = Column(Float, nullable=True)
     
-    # Timestamps
-    started_at = Column(DateTime(timezone=True), server_default=func.now())
-    completed_at = Column(DateTime(timezone=True), nullable=True)
+    # Timing and status
+    start_time = Column(DateTime(timezone=True), server_default=func.now())
+    end_time = Column(DateTime(timezone=True), nullable=True)
+    duration = Column(Float, nullable=True)  # Duration in seconds
+    status = Column(String(50), default="running")  # running, completed, failed, timeout
     
-    # Foreign keys
-    processing_history_id = Column(Integer, ForeignKey("processing_history.id"), nullable=False)
+    # Error handling
+    error_message = Column(Text, nullable=True)
+    retry_count = Column(Integer, default=0)
     
     # Relationships
-    processing_history = relationship("ProcessingHistory", back_populates="agent_executions")
+    document = relationship("Document", back_populates="agent_executions")
+    
+    # Indexes
+    __table_args__ = (
+        Index('idx_agent_executions_execution_id', 'execution_id'),
+        Index('idx_agent_executions_document_id', 'document_id'),
+        Index('idx_agent_executions_agent_type', 'agent_type'),
+        Index('idx_agent_executions_status', 'status'),
+        Index('idx_agent_executions_start_time', 'start_time'),
+    )
 
 
 class DocumentComparison(Base):
-    """Document comparison model"""
+    """Document comparison model for storing comparison results"""
     __tablename__ = "document_comparisons"
     
     id = Column(Integer, primary_key=True, index=True)
-    comparison_id = Column(String(100), unique=True, index=True, nullable=False)
+    comparison_id = Column(UUID(as_uuid=True), default=uuid.uuid4, unique=True, index=True)
     
-    # Comparison details
-    comparison_type = Column(String(50), nullable=False)  # semantic, structural, compliance, etc.
+    # Documents being compared
+    document1_id = Column(Integer, ForeignKey('documents.id'), nullable=False)
+    document2_id = Column(Integer, ForeignKey('documents.id'), nullable=False)
+    
+    # Comparison results
     similarity_score = Column(Float, nullable=True)
     differences = Column(JSON, nullable=True)  # Detailed differences
-    risk_changes = Column(JSON, nullable=True)  # Risk changes between documents
-    
-    # Results
-    status = Column(String(50), default="pending")
-    result = Column(JSON, nullable=True)
-    confidence = Column(Float, nullable=True)
     summary = Column(Text, nullable=True)
     
-    # Timestamps
+    # Metadata
+    comparison_type = Column(String(50), default="content")  # content, structure, metadata
     created_at = Column(DateTime(timezone=True), server_default=func.now())
     completed_at = Column(DateTime(timezone=True), nullable=True)
     
-    # Foreign keys
-    document_a_id = Column(Integer, ForeignKey("documents.id"), nullable=False)
-    document_b_id = Column(Integer, ForeignKey("documents.id"), nullable=False)
-    created_by = Column(Integer, ForeignKey("users.id"), nullable=False)
-    
     # Relationships
-    document_a = relationship("Document", foreign_keys=[document_a_id])
-    document_b = relationship("Document", foreign_keys=[document_b_id])
-    creator = relationship("User")
+    document1 = relationship("Document", foreign_keys=[document1_id])
+    document2 = relationship("Document", foreign_keys=[document2_id])
+    
+    # Indexes
+    __table_args__ = (
+        Index('idx_document_comparisons_comparison_id', 'comparison_id'),
+        Index('idx_document_comparisons_document1_id', 'document1_id'),
+        Index('idx_document_comparisons_document2_id', 'document2_id'),
+        Index('idx_document_comparisons_created_at', 'created_at'),
+    )
 
 
 class AuditEvent(Base):
-    """Audit event model for compliance and security auditing"""
+    """Audit event model for security and compliance logging"""
     __tablename__ = "audit_events"
     
     id = Column(Integer, primary_key=True, index=True)
-    event_id = Column(String(100), unique=True, index=True, nullable=False)
+    event_id = Column(UUID(as_uuid=True), default=uuid.uuid4, unique=True, index=True)
     
     # Event details
-    event_type = Column(String(100), nullable=False)  # document_upload, processing_start, etc.
-    event_category = Column(String(50), nullable=False)  # security, compliance, system, etc.
+    event_type = Column(String(100), nullable=False)  # login, logout, document_upload, etc.
+    event_category = Column(String(50), nullable=False)  # authentication, document, system, etc.
     severity = Column(String(20), default="info")  # info, warning, error, critical
     
-    # Event data
-    description = Column(Text, nullable=False)
-    details = Column(JSON, nullable=True)  # Additional event details
+    # User and session
+    user_id = Column(Integer, ForeignKey('users.id'), nullable=True)
+    session_id = Column(String(255), nullable=True)
     ip_address = Column(String(45), nullable=True)  # IPv4 or IPv6
     user_agent = Column(Text, nullable=True)
     
+    # Event data
+    details = Column(JSON, nullable=True)  # Additional event details
+    resource_type = Column(String(100), nullable=True)  # document, user, system, etc.
+    resource_id = Column(String(255), nullable=True)
+    
     # Timestamps
     timestamp = Column(DateTime(timezone=True), server_default=func.now())
     
-    # Foreign keys
-    user_id = Column(Integer, ForeignKey("users.id"), nullable=True)
-    document_id = Column(Integer, ForeignKey("documents.id"), nullable=True)
-    
     # Relationships
     user = relationship("User", back_populates="audit_events")
-    document = relationship("Document", back_populates="audit_events")
+    
+    # Indexes
+    __table_args__ = (
+        Index('idx_audit_events_event_id', 'event_id'),
+        Index('idx_audit_events_user_id', 'user_id'),
+        Index('idx_audit_events_event_type', 'event_type'),
+        Index('idx_audit_events_event_category', 'event_category'),
+        Index('idx_audit_events_severity', 'severity'),
+        Index('idx_audit_events_timestamp', 'timestamp'),
+        Index('idx_audit_events_ip_address', 'ip_address'),
+    )
 
 
-class SystemMetrics(Base):
-    """System metrics model for monitoring and performance tracking"""
+class SystemMetric(Base):
+    """System metrics model for monitoring"""
     __tablename__ = "system_metrics"
     
     id = Column(Integer, primary_key=True, index=True)
-    metric_id = Column(String(100), unique=True, index=True, nullable=False)
-    
-    # Metric details
     metric_name = Column(String(100), nullable=False)
-    metric_type = Column(String(50), nullable=False)  # counter, gauge, histogram, summary
-    value = Column(Float, nullable=False)
-    labels = Column(JSON, nullable=True)  # Metric labels
+    metric_value = Column(Float, nullable=False)
+    metric_unit = Column(String(20), nullable=True)  # seconds, bytes, percent, etc.
+    
+    # Context
+    tags = Column(JSON, nullable=True)  # Key-value pairs for filtering
+    source = Column(String(100), nullable=True)  # system, application, agent, etc.
     
     # Timestamps
     timestamp = Column(DateTime(timezone=True), server_default=func.now())
     
-    # Additional metadata
-    description = Column(Text, nullable=True)
-    unit = Column(String(20), nullable=True)  # seconds, bytes, percentage, etc.
+    # Indexes
+    __table_args__ = (
+        Index('idx_system_metrics_metric_name', 'metric_name'),
+        Index('idx_system_metrics_timestamp', 'timestamp'),
+        Index('idx_system_metrics_source', 'source'),
+    )
 
 
 class WorkflowTemplate(Base):
-    """Workflow template model for predefined processing workflows"""
+    """Workflow template model for defining processing workflows"""
     __tablename__ = "workflow_templates"
     
     id = Column(Integer, primary_key=True, index=True)
-    template_id = Column(String(100), unique=True, index=True, nullable=False)
-    
-    # Template details
-    name = Column(String(100), nullable=False)
+    name = Column(String(100), unique=True, nullable=False)
     description = Column(Text, nullable=True)
-    version = Column(String(20), default="1.0.0")
+    version = Column(String(20), default="1.0")
     
-    # Workflow configuration
-    stages = Column(JSON, nullable=False)  # Workflow stages configuration
-    agent_config = Column(JSON, nullable=True)  # Agent-specific configuration
-    workflow_config = Column(JSON, nullable=True)  # Workflow-specific configuration
+    # Workflow definition
+    stages = Column(JSON, nullable=False)  # List of workflow stages
+    conditions = Column(JSON, nullable=True)  # Conditional logic
+    settings = Column(JSON, nullable=True)  # Workflow settings
     
     # Metadata
     is_active = Column(Boolean, default=True)
     created_at = Column(DateTime(timezone=True), server_default=func.now())
     updated_at = Column(DateTime(timezone=True), onupdate=func.now())
     
-    # Foreign keys
-    created_by = Column(Integer, ForeignKey("users.id"), nullable=False)
-    
-    # Relationships
-    creator = relationship("User")
+    # Indexes
+    __table_args__ = (
+        Index('idx_workflow_templates_name', 'name'),
+        Index('idx_workflow_templates_is_active', 'is_active'),
+        Index('idx_workflow_templates_created_at', 'created_at'),
+    )
 
 
 class KnowledgeBase(Base):
-    """Knowledge base model for storing domain knowledge and rules"""
+    """Knowledge base model for storing AI knowledge"""
     __tablename__ = "knowledge_base"
     
     id = Column(Integer, primary_key=True, index=True)
-    kb_id = Column(String(100), unique=True, index=True, nullable=False)
-    
-    # Knowledge base details
-    name = Column(String(100), nullable=False)
-    description = Column(Text, nullable=True)
-    domain = Column(String(100), nullable=True)  # legal, financial, healthcare, etc.
+    entry_id = Column(UUID(as_uuid=True), default=uuid.uuid4, unique=True, index=True)
     
     # Content
+    title = Column(String(255), nullable=False)
     content = Column(Text, nullable=False)
-    content_type = Column(String(50), nullable=False)  # rule, guideline, policy, etc.
-    vector_embedding = Column(JSON, nullable=True)  # Vector embedding for similarity search
+    content_type = Column(String(50), default="text")  # text, qa, rule, etc.
+    
+    # Categorization
+    category = Column(String(100), nullable=True)
+    tags = Column(JSON, nullable=True)  # List of tags
     
     # Metadata
     source = Column(String(255), nullable=True)
-    version = Column(String(20), default="1.0.0")
+    confidence = Column(Float, nullable=True)
     is_active = Column(Boolean, default=True)
+    
+    # Timestamps
     created_at = Column(DateTime(timezone=True), server_default=func.now())
     updated_at = Column(DateTime(timezone=True), onupdate=func.now())
     
-    # Foreign keys
-    created_by = Column(Integer, ForeignKey("users.id"), nullable=False)
-    
-    # Relationships
-    creator = relationship("User")
+    # Indexes
+    __table_args__ = (
+        Index('idx_knowledge_base_entry_id', 'entry_id'),
+        Index('idx_knowledge_base_title', 'title'),
+        Index('idx_knowledge_base_category', 'category'),
+        Index('idx_knowledge_base_content_type', 'content_type'),
+        Index('idx_knowledge_base_is_active', 'is_active'),
+        Index('idx_knowledge_base_created_at', 'created_at'),
+    )
 
 
 class Notification(Base):
-    """Notification model for system notifications and alerts"""
+    """Notification model for user notifications"""
     __tablename__ = "notifications"
     
     id = Column(Integer, primary_key=True, index=True)
-    notification_id = Column(String(100), unique=True, index=True, nullable=False)
+    notification_id = Column(UUID(as_uuid=True), default=uuid.uuid4, unique=True, index=True)
+    
+    # Recipient
+    user_id = Column(Integer, ForeignKey('users.id'), nullable=False)
     
     # Notification details
     title = Column(String(255), nullable=False)
     message = Column(Text, nullable=False)
-    notification_type = Column(String(50), nullable=False)  # info, warning, error, success
-    priority = Column(String(20), default="normal")  # low, normal, high, urgent
+    notification_type = Column(String(50), default="info")  # info, warning, error, success
     
-    # Delivery
+    # Status
     is_read = Column(Boolean, default=False)
-    is_sent = Column(Boolean, default=False)
-    sent_at = Column(DateTime(timezone=True), nullable=True)
+    read_at = Column(DateTime(timezone=True), nullable=True)
+    
+    # Metadata
+    data = Column(JSON, nullable=True)  # Additional notification data
+    priority = Column(String(20), default="normal")  # low, normal, high, urgent
     
     # Timestamps
     created_at = Column(DateTime(timezone=True), server_default=func.now())
-    
-    # Foreign keys
-    user_id = Column(Integer, ForeignKey("users.id"), nullable=False)
-    document_id = Column(Integer, ForeignKey("documents.id"), nullable=True)
+    expires_at = Column(DateTime(timezone=True), nullable=True)
     
     # Relationships
-    user = relationship("User")
-    document = relationship("Document")
+    user = relationship("User", back_populates="notifications")
+    
+    # Indexes
+    __table_args__ = (
+        Index('idx_notifications_notification_id', 'notification_id'),
+        Index('idx_notifications_user_id', 'user_id'),
+        Index('idx_notifications_is_read', 'is_read'),
+        Index('idx_notifications_notification_type', 'notification_type'),
+        Index('idx_notifications_priority', 'priority'),
+        Index('idx_notifications_created_at', 'created_at'),
+    )
 
 
 class APILog(Base):
-    """API log model for tracking API usage and performance"""
+    """API log model for tracking API usage"""
     __tablename__ = "api_logs"
     
     id = Column(Integer, primary_key=True, index=True)
+    log_id = Column(UUID(as_uuid=True), default=uuid.uuid4, unique=True, index=True)
     
     # Request details
-    method = Column(String(10), nullable=False)
-    endpoint = Column(String(255), nullable=False)
+    method = Column(String(10), nullable=False)  # GET, POST, PUT, DELETE, etc.
+    endpoint = Column(String(500), nullable=False)
     status_code = Column(Integer, nullable=False)
     
+    # User and session
+    user_id = Column(Integer, ForeignKey('users.id'), nullable=True)
+    ip_address = Column(String(45), nullable=True)
+    user_agent = Column(Text, nullable=True)
+    
     # Performance
     response_time = Column(Float, nullable=True)  # Response time in seconds
     request_size = Column(Integer, nullable=True)  # Request size in bytes
     response_size = Column(Integer, nullable=True)  # Response size in bytes
     
-    # Request data
-    ip_address = Column(String(45), nullable=True)
-    user_agent = Column(Text, nullable=True)
-    request_headers = Column(JSON, nullable=True)
-    request_body = Column(Text, nullable=True)
-    response_body = Column(Text, nullable=True)
+    # Error handling
+    error_message = Column(Text, nullable=True)
     
     # Timestamps
     timestamp = Column(DateTime(timezone=True), server_default=func.now())
     
-    # Foreign keys
-    user_id = Column(Integer, ForeignKey("users.id"), nullable=True)
-    
-    # Relationships
-    user = relationship("User")
+    # Indexes
+    __table_args__ = (
+        Index('idx_api_logs_log_id', 'log_id'),
+        Index('idx_api_logs_user_id', 'user_id'),
+        Index('idx_api_logs_method', 'method'),
+        Index('idx_api_logs_endpoint', 'endpoint'),
+        Index('idx_api_logs_status_code', 'status_code'),
+        Index('idx_api_logs_timestamp', 'timestamp'),
+        Index('idx_api_logs_ip_address', 'ip_address'),
+    )
 
 
-class SystemConfiguration(Base):
-    """System configuration model for storing application settings"""
-    __tablename__ = "system_configurations"
+class SystemConfig(Base):
+    """System configuration model"""
+    __tablename__ = "system_configs"
     
     id = Column(Integer, primary_key=True, index=True)
-    config_key = Column(String(100), unique=True, index=True, nullable=False)
-    
-    # Configuration details
-    config_value = Column(Text, nullable=False)
-    config_type = Column(String(50), nullable=False)  # string, integer, float, boolean, json
+    key = Column(String(100), unique=True, nullable=False)
+    value = Column(Text, nullable=False)
     description = Column(Text, nullable=True)
+    category = Column(String(50), nullable=True)  # security, performance, monitoring, etc.
     
     # Metadata
-    is_active = Column(Boolean, default=True)
+    is_encrypted = Column(Boolean, default=False)
+    is_sensitive = Column(Boolean, default=False)
+    
+    # Timestamps
     created_at = Column(DateTime(timezone=True), server_default=func.now())
     updated_at = Column(DateTime(timezone=True), onupdate=func.now())
     
-    # Foreign keys
-    updated_by = Column(Integer, ForeignKey("users.id"), nullable=True)
-    
-    # Relationships
-    updater = relationship("User")
+    # Indexes
+    __table_args__ = (
+        Index('idx_system_configs_key', 'key'),
+        Index('idx_system_configs_category', 'category'),
+        Index('idx_system_configs_created_at', 'created_at'),
+    )
diff --git a/backend/app/main.py b/backend/app/main.py
index b92db74..a11e7a1 100644
--- a/backend/app/main.py
+++ b/backend/app/main.py
@@ -1,269 +1,356 @@
-import asyncio
 import os
+import logging
 from contextlib import asynccontextmanager
-from typing import Dict, Any
-
-from fastapi import FastAPI, HTTPException, Depends, BackgroundTasks, UploadFile, File
+from fastapi import FastAPI, HTTPException, Request
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.middleware.trustedhost import TrustedHostMiddleware
-from fastapi.responses import StreamingResponse, JSONResponse
-from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
-import uvicorn
-from sse_starlette.sse import EventSourceResponse
+from fastapi.responses import JSONResponse
+from fastapi.exceptions import RequestValidationError
+from starlette.exceptions import HTTPException as StarletteHTTPException
 
 from .core.config import settings
-from .core.security import get_current_user, create_access_token
 from .core.middleware import setup_middleware
+from .core.monitoring import setup_monitoring, instrument_fastapi
 from .database.connection import init_database, check_database_connection
-from .api.v1.endpoints import auth, agentic, documents, traces, qa, compare, audit, settings, memory, summarizer, translator, sentiment, agents
 from .services.agent_service import AgentService
-from .services.memory_service import MemoryService
-from .core.monitoring import setup_monitoring, instrument_fastapi
 
-# Global agent service instance
+# Configure logging
+logging.basicConfig(
+    level=getattr(logging, settings.LOG_LEVEL),
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Global service instances
 agent_service = None
 
+
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     """Application lifespan manager"""
     global agent_service
     
     # Startup
-    print("🚀 Starting Smart Document Bot...")
-    
-    # Check database connection
-    if check_database_connection():
-        print("✅ Database connection verified")
-        # Initialize database tables
-        init_database()
-        print("✅ Database initialized")
-    else:
-        print("⚠️ Database connection failed - using fallback mode")
+    logger.info("Starting AI Document Agent application...")
     
-    # Initialize agent service
-    agent_service = AgentService()
-    print("✅ Agent service initialized")
-    
-    # Setup monitoring
-    setup_monitoring()
-    print("✅ Monitoring setup complete")
+    try:
+        # Initialize database
+        logger.info("Initializing database...")
+        await init_database()
+        
+        # Check database connection
+        logger.info("Checking database connection...")
+        await check_database_connection()
+        
+        # Initialize agent service
+        logger.info("Initializing agent service...")
+        agent_service = AgentService()
+        await agent_service.initialize()
+        
+        # Setup monitoring
+        if settings.ENABLE_MONITORING:
+            logger.info("Setting up monitoring...")
+            setup_monitoring()
+        
+        logger.info("Application startup completed successfully")
+        
+    except Exception as e:
+        logger.error(f"Application startup failed: {e}")
+        raise
     
     yield
     
     # Shutdown
-    print("🛑 Shutting down Smart Document Bot...")
-    if agent_service:
-        await agent_service.cleanup_old_processing_history()
-    print("✅ Cleanup complete")
+    logger.info("Shutting down AI Document Agent application...")
+    
+    try:
+        if agent_service:
+            await agent_service.cleanup()
+        logger.info("Application shutdown completed successfully")
+    except Exception as e:
+        logger.error(f"Application shutdown error: {e}")
+
 
-# Create FastAPI app
+# Create FastAPI application
 app = FastAPI(
-    title="Smart Document Bot API",
-    description="AI-powered document processing and analysis system",
-    version="1.0.0",
+    title=settings.APP_NAME,
+    version=settings.APP_VERSION,
+    description="Enterprise-Grade AI Document Processing & Analysis Platform",
+    docs_url="/docs" if settings.DEBUG else None,
+    redoc_url="/redoc" if settings.DEBUG else None,
     lifespan=lifespan
 )
 
-# Security
-security = HTTPBearer()
-
-# Add CORS middleware
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=settings.ALLOWED_ORIGINS,
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-# Setup custom middleware
+# Setup middleware
 setup_middleware(app)
 
-# Instrument FastAPI for monitoring
-instrument_fastapi(app)
+# Setup monitoring
+if settings.ENABLE_MONITORING:
+    instrument_fastapi(app)
 
-# Dependency to get agent service
-def get_agent_service() -> AgentService:
-    """Get the global agent service instance"""
-    if agent_service is None:
-        raise RuntimeError("Agent service not initialized")
-    return agent_service
 
-# Override dependency injection for endpoints
-app.dependency_overrides[AgentService] = get_agent_service
+# Global exception handlers
+@app.exception_handler(StarletteHTTPException)
+async def http_exception_handler(request: Request, exc: StarletteHTTPException):
+    """Handle HTTP exceptions"""
+    logger.error(f"HTTP Exception: {exc.status_code} - {exc.detail}")
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={
+            "error": exc.detail,
+            "status_code": exc.status_code,
+            "path": request.url.path
+        }
+    )
 
-# Include routers
-app.include_router(
-    auth.router,
-    prefix="/api/v1/auth",
-    tags=["Authentication"]
-)
 
-app.include_router(
-    agentic.router,
-    prefix="/api/v1/agentic",
-    tags=["Agentic Processing"]
-)
+@app.exception_handler(RequestValidationError)
+async def validation_exception_handler(request: Request, exc: RequestValidationError):
+    """Handle validation errors"""
+    logger.error(f"Validation Error: {exc.errors()}")
+    return JSONResponse(
+        status_code=422,
+        content={
+            "error": "Validation error",
+            "details": exc.errors(),
+            "path": request.url.path
+        }
+    )
 
-app.include_router(
-    documents.router,
-    prefix="/api/v1/documents",
-    tags=["Documents"],
-    dependencies=[Depends(get_agent_service)]
-)
 
-app.include_router(
-    traces.router,
-    prefix="/api/v1/traces",
-    tags=["Agent Traces"],
-    dependencies=[Depends(get_agent_service)]
-)
+@app.exception_handler(Exception)
+async def general_exception_handler(request: Request, exc: Exception):
+    """Handle general exceptions"""
+    logger.error(f"Unhandled Exception: {str(exc)}", exc_info=True)
+    return JSONResponse(
+        status_code=500,
+        content={
+            "error": "Internal server error",
+            "message": "An unexpected error occurred",
+            "path": request.url.path
+        }
+    )
 
-app.include_router(
-    qa.router,
-    prefix="/api/v1/qa",
-    tags=["Question Answering"],
-    dependencies=[Depends(get_agent_service)]
-)
 
-app.include_router(
-    compare.router,
-    prefix="/api/v1/compare",
-    tags=["Document Comparison"],
-    dependencies=[Depends(get_agent_service)]
-)
+# Health check endpoints
+@app.get("/health")
+async def health_check():
+    """Basic health check"""
+    try:
+        # Check database connection
+        await check_database_connection()
+        
+        # Check agent service
+        if agent_service:
+            service_status = await agent_service.get_status()
+        else:
+            service_status = "not_initialized"
+        
+        return {
+            "status": "healthy",
+            "timestamp": "2024-01-01T00:00:00Z",
+            "version": settings.APP_VERSION,
+            "database": "connected",
+            "agent_service": service_status
+        }
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        raise HTTPException(status_code=503, detail="Service unhealthy")
 
-app.include_router(
-    audit.router,
-    prefix="/api/v1/audit",
-    tags=["Audit Trail"],
-    dependencies=[Depends(get_agent_service)]
-)
 
-app.include_router(
-    settings.router,
-    prefix="/api/v1/settings",
-    tags=["Settings"]
-)
+@app.get("/health/detailed")
+async def detailed_health_check():
+    """Detailed health check with component status"""
+    try:
+        health_status = {
+            "status": "healthy",
+            "timestamp": "2024-01-01T00:00:00Z",
+            "version": settings.APP_VERSION,
+            "components": {}
+        }
+        
+        # Database health
+        try:
+            await check_database_connection()
+            health_status["components"]["database"] = {
+                "status": "healthy",
+                "message": "Database connection successful"
+            }
+        except Exception as e:
+            health_status["components"]["database"] = {
+                "status": "unhealthy",
+                "message": f"Database connection failed: {str(e)}"
+            }
+            health_status["status"] = "degraded"
+        
+        # Agent service health
+        if agent_service:
+            try:
+                service_status = await agent_service.get_status()
+                health_status["components"]["agent_service"] = {
+                    "status": "healthy",
+                    "message": "Agent service operational",
+                    "details": service_status
+                }
+            except Exception as e:
+                health_status["components"]["agent_service"] = {
+                    "status": "unhealthy",
+                    "message": f"Agent service failed: {str(e)}"
+                }
+                health_status["status"] = "degraded"
+        else:
+            health_status["components"]["agent_service"] = {
+                "status": "not_initialized",
+                "message": "Agent service not initialized"
+            }
+        
+        # Redis health
+        try:
+            import redis
+            redis_client = redis.Redis.from_url(settings.REDIS_URL, decode_responses=True)
+            redis_client.ping()
+            health_status["components"]["redis"] = {
+                "status": "healthy",
+                "message": "Redis connection successful"
+            }
+        except Exception as e:
+            health_status["components"]["redis"] = {
+                "status": "unhealthy",
+                "message": f"Redis connection failed: {str(e)}"
+            }
+            health_status["status"] = "degraded"
+        
+        return health_status
+        
+    except Exception as e:
+        logger.error(f"Detailed health check failed: {e}")
+        raise HTTPException(status_code=503, detail="Service unhealthy")
 
-app.include_router(
-    memory.router,
-    prefix="/api/v1/memory",
-    tags=["Memory"]
-)
 
-app.include_router(
-    summarizer.router,
-    prefix="/api/v1/summarizer",
-    tags=["Document Summarization"],
-    dependencies=[Depends(get_agent_service)]
-)
+# Root endpoint
+@app.get("/")
+async def root():
+    """Root endpoint with API information"""
+    return {
+        "name": settings.APP_NAME,
+        "version": settings.APP_VERSION,
+        "description": "Enterprise-Grade AI Document Processing & Analysis Platform",
+        "status": "operational",
+        "endpoints": {
+            "docs": "/docs",
+            "health": "/health",
+            "api": "/api/v1"
+        },
+        "features": [
+            "Multi-Agent AI Processing",
+            "Document Intelligence",
+            "Enterprise Security",
+            "Real-time Analytics",
+            "Compliance Monitoring"
+        ]
+    }
 
-app.include_router(
-    translator.router,
-    prefix="/api/v1/translator",
-    tags=["Document Translation"],
-    dependencies=[Depends(get_agent_service)]
-)
 
-app.include_router(
-    sentiment.router,
-    prefix="/api/v1/sentiment",
-    tags=["Sentiment Analysis"],
-    dependencies=[Depends(get_agent_service)]
-)
+# API status endpoint
+@app.get("/api/v1/status")
+async def api_status():
+    """API status endpoint"""
+    return {
+        "api_version": "v1",
+        "status": "operational",
+        "timestamp": "2024-01-01T00:00:00Z",
+        "endpoints": {
+            "auth": "/api/v1/auth",
+            "documents": "/api/v1/documents",
+            "agents": "/api/v1/agents",
+            "analytics": "/api/v1/analytics"
+        }
+    }
+
 
-app.include_router(
-    agents.router,
-    prefix="/api/v1/agents",
-    tags=["Agent Management"],
-    dependencies=[Depends(get_agent_service)]
+# Include API routers
+from .api.v1.endpoints import (
+    auth, agentic, documents, traces, qa, compare, 
+    audit, settings, memory, summarizer, translator, 
+    sentiment, agents
 )
 
-# Health check endpoint
-@app.get("/health")
-async def health_check():
-    """Health check endpoint"""
-    return {
-        "status": "healthy",
-        "version": "1.0.0",
-        "timestamp": "2024-01-01T00:00:00Z"
-    }
+app.include_router(auth.router, prefix="/api/v1/auth", tags=["Authentication"])
+app.include_router(agentic.router, prefix="/api/v1/agentic", tags=["Agentic Processing"])
+app.include_router(documents.router, prefix="/api/v1/documents", tags=["Documents"])
+app.include_router(traces.router, prefix="/api/v1/traces", tags=["Agent Traces"])
+app.include_router(qa.router, prefix="/api/v1/qa", tags=["Question Answering"])
+app.include_router(compare.router, prefix="/api/v1/compare", tags=["Document Comparison"])
+app.include_router(audit.router, prefix="/api/v1/audit", tags=["Audit Trail"])
+app.include_router(settings.router, prefix="/api/v1/settings", tags=["Settings"])
+app.include_router(memory.router, prefix="/api/v1/memory", tags=["Memory Management"])
+app.include_router(summarizer.router, prefix="/api/v1/summarizer", tags=["Document Summarization"])
+app.include_router(translator.router, prefix="/api/v1/translator", tags=["Document Translation"])
+app.include_router(sentiment.router, prefix="/api/v1/sentiment", tags=["Sentiment Analysis"])
+app.include_router(agents.router, prefix="/api/v1/agents", tags=["Agent Management"])
 
-# Root endpoint
-@app.get("/")
-async def root():
-    """Root endpoint"""
-    return {
-        "message": "Smart Document Bot API",
-        "version": "1.0.0",
-        "docs": "/docs",
-        "health": "/health"
-    }
 
 # Agent capabilities endpoint
 @app.get("/api/v1/agents/capabilities")
-async def get_agent_capabilities(agent_service: AgentService = Depends(get_agent_service)):
-    """Get information about available agents and their capabilities"""
+async def get_agent_capabilities():
+    """Get all available agent capabilities"""
     return {
-        "agents": [
-            {
-                "name": "OrchestratorAgent",
-                "description": "Coordinates the overall document processing workflow",
-                "capabilities": ["workflow_coordination", "task_scheduling", "error_handling"]
+        "agents": {
+            "orchestrator": {
+                "description": "Workflow orchestration and coordination",
+                "capabilities": ["workflow_planning", "execution_monitoring", "resource_allocation"]
             },
-            {
-                "name": "IngestionAgent", 
-                "description": "Handles document upload and initial processing",
-                "capabilities": ["file_upload", "format_detection", "preprocessing"]
+            "ingestion": {
+                "description": "Document ingestion and content extraction",
+                "capabilities": ["text_extraction", "metadata_extraction", "format_detection"]
             },
-            {
-                "name": "ClassifierAgent",
-                "description": "Categorizes documents by type and content",
-                "capabilities": ["document_classification", "content_analysis", "metadata_extraction"]
+            "classifier": {
+                "description": "Document classification and categorization",
+                "capabilities": ["document_classification", "domain_detection", "content_categorization"]
             },
-            {
-                "name": "EntityAgent",
-                "description": "Extracts key entities and information",
-                "capabilities": ["entity_extraction", "named_entity_recognition", "relationship_mapping"]
+            "entity": {
+                "description": "Named entity recognition and extraction",
+                "capabilities": ["entity_extraction", "relationship_mapping", "entity_linking"]
             },
-            {
-                "name": "RiskAgent",
-                "description": "Assesses compliance risks and issues",
-                "capabilities": ["risk_assessment", "compliance_checking", "vulnerability_detection"]
+            "risk": {
+                "description": "Risk assessment and compliance monitoring",
+                "capabilities": ["risk_assessment", "compliance_checking", "policy_enforcement"]
             },
-            {
-                "name": "QAAgent",
-                "description": "Provides intelligent Q&A capabilities",
-                "capabilities": ["question_answering", "context_understanding", "knowledge_retrieval"]
+            "qa": {
+                "description": "Question answering and document querying",
+                "capabilities": ["question_answering", "context_retrieval", "answer_generation"]
             },
-            {
-                "name": "CompareAgent",
-                "description": "Compares documents for similarities and differences",
-                "capabilities": ["document_comparison", "similarity_analysis", "difference_detection"]
+            "compare": {
+                "description": "Document comparison and diff analysis",
+                "capabilities": ["document_comparison", "change_detection", "similarity_analysis"]
             },
-            {
-                "name": "AuditAgent",
-                "description": "Monitors and logs all system activities",
-                "capabilities": ["activity_logging", "audit_trail", "compliance_monitoring"]
+            "audit": {
+                "description": "Audit logging and compliance tracking",
+                "capabilities": ["audit_logging", "compliance_tracking", "event_monitoring"]
             },
-            {
-                "name": "SummarizerAgent",
-                "description": "Creates document summaries and insights",
-                "capabilities": ["document_summarization", "key_point_extraction", "insight_generation"]
+            "summarizer": {
+                "description": "Document summarization and key point extraction",
+                "capabilities": ["extractive_summarization", "abstractive_summarization", "key_point_extraction"]
             },
-            {
-                "name": "TranslatorAgent",
-                "description": "Handles multi-language document processing",
-                "capabilities": ["language_translation", "multilingual_processing", "cultural_adaptation"]
+            "translator": {
+                "description": "Multi-language document translation",
+                "capabilities": ["language_detection", "document_translation", "quality_assessment"]
             },
-            {
-                "name": "SentimentAnalysisAgent",
-                "description": "Analyzes document sentiment and tone",
+            "sentiment": {
+                "description": "Sentiment analysis and tone detection",
                 "capabilities": ["sentiment_analysis", "tone_detection", "emotion_recognition"]
             }
-        ]
+        },
+        "total_agents": 11,
+        "total_capabilities": 33
     }
 
+
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=8000)
+    import uvicorn
+    uvicorn.run(
+        "app.main:app",
+        host=settings.HOST,
+        port=settings.PORT,
+        reload=settings.DEBUG,
+        log_level=settings.LOG_LEVEL.lower()
+    )
diff --git a/backend/app/risk/policies/compliance.rego b/backend/app/risk/policies/compliance.rego
new file mode 100644
index 0000000..80e6dbd
--- /dev/null
+++ b/backend/app/risk/policies/compliance.rego
@@ -0,0 +1,110 @@
+package smart_doc_bot.compliance
+
+# GDPR Compliance Policies
+gdpr_data_retention_allowed {
+    input.data_type == "personal_data"
+    input.retention_days <= 90
+}
+
+gdpr_data_processing_allowed {
+    input.purpose == "legitimate_interest"
+    input.consent_given == true
+}
+
+gdpr_data_transfer_allowed {
+    input.destination == "EU"
+}
+
+gdpr_data_transfer_allowed {
+    input.destination == "US"
+    input.adequacy_decision == true
+}
+
+# HIPAA Compliance Policies
+hipaa_phi_access_allowed {
+    input.user_role == "healthcare_provider"
+    input.purpose == "treatment"
+}
+
+hipaa_phi_access_allowed {
+    input.user_role == "healthcare_provider"
+    input.purpose == "payment"
+}
+
+hipaa_phi_access_allowed {
+    input.user_role == "healthcare_provider"
+    input.purpose == "healthcare_operations"
+}
+
+hipaa_audit_required {
+    input.phi_accessed == true
+}
+
+# SOX Compliance Policies
+sox_financial_data_access_allowed {
+    input.user_role == "auditor"
+    input.purpose == "financial_audit"
+}
+
+sox_financial_data_access_allowed {
+    input.user_role == "finance_manager"
+    input.purpose == "financial_reporting"
+}
+
+sox_audit_trail_required {
+    input.financial_data_accessed == true
+}
+
+# Data Classification Policies
+data_classification_required {
+    input.data_type == "sensitive"
+    input.classification_level != ""
+}
+
+data_encryption_required {
+    input.data_type == "sensitive"
+    input.encryption_enabled == true
+}
+
+# Access Control Policies
+access_control_required {
+    input.resource_type == "document"
+    input.user_has_permission == true
+}
+
+access_control_required {
+    input.resource_type == "document"
+    input.user_owns_resource == true
+}
+
+# Audit Policies
+audit_logging_required {
+    input.action == "read"
+    input.resource_type == "sensitive"
+}
+
+audit_logging_required {
+    input.action == "write"
+    input.resource_type == "sensitive"
+}
+
+audit_logging_required {
+    input.action == "delete"
+    input.resource_type == "sensitive"
+}
+
+# Default deny for sensitive operations
+default sensitive_operation_allowed = false
+
+sensitive_operation_allowed {
+    input.operation == "data_export"
+    input.user_role == "admin"
+    input.audit_logged == true
+}
+
+sensitive_operation_allowed {
+    input.operation == "data_deletion"
+    input.user_role == "admin"
+    input.confirmation_received == true
+    input.audit_logged == true
+}
diff --git a/backend/app/risk/policies/security.rego b/backend/app/risk/policies/security.rego
new file mode 100644
index 0000000..a2ec517
--- /dev/null
+++ b/backend/app/risk/policies/security.rego
@@ -0,0 +1,87 @@
+package smart_doc_bot.security
+
+# Default deny
+default allow = false
+
+# Allow access if user has required permissions
+allow {
+    input.method == "GET"
+    input.path = ["api", "v1", "health"]
+}
+
+allow {
+    input.method == "POST"
+    input.path = ["api", "v1", "auth", "login"]
+}
+
+allow {
+    input.method == "POST"
+    input.path = ["api", "v1", "auth", "register"]
+}
+
+# Document access policies
+allow {
+    input.method == "GET"
+    input.path = ["api", "v1", "documents"]
+    has_permission(input.user, "documents:read")
+}
+
+allow {
+    input.method == "POST"
+    input.path = ["api", "v1", "documents", "upload"]
+    has_permission(input.user, "documents:create")
+}
+
+allow {
+    input.method == "DELETE"
+    input.path = ["api", "v1", "documents", "id"]
+    has_permission(input.user, "documents:delete")
+}
+
+# Agent access policies
+allow {
+    input.method == "POST"
+    input.path = ["api", "v1", "agents", "process"]
+    has_permission(input.user, "agents:execute")
+}
+
+allow {
+    input.method == "GET"
+    input.path = ["api", "v1", "agents", "traces"]
+    has_permission(input.user, "agents:read")
+}
+
+# Analytics access policies
+allow {
+    input.method == "GET"
+    input.path = ["api", "v1", "analytics"]
+    has_permission(input.user, "analytics:read")
+}
+
+# Admin access policies
+allow {
+    input.method == "GET"
+    input.path = ["api", "v1", "admin"]
+    is_admin(input.user)
+}
+
+# Helper functions
+has_permission(user, permission) {
+    user.roles[_] == "admin"
+}
+
+has_permission(user, permission) {
+    user.permissions[_] == permission
+}
+
+has_permission(user, permission) {
+    user.permissions[_] == "*"
+}
+
+is_admin(user) {
+    user.roles[_] == "admin"
+}
+
+is_admin(user) {
+    user.is_superuser == true
+}
diff --git a/backend/app/services/agent_service.py b/backend/app/services/agent_service.py
index 1562897..e064dce 100644
--- a/backend/app/services/agent_service.py
+++ b/backend/app/services/agent_service.py
@@ -1,7 +1,9 @@
 import asyncio
 import uuid
+import logging
 from typing import Any, Dict, List, Optional
 from datetime import datetime
+from contextlib import asynccontextmanager
 
 from ..agents.orchestrator import OrchestratorAgent
 from ..agents.ingestion import IngestionAgent
@@ -16,40 +18,112 @@
 from ..agents.sentiment import SentimentAnalysisAgent
 from ..models.base import Document, AgentResult, AgentType
 from ..core.config import settings
+from ..core.monitoring import get_monitor
+
+logger = logging.getLogger(__name__)
 
 
 class AgentService:
     """Service for managing agent execution and orchestration"""
     
     def __init__(self):
-        # Initialize the orchestrator agent
-        self.orchestrator = OrchestratorAgent(llm_model=settings.LLM_MODEL)
-        
-        # Initialize individual agents for direct access
-        self.ingestion_agent = IngestionAgent(llm_model=settings.LLM_MODEL)
-        self.classifier_agent = ClassifierAgent(llm_model=settings.LLM_MODEL)
-        self.entity_agent = EntityAgent(llm_model=settings.LLM_MODEL)
-        self.risk_agent = RiskAgent(llm_model=settings.LLM_MODEL)
-        self.qa_agent = QAAgent(llm_model=settings.LLM_MODEL)
-        self.compare_agent = CompareAgent(llm_model=settings.LLM_MODEL)
-        self.audit_agent = AuditAgent(llm_model=settings.LLM_MODEL)
-        self.summarizer_agent = SummarizerAgent(llm_model=settings.LLM_MODEL)
-        self.translator_agent = TranslatorAgent(llm_model=settings.LLM_MODEL)
-        self.sentiment_agent = SentimentAnalysisAgent(llm_model=settings.LLM_MODEL)
-        
-        # Processing history
+        self.monitor = get_monitor()
         self.processing_history = {}
+        self.agent_instances = {}
+        self.is_initialized = False
+        
+        # Agent mapping for easy access
+        self.agent_mapping = {
+            "orchestrator": None,
+            "ingestion": None,
+            "classifier": None,
+            "entity": None,
+            "risk": None,
+            "qa": None,
+            "compare": None,
+            "audit": None,
+            "summarizer": None,
+            "translator": None,
+            "sentiment": None
+        }
+    
+    async def initialize(self) -> None:
+        """Initialize all agents and services"""
+        try:
+            logger.info("Initializing AgentService...")
+            
+            # Initialize orchestrator agent
+            self.agent_mapping["orchestrator"] = OrchestratorAgent(
+                llm_model=settings.OPENAI_MODEL
+            )
+            
+            # Initialize individual agents
+            self.agent_mapping["ingestion"] = IngestionAgent(
+                llm_model=settings.OPENAI_MODEL
+            )
+            self.agent_mapping["classifier"] = ClassifierAgent(
+                llm_model=settings.OPENAI_MODEL
+            )
+            self.agent_mapping["entity"] = EntityAgent(
+                llm_model=settings.OPENAI_MODEL
+            )
+            self.agent_mapping["risk"] = RiskAgent(
+                llm_model=settings.OPENAI_MODEL
+            )
+            self.agent_mapping["qa"] = QAAgent(
+                llm_model=settings.OPENAI_MODEL
+            )
+            self.agent_mapping["compare"] = CompareAgent(
+                llm_model=settings.OPENAI_MODEL
+            )
+            self.agent_mapping["audit"] = AuditAgent(
+                llm_model=settings.OPENAI_MODEL
+            )
+            self.agent_mapping["summarizer"] = SummarizerAgent(
+                llm_model=settings.OPENAI_MODEL
+            )
+            self.agent_mapping["translator"] = TranslatorAgent(
+                llm_model=settings.OPENAI_MODEL
+            )
+            self.agent_mapping["sentiment"] = SentimentAnalysisAgent(
+                llm_model=settings.OPENAI_MODEL
+            )
+            
+            # Store references for backward compatibility
+            self.orchestrator = self.agent_mapping["orchestrator"]
+            self.ingestion_agent = self.agent_mapping["ingestion"]
+            self.classifier_agent = self.agent_mapping["classifier"]
+            self.entity_agent = self.agent_mapping["entity"]
+            self.risk_agent = self.agent_mapping["risk"]
+            self.qa_agent = self.agent_mapping["qa"]
+            self.compare_agent = self.agent_mapping["compare"]
+            self.audit_agent = self.agent_mapping["audit"]
+            self.summarizer_agent = self.agent_mapping["summarizer"]
+            self.translator_agent = self.agent_mapping["translator"]
+            self.sentiment_agent = self.agent_mapping["sentiment"]
+            
+            self.is_initialized = True
+            logger.info("AgentService initialized successfully")
+            
+        except Exception as e:
+            logger.error(f"Failed to initialize AgentService: {e}")
+            raise
     
     async def process_document(self, document: Document, goal: str = "Analyze document for compliance and risks") -> Dict[str, Any]:
         """Process a document through the complete agent pipeline"""
+        if not self.is_initialized:
+            raise RuntimeError("AgentService not initialized. Call initialize() first.")
+        
+        processing_id = str(uuid.uuid4())
+        start_time = datetime.utcnow()
+        
         try:
-            # Generate processing ID
-            processing_id = str(uuid.uuid4())
+            logger.info(f"Starting document processing: {processing_id}")
             
             # Initialize processing history
             self.processing_history[processing_id] = {
                 "processing_id": processing_id,
-                "start_time": datetime.utcnow().isoformat(),
+                "start_time": start_time.isoformat(),
                 "document_id": getattr(document, 'id', 'unknown'),
                 "goal": goal,
                 "stages": [],
@@ -60,79 +134,138 @@ async def process_document(self, document: Document, goal: str = "Analyze docume
             context = {
                 "document": document,
                 "processing_id": processing_id,
-                "goal": goal
+                "goal": goal,
+                "agent_service": self
             }
             
-            # Execute orchestration
-            orchestration_result = await self.orchestrator.run(goal, context)
+            # Execute orchestration with monitoring
+            with self.monitor.monitor_agent_execution("orchestrator", processing_id):
+                orchestration_result = await self.orchestrator.run(goal, context)
+            
+            # Calculate processing time
+            end_time = datetime.utcnow()
+            processing_duration = (end_time - start_time).total_seconds()
             
             # Update processing history
             self.processing_history[processing_id].update({
-                "end_time": datetime.utcnow().isoformat(),
+                "end_time": end_time.isoformat(),
+                "duration": processing_duration,
                 "status": "completed" if orchestration_result else "failed",
                 "orchestration_result": orchestration_result.dict() if orchestration_result else None,
                 "workflow_status": self.orchestrator.get_workflow_status()
             })
             
+            # Record metrics
+            self.monitor.record_agent_execution(
+                agent_name="orchestrator",
+                duration=processing_duration,
+                success=bool(orchestration_result),
+                confidence=orchestration_result.confidence if orchestration_result else 0.0
+            )
+            
+            logger.info(f"Document processing completed: {processing_id} in {processing_duration:.2f}s")
+            
             return {
                 "processing_id": processing_id,
                 "status": "completed" if orchestration_result else "failed",
                 "result": orchestration_result.output if orchestration_result else None,
                 "confidence": orchestration_result.confidence if orchestration_result else 0.0,
                 "rationale": orchestration_result.rationale if orchestration_result else "Processing failed",
-                "workflow_status": self.orchestrator.get_workflow_status()
+                "workflow_status": self.orchestrator.get_workflow_status(),
+                "duration": processing_duration
             }
             
         except Exception as e:
+            end_time = datetime.utcnow()
+            processing_duration = (end_time - start_time).total_seconds()
+            
+            logger.error(f"Document processing failed: {processing_id} - {e}")
+            
             # Update processing history with error
             if processing_id in self.processing_history:
                 self.processing_history[processing_id].update({
-                    "end_time": datetime.utcnow().isoformat(),
+                    "end_time": end_time.isoformat(),
+                    "duration": processing_duration,
                     "status": "failed",
                     "error": str(e)
                 })
             
+            # Record error metrics
+            self.monitor.record_agent_execution(
+                agent_name="orchestrator",
+                duration=processing_duration,
+                success=False,
+                error=str(e)
+            )
+            
             return {
-                "processing_id": processing_id if 'processing_id' in locals() else "unknown",
+                "processing_id": processing_id,
                 "status": "failed",
                 "error": str(e),
                 "result": None,
                 "confidence": 0.0,
-                "rationale": f"Processing failed: {str(e)}"
+                "rationale": f"Processing failed: {str(e)}",
+                "duration": processing_duration
             }
     
     async def execute_single_agent(self, agent_type: str, document: Document, goal: str) -> AgentResult:
-        """Execute a single agent"""
+        """Execute a single agent with monitoring and error handling"""
+        if not self.is_initialized:
+            raise RuntimeError("AgentService not initialized. Call initialize() first.")
+        
+        agent_type = agent_type.lower()
+        if agent_type not in self.agent_mapping:
+            raise ValueError(f"Unknown agent type: {agent_type}")
+        
+        agent = self.agent_mapping[agent_type]
+        if not agent:
+            raise RuntimeError(f"Agent {agent_type} not initialized")
+        
+        start_time = datetime.utcnow()
+        
         try:
-            # Map agent types to agent instances
-            agent_mapping = {
-                "ingestion": self.ingestion_agent,
-                "classifier": self.classifier_agent,
-                "entity": self.entity_agent,
-                "risk": self.risk_agent,
-                "qa": self.qa_agent,
-                "compare": self.compare_agent,
-                "audit": self.audit_agent,
-                "summarizer": self.summarizer_agent,
-                "translator": self.translator_agent,
-                "sentiment": self.sentiment_agent
-            }
-            
-            agent = agent_mapping.get(agent_type.lower())
-            if not agent:
-                raise ValueError(f"Unknown agent type: {agent_type}")
+            logger.info(f"Executing agent: {agent_type}")
             
             # Prepare context
             context = {
                 "document": document,
-                "goal": goal
+                "goal": goal,
+                "agent_service": self
             }
             
-            # Execute agent
-            result = await agent.run(goal, context)
+            # Execute agent with monitoring
+            with self.monitor.monitor_agent_execution(agent_type, f"{agent_type}_{document.id}"):
+                result = await agent.run(goal, context)
+            
+            # Calculate execution time
+            end_time = datetime.utcnow()
+            execution_duration = (end_time - start_time).total_seconds()
+            
+            # Record metrics
+            self.monitor.record_agent_execution(
+                agent_name=agent_type,
+                duration=execution_duration,
+                success=bool(result),
+                confidence=result.confidence if result else 0.0
+            )
+            
+            logger.info(f"Agent execution completed: {agent_type} in {execution_duration:.2f}s")
             return result
             
         except Exception as e:
+            end_time = datetime.utcnow()
+            execution_duration = (end_time - start_time).total_seconds()
+            
+            logger.error(f"Agent execution failed: {agent_type} - {e}")
+            
+            # Record error metrics
+            self.monitor.record_agent_execution(
+                agent_name=agent_type,
+                duration=execution_duration,
+                success=False,
+                error=str(e)
+            )
+            
             return AgentResult(
                 output=None,
                 rationale=f"Agent execution failed: {str(e)}",
@@ -142,88 +275,202 @@ async def execute_single_agent(self, agent_type: str, document: Document, goal:
     
     async def compare_documents(self, document_a: Document, document_b: Document, goal: str = "Compare documents for differences and risk changes") -> Dict[str, Any]:
         """Compare two documents using the compare agent"""
+        if not self.is_initialized:
+            raise RuntimeError("AgentService not initialized. Call initialize() first.")
+        
+        start_time = datetime.utcnow()
+        
         try:
+            logger.info("Starting document comparison")
+            
             # Prepare context for comparison
             context = {
                 "document_a": document_a,
                 "document_b": document_b,
-                "goal": goal
+                "goal": goal,
+                "agent_service": self
             }
             
-            # Execute comparison
-            comparison_result = await self.compare_agent.run(goal, context)
+            # Execute comparison with monitoring
+            with self.monitor.monitor_agent_execution("compare", f"compare_{document_a.id}_{document_b.id}"):
+                comparison_result = await self.compare_agent.run(goal, context)
+            
+            # Calculate execution time
+            end_time = datetime.utcnow()
+            execution_duration = (end_time - start_time).total_seconds()
+            
+            # Record metrics
+            self.monitor.record_agent_execution(
+                agent_name="compare",
+                duration=execution_duration,
+                success=bool(comparison_result),
+                confidence=comparison_result.confidence if comparison_result else 0.0
+            )
+            
+            logger.info(f"Document comparison completed in {execution_duration:.2f}s")
             
             return {
                 "status": "completed" if comparison_result else "failed",
                 "result": comparison_result.output if comparison_result else None,
                 "confidence": comparison_result.confidence if comparison_result else 0.0,
-                "rationale": comparison_result.rationale if comparison_result else "Comparison failed"
+                "rationale": comparison_result.rationale if comparison_result else "Comparison failed",
+                "duration": execution_duration
             }
             
         except Exception as e:
+            end_time = datetime.utcnow()
+            execution_duration = (end_time - start_time).total_seconds()
+            
+            logger.error(f"Document comparison failed: {e}")
+            
+            # Record error metrics
+            self.monitor.record_agent_execution(
+                agent_name="compare",
+                duration=execution_duration,
+                success=False,
+                error=str(e)
+            )
+            
             return {
                 "status": "failed",
                 "error": str(e),
                 "result": None,
                 "confidence": 0.0,
-                "rationale": f"Comparison failed: {str(e)}"
+                "rationale": f"Comparison failed: {str(e)}",
+                "duration": execution_duration
             }
     
     async def generate_audit_trail(self, document: Document, processing_history: List[Dict] = None) -> Dict[str, Any]:
         """Generate audit trail for a document"""
+        if not self.is_initialized:
+            raise RuntimeError("AgentService not initialized. Call initialize() first.")
+        
+        start_time = datetime.utcnow()
+        
         try:
+            logger.info(f"Generating audit trail for document: {document.id}")
+            
             # Prepare context for audit
             context = {
                 "document": document,
                 "processing_history": processing_history or [],
-                "goal": "Generate comprehensive audit trail"
+                "goal": "Generate comprehensive audit trail",
+                "agent_service": self
             }
             
-            # Execute audit
-            audit_result = await self.audit_agent.run("Generate audit trail", context)
+            # Execute audit with monitoring
+            with self.monitor.monitor_agent_execution("audit", f"audit_{document.id}"):
+                audit_result = await self.audit_agent.run("Generate audit trail", context)
+            
+            # Calculate execution time
+            end_time = datetime.utcnow()
+            execution_duration = (end_time - start_time).total_seconds()
+            
+            # Record metrics
+            self.monitor.record_agent_execution(
+                agent_name="audit",
+                duration=execution_duration,
+                success=bool(audit_result),
+                confidence=audit_result.confidence if audit_result else 0.0
+            )
+            
+            logger.info(f"Audit trail generation completed in {execution_duration:.2f}s")
             
             return {
                 "status": "completed" if audit_result else "failed",
                 "result": audit_result.output if audit_result else None,
                 "confidence": audit_result.confidence if audit_result else 0.0,
-                "rationale": audit_result.rationale if audit_result else "Audit generation failed"
+                "rationale": audit_result.rationale if audit_result else "Audit generation failed",
+                "duration": execution_duration
             }
             
         except Exception as e:
+            end_time = datetime.utcnow()
+            execution_duration = (end_time - start_time).total_seconds()
+            
+            logger.error(f"Audit trail generation failed: {e}")
+            
+            # Record error metrics
+            self.monitor.record_agent_execution(
+                agent_name="audit",
+                duration=execution_duration,
+                success=False,
+                error=str(e)
+            )
+            
             return {
                 "status": "failed",
                 "error": str(e),
                 "result": None,
                 "confidence": 0.0,
-                "rationale": f"Audit generation failed: {str(e)}"
+                "rationale": f"Audit generation failed: {str(e)}",
+                "duration": execution_duration
             }
     
     async def generate_qa(self, document: Document, goal: str = "Generate questions and answers about the document") -> Dict[str, Any]:
         """Generate questions and answers for a document"""
+        if not self.is_initialized:
+            raise RuntimeError("AgentService not initialized. Call initialize() first.")
+        
+        start_time = datetime.utcnow()
+        
         try:
+            logger.info(f"Generating QA for document: {document.id}")
+            
             # Prepare context for QA generation
             context = {
                 "document": document,
-                "goal": goal
+                "goal": goal,
+                "agent_service": self
             }
             
-            # Execute QA generation
-            qa_result = await self.qa_agent.run(goal, context)
+            # Execute QA generation with monitoring
+            with self.monitor.monitor_agent_execution("qa", f"qa_{document.id}"):
+                qa_result = await self.qa_agent.run(goal, context)
+            
+            # Calculate execution time
+            end_time = datetime.utcnow()
+            execution_duration = (end_time - start_time).total_seconds()
+            
+            # Record metrics
+            self.monitor.record_agent_execution(
+                agent_name="qa",
+                duration=execution_duration,
+                success=bool(qa_result),
+                confidence=qa_result.confidence if qa_result else 0.0
+            )
+            
+            logger.info(f"QA generation completed in {execution_duration:.2f}s")
             
             return {
                 "status": "completed" if qa_result else "failed",
                 "result": qa_result.output if qa_result else None,
                 "confidence": qa_result.confidence if qa_result else 0.0,
-                "rationale": qa_result.rationale if qa_result else "QA generation failed"
+                "rationale": qa_result.rationale if qa_result else "QA generation failed",
+                "duration": execution_duration
             }
             
         except Exception as e:
+            end_time = datetime.utcnow()
+            execution_duration = (end_time - start_time).total_seconds()
+            
+            logger.error(f"QA generation failed: {e}")
+            
+            # Record error metrics
+            self.monitor.record_agent_execution(
+                agent_name="qa",
+                duration=execution_duration,
+                success=False,
+                error=str(e)
+            )
+            
             return {
                 "status": "failed",
                 "error": str(e),
                 "result": None,
                 "confidence": 0.0,
-                "rationale": f"QA generation failed: {str(e)}"
+                "rationale": f"QA generation failed: {str(e)}",
+                "duration": execution_duration
             }
     
     def get_processing_status(self, processing_id: str) -> Optional[Dict[str, Any]]:
@@ -240,64 +487,90 @@ def get_agent_capabilities(self) -> Dict[str, Any]:
             "orchestrator": {
                 "name": "OrchestratorAgent",
                 "description": "Coordinates the complete document processing workflow",
-                "capabilities": ["Workflow planning", "Execution monitoring", "Stage coordination"]
+                "capabilities": ["Workflow planning", "Execution monitoring", "Stage coordination"],
+                "status": "initialized" if self.agent_mapping["orchestrator"] else "not_initialized"
             },
             "ingestion": {
                 "name": "IngestionAgent",
                 "description": "Extracts and normalizes text from documents",
-                "capabilities": ["OCR", "PDF parsing", "Text normalization"]
+                "capabilities": ["OCR", "PDF parsing", "Text normalization"],
+                "status": "initialized" if self.agent_mapping["ingestion"] else "not_initialized"
             },
             "classifier": {
                 "name": "ClassifierAgent",
                 "description": "Classifies documents and analyzes content structure",
-                "capabilities": ["Document classification", "Content analysis", "Domain identification"]
+                "capabilities": ["Document classification", "Content analysis", "Domain identification"],
+                "status": "initialized" if self.agent_mapping["classifier"] else "not_initialized"
             },
             "entity": {
                 "name": "EntityAgent",
                 "description": "Extracts named entities and key information",
-                "capabilities": ["Named entity recognition", "Clause extraction", "Key information extraction"]
+                "capabilities": ["Named entity recognition", "Clause extraction", "Key information extraction"],
+                "status": "initialized" if self.agent_mapping["entity"] else "not_initialized"
             },
             "risk": {
                 "name": "RiskAgent",
                 "description": "Assesses compliance, financial, and operational risks",
-                "capabilities": ["Compliance risk analysis", "Financial risk analysis", "Operational risk analysis"]
+                "capabilities": ["Compliance risk analysis", "Financial risk analysis", "Operational risk analysis"],
+                "status": "initialized" if self.agent_mapping["risk"] else "not_initialized"
             },
             "qa": {
                 "name": "QAAgent",
                 "description": "Generates questions and answers about documents",
-                "capabilities": ["Factual question generation", "Compliance question generation", "Risk question generation"]
+                "capabilities": ["Factual question generation", "Compliance question generation", "Risk question generation"],
+                "status": "initialized" if self.agent_mapping["qa"] else "not_initialized"
             },
             "compare": {
                 "name": "CompareAgent",
                 "description": "Compares documents for differences and changes",
-                "capabilities": ["Semantic comparison", "Structural comparison", "Compliance comparison"]
+                "capabilities": ["Semantic comparison", "Structural comparison", "Compliance comparison"],
+                "status": "initialized" if self.agent_mapping["compare"] else "not_initialized"
             },
             "audit": {
                 "name": "AuditAgent",
                 "description": "Generates audit trails and compliance reports",
-                "capabilities": ["Audit trail generation", "Compliance reporting", "Audit bundle creation"]
+                "capabilities": ["Audit trail generation", "Compliance reporting", "Audit bundle creation"],
+                "status": "initialized" if self.agent_mapping["audit"] else "not_initialized"
             },
             "summarizer": {
                 "name": "SummarizerAgent",
                 "description": "Generates comprehensive document summaries",
-                "capabilities": ["Extractive summarization", "Abstractive summarization", "Executive summaries", "Technical summaries", "Key points extraction"]
+                "capabilities": ["Extractive summarization", "Abstractive summarization", "Executive summaries", "Technical summaries", "Key points extraction"],
+                "status": "initialized" if self.agent_mapping["summarizer"] else "not_initialized"
             },
             "translator": {
                 "name": "TranslatorAgent",
                 "description": "Translates documents between multiple languages",
-                "capabilities": ["Text translation", "Document translation", "Language detection", "Technical translation", "Cultural adaptation"]
+                "capabilities": ["Text translation", "Document translation", "Language detection", "Technical translation", "Cultural adaptation"],
+                "status": "initialized" if self.agent_mapping["translator"] else "not_initialized"
             },
             "sentiment": {
                 "name": "SentimentAnalysisAgent",
                 "description": "Analyzes sentiment, tone, and emotional content",
-                "capabilities": ["Sentiment analysis", "Tone analysis", "Emotion detection", "Bias detection", "Sentiment tracking"]
+                "capabilities": ["Sentiment analysis", "Tone analysis", "Emotion detection", "Bias detection", "Sentiment tracking"],
+                "status": "initialized" if self.agent_mapping["sentiment"] else "not_initialized"
             }
         }
     
     def get_workflow_status(self) -> Dict[str, Any]:
         """Get current workflow status from orchestrator"""
+        if not self.orchestrator:
+            return {"status": "not_initialized"}
         return self.orchestrator.get_workflow_status()
     
+    async def get_status(self) -> Dict[str, Any]:
+        """Get comprehensive service status"""
+        return {
+            "initialized": self.is_initialized,
+            "agents": self.get_agent_capabilities(),
+            "processing_history_count": len(self.processing_history),
+            "workflow_status": self.get_workflow_status(),
+            "monitoring": {
+                "enabled": True,
+                "metrics_available": True
+            }
+        }
+    
     async def cleanup_old_processing_history(self, max_age_hours: int = 24):
         """Clean up old processing history"""
         cutoff_time = datetime.utcnow().timestamp() - (max_age_hours * 3600)
@@ -310,3 +583,24 @@ async def cleanup_old_processing_history(self, max_age_hours: int = 24):
         
         for processing_id in to_remove:
             del self.processing_history[processing_id]
+        
+        logger.info(f"Cleaned up {len(to_remove)} old processing history entries")
+    
+    async def cleanup(self) -> None:
+        """Cleanup resources and connections"""
+        try:
+            logger.info("Cleaning up AgentService...")
+            
+            # Clean up processing history
+            await self.cleanup_old_processing_history()
+            
+            # Clear agent instances
+            self.agent_mapping.clear()
+            self.processing_history.clear()
+            
+            self.is_initialized = False
+            logger.info("AgentService cleanup completed")
+            
+        except Exception as e:
+            logger.error(f"AgentService cleanup failed: {e}")
+            raise
diff --git a/backend/app/services/memory_service.py b/backend/app/services/memory_service.py
index 44b70e3..404dcd3 100644
--- a/backend/app/services/memory_service.py
+++ b/backend/app/services/memory_service.py
@@ -1,5 +1,6 @@
 import json
 import asyncio
+import logging
 from typing import Dict, Any, List, Optional
 from datetime import datetime, timedelta
 import redis.asyncio as redis
@@ -9,32 +10,71 @@
 import uuid
 
 from ..core.config import settings
+from ..core.monitoring import get_monitor
+
+logger = logging.getLogger(__name__)
 
 
 class MemoryService:
     """Service for managing shared memory (Redis + Vector DB)"""
     
     def __init__(self):
+        self.monitor = get_monitor()
         self.redis_client = None
         self.vector_store = None
         self.chroma_client = None
-        self._initialize_connections()
+        self.is_initialized = False
+        
+        # In-memory fallback storage
+        self._memory_storage = {}
     
-    def _initialize_connections(self):
+    async def initialize(self) -> None:
         """Initialize Redis and vector database connections"""
         try:
+            logger.info("Initializing MemoryService...")
+            
             # Initialize Redis
-            print("🔗 Initializing Redis connection...")
+            await self._initialize_redis()
+            
+            # Initialize ChromaDB
+            await self._initialize_chromadb()
+            
+            self.is_initialized = True
+            logger.info("MemoryService initialized successfully")
+            
+        except Exception as e:
+            logger.error(f"Failed to initialize MemoryService: {e}")
+            # Fallback to in-memory storage
+            self._setup_fallback_storage()
+            raise
+    
+    async def _initialize_redis(self) -> None:
+        """Initialize Redis connection"""
+        try:
             if settings.REDIS_URL:
-                self.redis_client = redis.from_url(settings.REDIS_URL)
+                logger.info("Initializing Redis connection...")
+                self.redis_client = redis.from_url(
+                    settings.REDIS_URL,
+                    decode_responses=True,
+                    max_connections=settings.REDIS_MAX_CONNECTIONS
+                )
+                
                 # Test connection
-                asyncio.create_task(self._test_redis_connection())
+                await self.redis_client.ping()
+                logger.info("Redis connection successful")
             else:
-                print("⚠️ Redis URL not configured, using in-memory storage")
+                logger.warning("Redis URL not configured, using in-memory storage")
                 self.redis_client = None
+                
+        except Exception as e:
+            logger.error(f"Redis initialization failed: {e}")
+            self.redis_client = None
+    
+    async def _initialize_chromadb(self) -> None:
+        """Initialize ChromaDB connection"""
+        try:
+            logger.info("Initializing ChromaDB connection...")
             
-            # Initialize ChromaDB
-            print("🔗 Initializing ChromaDB connection...")
             if settings.CHROMA_PERSIST_DIRECTORY:
                 self.chroma_client = chromadb.PersistentClient(
                     path=settings.CHROMA_PERSIST_DIRECTORY,
@@ -43,78 +83,104 @@ def _initialize_connections(self):
                         allow_reset=True
                     )
                 )
-                # Get or create default collection
-                try:
-                    self.vector_store = self.chroma_client.get_collection("documents")
-                except:
-                    self.vector_store = self.chroma_client.create_collection("documents")
             else:
-                print("⚠️ ChromaDB path not configured, using in-memory storage")
+                logger.warning("ChromaDB path not configured, using in-memory storage")
                 self.chroma_client = chromadb.Client()
-                self.vector_store = self.chroma_client.create_collection("documents")
             
-            print("✅ Memory connections initialized")
+            # Get or create default collection
+            try:
+                self.vector_store = self.chroma_client.get_collection(settings.CHROMA_COLLECTION_NAME)
+                logger.info(f"Using existing ChromaDB collection: {settings.CHROMA_COLLECTION_NAME}")
+            except Exception:
+                self.vector_store = self.chroma_client.create_collection(settings.CHROMA_COLLECTION_NAME)
+                logger.info(f"Created new ChromaDB collection: {settings.CHROMA_COLLECTION_NAME}")
             
         except Exception as e:
-            print(f"⚠️ Memory initialization failed: {e}")
-            # Fallback to in-memory storage
-            self.redis_client = None
-            self.chroma_client = chromadb.Client()
-            self.vector_store = self.chroma_client.create_collection("documents")
+            logger.error(f"ChromaDB initialization failed: {e}")
+            self._setup_fallback_storage()
     
-    async def _test_redis_connection(self):
-        """Test Redis connection"""
-        try:
-            await self.redis_client.ping()
-            print("✅ Redis connection successful")
-        except Exception as e:
-            print(f"❌ Redis connection failed: {e}")
-            self.redis_client = None
+    def _setup_fallback_storage(self) -> None:
+        """Setup fallback in-memory storage"""
+        logger.info("Setting up fallback in-memory storage")
+        self.chroma_client = chromadb.Client()
+        self.vector_store = self.chroma_client.create_collection("documents")
     
     async def store_short_term(self, key: str, data: Any, ttl: int = 3600) -> bool:
         """Store data in Redis (short-term memory)"""
+        if not self.is_initialized:
+            raise RuntimeError("MemoryService not initialized. Call initialize() first.")
+        
         try:
             if self.redis_client:
                 serialized_data = json.dumps(data, default=str)
                 await self.redis_client.setex(key, ttl, serialized_data)
-                print(f"📝 Stored in Redis: {key} (TTL: {ttl}s)")
+                logger.debug(f"Stored in Redis: {key} (TTL: {ttl}s)")
                 return True
             else:
                 # Fallback to in-memory storage
-                print(f"📝 Stored in memory: {key}")
+                self._memory_storage[key] = {
+                    "data": data,
+                    "expires_at": datetime.utcnow() + timedelta(seconds=ttl)
+                }
+                logger.debug(f"Stored in memory: {key}")
                 return True
             
         except Exception as e:
-            print(f"❌ Failed to store in short-term memory: {e}")
+            logger.error(f"Failed to store in short-term memory: {e}")
             return False
     
     async def get_short_term(self, key: str) -> Optional[Any]:
         """Retrieve data from Redis (short-term memory)"""
+        if not self.is_initialized:
+            raise RuntimeError("MemoryService not initialized. Call initialize() first.")
+        
         try:
             if self.redis_client:
                 data = await self.redis_client.get(key)
                 if data:
                     return json.loads(data)
+            else:
+                # Check in-memory storage
+                if key in self._memory_storage:
+                    item = self._memory_storage[key]
+                    if datetime.utcnow() < item["expires_at"]:
+                        return item["data"]
+                    else:
+                        # Remove expired item
+                        del self._memory_storage[key]
+            
             return None
             
         except Exception as e:
-            print(f"❌ Failed to retrieve from short-term memory: {e}")
+            logger.error(f"Failed to retrieve from short-term memory: {e}")
             return None
     
     async def delete_short_term(self, key: str) -> bool:
         """Delete data from Redis (short-term memory)"""
+        if not self.is_initialized:
+            raise RuntimeError("MemoryService not initialized. Call initialize() first.")
+        
         try:
             if self.redis_client:
                 await self.redis_client.delete(key)
-                print(f"🗑️ Deleted from Redis: {key}")
+                logger.debug(f"Deleted from Redis: {key}")
+            else:
+                # Remove from in-memory storage
+                if key in self._memory_storage:
+                    del self._memory_storage[key]
+                    logger.debug(f"Deleted from memory: {key}")
+            
             return True
             
         except Exception as e:
-            print(f"❌ Failed to delete from short-term memory: {e}")
+            logger.error(f"Failed to delete from short-term memory: {e}")
             return False
     
     async def store_long_term(self, collection: str, documents: List[Dict[str, Any]], metadata: Dict[str, Any] = None) -> bool:
         """Store documents in vector database (long-term memory)"""
+        if not self.is_initialized:
+            raise RuntimeError("MemoryService not initialized. Call initialize() first.")
+        
         try:
             if not documents:
                 return False
@@ -138,7 +204,8 @@ async def store_long_term(self, collection: str, documents: List[Dict[str, Any]]
                 doc_metadata = {
                     "source": doc.get("source", "unknown"),
                     "type": doc.get("type", "document"),
-                    "created_at": datetime.now().isoformat(),
+                    "collection": collection,
+                    "created_at": datetime.utcnow().isoformat(),
                     **(metadata or {}),
                     **(doc.get("metadata", {}))
                 }
@@ -150,17 +217,28 @@ async def store_long_term(self, collection: str, documents: List[Dict[str, Any]]
                     documents=texts,
                     metadatas=metadatas
                 )
-                print(f"📚 Stored {len(ids)} documents in ChromaDB: {collection}")
+                logger.info(f"Stored {len(ids)} documents in ChromaDB collection: {collection}")
+                
+                # Record metrics
+                self.monitor.record_performance_metric(
+                    "documents_stored",
+                    len(ids),
+                    {"collection": collection}
+                )
+                
                 return True
             
             return False
             
         except Exception as e:
-            print(f"❌ Failed to store in long-term memory: {e}")
+            logger.error(f"Failed to store in long-term memory: {e}")
             return False
     
     async def search_long_term(self, query: str, collection: str = None, k: int = 5, filter_metadata: Dict[str, Any] = None) -> List[Dict[str, Any]]:
         """Search documents in vector database (long-term memory)"""
+        if not self.is_initialized:
+            raise RuntimeError("MemoryService not initialized. Call initialize() first.")
+        
         try:
             if not query.strip():
                 return []
@@ -183,11 +261,19 @@ async def search_long_term(self, query: str, collection: str = None, k: int = 5,
                         "id": results['ids'][0][i] if results['ids'] and results['ids'][0] else None
                     })
             
-            print(f"🔍 Searched ChromaDB for: '{query}' -> {len(formatted_results)} results")
+            logger.debug(f"Searched ChromaDB for: '{query}' -> {len(formatted_results)} results")
+            
+            # Record metrics
+            self.monitor.record_performance_metric(
+                "search_results",
+                len(formatted_results),
+                {"collection": collection or "default"}
+            )
+            
             return formatted_results
             
         except Exception as e:
-            print(f"❌ Failed to search long-term memory: {e}")
+            logger.error(f"Failed to search long-term memory: {e}")
             return []
     
     async def store_trace_context(self, trace_id: str, context: Dict[str, Any]) -> bool:
@@ -203,12 +289,12 @@ async def store_agent_memory(self, agent_id: str, memory_data: Dict[str, Any], m
         try:
             # Create memory document
             memory_doc = {
-                "id": f"memory_{agent_id}_{datetime.now().timestamp()}",
+                "id": f"memory_{agent_id}_{datetime.utcnow().timestamp()}",
                 "content": json.dumps(memory_data),
                 "metadata": {
                     "agent_id": agent_id,
                     "memory_type": memory_type,
-                    "timestamp": datetime.now().isoformat(),
+                    "timestamp": datetime.utcnow().isoformat(),
                     "data_hash": hashlib.md5(json.dumps(memory_data, sort_keys=True).encode()).hexdigest()
                 }
             }
@@ -216,7 +302,7 @@ async def store_agent_memory(self, agent_id: str, memory_data: Dict[str, Any], m
             return await self.store_long_term("agent_memories", [memory_doc])
             
         except Exception as e:
-            print(f"❌ Failed to store agent memory: {e}")
+            logger.error(f"Failed to store agent memory: {e}")
             return False
     
     async def search_agent_memory(self, agent_id: str, query: str, memory_type: str = None, k: int = 5) -> List[Dict[str, Any]]:
@@ -238,7 +324,7 @@ async def search_agent_memory(self, agent_id: str, query: str, memory_type: str
             return results
             
         except Exception as e:
-            print(f"❌ Failed to search agent memory: {e}")
+            logger.error(f"Failed to search agent memory: {e}")
             return []
     
     async def store_document_embeddings(self, document_id: str, text_chunks: List[str], metadata: Dict[str, Any] = None) -> bool:
@@ -260,7 +346,7 @@ async def store_document_embeddings(self, document_id: str, text_chunks: List[st
             return await self.store_long_term("document_embeddings", documents)
             
         except Exception as e:
-            print(f"❌ Failed to store document embeddings: {e}")
+            logger.error(f"Failed to store document embeddings: {e}")
             return False
     
     async def search_similar_documents(self, query: str, document_type: str = None, k: int = 5) -> List[Dict[str, Any]]:
@@ -273,60 +359,89 @@ async def search_similar_documents(self, query: str, document_type: str = None,
             return await self.search_long_term(query, "document_embeddings", k, filter_metadata)
             
         except Exception as e:
-            print(f"❌ Failed to search similar documents: {e}")
+            logger.error(f"Failed to search similar documents: {e}")
             return []
     
-    async def get_collection_stats(self, collection_name: str = "documents") -> Dict[str, Any]:
-        """Get statistics about a collection"""
+    async def get_collection_stats(self, collection_name: str = None) -> Dict[str, Any]:
+        """Get statistics about collections"""
+        if not self.is_initialized:
+            raise RuntimeError("MemoryService not initialized. Call initialize() first.")
+        
         try:
+            collection_name = collection_name or settings.CHROMA_COLLECTION_NAME
+            
             if self.vector_store:
                 count = self.vector_store.count()
                 return {
                     "collection_name": collection_name,
                     "document_count": count,
-                    "status": "active"
+                    "status": "active",
+                    "timestamp": datetime.utcnow().isoformat()
                 }
+            
             return {
                 "collection_name": collection_name,
                 "document_count": 0,
-                "status": "inactive"
+                "status": "inactive",
+                "timestamp": datetime.utcnow().isoformat()
             }
             
         except Exception as e:
-            print(f"❌ Failed to get collection stats: {e}")
+            logger.error(f"Failed to get collection stats: {e}")
             return {
                 "collection_name": collection_name,
                 "document_count": 0,
                 "status": "error",
-                "error": str(e)
+                "error": str(e),
+                "timestamp": datetime.utcnow().isoformat()
             }
     
     async def cleanup_expired_data(self, max_age_hours: int = 24) -> int:
         """Clean up expired data from memory"""
+        if not self.is_initialized:
+            raise RuntimeError("MemoryService not initialized. Call initialize() first.")
+        
         try:
             cleaned_count = 0
             
+            # Clean up in-memory storage
+            current_time = datetime.utcnow()
+            expired_keys = []
+            
+            for key, item in self._memory_storage.items():
+                if current_time > item["expires_at"]:
+                    expired_keys.append(key)
+            
+            for key in expired_keys:
+                del self._memory_storage[key]
+                cleaned_count += 1
+            
             # Note: ChromaDB doesn't have built-in TTL, so we'd need to implement
             # custom cleanup logic based on metadata timestamps
             # For now, we'll just clean up Redis data
             
-            if self.redis_client:
-                # This is a simplified cleanup - in production you'd want more sophisticated logic
-                print(f"🧹 Cleanup completed: {cleaned_count} items removed")
-            
+            logger.info(f"Cleanup completed: {cleaned_count} items removed")
             return cleaned_count
             
         except Exception as e:
-            print(f"❌ Failed to cleanup expired data: {e}")
+            logger.error(f"Failed to cleanup expired data: {e}")
             return 0
     
     async def health_check(self) -> Dict[str, Any]:
         """Check health of memory services"""
+        if not self.is_initialized:
+            return {
+                "redis": "not_initialized",
+                "chromadb": "not_initialized",
+                "overall": "not_initialized"
+            }
+        
         try:
             health_status = {
                 "redis": "unknown",
                 "chromadb": "unknown",
-                "overall": "unknown"
+                "overall": "unknown",
+                "timestamp": datetime.utcnow().isoformat()
             }
             
             # Check Redis
@@ -334,7 +449,8 @@ async def health_check(self) -> Dict[str, Any]:
                 try:
                     await self.redis_client.ping()
                     health_status["redis"] = "healthy"
-                except:
+                except Exception as e:
+                    logger.error(f"Redis health check failed: {e}")
                     health_status["redis"] = "unhealthy"
             else:
                 health_status["redis"] = "not_configured"
@@ -344,7 +460,8 @@ async def health_check(self) -> Dict[str, Any]:
                 try:
                     self.vector_store.count()
                     health_status["chromadb"] = "healthy"
-                except:
+                except Exception as e:
+                    logger.error(f"ChromaDB health check failed: {e}")
                     health_status["chromadb"] = "unhealthy"
             else:
                 health_status["chromadb"] = "not_configured"
@@ -360,9 +477,46 @@ async def health_check(self) -> Dict[str, Any]:
             return health_status
             
         except Exception as e:
+            logger.error(f"Memory service health check failed: {e}")
             return {
                 "redis": "error",
                 "chromadb": "error", 
                 "overall": "error",
-                "error": str(e)
+                "error": str(e),
+                "timestamp": datetime.utcnow().isoformat()
             }
+    
+    async def get_status(self) -> Dict[str, Any]:
+        """Get comprehensive service status"""
+        return {
+            "initialized": self.is_initialized,
+            "redis_configured": self.redis_client is not None,
+            "chromadb_configured": self.vector_store is not None,
+            "health": await self.health_check(),
+            "collections": await self.get_collection_stats(),
+            "memory_storage_size": len(self._memory_storage)
+        }
+    
+    async def cleanup(self) -> None:
+        """Cleanup resources and connections"""
+        try:
+            logger.info("Cleaning up MemoryService...")
+            
+            # Close Redis connection
+            if self.redis_client:
+                await self.redis_client.close()
+                logger.info("Redis connection closed")
+            
+            # Clear in-memory storage
+            self._memory_storage.clear()
+            
+            # Clear ChromaDB references
+            self.vector_store = None
+            self.chroma_client = None
+            
+            self.is_initialized = False
+            logger.info("MemoryService cleanup completed")
+            
+        except Exception as e:
+            logger.error(f"MemoryService cleanup failed: {e}")
+            raise
diff --git a/backend/app/tasks/__init__.py b/backend/app/tasks/__init__.py
new file mode 100644
index 0000000..e3140cc
--- /dev/null
+++ b/backend/app/tasks/__init__.py
@@ -0,0 +1,40 @@
+"""
+Celery Tasks Package for AI Document Agent
+Handles distributed task processing for document analysis and AI operations
+"""
+
+from .document_tasks import *
+from .agent_tasks import *
+from .analytics_tasks import *
+from .maintenance_tasks import *
+
+__all__ = [
+    # Document tasks
+    "process_document",
+    "extract_text",
+    "classify_document",
+    "extract_entities",
+    "assess_risk",
+    "compare_documents",
+    
+    # Agent tasks
+    "execute_agent",
+    "orchestrate_workflow",
+    "run_qa_agent",
+    "run_summarizer_agent",
+    "run_translator_agent",
+    "run_sentiment_agent",
+    
+    # Analytics tasks
+    "generate_daily_reports",
+    "update_system_metrics",
+    "analyze_performance",
+    "generate_insights",
+    
+    # Maintenance tasks
+    "cleanup_expired_documents",
+    "backup_database",
+    "cleanup_audit_logs",
+    "optimize_database",
+    "health_check",
+]
diff --git a/backend/app/tasks/document_tasks.py b/backend/app/tasks/document_tasks.py
new file mode 100644
index 0000000..b537df5
--- /dev/null
+++ b/backend/app/tasks/document_tasks.py
@@ -0,0 +1,282 @@
+"""
+Document Processing Tasks for AI Document Agent
+Handles document upload, processing, and analysis tasks
+"""
+
+import os
+import logging
+from typing import Dict, Any, Optional
+from celery import current_task
+from ..core.celery_config import celery_app
+from ..services.agent_service import AgentService
+from ..services.memory_service import MemoryService
+from ..database.connection import get_db
+from ..database.models import Document, ProcessingHistory
+from ..core.config import settings
+
+logger = logging.getLogger(__name__)
+
+@celery_app.task(bind=True, max_retries=3)
+def process_document(self, document_id: int, user_id: int) -> Dict[str, Any]:
+    """
+    Process a document through the complete AI pipeline
+    
+    Args:
+        document_id: ID of the document to process
+        user_id: ID of the user who uploaded the document
+        
+    Returns:
+        Dictionary containing processing results
+    """
+    try:
+        # Update task status
+        current_task.update_state(
+            state="PROGRESS",
+            meta={"current": 0, "total": 100, "status": "Starting document processing"}
+        )
+        
+        # Get database session
+        db = next(get_db())
+        
+        # Get document
+        document = db.query(Document).filter(Document.id == document_id).first()
+        if not document:
+            raise ValueError(f"Document {document_id} not found")
+        
+        # Initialize services
+        agent_service = AgentService()
+        memory_service = MemoryService()
+        
+        # Step 1: Extract text (10%)
+        current_task.update_state(
+            state="PROGRESS",
+            meta={"current": 10, "total": 100, "status": "Extracting text"}
+        )
+        
+        text_result = extract_text.delay(document_id)
+        extracted_text = text_result.get(timeout=300)
+        
+        # Step 2: Classify document (20%)
+        current_task.update_state(
+            state="PROGRESS",
+            meta={"current": 20, "total": 100, "status": "Classifying document"}
+        )
+        
+        classification_result = classify_document.delay(document_id, extracted_text)
+        classification = classification_result.get(timeout=300)
+        
+        # Step 3: Extract entities (40%)
+        current_task.update_state(
+            state="PROGRESS",
+            meta={"current": 40, "total": 100, "status": "Extracting entities"}
+        )
+        
+        entities_result = extract_entities.delay(document_id, extracted_text)
+        entities = entities_result.get(timeout=300)
+        
+        # Step 4: Assess risk (60%)
+        current_task.update_state(
+            state="PROGRESS",
+            meta={"current": 60, "total": 100, "status": "Assessing risk"}
+        )
+        
+        risk_result = assess_risk.delay(document_id, extracted_text, entities)
+        risk_assessment = risk_result.get(timeout=300)
+        
+        # Step 5: Store in vector database (80%)
+        current_task.update_state(
+            state="PROGRESS",
+            meta={"current": 80, "total": 100, "status": "Storing in vector database"}
+        )
+        
+        # Store document in vector database
+        await memory_service.store_long_term(
+            f"doc_{document_id}",
+            extracted_text,
+            metadata={
+                "document_id": document_id,
+                "user_id": user_id,
+                "classification": classification,
+                "entities": entities,
+                "risk_assessment": risk_assessment
+            }
+        )
+        
+        # Step 6: Update document status (100%)
+        current_task.update_state(
+            state="PROGRESS",
+            meta={"current": 100, "total": 100, "status": "Completing processing"}
+        )
+        
+        # Update document status
+        document.status = "processed"
+        document.processing_progress = 100
+        db.commit()
+        
+        # Log processing history
+        history = ProcessingHistory(
+            document_id=document_id,
+            stage="complete",
+            status="success",
+            details={
+                "classification": classification,
+                "entities_count": len(entities),
+                "risk_score": risk_assessment.get("risk_score", 0)
+            }
+        )
+        db.add(history)
+        db.commit()
+        
+        return {
+            "document_id": document_id,
+            "status": "success",
+            "classification": classification,
+            "entities": entities,
+            "risk_assessment": risk_assessment,
+            "processing_time": current_task.request.execution_time
+        }
+        
+    except Exception as exc:
+        logger.error(f"Document processing failed: {exc}")
+        
+        # Update document status
+        if 'document' in locals():
+            document.status = "failed"
+            document.processing_error = str(exc)
+            db.commit()
+        
+        # Retry with exponential backoff
+        if self.request.retries < self.max_retries:
+            countdown = 2 ** self.request.retries
+            raise self.retry(countdown=countdown, exc=exc)
+        else:
+            raise exc
+
+@celery_app.task(bind=True, max_retries=3)
+def extract_text(self, document_id: int) -> str:
+    """Extract text from document"""
+    try:
+        db = next(get_db())
+        document = db.query(Document).filter(Document.id == document_id).first()
+        
+        if not document:
+            raise ValueError(f"Document {document_id} not found")
+        
+        # Use ingestion agent to extract text
+        agent_service = AgentService()
+        result = await agent_service.ingestion_agent.run(
+            "Extract text from document",
+            {"document_path": document.file_path}
+        )
+        
+        # Update document with extracted text
+        document.extracted_text = result.get("text", "")
+        db.commit()
+        
+        return document.extracted_text
+        
+    except Exception as exc:
+        logger.error(f"Text extraction failed: {exc}")
+        if self.request.retries < self.max_retries:
+            countdown = 2 ** self.request.retries
+            raise self.retry(countdown=countdown, exc=exc)
+        else:
+            raise exc
+
+@celery_app.task(bind=True, max_retries=3)
+def classify_document(self, document_id: int, text: str) -> Dict[str, Any]:
+    """Classify document type and domain"""
+    try:
+        agent_service = AgentService()
+        result = await agent_service.classifier_agent.run(
+            "Classify document type and domain",
+            {"text": text}
+        )
+        
+        return result
+        
+    except Exception as exc:
+        logger.error(f"Document classification failed: {exc}")
+        if self.request.retries < self.max_retries:
+            countdown = 2 ** self.request.retries
+            raise self.retry(countdown=countdown, exc=exc)
+        else:
+            raise exc
+
+@celery_app.task(bind=True, max_retries=3)
+def extract_entities(self, document_id: int, text: str) -> Dict[str, Any]:
+    """Extract named entities from document"""
+    try:
+        agent_service = AgentService()
+        result = await agent_service.entity_agent.run(
+            "Extract named entities from text",
+            {"text": text}
+        )
+        
+        return result
+        
+    except Exception as exc:
+        logger.error(f"Entity extraction failed: {exc}")
+        if self.request.retries < self.max_retries:
+            countdown = 2 ** self.request.retries
+            raise self.retry(countdown=countdown, exc=exc)
+        else:
+            raise exc
+
+@celery_app.task(bind=True, max_retries=3)
+def assess_risk(self, document_id: int, text: str, entities: Dict[str, Any]) -> Dict[str, Any]:
+    """Assess document risk and compliance"""
+    try:
+        agent_service = AgentService()
+        result = await agent_service.risk_agent.run(
+            "Assess document risk and compliance",
+            {"text": text, "entities": entities}
+        )
+        
+        return result
+        
+    except Exception as exc:
+        logger.error(f"Risk assessment failed: {exc}")
+        if self.request.retries < self.max_retries:
+            countdown = 2 ** self.request.retries
+            raise self.retry(countdown=countdown, exc=exc)
+        else:
+            raise exc
+
+@celery_app.task(bind=True, max_retries=3)
+def compare_documents(self, document1_id: int, document2_id: int) -> Dict[str, Any]:
+    """Compare two documents for similarities and differences"""
+    try:
+        agent_service = AgentService()
+        result = await agent_service.compare_agent.run(
+            "Compare two documents",
+            {"document1_id": document1_id, "document2_id": document2_id}
+        )
+        
+        return result
+        
+    except Exception as exc:
+        logger.error(f"Document comparison failed: {exc}")
+        if self.request.retries < self.max_retries:
+            countdown = 2 ** self.request.retries
+            raise self.retry(countdown=countdown, exc=exc)
+        else:
+            raise exc
+
+@celery_app.task
+def process_pending_documents():
+    """Process all pending documents in the queue"""
+    try:
+        db = next(get_db())
+        pending_documents = db.query(Document).filter(
+            Document.status == "pending"
+        ).limit(10).all()
+        
+        for document in pending_documents:
+            process_document.delay(document.id, document.user_id)
+            
+        return {"processed": len(pending_documents)}
+        
+    except Exception as exc:
+        logger.error(f"Pending document processing failed: {exc}")
+        raise exc
diff --git a/config/chroma_auth.json b/config/chroma_auth.json
new file mode 100644
index 0000000..58d6a29
--- /dev/null
+++ b/config/chroma_auth.json
@@ -0,0 +1,4 @@
+{
+  "admin": "$2b$12$LQv3c1yqBWVHxkd0LHAkCOYz6TtxMQJqhN8/LewdBPj4J/8JQHqGq",
+  "user1": "$2b$12$LQv3c1yqBWVHxkd0LHAkCOYz6TtxMQJqhN8/LewdBPj4J/8JQHqGq"
+}
diff --git a/env.example b/env.example
new file mode 100644
index 0000000..b385022
--- /dev/null
+++ b/env.example
@@ -0,0 +1,203 @@
+# AI Document Agent - Environment Configuration Template
+# Copy this file to .env and configure your values
+
+# =============================================================================
+# CORE APPLICATION SETTINGS
+# =============================================================================
+APP_NAME=AI Document Agent
+APP_VERSION=1.0.0
+DEBUG=false
+LOG_LEVEL=INFO
+ENVIRONMENT=production
+
+# Server Configuration
+HOST=0.0.0.0
+PORT=8000
+WORKER_PROCESSES=4
+MAX_CONCURRENT_REQUESTS=100
+
+# =============================================================================
+# SECURITY SETTINGS
+# =============================================================================
+SECRET_KEY=your-super-secret-key-change-in-production
+ALGORITHM=HS256
+ACCESS_TOKEN_EXPIRE_MINUTES=30
+REFRESH_TOKEN_EXPIRE_DAYS=7
+
+# JWT Settings
+JWT_SECRET_KEY=your-jwt-secret-key-change-in-production
+JWT_ALGORITHM=HS256
+JWT_ACCESS_TOKEN_EXPIRE_MINUTES=30
+JWT_REFRESH_TOKEN_EXPIRE_DAYS=7
+
+# Security Headers
+ENABLE_CORS=true
+ALLOWED_ORIGINS=http://localhost:3000,http://localhost:8000
+ALLOWED_METHODS=GET,POST,PUT,DELETE,OPTIONS
+ALLOWED_HEADERS=*
+
+# Rate Limiting
+RATE_LIMIT_REQUESTS=1000
+RATE_LIMIT_WINDOW=3600
+ENABLE_RATE_LIMITING=true
+
+# =============================================================================
+# DATABASE CONFIGURATION
+# =============================================================================
+DATABASE_URL=postgresql://postgres:password@localhost:5432/smart_doc_bot
+DATABASE_POOL_SIZE=10
+DATABASE_MAX_OVERFLOW=20
+DATABASE_POOL_TIMEOUT=30
+DATABASE_POOL_RECYCLE=3600
+
+# Database Migration
+ALEMBIC_CONFIG=alembic.ini
+MIGRATION_AUTO_UPGRADE=true
+
+# =============================================================================
+# REDIS CONFIGURATION
+# =============================================================================
+REDIS_URL=redis://localhost:6379/0
+REDIS_MAX_CONNECTIONS=10
+REDIS_PASSWORD=
+REDIS_DB=0
+
+# =============================================================================
+# CHROMADB VECTOR DATABASE
+# =============================================================================
+CHROMA_PERSIST_DIRECTORY=./chroma_db
+CHROMA_COLLECTION_NAME=documents
+CHROMA_SERVER_HOST=localhost
+CHROMA_SERVER_PORT=8001
+
+# =============================================================================
+# AI/ML SERVICES
+# =============================================================================
+OPENAI_API_KEY=your-openai-api-key
+OPENAI_MODEL=gpt-4
+OPENAI_MAX_TOKENS=4000
+OPENAI_TEMPERATURE=0.1
+
+ANTHROPIC_API_KEY=your-anthropic-api-key
+ANTHROPIC_MODEL=claude-3-sonnet-20240229
+
+# Agent Configuration
+AGENT_TIMEOUT=300
+AGENT_MAX_RETRIES=3
+AGENT_CONCURRENT_LIMIT=10
+AGENT_CONFIDENCE_THRESHOLD=0.8
+
+# =============================================================================
+# FILE STORAGE
+# =============================================================================
+UPLOAD_DIR=./uploads
+MAX_FILE_SIZE=104857600
+ALLOWED_FILE_TYPES=.pdf,.docx,.txt,.csv,.xlsx,.pptx,.doc,.rtf
+ENABLE_VIRUS_SCAN=true
+
+# =============================================================================
+# MONITORING & OBSERVABILITY
+# =============================================================================
+ENABLE_MONITORING=true
+PROMETHEUS_PORT=9090
+GRAFANA_PORT=3001
+ELASTICSEARCH_ENABLED=true
+JAEGER_ENABLED=true
+
+# Prometheus Configuration
+PROMETHEUS_ENABLED=true
+PROMETHEUS_METRICS_PATH=/metrics
+PROMETHEUS_PUSHGATEWAY=http://localhost:9091
+
+# Grafana Configuration
+GRAFANA_ENABLED=true
+GRAFANA_ADMIN_USER=admin
+GRAFANA_ADMIN_PASSWORD=admin
+
+# Elasticsearch Configuration
+ELASTICSEARCH_URL=http://localhost:9200
+ELASTICSEARCH_INDEX_PREFIX=smart-doc-bot
+ELASTICSEARCH_USERNAME=
+ELASTICSEARCH_PASSWORD=
+
+# Jaeger Configuration
+JAEGER_AGENT_HOST=localhost
+JAEGER_AGENT_PORT=6831
+JAEGER_COLLECTOR_URL=http://localhost:14268/api/traces
+
+# =============================================================================
+# AUDIT & COMPLIANCE
+# =============================================================================
+AUDIT_LOG_ENABLED=true
+AUDIT_LOG_RETENTION_DAYS=90
+COMPLIANCE_SCAN_ENABLED=true
+PII_REDACTION_ENABLED=true
+
+# Compliance Frameworks
+ENABLE_GDPR_COMPLIANCE=true
+ENABLE_HIPAA_COMPLIANCE=true
+ENABLE_SOX_COMPLIANCE=true
+
+# =============================================================================
+# EMAIL & NOTIFICATIONS
+# =============================================================================
+SMTP_HOST=smtp.gmail.com
+SMTP_PORT=587
+SMTP_USERNAME=your-email@gmail.com
+SMTP_PASSWORD=your-app-password
+SMTP_USE_TLS=true
+SMTP_USE_SSL=false
+
+# Email Templates
+EMAIL_FROM=noreply@smartdocbot.com
+EMAIL_REPLY_TO=support@smartdocbot.com
+
+# =============================================================================
+# BACKUP & RECOVERY
+# =============================================================================
+BACKUP_ENABLED=true
+BACKUP_RETENTION_DAYS=30
+BACKUP_SCHEDULE=0 2 * * *
+BACKUP_STORAGE_PATH=./backups
+
+# =============================================================================
+# PERFORMANCE & CACHING
+# =============================================================================
+ENABLE_CACHING=true
+CACHE_TTL=3600
+CACHE_MAX_SIZE=1000
+
+# Session Management
+SESSION_SECRET_KEY=your-session-secret-key
+SESSION_TTL=3600
+SESSION_COOKIE_SECURE=true
+
+# =============================================================================
+# FEATURE FLAGS
+# =============================================================================
+ENABLE_WEBSOCKETS=true
+ENABLE_SSE=true
+ENABLE_REAL_TIME_UPDATES=true
+ENABLE_DOCUMENT_COMPARISON=true
+ENABLE_QA_FEATURE=true
+ENABLE_AUDIT_TRAIL=true
+ENABLE_ANALYTICS=true
+
+# =============================================================================
+# INTEGRATION SETTINGS
+# =============================================================================
+# External API Keys (if needed)
+GOOGLE_CLOUD_API_KEY=
+AWS_ACCESS_KEY_ID=
+AWS_SECRET_ACCESS_KEY=
+AWS_REGION=us-east-1
+AWS_S3_BUCKET=smart-doc-bot-uploads
+
+# =============================================================================
+# DEVELOPMENT SETTINGS
+# =============================================================================
+# Only set these in development
+# DEBUG=true
+# LOG_LEVEL=DEBUG
+# ENABLE_MONITORING=false
+# ENABLE_RATE_LIMITING=false
diff --git a/k8s/configmaps.yaml b/k8s/configmaps.yaml
new file mode 100644
index 0000000..161ecc1
--- /dev/null
+++ b/k8s/configmaps.yaml
@@ -0,0 +1,113 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: ai-document-agent-config
+  namespace: ai-document-agent
+data:
+  # Application Configuration
+  APP_NAME: "AI Document Agent"
+  APP_VERSION: "1.0.0"
+  DEBUG: "false"
+  LOG_LEVEL: "INFO"
+  ENVIRONMENT: "production"
+  
+  # Server Configuration
+  HOST: "0.0.0.0"
+  PORT: "8000"
+  WORKER_PROCESSES: "4"
+  MAX_CONCURRENT_REQUESTS: "100"
+  
+  # Database Configuration
+  DATABASE_URL: "postgresql://postgres:$(DATABASE_PASSWORD)@postgres:5432/smart_doc_bot"
+  DATABASE_POOL_SIZE: "10"
+  DATABASE_MAX_OVERFLOW: "20"
+  
+  # Redis Configuration
+  REDIS_URL: "redis://redis:6379/0"
+  REDIS_MAX_CONNECTIONS: "10"
+  
+  # ChromaDB Configuration
+  CHROMA_PERSIST_DIRECTORY: "/app/chroma_db"
+  CHROMA_COLLECTION_NAME: "documents"
+  
+  # AI/ML Configuration
+  OPENAI_MODEL: "gpt-4"
+  OPENAI_MAX_TOKENS: "4000"
+  OPENAI_TEMPERATURE: "0.1"
+  AGENT_TIMEOUT: "300"
+  AGENT_MAX_RETRIES: "3"
+  AGENT_CONCURRENT_LIMIT: "10"
+  
+  # Security Configuration
+  ALGORITHM: "HS256"
+  ACCESS_TOKEN_EXPIRE_MINUTES: "30"
+  RATE_LIMIT_REQUESTS: "1000"
+  RATE_LIMIT_WINDOW: "3600"
+  
+  # Monitoring Configuration
+  ENABLE_MONITORING: "true"
+  PROMETHEUS_PORT: "9090"
+  GRAFANA_PORT: "3001"
+  ELASTICSEARCH_ENABLED: "true"
+  JAEGER_ENABLED: "true"
+  
+  # Audit Configuration
+  AUDIT_LOG_ENABLED: "true"
+  AUDIT_LOG_RETENTION_DAYS: "90"
+  COMPLIANCE_SCAN_ENABLED: "true"
+  PII_REDACTION_ENABLED: "true"
+  
+  # Feature Flags
+  ENABLE_WEBSOCKETS: "true"
+  ENABLE_SSE: "true"
+  ENABLE_REAL_TIME_UPDATES: "true"
+  ENABLE_DOCUMENT_COMPARISON: "true"
+  ENABLE_QA_FEATURE: "true"
+  ENABLE_AUDIT_TRAIL: "true"
+  ENABLE_ANALYTICS: "true"
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: ai-document-agent-nginx-config
+  namespace: ai-document-agent
+data:
+  nginx.conf: |
+    events {
+      worker_connections 1024;
+    }
+    
+    http {
+      upstream backend {
+        server backend:8000;
+      }
+      
+      upstream frontend {
+        server frontend:3000;
+      }
+      
+      server {
+        listen 80;
+        server_name localhost;
+        
+        location / {
+          proxy_pass http://frontend;
+          proxy_set_header Host $host;
+          proxy_set_header X-Real-IP $remote_addr;
+          proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+          proxy_set_header X-Forwarded-Proto $scheme;
+        }
+        
+        location /api/ {
+          proxy_pass http://backend;
+          proxy_set_header Host $host;
+          proxy_set_header X-Real-IP $remote_addr;
+          proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+          proxy_set_header X-Forwarded-Proto $scheme;
+        }
+        
+        location /health {
+          proxy_pass http://backend/health;
+        }
+      }
+    }
diff --git a/k8s/deployments.yaml b/k8s/deployments.yaml
new file mode 100644
index 0000000..92d1609
--- /dev/null
+++ b/k8s/deployments.yaml
@@ -0,0 +1,288 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: backend
+  namespace: ai-document-agent
+  labels:
+    app: smart-doc-bot
+    component: backend
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      app: smart-doc-bot
+      component: backend
+  template:
+    metadata:
+      labels:
+        app: smart-doc-bot
+        component: backend
+    spec:
+      containers:
+      - name: backend
+        image: smart-doc-bot-backend:latest
+        ports:
+        - containerPort: 8000
+        env:
+        - name: SECRET_KEY
+          valueFrom:
+            secretKeyRef:
+              name: ai-document-agent-secrets
+              key: secret-key
+        - name: JWT_SECRET_KEY
+          valueFrom:
+            secretKeyRef:
+              name: ai-document-agent-secrets
+              key: jwt-secret-key
+        - name: OPENAI_API_KEY
+          valueFrom:
+            secretKeyRef:
+              name: ai-document-agent-secrets
+              key: openai-api-key
+        - name: ANTHROPIC_API_KEY
+          valueFrom:
+            secretKeyRef:
+              name: ai-document-agent-secrets
+              key: anthropic-api-key
+        - name: DATABASE_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: ai-document-agent-secrets
+              key: database-password
+        envFrom:
+        - configMapRef:
+            name: ai-document-agent-config
+        resources:
+          requests:
+            memory: "512Mi"
+            cpu: "250m"
+          limits:
+            memory: "1Gi"
+            cpu: "500m"
+        livenessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /health
+            port: 8000
+          initialDelaySeconds: 5
+          periodSeconds: 5
+        volumeMounts:
+        - name: uploads
+          mountPath: /app/uploads
+        - name: chroma-db
+          mountPath: /app/chroma_db
+        - name: logs
+          mountPath: /app/logs
+      volumes:
+      - name: uploads
+        persistentVolumeClaim:
+          claimName: uploads-pvc
+      - name: chroma-db
+        persistentVolumeClaim:
+          claimName: chroma-db-pvc
+      - name: logs
+        persistentVolumeClaim:
+          claimName: logs-pvc
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: frontend
+  namespace: ai-document-agent
+  labels:
+    app: smart-doc-bot
+    component: frontend
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: smart-doc-bot
+      component: frontend
+  template:
+    metadata:
+      labels:
+        app: smart-doc-bot
+        component: frontend
+    spec:
+      containers:
+      - name: frontend
+        image: smart-doc-bot-frontend:latest
+        ports:
+        - containerPort: 3000
+        env:
+        - name: REACT_APP_API_URL
+          value: "https://api.smartdocbot.com"
+        - name: REACT_APP_ENVIRONMENT
+          value: "production"
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "100m"
+          limits:
+            memory: "512Mi"
+            cpu: "200m"
+        livenessProbe:
+          httpGet:
+            path: /
+            port: 3000
+          initialDelaySeconds: 30
+          periodSeconds: 10
+        readinessProbe:
+          httpGet:
+            path: /
+            port: 3000
+          initialDelaySeconds: 5
+          periodSeconds: 5
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: postgres
+  namespace: ai-document-agent
+  labels:
+    app: smart-doc-bot
+    component: database
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: smart-doc-bot
+      component: database
+  template:
+    metadata:
+      labels:
+        app: smart-doc-bot
+        component: database
+    spec:
+      containers:
+      - name: postgres
+        image: postgres:15-alpine
+        ports:
+        - containerPort: 5432
+        env:
+        - name: POSTGRES_DB
+          value: "smart_doc_bot"
+        - name: POSTGRES_USER
+          value: "postgres"
+        - name: POSTGRES_PASSWORD
+          valueFrom:
+            secretKeyRef:
+              name: ai-document-agent-secrets
+              key: database-password
+        resources:
+          requests:
+            memory: "1Gi"
+            cpu: "500m"
+          limits:
+            memory: "2Gi"
+            cpu: "1000m"
+        volumeMounts:
+        - name: postgres-data
+          mountPath: /var/lib/postgresql/data
+        - name: init-script
+          mountPath: /docker-entrypoint-initdb.d
+      volumes:
+      - name: postgres-data
+        persistentVolumeClaim:
+          claimName: postgres-pvc
+      - name: init-script
+        configMapRef:
+          name: ai-document-agent-init-db
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: redis
+  namespace: ai-document-agent
+  labels:
+    app: smart-doc-bot
+    component: cache
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: smart-doc-bot
+      component: cache
+  template:
+    metadata:
+      labels:
+        app: smart-doc-bot
+        component: cache
+    spec:
+      containers:
+      - name: redis
+        image: redis:7-alpine
+        ports:
+        - containerPort: 6379
+        command:
+        - redis-server
+        - --appendonly
+        - yes
+        - --maxmemory
+        - 256mb
+        - --maxmemory-policy
+        - allkeys-lru
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "100m"
+          limits:
+            memory: "512Mi"
+            cpu: "200m"
+        volumeMounts:
+        - name: redis-data
+          mountPath: /data
+      volumes:
+      - name: redis-data
+        persistentVolumeClaim:
+          claimName: redis-pvc
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: nginx
+  namespace: ai-document-agent
+  labels:
+    app: smart-doc-bot
+    component: ingress
+spec:
+  replicas: 2
+  selector:
+    matchLabels:
+      app: smart-doc-bot
+      component: ingress
+  template:
+    metadata:
+      labels:
+        app: smart-doc-bot
+        component: ingress
+    spec:
+      containers:
+      - name: nginx
+        image: nginx:alpine
+        ports:
+        - containerPort: 80
+        - containerPort: 443
+        volumeMounts:
+        - name: nginx-config
+          mountPath: /etc/nginx/nginx.conf
+          subPath: nginx.conf
+        - name: nginx-ssl
+          mountPath: /etc/nginx/ssl
+        - name: nginx-logs
+          mountPath: /var/log/nginx
+      volumes:
+      - name: nginx-config
+        configMapRef:
+          name: ai-document-agent-nginx-config
+      - name: nginx-ssl
+        secretRef:
+          name: ai-document-agent-tls
+      - name: nginx-logs
+        persistentVolumeClaim:
+          claimName: logs-pvc
diff --git a/k8s/namespace.yaml b/k8s/namespace.yaml
new file mode 100644
index 0000000..7821cc5
--- /dev/null
+++ b/k8s/namespace.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: ai-document-agent
+  labels:
+    name: ai-document-agent
+    app: smart-doc-bot
+    environment: production
+  annotations:
+    description: "AI Document Agent - Enterprise Document Intelligence Platform"
+    owner: "ai-document-agent-team"
+    cost-center: "engineering"
diff --git a/k8s/secrets.yaml b/k8s/secrets.yaml
new file mode 100644
index 0000000..514d7c4
--- /dev/null
+++ b/k8s/secrets.yaml
@@ -0,0 +1,26 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ai-document-agent-secrets
+  namespace: ai-document-agent
+type: Opaque
+data:
+  # Base64 encoded secrets - replace with actual values
+  secret-key: eW91ci1zdXBlci1zZWNyZXQta2V5LWNoYW5nZS1pbi1wcm9kdWN0aW9u
+  jwt-secret-key: eW91ci1qd3Qtc2VjcmV0LWtleS1jaGFuZ2UtaW4tcHJvZHVjdGlvbg==
+  openai-api-key: eW91ci1vcGVuYWktYXBpLWtleQ==
+  anthropic-api-key: eW91ci1hbnRocm9waWMtYXBpLWtleQ==
+  database-password: cGFzc3dvcmQ=
+  redis-password: 
+  smtp-password: eW91ci1zbXRwLXBhc3N3b3Jk
+---
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ai-document-agent-tls
+  namespace: ai-document-agent
+type: kubernetes.io/tls
+data:
+  # Base64 encoded TLS certificate and key - replace with actual values
+  tls.crt: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCg==
+  tls.key: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCg==
diff --git a/model/summarizer.py b/model/summarizer.py
index 7418477..4687fa8 100644
--- a/model/summarizer.py
+++ b/model/summarizer.py
@@ -1,9 +1,356 @@
-def summarize_text(text: str) -> str:
+import os
+import re
+from typing import Dict, List, Optional, Any
+from openai import OpenAI
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.chains.summarize import load_summarize_chain
+from langchain.chat_models import ChatOpenAI
+from langchain.schema import Document
+
+class AdvancedSummarizer:
+    """Advanced AI-powered document summarization using OpenAI GPT models"""
+    
+    def __init__(self, model_name: str = "gpt-4", max_tokens: int = 1000):
+        self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+        self.model_name = model_name
+        self.max_tokens = max_tokens
+        self.text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=4000,
+            chunk_overlap=200,
+            length_function=len,
+        )
+    
+    def summarize_text(self, text: str, summary_type: str = "extractive", 
+                      max_length: int = 500) -> Dict[str, Any]:
+        """
+        Advanced text summarization with multiple strategies
+        
+        Args:
+            text: Input text to summarize
+            summary_type: Type of summarization (extractive, abstractive, bullet_points, executive)
+            max_length: Maximum length of summary
+            
+        Returns:
+            Dictionary containing summary and metadata
+        """
+        if not text or len(text.strip()) < 100:
+            return {
+                "summary": text,
+                "type": summary_type,
+                "confidence": 1.0,
+                "metadata": {"original_length": len(text), "summary_length": len(text)}
+            }
+        
+        try:
+            if summary_type == "extractive":
+                return self._extractive_summarization(text, max_length)
+            elif summary_type == "abstractive":
+                return self._abstractive_summarization(text, max_length)
+            elif summary_type == "bullet_points":
+                return self._bullet_point_summarization(text, max_length)
+            elif summary_type == "executive":
+                return self._executive_summarization(text, max_length)
+            else:
+                return self._abstractive_summarization(text, max_length)
+                
+        except Exception as e:
+            # Fallback to basic summarization
+            return self._fallback_summarization(text, max_length)
+    
+    def _extractive_summarization(self, text: str, max_length: int) -> Dict[str, Any]:
+        """Extractive summarization using key sentence extraction"""
+        try:
+            # Split text into sentences
+            sentences = re.split(r'(?<=[.!?])\s+', text.strip())
+            
+            # Use OpenAI to score and select key sentences
+            prompt = f"""
+            Analyze the following text and identify the {max_length//50} most important sentences that best summarize the content.
+            Focus on sentences that contain key information, main points, and conclusions.
+            
+            Text:
+            {text}
+            
+            Return only the selected sentences in order, separated by newlines.
+            """
+            
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=self.max_tokens,
+                temperature=0.1
+            )
+            
+            summary = response.choices[0].message.content.strip()
+            
+            return {
+                "summary": summary,
+                "type": "extractive",
+                "confidence": 0.9,
+                "metadata": {
+                    "original_length": len(text),
+                    "summary_length": len(summary),
+                    "compression_ratio": len(summary) / len(text) if len(text) > 0 else 0
+                }
+            }
+            
+        except Exception as e:
+            return self._fallback_summarization(text, max_length)
+    
+    def _abstractive_summarization(self, text: str, max_length: int) -> Dict[str, Any]:
+        """Abstractive summarization using GPT models"""
+        try:
+            # For long texts, use chunking
+            if len(text) > 8000:
+                return self._chunked_summarization(text, max_length)
+            
+            prompt = f"""
+            Create a comprehensive summary of the following text in approximately {max_length} characters.
+            The summary should capture the main points, key insights, and conclusions.
+            Write in a clear, professional tone.
+            
+            Text:
+            {text}
+            
+            Summary:
+            """
+            
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=self.max_tokens,
+                temperature=0.3
+            )
+            
+            summary = response.choices[0].message.content.strip()
+            
+            return {
+                "summary": summary,
+                "type": "abstractive",
+                "confidence": 0.95,
+                "metadata": {
+                    "original_length": len(text),
+                    "summary_length": len(summary),
+                    "compression_ratio": len(summary) / len(text) if len(text) > 0 else 0
+                }
+            }
+            
+        except Exception as e:
+            return self._fallback_summarization(text, max_length)
+    
+    def _bullet_point_summarization(self, text: str, max_length: int) -> Dict[str, Any]:
+        """Bullet point summarization"""
+        try:
+            prompt = f"""
+            Create a bullet-point summary of the following text with key points and insights.
+            Use clear, concise bullet points that capture the main information.
+            Limit to approximately {max_length} characters total.
+            
+            Text:
+            {text}
+            
+            Bullet Point Summary:
+            """
+            
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=self.max_tokens,
+                temperature=0.2
+            )
+            
+            summary = response.choices[0].message.content.strip()
+            
+            return {
+                "summary": summary,
+                "type": "bullet_points",
+                "confidence": 0.9,
+                "metadata": {
+                    "original_length": len(text),
+                    "summary_length": len(summary),
+                    "compression_ratio": len(summary) / len(text) if len(text) > 0 else 0
+                }
+            }
+            
+        except Exception as e:
+            return self._fallback_summarization(text, max_length)
+    
+    def _executive_summarization(self, text: str, max_length: int) -> Dict[str, Any]:
+        """Executive summary for business documents"""
+        try:
+            prompt = f"""
+            Create an executive summary of the following text suitable for business leaders.
+            Focus on key decisions, risks, opportunities, and actionable insights.
+            Write in a professional, executive-level tone.
+            Limit to approximately {max_length} characters.
+            
+            Text:
+            {text}
+            
+            Executive Summary:
+            """
+            
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=self.max_tokens,
+                temperature=0.2
+            )
+            
+            summary = response.choices[0].message.content.strip()
+            
+            return {
+                "summary": summary,
+                "type": "executive",
+                "confidence": 0.95,
+                "metadata": {
+                    "original_length": len(text),
+                    "summary_length": len(summary),
+                    "compression_ratio": len(summary) / len(text) if len(text) > 0 else 0
+                }
+            }
+            
+        except Exception as e:
+            return self._fallback_summarization(text, max_length)
+    
+    def _chunked_summarization(self, text: str, max_length: int) -> Dict[str, Any]:
+        """Handle long texts by chunking and summarizing"""
+        try:
+            # Split text into chunks
+            chunks = self.text_splitter.split_text(text)
+            
+            # Summarize each chunk
+            chunk_summaries = []
+            for chunk in chunks:
+                chunk_summary = self._abstractive_summarization(chunk, max_length // len(chunks))
+                chunk_summaries.append(chunk_summary["summary"])
+            
+            # Combine chunk summaries
+            combined_summary = " ".join(chunk_summaries)
+            
+            # Create final summary
+            final_summary = self._abstractive_summarization(combined_summary, max_length)
+            
+            return {
+                "summary": final_summary["summary"],
+                "type": "chunked_abstractive",
+                "confidence": 0.85,
+                "metadata": {
+                    "original_length": len(text),
+                    "summary_length": len(final_summary["summary"]),
+                    "chunks_processed": len(chunks),
+                    "compression_ratio": len(final_summary["summary"]) / len(text) if len(text) > 0 else 0
+                }
+            }
+            
+        except Exception as e:
+            return self._fallback_summarization(text, max_length)
+    
+    def _fallback_summarization(self, text: str, max_length: int) -> Dict[str, Any]:
+        """Fallback summarization when AI fails"""
+        sentences = re.split(r'(?<=[.!?])\s+', text.strip())
+        
+        # Simple extractive summarization
+        if len(sentences) <= 3:
+            summary = text
+        else:
+            # Take first, middle, and last sentences
+            summary_parts = []
+            summary_parts.append(sentences[0])
+            
+            if len(sentences) > 2:
+                middle_idx = len(sentences) // 2
+                summary_parts.append(sentences[middle_idx])
+            
+            if len(sentences) > 1:
+                summary_parts.append(sentences[-1])
+            
+            summary = " ".join(summary_parts)
+        
+        # Truncate if too long
+        if len(summary) > max_length:
+            summary = summary[:max_length-3] + "..."
+        
+        return {
+            "summary": summary,
+            "type": "fallback",
+            "confidence": 0.6,
+            "metadata": {
+                "original_length": len(text),
+                "summary_length": len(summary),
+                "compression_ratio": len(summary) / len(text) if len(text) > 0 else 0,
+                "error": "AI summarization failed, using fallback method"
+            }
+        }
+    
+    def analyze_summary_quality(self, original_text: str, summary: str) -> Dict[str, Any]:
+        """Analyze the quality of a summary"""
+        try:
+            prompt = f"""
+            Analyze the quality of this summary compared to the original text.
+            Rate the following aspects on a scale of 1-10:
+            1. Completeness: Does it capture all key points?
+            2. Accuracy: Is the information correct?
+            3. Clarity: Is it easy to understand?
+            4. Conciseness: Is it appropriately brief?
+            5. Relevance: Does it focus on important information?
+            
+            Original Text:
+            {original_text[:2000]}...
+            
+            Summary:
+            {summary}
+            
+            Provide your analysis as JSON:
+            {{
+                "completeness": 8,
+                "accuracy": 9,
+                "clarity": 8,
+                "conciseness": 7,
+                "relevance": 9,
+                "overall_score": 8.2,
+                "strengths": ["Captures key points", "Clear language"],
+                "improvements": ["Could include more details"]
+            }}
+            """
+            
+            response = self.client.chat.completions.create(
+                model=self.model_name,
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=500,
+                temperature=0.1
+            )
+            
+            import json
+            analysis = json.loads(response.choices[0].message.content.strip())
+            return analysis
+            
+        except Exception as e:
+            return {
+                "completeness": 5,
+                "accuracy": 5,
+                "clarity": 5,
+                "conciseness": 5,
+                "relevance": 5,
+                "overall_score": 5.0,
+                "strengths": ["Fallback analysis"],
+                "improvements": ["AI analysis failed"]
+            }
+
+
+# Backward compatibility function
+def summarize_text(text: str, summary_type: str = "abstractive", max_length: int = 500) -> str:
     """
-    Naive summarization by extracting the first 2-3 sentences.
-    In production, replace this with a call to an LLM or advanced NLP model.
+    Backward compatibility function for simple text summarization
+    
+    Args:
+        text: Input text to summarize
+        summary_type: Type of summarization
+        max_length: Maximum length of summary
+        
+    Returns:
+        Summarized text
     """
-    import re
-    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
-    summary = ' '.join(sentences[:3]) if len(sentences) > 2 else text
-    return summary
+    summarizer = AdvancedSummarizer()
+    result = summarizer.summarize_text(text, summary_type, max_length)
+    return result["summary"]
diff --git a/monitoring/filebeat/filebeat.yml b/monitoring/filebeat/filebeat.yml
new file mode 100644
index 0000000..4a70175
--- /dev/null
+++ b/monitoring/filebeat/filebeat.yml
@@ -0,0 +1,57 @@
+filebeat.inputs:
+- type: log
+  enabled: true
+  paths:
+    - /var/log/smart-doc-bot/*.log
+  fields:
+    service: smart-doc-bot
+    environment: production
+  fields_under_root: true
+  multiline.pattern: '^\['
+  multiline.negate: true
+  multiline.match: after
+
+- type: container
+  enabled: true
+  paths:
+    - '/var/lib/docker/containers/*/*.log'
+  processors:
+    - add_docker_metadata:
+        host: "unix:///var/run/docker.sock"
+
+processors:
+  - add_host_metadata:
+      when.not.contains.tags: forwarded
+  - add_cloud_metadata: ~
+  - add_kubernetes_metadata:
+      host: ${NODE_NAME}
+      matchers:
+      - logs_path:
+          logs_path: "/var/log/containers/"
+
+output.elasticsearch:
+  hosts: ["elasticsearch:9200"]
+  indices:
+    - index: "filebeat-%{[agent.version]}-%{+yyyy.MM.dd}"
+  template.name: "filebeat"
+  template.pattern: "filebeat-*"
+  template.enabled: false
+  template.overwrite: false
+
+setup.kibana:
+  host: "kibana:5601"
+
+setup.template.settings:
+  index.number_of_shards: 1
+  index.number_of_replicas: 0
+
+setup.dashboards.enabled: true
+setup.dashboards.directory: /usr/share/filebeat/kibana
+
+logging.level: info
+logging.to_files: true
+logging.files:
+  path: /var/log/filebeat
+  name: filebeat
+  keepfiles: 7
+  permissions: 0644
diff --git a/monitoring/grafana/dashboards/ai-document-agent-dashboard.json b/monitoring/grafana/dashboards/ai-document-agent-dashboard.json
new file mode 100644
index 0000000..262170c
--- /dev/null
+++ b/monitoring/grafana/dashboards/ai-document-agent-dashboard.json
@@ -0,0 +1,457 @@
+{
+  "dashboard": {
+    "id": null,
+    "title": "AI Document Agent Dashboard",
+    "tags": ["ai", "document-processing", "agents"],
+    "style": "dark",
+    "timezone": "browser",
+    "panels": [
+      {
+        "id": 1,
+        "title": "System Overview",
+        "type": "stat",
+        "targets": [
+          {
+            "expr": "system_cpu_usage",
+            "legendFormat": "CPU Usage"
+          },
+          {
+            "expr": "system_memory_usage", 
+            "legendFormat": "Memory Usage"
+          },
+          {
+            "expr": "system_disk_usage",
+            "legendFormat": "Disk Usage"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "displayMode": "gradient-gauge"
+            },
+            "mappings": [],
+            "thresholds": {
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "red",
+                  "value": 80
+                }
+              ]
+            }
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 0
+        }
+      },
+      {
+        "id": 2,
+        "title": "Agent Performance",
+        "type": "timeseries",
+        "targets": [
+          {
+            "expr": "rate(agent_execution_time_sum[5m]) / rate(agent_execution_time_count[5m])",
+            "legendFormat": "{{agent_type}} - Avg Execution Time"
+          },
+          {
+            "expr": "agent_confidence",
+            "legendFormat": "{{agent_type}} - Confidence"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisLabel": "",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "drawStyle": "line",
+              "fillOpacity": 10,
+              "gradientMode": "none",
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "vis": false
+              },
+              "lineInterpolation": "linear",
+              "lineWidth": 1,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "never",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "red",
+                  "value": 80
+                }
+              ]
+            },
+            "unit": "s"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 0
+        }
+      },
+      {
+        "id": 3,
+        "title": "Document Processing Pipeline",
+        "type": "timeseries",
+        "targets": [
+          {
+            "expr": "rate(workflow_execution_time_sum[5m]) / rate(workflow_execution_time_count[5m])",
+            "legendFormat": "Workflow Execution Time"
+          },
+          {
+            "expr": "workflow_stages_completed",
+            "legendFormat": "Stages Completed"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisLabel": "",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "drawStyle": "line",
+              "fillOpacity": 10,
+              "gradientMode": "none",
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "vis": false
+              },
+              "lineInterpolation": "linear",
+              "lineWidth": 1,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "never",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "red",
+                  "value": 80
+                }
+              ]
+            },
+            "unit": "s"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 0,
+          "y": 8
+        }
+      },
+      {
+        "id": 4,
+        "title": "API Performance",
+        "type": "timeseries",
+        "targets": [
+          {
+            "expr": "rate(http_requests_total[5m])",
+            "legendFormat": "{{method}} {{endpoint}} - Requests/sec"
+          },
+          {
+            "expr": "rate(http_request_duration_sum[5m]) / rate(http_request_duration_count[5m])",
+            "legendFormat": "{{method}} {{endpoint}} - Response Time"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisLabel": "",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "drawStyle": "line",
+              "fillOpacity": 10,
+              "gradientMode": "none",
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "vis": false
+              },
+              "lineInterpolation": "linear",
+              "lineWidth": 1,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "never",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "red",
+                  "value": 80
+                }
+              ]
+            },
+            "unit": "reqps"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 12,
+          "x": 12,
+          "y": 8
+        }
+      },
+      {
+        "id": 5,
+        "title": "Agent Success Rate",
+        "type": "piechart",
+        "targets": [
+          {
+            "expr": "sum(agent_execution_time_count) by (agent_type)",
+            "legendFormat": "{{agent_type}}"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "vis": false
+              }
+            },
+            "mappings": []
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 8,
+          "x": 0,
+          "y": 16
+        }
+      },
+      {
+        "id": 6,
+        "title": "Error Rate",
+        "type": "timeseries",
+        "targets": [
+          {
+            "expr": "rate(http_requests_total{status_code=~\"5..\"}[5m])",
+            "legendFormat": "5xx Errors"
+          },
+          {
+            "expr": "rate(http_requests_total{status_code=~\"4..\"}[5m])",
+            "legendFormat": "4xx Errors"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisLabel": "",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "drawStyle": "line",
+              "fillOpacity": 10,
+              "gradientMode": "none",
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "vis": false
+              },
+              "lineInterpolation": "linear",
+              "lineWidth": 1,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "never",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "red",
+                  "value": 80
+                }
+              ]
+            },
+            "unit": "reqps"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 8,
+          "x": 8,
+          "y": 16
+        }
+      },
+      {
+        "id": 7,
+        "title": "Memory Usage by Agent",
+        "type": "barchart",
+        "targets": [
+          {
+            "expr": "agent_memory_usage",
+            "legendFormat": "{{agent_type}}"
+          }
+        ],
+        "fieldConfig": {
+          "defaults": {
+            "color": {
+              "mode": "palette-classic"
+            },
+            "custom": {
+              "axisLabel": "",
+              "axisPlacement": "auto",
+              "barAlignment": 0,
+              "drawStyle": "bars",
+              "fillOpacity": 80,
+              "gradientMode": "none",
+              "hideFrom": {
+                "legend": false,
+                "tooltip": false,
+                "vis": false
+              },
+              "lineInterpolation": "linear",
+              "lineWidth": 1,
+              "pointSize": 5,
+              "scaleDistribution": {
+                "type": "linear"
+              },
+              "showPoints": "never",
+              "spanNulls": false,
+              "stacking": {
+                "group": "A",
+                "mode": "none"
+              },
+              "thresholdsStyle": {
+                "mode": "off"
+              }
+            },
+            "mappings": [],
+            "thresholds": {
+              "mode": "absolute",
+              "steps": [
+                {
+                  "color": "green",
+                  "value": null
+                },
+                {
+                  "color": "red",
+                  "value": 80
+                }
+              ]
+            },
+            "unit": "bytes"
+          }
+        },
+        "gridPos": {
+          "h": 8,
+          "w": 8,
+          "x": 16,
+          "y": 16
+        }
+      }
+    ],
+    "time": {
+      "from": "now-1h",
+      "to": "now"
+    },
+    "timepicker": {},
+    "templating": {
+      "list": []
+    },
+    "annotations": {
+      "list": []
+    },
+    "refresh": "5s",
+    "schemaVersion": 27,
+    "version": 1,
+    "links": []
+  }
+}
diff --git a/monitoring/grafana/datasources/prometheus.yml b/monitoring/grafana/datasources/prometheus.yml
new file mode 100644
index 0000000..e1a77f9
--- /dev/null
+++ b/monitoring/grafana/datasources/prometheus.yml
@@ -0,0 +1,14 @@
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: true
+    jsonData:
+      timeInterval: "5s"
+      queryTimeout: "60s"
+      httpMethod: "POST"
+    secureJsonData: {}
diff --git a/nginx/nginx.conf b/nginx/nginx.conf
new file mode 100644
index 0000000..782122d
--- /dev/null
+++ b/nginx/nginx.conf
@@ -0,0 +1,211 @@
+events {
+    worker_connections 1024;
+}
+
+http {
+    include       /etc/nginx/mime.types;
+    default_type  application/octet-stream;
+
+    # Logging
+    log_format main '$remote_addr - $remote_user [$time_local] "$request" '
+                    '$status $body_bytes_sent "$http_referer" '
+                    '"$http_user_agent" "$http_x_forwarded_for"';
+
+    access_log /var/log/nginx/access.log main;
+    error_log /var/log/nginx/error.log warn;
+
+    # Basic settings
+    sendfile on;
+    tcp_nopush on;
+    tcp_nodelay on;
+    keepalive_timeout 65;
+    types_hash_max_size 2048;
+    client_max_body_size 50M;
+
+    # Gzip compression
+    gzip on;
+    gzip_vary on;
+    gzip_min_length 1024;
+    gzip_proxied any;
+    gzip_comp_level 6;
+    gzip_types
+        text/plain
+        text/css
+        text/xml
+        text/javascript
+        application/json
+        application/javascript
+        application/xml+rss
+        application/atom+xml
+        image/svg+xml;
+
+    # Security headers
+    add_header X-Frame-Options "SAMEORIGIN" always;
+    add_header X-Content-Type-Options "nosniff" always;
+    add_header X-XSS-Protection "1; mode=block" always;
+    add_header Referrer-Policy "strict-origin-when-cross-origin" always;
+    add_header Content-Security-Policy "default-src 'self'; script-src 'self' 'unsafe-inline' 'unsafe-eval'; style-src 'self' 'unsafe-inline'; img-src 'self' data: https:; font-src 'self' data:; connect-src 'self' ws: wss:;" always;
+
+    # Rate limiting
+    limit_req_zone $binary_remote_addr zone=api:10m rate=10r/s;
+    limit_req_zone $binary_remote_addr zone=login:10m rate=1r/s;
+
+    # Upstream backend servers
+    upstream backend {
+        least_conn;
+        server backend:8000 max_fails=3 fail_timeout=30s;
+        # Add more backend servers for load balancing
+        # server backend2:8000 max_fails=3 fail_timeout=30s;
+        # server backend3:8000 max_fails=3 fail_timeout=30s;
+    }
+
+    # Upstream frontend servers
+    upstream frontend {
+        least_conn;
+        server frontend:3000 max_fails=3 fail_timeout=30s;
+        # Add more frontend servers for load balancing
+        # server frontend2:3000 max_fails=3 fail_timeout=30s;
+    }
+
+    # HTTP server (redirect to HTTPS)
+    server {
+        listen 80;
+        server_name _;
+        
+        # Redirect all HTTP traffic to HTTPS
+        return 301 https://$host$request_uri;
+    }
+
+    # HTTPS server
+    server {
+        listen 443 ssl http2;
+        server_name _;
+
+        # SSL configuration
+        ssl_certificate /etc/nginx/ssl/cert.pem;
+        ssl_certificate_key /etc/nginx/ssl/key.pem;
+        ssl_protocols TLSv1.2 TLSv1.3;
+        ssl_ciphers ECDHE-RSA-AES256-GCM-SHA512:DHE-RSA-AES256-GCM-SHA512:ECDHE-RSA-AES256-GCM-SHA384:DHE-RSA-AES256-GCM-SHA384;
+        ssl_prefer_server_ciphers off;
+        ssl_session_cache shared:SSL:10m;
+        ssl_session_timeout 10m;
+
+        # Security headers for HTTPS
+        add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always;
+
+        # Frontend application
+        location / {
+            proxy_pass http://frontend;
+            proxy_http_version 1.1;
+            proxy_set_header Upgrade $http_upgrade;
+            proxy_set_header Connection 'upgrade';
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+            proxy_cache_bypass $http_upgrade;
+            proxy_read_timeout 86400;
+        }
+
+        # API endpoints
+        location /api/ {
+            limit_req zone=api burst=20 nodelay;
+            
+            proxy_pass http://backend;
+            proxy_http_version 1.1;
+            proxy_set_header Upgrade $http_upgrade;
+            proxy_set_header Connection 'upgrade';
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+            proxy_cache_bypass $http_upgrade;
+            
+            # Timeouts for API calls
+            proxy_connect_timeout 60s;
+            proxy_send_timeout 60s;
+            proxy_read_timeout 60s;
+        }
+
+        # Authentication endpoints (stricter rate limiting)
+        location /api/v1/auth/ {
+            limit_req zone=login burst=5 nodelay;
+            
+            proxy_pass http://backend;
+            proxy_http_version 1.1;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+
+        # WebSocket support for real-time features
+        location /ws/ {
+            proxy_pass http://backend;
+            proxy_http_version 1.1;
+            proxy_set_header Upgrade $http_upgrade;
+            proxy_set_header Connection "upgrade";
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+            proxy_read_timeout 86400;
+        }
+
+        # Health check endpoint
+        location /health {
+            proxy_pass http://backend;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+
+        # Metrics endpoint (for monitoring)
+        location /metrics {
+            proxy_pass http://backend;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+        }
+
+        # Static files (if served by backend)
+        location /static/ {
+            proxy_pass http://backend;
+            proxy_set_header Host $host;
+            expires 1y;
+            add_header Cache-Control "public, immutable";
+        }
+
+        # Upload endpoint (larger file uploads)
+        location /api/v1/documents/upload {
+            client_max_body_size 50M;
+            proxy_pass http://backend;
+            proxy_http_version 1.1;
+            proxy_set_header Host $host;
+            proxy_set_header X-Real-IP $remote_addr;
+            proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+            proxy_set_header X-Forwarded-Proto $scheme;
+            proxy_read_timeout 300s;
+            proxy_send_timeout 300s;
+        }
+
+        # Deny access to sensitive files
+        location ~ /\. {
+            deny all;
+        }
+
+        location ~ /\.ht {
+            deny all;
+        }
+
+        # Error pages
+        error_page 404 /404.html;
+        error_page 500 502 503 504 /50x.html;
+        
+        location = /50x.html {
+            root /usr/share/nginx/html;
+        }
+    }
+}
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000..c547e6b
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,68 @@
+[tool:pytest]
+# Test discovery
+testpaths = backend/tests
+python_files = test_*.py *_test.py
+python_classes = Test*
+python_functions = test_*
+
+# Output and reporting
+addopts = 
+    -v
+    --tb=short
+    --strict-markers
+    --disable-warnings
+    --cov=backend/app
+    --cov-report=term-missing
+    --cov-report=html:backend/htmlcov
+    --cov-report=xml:backend/coverage.xml
+    --cov-fail-under=80
+    --junitxml=backend/test-results.xml
+
+# Markers
+markers =
+    unit: Unit tests
+    integration: Integration tests
+    e2e: End-to-end tests
+    slow: Slow running tests
+    api: API tests
+    database: Database tests
+    agent: Agent tests
+    security: Security tests
+    performance: Performance tests
+    smoke: Smoke tests
+
+# Filtering
+filterwarnings =
+    ignore::DeprecationWarning
+    ignore::PendingDeprecationWarning
+    ignore::UserWarning
+
+# Test configuration
+minversion = 6.0
+asyncio_mode = auto
+
+# Coverage configuration
+[coverage:run]
+source = backend/app
+omit = 
+    */tests/*
+    */migrations/*
+    */__pycache__/*
+    */venv/*
+    */env/*
+
+[coverage:report]
+exclude_lines =
+    pragma: no cover
+    def __repr__
+    if self.debug:
+    if settings.DEBUG
+    raise AssertionError
+    raise NotImplementedError
+    if 0:
+    if __name__ == .__main__.:
+    class .*\bProtocol\):
+    @(abc\.)?abstractmethod
+
+[coverage:html]
+directory = backend/htmlcov
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..c66cc66
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,80 @@
+# Development and Testing Dependencies
+# Install with: pip install -r requirements-dev.txt
+
+# Testing
+pytest==7.4.3
+pytest-asyncio==0.21.1
+pytest-cov==4.1.0
+pytest-mock==3.12.0
+pytest-xdist==3.3.1
+pytest-html==3.2.0
+pytest-json-report==1.5.0
+factory-boy==3.3.0
+faker==20.1.0
+
+# Code Quality
+flake8==6.1.0
+black==23.11.0
+isort==5.12.0
+mypy==1.7.1
+bandit==1.7.5
+safety==2.3.5
+pre-commit==3.5.0
+
+# Documentation
+sphinx==7.2.6
+sphinx-rtd-theme==1.3.0
+sphinx-autodoc-typehints==1.25.0
+myst-parser==2.0.0
+
+# Performance Testing
+locust==2.17.0
+wrk==4.2.0
+
+# Development Tools
+ipython==8.17.2
+ipdb==0.13.13
+debugpy==1.8.0
+watchdog==3.0.0
+
+# Database Tools
+alembic==1.13.1
+psycopg2-binary==2.9.9
+
+# Monitoring and Debugging
+memory-profiler==0.61.0
+line-profiler==4.1.2
+py-spy==0.3.14
+
+# Security Tools
+cryptography==41.0.8
+python-jose[cryptography]==3.3.0
+
+# API Testing
+httpx==0.25.2
+requests-mock==1.11.0
+responses==0.24.1
+
+# Mock and Stub Libraries
+freezegun==1.2.2
+vcrpy==6.0.1
+
+# Environment Management
+python-dotenv==1.0.0
+environs==10.0.0
+
+# Build Tools
+build==0.11.0
+twine==4.0.2
+wheel==0.42.0
+
+# Type Checking
+types-requests==2.31.0.10
+types-PyYAML==6.0.12.12
+types-redis==4.6.0.9
+
+# Additional Utilities
+click==8.1.7
+rich==13.7.0
+tqdm==4.66.1
+colorama==0.4.6
diff --git a/requirements.txt b/requirements.txt
index 9a538ae..94a3719 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,62 +1,49 @@
-# Core FastAPI & Web Framework
-fastapi==0.110.0
-uvicorn==0.29.0
-pydantic==2.6.4
-python-multipart==0.0.9
-
-# LLM & AI Libraries
-openai==1.12.0
-langchain==0.1.0
-langchain-openai==0.0.5
-langchain-community==0.0.10
-transformers==4.37.2
-torch==2.1.2
-sentence-transformers==2.2.2
-
-# Document Processing
-pypdf2==3.0.1
-python-docx==1.1.0
-pytesseract==0.3.10
-Pillow==10.1.0
-pdf2image==1.16.3
+# Core Framework
+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+pydantic==2.5.0
+pydantic-settings==2.1.0
 
-# Vector Database & Search
+# Database
 sqlalchemy==2.0.25
-redis==5.0.1
+alembic==1.13.1
 psycopg2-binary==2.9.9
-pgvector==0.2.4
-chromadb==0.4.22
-
-# Policy Engine
-opa==0.1.0
-
-# Monitoring & Observability
-opentelemetry-api==1.21.0
-opentelemetry-sdk==1.21.0
-opentelemetry-instrumentation-fastapi==0.42b0
-prometheus-client==0.19.0
+redis==5.0.1
 
-# Security & Authentication
+# Security
 python-jose[cryptography]==3.3.0
 passlib[bcrypt]==1.7.4
-python-multipart==0.0.9
+python-multipart==0.0.6
 
-# Utilities
-python-dotenv==1.0.0
-httpx==0.26.0
-celery==5.3.4
-redis==5.0.1
+# AI/ML
+openai==1.3.7
+langchain==0.1.0
+langchain-openai==0.0.2
+chromadb==0.4.18
+sentence-transformers==2.2.2
+
+# Document Processing
+python-docx==1.1.0
+PyPDF2==3.0.1
 pandas==2.1.4
 openpyxl==3.1.2
-reportlab==4.0.7
+
+# Monitoring & Observability
+prometheus-client==0.19.0
+structlog==23.2.0
 
 # Testing
 pytest==7.4.3
 pytest-asyncio==0.21.1
-httpx==0.26.0
+httpx==0.25.2
 
 # Development
-black==23.12.1
-isort==5.13.2
+black==23.11.0
+isort==5.12.0
 flake8==6.1.0
-mypy==1.8.0
+mypy==1.7.1
+
+# Additional Dependencies
+python-dotenv==1.0.0
+aiofiles==23.2.1
+python-dateutil==2.8.2
diff --git a/scripts/backup.sh b/scripts/backup.sh
new file mode 100644
index 0000000..324df75
--- /dev/null
+++ b/scripts/backup.sh
@@ -0,0 +1,430 @@
+#!/bin/bash
+
+# AI Document Agent Backup Script
+# This script creates comprehensive backups of the entire system
+
+set -e  # Exit on any error
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Function to print colored output
+print_status() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Configuration
+BACKUP_DIR="./backups"
+DATE=$(date +%Y%m%d_%H%M%S)
+BACKUP_NAME="ai_document_agent_backup_$DATE"
+BACKUP_PATH="$BACKUP_DIR/$BACKUP_NAME"
+
+# Database configuration
+DB_HOST="${DB_HOST:-localhost}"
+DB_PORT="${DB_PORT:-5432}"
+DB_NAME="${DB_NAME:-ai_document_agent}"
+DB_USER="${DB_USER:-postgres}"
+
+# Function to check if command exists
+command_exists() {
+    command -v "$1" >/dev/null 2>&1
+}
+
+# Function to create backup directory
+create_backup_directory() {
+    print_status "Creating backup directory..."
+    mkdir -p "$BACKUP_PATH"
+    mkdir -p "$BACKUP_PATH/database"
+    mkdir -p "$BACKUP_PATH/files"
+    mkdir -p "$BACKUP_PATH/config"
+    mkdir -p "$BACKUP_PATH/logs"
+    print_success "Backup directory created: $BACKUP_PATH"
+}
+
+# Function to backup database
+backup_database() {
+    print_status "Backing up PostgreSQL database..."
+    
+    if command_exists pg_dump; then
+        # Prompt for password if not provided
+        if [ -z "$DB_PASSWORD" ]; then
+            echo -n "Enter database password: "
+            read -s DB_PASSWORD
+            echo
+        fi
+        
+        # Set password environment variable
+        export PGPASSWORD="$DB_PASSWORD"
+        
+        # Create database dump
+        pg_dump -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" \
+            --verbose --clean --no-owner --no-privileges \
+            --file="$BACKUP_PATH/database/full_backup.sql"
+        
+        # Create schema-only backup
+        pg_dump -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" \
+            --verbose --schema-only --no-owner --no-privileges \
+            --file="$BACKUP_PATH/database/schema_only.sql"
+        
+        # Create data-only backup
+        pg_dump -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" \
+            --verbose --data-only --no-owner --no-privileges \
+            --file="$BACKUP_PATH/database/data_only.sql"
+        
+        print_success "Database backup completed"
+    else
+        print_error "pg_dump not found. Please install PostgreSQL client tools."
+        return 1
+    fi
+}
+
+# Function to backup uploaded files
+backup_files() {
+    print_status "Backing up uploaded files..."
+    
+    if [ -d "./uploads" ]; then
+        tar -czf "$BACKUP_PATH/files/uploads.tar.gz" -C . uploads/
+        print_success "Uploaded files backed up"
+    else
+        print_warning "Uploads directory not found, skipping"
+    fi
+    
+    if [ -d "./output" ]; then
+        tar -czf "$BACKUP_PATH/files/output.tar.gz" -C . output/
+        print_success "Output files backed up"
+    else
+        print_warning "Output directory not found, skipping"
+    fi
+    
+    if [ -d "./chroma_db" ]; then
+        tar -czf "$BACKUP_PATH/files/chroma_db.tar.gz" -C . chroma_db/
+        print_success "ChromaDB vector database backed up"
+    else
+        print_warning "ChromaDB directory not found, skipping"
+    fi
+}
+
+# Function to backup configuration files
+backup_config() {
+    print_status "Backing up configuration files..."
+    
+    # Backup environment files
+    if [ -f ".env" ]; then
+        cp .env "$BACKUP_PATH/config/"
+        print_success "Environment file backed up"
+    fi
+    
+    if [ -f "backend/.env" ]; then
+        cp backend/.env "$BACKUP_PATH/config/"
+        print_success "Backend environment file backed up"
+    fi
+    
+    # Backup configuration files
+    if [ -d "backend/app/core" ]; then
+        tar -czf "$BACKUP_PATH/config/backend_config.tar.gz" -C backend/app core/
+        print_success "Backend configuration backed up"
+    fi
+    
+    if [ -d "frontend/src/config" ]; then
+        tar -czf "$BACKUP_PATH/config/frontend_config.tar.gz" -C frontend/src config/
+        print_success "Frontend configuration backed up"
+    fi
+    
+    # Backup Docker configuration
+    if [ -f "docker-compose.yml" ]; then
+        cp docker-compose.yml "$BACKUP_PATH/config/"
+        print_success "Docker Compose configuration backed up"
+    fi
+    
+    # Backup Nginx configuration
+    if [ -d "nginx" ]; then
+        tar -czf "$BACKUP_PATH/config/nginx_config.tar.gz" -C . nginx/
+        print_success "Nginx configuration backed up"
+    fi
+    
+    # Backup monitoring configuration
+    if [ -d "monitoring" ]; then
+        tar -czf "$BACKUP_PATH/config/monitoring_config.tar.gz" -C . monitoring/
+        print_success "Monitoring configuration backed up"
+    fi
+}
+
+# Function to backup logs
+backup_logs() {
+    print_status "Backing up log files..."
+    
+    if [ -d "./logs" ]; then
+        tar -czf "$BACKUP_PATH/logs/application_logs.tar.gz" -C . logs/
+        print_success "Application logs backed up"
+    else
+        print_warning "Logs directory not found, skipping"
+    fi
+    
+    if [ -d "./audit_logs" ]; then
+        tar -czf "$BACKUP_PATH/logs/audit_logs.tar.gz" -C . audit_logs/
+        print_success "Audit logs backed up"
+    else
+        print_warning "Audit logs directory not found, skipping"
+    fi
+}
+
+# Function to backup code (optional)
+backup_code() {
+    print_status "Backing up source code..."
+    
+    # Create a git archive if this is a git repository
+    if [ -d ".git" ]; then
+        git archive --format=tar.gz --output="$BACKUP_PATH/source_code.tar.gz" HEAD
+        print_success "Source code backed up (git archive)"
+    else
+        # Fallback: create a tar of the entire project
+        tar -czf "$BACKUP_PATH/source_code.tar.gz" \
+            --exclude="$BACKUP_DIR" \
+            --exclude="node_modules" \
+            --exclude="__pycache__" \
+            --exclude="*.pyc" \
+            --exclude=".git" \
+            --exclude="uploads" \
+            --exclude="output" \
+            --exclude="chroma_db" \
+            --exclude="logs" \
+            --exclude="audit_logs" \
+            .
+        print_success "Source code backed up (full project)"
+    fi
+}
+
+# Function to create backup manifest
+create_manifest() {
+    print_status "Creating backup manifest..."
+    
+    cat > "$BACKUP_PATH/backup_manifest.txt" << EOF
+AI Document Agent Backup Manifest
+=================================
+Backup Date: $(date)
+Backup Name: $BACKUP_NAME
+System: $(uname -a)
+
+Backup Contents:
+$(find "$BACKUP_PATH" -type f -name "*.tar.gz" -o -name "*.sql" -o -name "*.env" | sort)
+
+Database Information:
+- Host: $DB_HOST
+- Port: $DB_PORT
+- Database: $DB_NAME
+- User: $DB_USER
+
+Backup Size: $(du -sh "$BACKUP_PATH" | cut -f1)
+
+Restore Instructions:
+1. Extract the backup: tar -xzf $BACKUP_NAME.tar.gz
+2. Restore database: psql -h $DB_HOST -p $DB_PORT -U $DB_USER -d $DB_NAME < database/full_backup.sql
+3. Restore files: tar -xzf files/*.tar.gz
+4. Restore configuration: tar -xzf config/*.tar.gz
+5. Restart services: docker-compose up -d
+
+EOF
+    
+    print_success "Backup manifest created"
+}
+
+# Function to compress backup
+compress_backup() {
+    print_status "Compressing backup..."
+    
+    cd "$BACKUP_DIR"
+    tar -czf "${BACKUP_NAME}.tar.gz" "$BACKUP_NAME"
+    cd - > /dev/null
+    
+    # Remove uncompressed directory
+    rm -rf "$BACKUP_PATH"
+    
+    print_success "Backup compressed: $BACKUP_DIR/${BACKUP_NAME}.tar.gz"
+}
+
+# Function to verify backup
+verify_backup() {
+    print_status "Verifying backup..."
+    
+    BACKUP_FILE="$BACKUP_DIR/${BACKUP_NAME}.tar.gz"
+    
+    if [ -f "$BACKUP_FILE" ]; then
+        # Check if tar file is valid
+        if tar -tzf "$BACKUP_FILE" > /dev/null 2>&1; then
+            print_success "Backup verification successful"
+            print_status "Backup size: $(du -sh "$BACKUP_FILE" | cut -f1)"
+        else
+            print_error "Backup verification failed - tar file is corrupted"
+            return 1
+        fi
+    else
+        print_error "Backup file not found"
+        return 1
+    fi
+}
+
+# Function to cleanup old backups
+cleanup_old_backups() {
+    print_status "Cleaning up old backups..."
+    
+    # Keep backups for 30 days
+    find "$BACKUP_DIR" -name "ai_document_agent_backup_*.tar.gz" -mtime +30 -delete
+    
+    print_success "Old backups cleaned up"
+}
+
+# Function to show backup status
+show_backup_status() {
+    print_status "Backup Status:"
+    echo "Backup Directory: $BACKUP_DIR"
+    echo "Latest Backup: $BACKUP_NAME"
+    echo "Backup Size: $(du -sh "$BACKUP_DIR" 2>/dev/null | cut -f1 || echo 'N/A')"
+    echo ""
+    echo "Recent Backups:"
+    ls -la "$BACKUP_DIR"/*.tar.gz 2>/dev/null | tail -5 || echo "No backups found"
+}
+
+# Function to restore backup
+restore_backup() {
+    local backup_file="$1"
+    
+    if [ -z "$backup_file" ]; then
+        print_error "No backup file specified"
+        echo "Usage: $0 restore <backup_file.tar.gz>"
+        return 1
+    fi
+    
+    if [ ! -f "$backup_file" ]; then
+        print_error "Backup file not found: $backup_file"
+        return 1
+    fi
+    
+    print_status "Restoring backup: $backup_file"
+    
+    # Extract backup
+    local temp_dir=$(mktemp -d)
+    tar -xzf "$backup_file" -C "$temp_dir"
+    
+    # Restore database
+    if [ -f "$temp_dir"/*/database/full_backup.sql ]; then
+        print_status "Restoring database..."
+        if [ -z "$DB_PASSWORD" ]; then
+            echo -n "Enter database password: "
+            read -s DB_PASSWORD
+            echo
+        fi
+        export PGPASSWORD="$DB_PASSWORD"
+        psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" < "$temp_dir"/*/database/full_backup.sql
+        print_success "Database restored"
+    fi
+    
+    # Restore files
+    if [ -f "$temp_dir"/*/files/uploads.tar.gz ]; then
+        print_status "Restoring uploaded files..."
+        tar -xzf "$temp_dir"/*/files/uploads.tar.gz -C .
+        print_success "Uploaded files restored"
+    fi
+    
+    if [ -f "$temp_dir"/*/files/output.tar.gz ]; then
+        print_status "Restoring output files..."
+        tar -xzf "$temp_dir"/*/files/output.tar.gz -C .
+        print_success "Output files restored"
+    fi
+    
+    if [ -f "$temp_dir"/*/files/chroma_db.tar.gz ]; then
+        print_status "Restoring ChromaDB..."
+        tar -xzf "$temp_dir"/*/files/chroma_db.tar.gz -C .
+        print_success "ChromaDB restored"
+    fi
+    
+    # Cleanup
+    rm -rf "$temp_dir"
+    
+    print_success "Backup restoration completed"
+}
+
+# Function to show help
+show_help() {
+    echo "AI Document Agent Backup Script"
+    echo ""
+    echo "Usage: $0 [OPTION]"
+    echo ""
+    echo "Options:"
+    echo "  backup     - Create a full backup (default)"
+    echo "  restore    - Restore from backup"
+    echo "  status     - Show backup status"
+    echo "  cleanup    - Clean up old backups"
+    echo "  help       - Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0 backup                    # Create backup"
+    echo "  $0 restore backup.tar.gz     # Restore from backup"
+    echo "  $0 status                    # Show backup status"
+    echo "  $0 cleanup                   # Clean old backups"
+    echo ""
+    echo "Environment Variables:"
+    echo "  DB_HOST     - Database host (default: localhost)"
+    echo "  DB_PORT     - Database port (default: 5432)"
+    echo "  DB_NAME     - Database name (default: ai_document_agent)"
+    echo "  DB_USER     - Database user (default: postgres)"
+    echo "  DB_PASSWORD - Database password (will prompt if not set)"
+}
+
+# Main script logic
+main() {
+    case "${1:-backup}" in
+        "backup")
+            print_status "Starting AI Document Agent backup..."
+            
+            create_backup_directory
+            backup_database
+            backup_files
+            backup_config
+            backup_logs
+            backup_code
+            create_manifest
+            compress_backup
+            verify_backup
+            cleanup_old_backups
+            
+            print_success "Backup completed successfully!"
+            print_status "Backup location: $BACKUP_DIR/${BACKUP_NAME}.tar.gz"
+            ;;
+        "restore")
+            restore_backup "$2"
+            ;;
+        "status")
+            show_backup_status
+            ;;
+        "cleanup")
+            cleanup_old_backups
+            ;;
+        "help"|"-h"|"--help")
+            show_help
+            ;;
+        *)
+            print_error "Unknown option: $1"
+            show_help
+            exit 1
+            ;;
+    esac
+}
+
+# Run main function with all arguments
+main "$@"
diff --git a/scripts/init-db.sql b/scripts/init-db.sql
new file mode 100644
index 0000000..d6bd78d
--- /dev/null
+++ b/scripts/init-db.sql
@@ -0,0 +1,606 @@
+-- AI Document Agent Database Initialization Script
+-- This script creates the database schema and initial data
+
+-- Create database if it doesn't exist
+-- Note: This should be run as a superuser or database owner
+-- CREATE DATABASE ai_document_agent;
+
+-- Connect to the database
+-- \c ai_document_agent;
+
+-- Enable required extensions
+CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+CREATE EXTENSION IF NOT EXISTS "pgcrypto";
+CREATE EXTENSION IF NOT EXISTS "vector";
+
+-- Create custom types
+CREATE TYPE document_status AS ENUM ('pending', 'processing', 'completed', 'failed');
+CREATE TYPE agent_type AS ENUM ('orchestrator', 'ingestion', 'classifier', 'entity', 'risk', 'qa', 'compare', 'audit', 'summarizer', 'translator', 'sentiment');
+CREATE TYPE severity_level AS ENUM ('low', 'medium', 'high', 'critical');
+CREATE TYPE user_role AS ENUM ('user', 'manager', 'admin');
+
+-- Create users table
+CREATE TABLE IF NOT EXISTS users (
+    id SERIAL PRIMARY KEY,
+    email VARCHAR(255) UNIQUE NOT NULL,
+    hashed_password VARCHAR(255) NOT NULL,
+    full_name VARCHAR(255),
+    is_active BOOLEAN DEFAULT true,
+    is_superuser BOOLEAN DEFAULT false,
+    role user_role DEFAULT 'user',
+    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    last_login TIMESTAMP WITH TIME ZONE
+);
+
+-- Create roles table
+CREATE TABLE IF NOT EXISTS roles (
+    id SERIAL PRIMARY KEY,
+    name VARCHAR(100) UNIQUE NOT NULL,
+    description TEXT,
+    permissions JSONB,
+    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Create user_roles junction table
+CREATE TABLE IF NOT EXISTS user_roles (
+    user_id INTEGER REFERENCES users(id) ON DELETE CASCADE,
+    role_id INTEGER REFERENCES roles(id) ON DELETE CASCADE,
+    PRIMARY KEY (user_id, role_id)
+);
+
+-- Create documents table
+CREATE TABLE IF NOT EXISTS documents (
+    id SERIAL PRIMARY KEY,
+    filename VARCHAR(255) NOT NULL,
+    original_filename VARCHAR(255) NOT NULL,
+    content TEXT,
+    file_path VARCHAR(500) NOT NULL,
+    file_size INTEGER NOT NULL,
+    content_type VARCHAR(100) NOT NULL,
+    doc_type VARCHAR(50),
+    domain VARCHAR(100),
+    processing_status document_status DEFAULT 'pending',
+    processing_result JSONB,
+    confidence_score FLOAT,
+    risk_score FLOAT,
+    metadata JSONB,
+    tags JSONB,
+    entities JSONB,
+    clauses JSONB,
+    risks JSONB,
+    qa_pairs JSONB,
+    uploaded_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    processed_at TIMESTAMP WITH TIME ZONE,
+    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    uploaded_by INTEGER REFERENCES users(id) NOT NULL
+);
+
+-- Create tags table
+CREATE TABLE IF NOT EXISTS tags (
+    id SERIAL PRIMARY KEY,
+    name VARCHAR(100) UNIQUE NOT NULL,
+    description TEXT,
+    color VARCHAR(7),
+    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Create document_tags junction table
+CREATE TABLE IF NOT EXISTS document_tags (
+    document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
+    tag_id INTEGER REFERENCES tags(id) ON DELETE CASCADE,
+    PRIMARY KEY (document_id, tag_id)
+);
+
+-- Create compliance_frameworks table
+CREATE TABLE IF NOT EXISTS compliance_frameworks (
+    id SERIAL PRIMARY KEY,
+    name VARCHAR(100) UNIQUE NOT NULL,
+    description TEXT,
+    version VARCHAR(20),
+    requirements JSONB,
+    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
+);
+
+-- Create document_compliance_frameworks junction table
+CREATE TABLE IF NOT EXISTS document_compliance_frameworks (
+    document_id INTEGER REFERENCES documents(id) ON DELETE CASCADE,
+    framework_id INTEGER REFERENCES compliance_frameworks(id) ON DELETE CASCADE,
+    PRIMARY KEY (document_id, framework_id)
+);
+
+-- Create processing_history table
+CREATE TABLE IF NOT EXISTS processing_history (
+    id SERIAL PRIMARY KEY,
+    processing_id VARCHAR(100) UNIQUE NOT NULL,
+    workflow_id VARCHAR(100),
+    current_stage VARCHAR(100),
+    completed_stages JSONB,
+    failed_stages JSONB,
+    total_execution_time FLOAT,
+    progress_percentage FLOAT DEFAULT 0.0,
+    status document_status DEFAULT 'pending',
+    result JSONB,
+    confidence FLOAT,
+    rationale TEXT,
+    error_message TEXT,
+    started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    completed_at TIMESTAMP WITH TIME ZONE,
+    document_id INTEGER REFERENCES documents(id) NOT NULL,
+    user_id INTEGER REFERENCES users(id) NOT NULL
+);
+
+-- Create agent_executions table
+CREATE TABLE IF NOT EXISTS agent_executions (
+    id SERIAL PRIMARY KEY,
+    execution_id VARCHAR(100) UNIQUE NOT NULL,
+    agent_type agent_type NOT NULL,
+    agent_name VARCHAR(100) NOT NULL,
+    input_size INTEGER,
+    output_size INTEGER,
+    execution_time FLOAT,
+    memory_usage FLOAT,
+    cpu_usage FLOAT,
+    status document_status DEFAULT 'pending',
+    confidence FLOAT,
+    output JSONB,
+    error_message TEXT,
+    started_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    completed_at TIMESTAMP WITH TIME ZONE,
+    processing_history_id INTEGER REFERENCES processing_history(id) NOT NULL
+);
+
+-- Create document_comparisons table
+CREATE TABLE IF NOT EXISTS document_comparisons (
+    id SERIAL PRIMARY KEY,
+    comparison_id VARCHAR(100) UNIQUE NOT NULL,
+    comparison_type VARCHAR(50) NOT NULL,
+    similarity_score FLOAT,
+    differences JSONB,
+    risk_changes JSONB,
+    status document_status DEFAULT 'pending',
+    result JSONB,
+    confidence FLOAT,
+    summary TEXT,
+    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    completed_at TIMESTAMP WITH TIME ZONE,
+    document_a_id INTEGER REFERENCES documents(id) NOT NULL,
+    document_b_id INTEGER REFERENCES documents(id) NOT NULL,
+    created_by INTEGER REFERENCES users(id) NOT NULL
+);
+
+-- Create audit_events table
+CREATE TABLE IF NOT EXISTS audit_events (
+    id SERIAL PRIMARY KEY,
+    event_id VARCHAR(100) UNIQUE NOT NULL,
+    event_type VARCHAR(100) NOT NULL,
+    event_category VARCHAR(50) NOT NULL,
+    severity severity_level DEFAULT 'low',
+    description TEXT NOT NULL,
+    details JSONB,
+    ip_address INET,
+    user_agent TEXT,
+    timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    user_id INTEGER REFERENCES users(id),
+    document_id INTEGER REFERENCES documents(id)
+);
+
+-- Create system_metrics table
+CREATE TABLE IF NOT EXISTS system_metrics (
+    id SERIAL PRIMARY KEY,
+    metric_id VARCHAR(100) UNIQUE NOT NULL,
+    metric_name VARCHAR(100) NOT NULL,
+    metric_type VARCHAR(50) NOT NULL,
+    value FLOAT NOT NULL,
+    labels JSONB,
+    timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    description TEXT,
+    unit VARCHAR(20)
+);
+
+-- Create workflow_templates table
+CREATE TABLE IF NOT EXISTS workflow_templates (
+    id SERIAL PRIMARY KEY,
+    template_id VARCHAR(100) UNIQUE NOT NULL,
+    name VARCHAR(100) NOT NULL,
+    description TEXT,
+    version VARCHAR(20) DEFAULT '1.0.0',
+    stages JSONB NOT NULL,
+    agent_config JSONB,
+    workflow_config JSONB,
+    is_active BOOLEAN DEFAULT true,
+    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    created_by INTEGER REFERENCES users(id) NOT NULL
+);
+
+-- Create knowledge_base table
+CREATE TABLE IF NOT EXISTS knowledge_base (
+    id SERIAL PRIMARY KEY,
+    kb_id VARCHAR(100) UNIQUE NOT NULL,
+    name VARCHAR(100) NOT NULL,
+    description TEXT,
+    domain VARCHAR(100),
+    content TEXT NOT NULL,
+    content_type VARCHAR(50) NOT NULL,
+    vector_embedding vector(1536), -- OpenAI embedding dimension
+    source VARCHAR(255),
+    version VARCHAR(20) DEFAULT '1.0.0',
+    is_active BOOLEAN DEFAULT true,
+    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    created_by INTEGER REFERENCES users(id) NOT NULL
+);
+
+-- Create notifications table
+CREATE TABLE IF NOT EXISTS notifications (
+    id SERIAL PRIMARY KEY,
+    notification_id VARCHAR(100) UNIQUE NOT NULL,
+    title VARCHAR(255) NOT NULL,
+    message TEXT NOT NULL,
+    notification_type VARCHAR(50) NOT NULL,
+    priority VARCHAR(20) DEFAULT 'normal',
+    is_read BOOLEAN DEFAULT false,
+    is_sent BOOLEAN DEFAULT false,
+    sent_at TIMESTAMP WITH TIME ZONE,
+    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    user_id INTEGER REFERENCES users(id) NOT NULL,
+    document_id INTEGER REFERENCES documents(id)
+);
+
+-- Create api_logs table
+CREATE TABLE IF NOT EXISTS api_logs (
+    id SERIAL PRIMARY KEY,
+    method VARCHAR(10) NOT NULL,
+    endpoint VARCHAR(255) NOT NULL,
+    status_code INTEGER NOT NULL,
+    response_time FLOAT,
+    request_size INTEGER,
+    response_size INTEGER,
+    ip_address INET,
+    user_agent TEXT,
+    request_headers JSONB,
+    request_body TEXT,
+    response_body TEXT,
+    timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    user_id INTEGER REFERENCES users(id)
+);
+
+-- Create system_configurations table
+CREATE TABLE IF NOT EXISTS system_configurations (
+    id SERIAL PRIMARY KEY,
+    config_key VARCHAR(100) UNIQUE NOT NULL,
+    config_value TEXT NOT NULL,
+    config_type VARCHAR(50) NOT NULL,
+    description TEXT,
+    is_active BOOLEAN DEFAULT true,
+    created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
+    updated_by INTEGER REFERENCES users(id)
+);
+
+-- Create indexes for better performance
+CREATE INDEX IF NOT EXISTS idx_documents_uploaded_by ON documents(uploaded_by);
+CREATE INDEX IF NOT EXISTS idx_documents_status ON documents(processing_status);
+CREATE INDEX IF NOT EXISTS idx_documents_uploaded_at ON documents(uploaded_at);
+CREATE INDEX IF NOT EXISTS idx_processing_history_document_id ON processing_history(document_id);
+CREATE INDEX IF NOT EXISTS idx_agent_executions_processing_history_id ON agent_executions(processing_history_id);
+CREATE INDEX IF NOT EXISTS idx_audit_events_timestamp ON audit_events(timestamp);
+CREATE INDEX IF NOT EXISTS idx_audit_events_user_id ON audit_events(user_id);
+CREATE INDEX IF NOT EXISTS idx_system_metrics_timestamp ON system_metrics(timestamp);
+CREATE INDEX IF NOT EXISTS idx_knowledge_base_vector ON knowledge_base USING ivfflat (vector_embedding vector_cosine_ops);
+CREATE INDEX IF NOT EXISTS idx_api_logs_timestamp ON api_logs(timestamp);
+CREATE INDEX IF NOT EXISTS idx_api_logs_endpoint ON api_logs(endpoint);
+
+-- Insert default roles
+INSERT INTO roles (name, description, permissions) VALUES
+('admin', 'Administrator with full access', '["*"]'),
+('manager', 'Manager with limited admin access', '["read", "write", "analyze", "audit"]'),
+('user', 'Regular user with basic access', '["read", "write", "analyze"]')
+ON CONFLICT (name) DO NOTHING;
+
+-- Insert default admin user (password: admin123)
+INSERT INTO users (email, hashed_password, full_name, is_active, is_superuser, role) VALUES
+('admin@example.com', crypt('admin123', gen_salt('bf')), 'System Administrator', true, true, 'admin')
+ON CONFLICT (email) DO NOTHING;
+
+-- Insert default compliance frameworks
+INSERT INTO compliance_frameworks (name, description, version) VALUES
+('GDPR', 'General Data Protection Regulation', '2018'),
+('HIPAA', 'Health Insurance Portability and Accountability Act', '1996'),
+('SOX', 'Sarbanes-Oxley Act', '2002'),
+('PCI-DSS', 'Payment Card Industry Data Security Standard', '4.0')
+ON CONFLICT (name) DO NOTHING;
+
+-- Insert default tags
+INSERT INTO tags (name, description, color) VALUES
+('contract', 'Legal contracts', '#2196F3'),
+('invoice', 'Financial invoices', '#4CAF50'),
+('policy', 'Company policies', '#FF9800'),
+('report', 'Business reports', '#9C27B0'),
+('compliance', 'Compliance documents', '#F44336')
+ON CONFLICT (name) DO NOTHING;
+
+-- Insert default system configurations
+INSERT INTO system_configurations (config_key, config_value, config_type, description) VALUES
+('max_file_size', '52428800', 'integer', 'Maximum file upload size in bytes'),
+('allowed_file_types', '["pdf", "docx", "txt", "csv", "xlsx"]', 'json', 'Allowed file types for upload'),
+('agent_timeout', '300', 'integer', 'Agent execution timeout in seconds'),
+('confidence_threshold', '0.7', 'float', 'Minimum confidence threshold for agent results'),
+('audit_retention_days', '90', 'integer', 'Number of days to retain audit logs'),
+('enable_monitoring', 'true', 'boolean', 'Enable system monitoring'),
+('enable_pii_redaction', 'true', 'boolean', 'Enable PII redaction in logs')
+ON CONFLICT (config_key) DO NOTHING;
+
+-- Create function to update updated_at timestamp
+CREATE OR REPLACE FUNCTION update_updated_at_column()
+RETURNS TRIGGER AS $$
+BEGIN
+    NEW.updated_at = CURRENT_TIMESTAMP;
+    RETURN NEW;
+END;
+$$ language 'plpgsql';
+
+-- Create triggers for updated_at columns
+CREATE TRIGGER update_users_updated_at BEFORE UPDATE ON users FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
+CREATE TRIGGER update_documents_updated_at BEFORE UPDATE ON documents FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
+CREATE TRIGGER update_workflow_templates_updated_at BEFORE UPDATE ON workflow_templates FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
+CREATE TRIGGER update_knowledge_base_updated_at BEFORE UPDATE ON knowledge_base FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
+CREATE TRIGGER update_system_configurations_updated_at BEFORE UPDATE ON system_configurations FOR EACH ROW EXECUTE FUNCTION update_updated_at_column();
+
+-- Grant permissions to application user (adjust username as needed)
+-- GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO ai_document_agent_user;
+-- GRANT ALL PRIVILEGES ON ALL SEQUENCES IN SCHEMA public TO ai_document_agent_user;
+
+-- Create view for document summary
+CREATE OR REPLACE VIEW document_summary AS
+SELECT 
+    d.id,
+    d.filename,
+    d.doc_type,
+    d.processing_status,
+    d.confidence_score,
+    d.risk_score,
+    d.uploaded_at,
+    u.full_name as uploaded_by_name,
+    COUNT(DISTINCT dt.tag_id) as tag_count,
+    COUNT(DISTINCT dcf.framework_id) as compliance_framework_count
+FROM documents d
+LEFT JOIN users u ON d.uploaded_by = u.id
+LEFT JOIN document_tags dt ON d.id = dt.document_id
+LEFT JOIN document_compliance_frameworks dcf ON d.id = dcf.document_id
+GROUP BY d.id, d.filename, d.doc_type, d.processing_status, d.confidence_score, d.risk_score, d.uploaded_at, u.full_name;
+
+-- Create view for system health
+CREATE OR REPLACE VIEW system_health AS
+SELECT 
+    'documents' as component,
+    COUNT(*) as total_count,
+    COUNT(CASE WHEN processing_status = 'completed' THEN 1 END) as completed_count,
+    COUNT(CASE WHEN processing_status = 'failed' THEN 1 END) as failed_count,
+    AVG(confidence_score) as avg_confidence
+FROM documents
+UNION ALL
+SELECT 
+    'users' as component,
+    COUNT(*) as total_count,
+    COUNT(CASE WHEN is_active = true THEN 1 END) as completed_count,
+    COUNT(CASE WHEN is_active = false THEN 1 END) as failed_count,
+    NULL as avg_confidence
+FROM users;
+
+-- Create function to clean old audit logs
+CREATE OR REPLACE FUNCTION clean_old_audit_logs()
+RETURNS INTEGER AS $$
+DECLARE
+    deleted_count INTEGER;
+    retention_days INTEGER;
+BEGIN
+    -- Get retention period from configuration
+    SELECT config_value::INTEGER INTO retention_days 
+    FROM system_configurations 
+    WHERE config_key = 'audit_retention_days';
+    
+    -- Default to 90 days if not configured
+    IF retention_days IS NULL THEN
+        retention_days := 90;
+    END IF;
+    
+    -- Delete old audit logs
+    DELETE FROM audit_events 
+    WHERE timestamp < CURRENT_TIMESTAMP - INTERVAL '1 day' * retention_days;
+    
+    GET DIAGNOSTICS deleted_count = ROW_COUNT;
+    RETURN deleted_count;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Create function to get document statistics
+CREATE OR REPLACE FUNCTION get_document_statistics(
+    start_date TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP - INTERVAL '30 days',
+    end_date TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
+)
+RETURNS TABLE (
+    total_documents BIGINT,
+    completed_documents BIGINT,
+    failed_documents BIGINT,
+    avg_processing_time FLOAT,
+    avg_confidence_score FLOAT,
+    total_file_size BIGINT
+) AS $$
+BEGIN
+    RETURN QUERY
+    SELECT 
+        COUNT(*)::BIGINT as total_documents,
+        COUNT(CASE WHEN d.processing_status = 'completed' THEN 1 END)::BIGINT as completed_documents,
+        COUNT(CASE WHEN d.processing_status = 'failed' THEN 1 END)::BIGINT as failed_documents,
+        AVG(ph.total_execution_time) as avg_processing_time,
+        AVG(d.confidence_score) as avg_confidence_score,
+        SUM(d.file_size)::BIGINT as total_file_size
+    FROM documents d
+    LEFT JOIN processing_history ph ON d.id = ph.document_id
+    WHERE d.uploaded_at BETWEEN start_date AND end_date;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Create function to search documents by content similarity
+CREATE OR REPLACE FUNCTION search_documents_by_similarity(
+    search_text TEXT,
+    similarity_threshold FLOAT DEFAULT 0.8,
+    limit_count INTEGER DEFAULT 10
+)
+RETURNS TABLE (
+    document_id INTEGER,
+    filename VARCHAR(255),
+    similarity_score FLOAT,
+    content_preview TEXT
+) AS $$
+BEGIN
+    RETURN QUERY
+    SELECT 
+        d.id as document_id,
+        d.filename,
+        d.confidence_score as similarity_score,
+        LEFT(d.content, 200) as content_preview
+    FROM documents d
+    WHERE d.processing_status = 'completed'
+    AND d.content ILIKE '%' || search_text || '%'
+    ORDER BY d.confidence_score DESC
+    LIMIT limit_count;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Create function to get agent performance metrics
+CREATE OR REPLACE FUNCTION get_agent_performance_metrics(
+    start_date TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP - INTERVAL '7 days',
+    end_date TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
+)
+RETURNS TABLE (
+    agent_type TEXT,
+    total_executions BIGINT,
+    successful_executions BIGINT,
+    avg_execution_time FLOAT,
+    avg_confidence FLOAT,
+    avg_memory_usage FLOAT
+) AS $$
+BEGIN
+    RETURN QUERY
+    SELECT 
+        ae.agent_type::TEXT,
+        COUNT(*)::BIGINT as total_executions,
+        COUNT(CASE WHEN ae.status = 'completed' THEN 1 END)::BIGINT as successful_executions,
+        AVG(ae.execution_time) as avg_execution_time,
+        AVG(ae.confidence) as avg_confidence,
+        AVG(ae.memory_usage) as avg_memory_usage
+    FROM agent_executions ae
+    WHERE ae.started_at BETWEEN start_date AND end_date
+    GROUP BY ae.agent_type
+    ORDER BY total_executions DESC;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Create indexes for the new functions
+CREATE INDEX IF NOT EXISTS idx_documents_content_gin ON documents USING gin(to_tsvector('english', content));
+CREATE INDEX IF NOT EXISTS idx_agent_executions_started_at ON agent_executions(started_at);
+CREATE INDEX IF NOT EXISTS idx_processing_history_started_at ON processing_history(started_at);
+
+-- Insert sample workflow template
+INSERT INTO workflow_templates (template_id, name, description, stages, agent_config, created_by) VALUES
+('default-workflow', 'Default Document Processing Workflow', 'Standard workflow for processing documents', 
+'[
+    {"name": "ingestion", "agent": "ingestion", "order": 1, "required": true},
+    {"name": "classification", "agent": "classifier", "order": 2, "required": true},
+    {"name": "entity_extraction", "agent": "entity", "order": 3, "required": true},
+    {"name": "risk_assessment", "agent": "risk", "order": 4, "required": true},
+    {"name": "summarization", "agent": "summarizer", "order": 5, "required": false},
+    {"name": "audit_logging", "agent": "audit", "order": 6, "required": true}
+]',
+'{
+    "ingestion": {"timeout": 60, "max_retries": 3},
+    "classifier": {"timeout": 30, "max_retries": 2},
+    "entity": {"timeout": 45, "max_retries": 2},
+    "risk": {"timeout": 60, "max_retries": 3},
+    "summarizer": {"timeout": 90, "max_retries": 2},
+    "audit": {"timeout": 10, "max_retries": 1}
+}',
+(SELECT id FROM users WHERE email = 'admin@example.com' LIMIT 1))
+ON CONFLICT (template_id) DO NOTHING;
+
+-- Create a function to automatically assign tags based on content
+CREATE OR REPLACE FUNCTION auto_assign_tags(document_id INTEGER)
+RETURNS VOID AS $$
+DECLARE
+    doc_content TEXT;
+    tag_id INTEGER;
+BEGIN
+    -- Get document content
+    SELECT content INTO doc_content FROM documents WHERE id = document_id;
+    
+    IF doc_content IS NULL THEN
+        RETURN;
+    END IF;
+    
+    -- Auto-assign tags based on content
+    IF doc_content ILIKE '%contract%' OR doc_content ILIKE '%agreement%' THEN
+        SELECT id INTO tag_id FROM tags WHERE name = 'contract';
+        IF tag_id IS NOT NULL THEN
+            INSERT INTO document_tags (document_id, tag_id) VALUES (document_id, tag_id) ON CONFLICT DO NOTHING;
+        END IF;
+    END IF;
+    
+    IF doc_content ILIKE '%invoice%' OR doc_content ILIKE '%bill%' OR doc_content ILIKE '%payment%' THEN
+        SELECT id INTO tag_id FROM tags WHERE name = 'invoice';
+        IF tag_id IS NOT NULL THEN
+            INSERT INTO document_tags (document_id, tag_id) VALUES (document_id, tag_id) ON CONFLICT DO NOTHING;
+        END IF;
+    END IF;
+    
+    IF doc_content ILIKE '%policy%' OR doc_content ILIKE '%procedure%' OR doc_content ILIKE '%guideline%' THEN
+        SELECT id INTO tag_id FROM tags WHERE name = 'policy';
+        IF tag_id IS NOT NULL THEN
+            INSERT INTO document_tags (document_id, tag_id) VALUES (document_id, tag_id) ON CONFLICT DO NOTHING;
+        END IF;
+    END IF;
+    
+    IF doc_content ILIKE '%report%' OR doc_content ILIKE '%analysis%' OR doc_content ILIKE '%summary%' THEN
+        SELECT id INTO tag_id FROM tags WHERE name = 'report';
+        IF tag_id IS NOT NULL THEN
+            INSERT INTO document_tags (document_id, tag_id) VALUES (document_id, tag_id) ON CONFLICT DO NOTHING;
+        END IF;
+    END IF;
+    
+    IF doc_content ILIKE '%gdpr%' OR doc_content ILIKE '%hipaa%' OR doc_content ILIKE '%sox%' OR doc_content ILIKE '%compliance%' THEN
+        SELECT id INTO tag_id FROM tags WHERE name = 'compliance';
+        IF tag_id IS NOT NULL THEN
+            INSERT INTO document_tags (document_id, tag_id) VALUES (document_id, tag_id) ON CONFLICT DO NOTHING;
+        END IF;
+    END IF;
+END;
+$$ LANGUAGE plpgsql;
+
+-- Create trigger to auto-assign tags when document is processed
+CREATE OR REPLACE FUNCTION trigger_auto_assign_tags()
+RETURNS TRIGGER AS $$
+BEGIN
+    IF NEW.processing_status = 'completed' AND OLD.processing_status != 'completed' THEN
+        PERFORM auto_assign_tags(NEW.id);
+    END IF;
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+CREATE TRIGGER auto_assign_tags_trigger
+    AFTER UPDATE ON documents
+    FOR EACH ROW
+    EXECUTE FUNCTION trigger_auto_assign_tags();
+
+-- Final cleanup and optimization
+VACUUM ANALYZE;
+
+-- Print completion message
+DO $$
+BEGIN
+    RAISE NOTICE 'Database initialization completed successfully!';
+    RAISE NOTICE 'Default admin user: admin@example.com / admin123';
+    RAISE NOTICE 'Default roles, compliance frameworks, and tags have been created.';
+END $$;
diff --git a/scripts/monitoring-setup.sh b/scripts/monitoring-setup.sh
new file mode 100644
index 0000000..ab16593
--- /dev/null
+++ b/scripts/monitoring-setup.sh
@@ -0,0 +1,793 @@
+#!/bin/bash
+
+# AI Document Agent Monitoring Setup Script
+# This script sets up comprehensive monitoring for the AI Document Agent
+
+set -e  # Exit on any error
+
+# Colors for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Function to print colored output
+print_status() {
+    echo -e "${BLUE}[INFO]${NC} $1"
+}
+
+print_success() {
+    echo -e "${GREEN}[SUCCESS]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+# Configuration
+MONITORING_DIR="./monitoring"
+GRAFANA_ADMIN_USER="admin"
+GRAFANA_ADMIN_PASSWORD="admin"
+
+# Function to check if command exists
+command_exists() {
+    command -v "$1" >/dev/null 2>&1
+}
+
+# Function to check if Docker is running
+check_docker() {
+    if ! command_exists docker; then
+        print_error "Docker is not installed. Please install Docker first."
+        exit 1
+    fi
+    
+    if ! docker info >/dev/null 2>&1; then
+        print_error "Docker is not running. Please start Docker first."
+        exit 1
+    fi
+    
+    print_success "Docker is available and running"
+}
+
+# Function to create monitoring directories
+create_monitoring_directories() {
+    print_status "Creating monitoring directories..."
+    
+    mkdir -p "$MONITORING_DIR/grafana/dashboards"
+    mkdir -p "$MONITORING_DIR/grafana/datasources"
+    mkdir -p "$MONITORING_DIR/grafana/provisioning/dashboards"
+    mkdir -p "$MONITORING_DIR/grafana/provisioning/datasources"
+    mkdir -p "$MONITORING_DIR/prometheus/rules"
+    mkdir -p "$MONITORING_DIR/prometheus/alerts"
+    mkdir -p "$MONITORING_DIR/alertmanager"
+    mkdir -p "$MONITORING_DIR/jaeger"
+    mkdir -p "$MONITORING_DIR/elasticsearch"
+    mkdir -p "$MONITORING_DIR/kibana"
+    mkdir -p "$MONITORING_DIR/filebeat"
+    
+    print_success "Monitoring directories created"
+}
+
+# Function to setup Prometheus configuration
+setup_prometheus() {
+    print_status "Setting up Prometheus configuration..."
+    
+    # Create Prometheus configuration
+    cat > "$MONITORING_DIR/prometheus/prometheus.yml" << 'EOF'
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+rule_files:
+  - "rules/*.yml"
+
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets:
+          - alertmanager:9093
+
+scrape_configs:
+  # Prometheus itself
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+
+  # AI Document Agent Backend
+  - job_name: 'ai-document-agent-backend'
+    static_configs:
+      - targets: ['backend:8000']
+    metrics_path: '/metrics'
+    scrape_interval: 10s
+
+  # AI Document Agent Frontend
+  - job_name: 'ai-document-agent-frontend'
+    static_configs:
+      - targets: ['frontend:3000']
+    metrics_path: '/metrics'
+    scrape_interval: 10s
+
+  # PostgreSQL
+  - job_name: 'postgresql'
+    static_configs:
+      - targets: ['postgres:5432']
+    scrape_interval: 30s
+
+  # Redis
+  - job_name: 'redis'
+    static_configs:
+      - targets: ['redis:6379']
+    scrape_interval: 30s
+
+  # ChromaDB
+  - job_name: 'chromadb'
+    static_configs:
+      - targets: ['chromadb:8000']
+    metrics_path: '/metrics'
+    scrape_interval: 30s
+
+  # Node Exporter (system metrics)
+  - job_name: 'node-exporter'
+    static_configs:
+      - targets: ['node-exporter:9100']
+    scrape_interval: 15s
+
+  # cAdvisor (container metrics)
+  - job_name: 'cadvisor'
+    static_configs:
+      - targets: ['cadvisor:8080']
+    scrape_interval: 15s
+
+  # Jaeger (tracing)
+  - job_name: 'jaeger'
+    static_configs:
+      - targets: ['jaeger:16686']
+    scrape_interval: 30s
+
+  # Elasticsearch
+  - job_name: 'elasticsearch'
+    static_configs:
+      - targets: ['elasticsearch:9200']
+    metrics_path: '/_prometheus/metrics'
+    scrape_interval: 30s
+EOF
+
+    # Create alert rules
+    cat > "$MONITORING_DIR/prometheus/rules/ai-document-agent.yml" << 'EOF'
+groups:
+  - name: ai-document-agent
+    rules:
+      # High CPU usage
+      - alert: HighCPUUsage
+        expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High CPU usage on {{ $labels.instance }}"
+          description: "CPU usage is above 80% for more than 5 minutes"
+
+      # High memory usage
+      - alert: HighMemoryUsage
+        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 85
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High memory usage on {{ $labels.instance }}"
+          description: "Memory usage is above 85% for more than 5 minutes"
+
+      # High disk usage
+      - alert: HighDiskUsage
+        expr: (node_filesystem_size_bytes - node_filesystem_free_bytes) / node_filesystem_size_bytes * 100 > 90
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High disk usage on {{ $labels.instance }}"
+          description: "Disk usage is above 90% for more than 5 minutes"
+
+      # Backend service down
+      - alert: BackendServiceDown
+        expr: up{job="ai-document-agent-backend"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Backend service is down"
+          description: "The AI Document Agent backend service is not responding"
+
+      # Frontend service down
+      - alert: FrontendServiceDown
+        expr: up{job="ai-document-agent-frontend"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Frontend service is down"
+          description: "The AI Document Agent frontend service is not responding"
+
+      # High error rate
+      - alert: HighErrorRate
+        expr: rate(http_requests_total{status_code=~"5.."}[5m]) / rate(http_requests_total[5m]) * 100 > 5
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High error rate detected"
+          description: "Error rate is above 5% for more than 5 minutes"
+
+      # Slow response time
+      - alert: SlowResponseTime
+        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Slow response time detected"
+          description: "95th percentile response time is above 2 seconds"
+
+      # Agent execution failures
+      - alert: AgentExecutionFailures
+        expr: rate(agent_execution_failures_total[5m]) > 0.1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High agent execution failure rate"
+          description: "Agent execution failure rate is above 0.1 per second"
+
+      # Database connection issues
+      - alert: DatabaseConnectionIssues
+        expr: up{job="postgresql"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Database connection issues"
+          description: "Cannot connect to PostgreSQL database"
+
+      # Redis connection issues
+      - alert: RedisConnectionIssues
+        expr: up{job="redis"} == 0
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Redis connection issues"
+          description: "Cannot connect to Redis cache"
+EOF
+
+    print_success "Prometheus configuration created"
+}
+
+# Function to setup Alertmanager configuration
+setup_alertmanager() {
+    print_status "Setting up Alertmanager configuration..."
+    
+    cat > "$MONITORING_DIR/alertmanager/alertmanager.yml" << 'EOF'
+global:
+  resolve_timeout: 5m
+  slack_api_url: 'https://hooks.slack.com/services/YOUR_SLACK_WEBHOOK'
+
+route:
+  group_by: ['alertname']
+  group_wait: 10s
+  group_interval: 10s
+  repeat_interval: 1h
+  receiver: 'web.hook'
+  routes:
+    - match:
+        severity: critical
+      receiver: 'slack.critical'
+      continue: true
+    - match:
+        severity: warning
+      receiver: 'slack.warning'
+
+receivers:
+  - name: 'web.hook'
+    webhook_configs:
+      - url: 'http://127.0.0.1:5001/'
+
+  - name: 'slack.critical'
+    slack_configs:
+      - channel: '#alerts-critical'
+        title: '{{ template "slack.title" . }}'
+        text: '{{ template "slack.text" . }}'
+        send_resolved: true
+
+  - name: 'slack.warning'
+    slack_configs:
+      - channel: '#alerts-warning'
+        title: '{{ template "slack.title" . }}'
+        text: '{{ template "slack.text" . }}'
+        send_resolved: true
+
+inhibit_rules:
+  - source_match:
+      severity: 'critical'
+    target_match:
+      severity: 'warning'
+    equal: ['alertname', 'dev', 'instance']
+EOF
+
+    print_success "Alertmanager configuration created"
+}
+
+# Function to setup Grafana datasources
+setup_grafana_datasources() {
+    print_status "Setting up Grafana datasources..."
+    
+    cat > "$MONITORING_DIR/grafana/provisioning/datasources/datasources.yml" << 'EOF'
+apiVersion: 1
+
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: true
+    jsonData:
+      timeInterval: "5s"
+      queryTimeout: "60s"
+      httpMethod: "POST"
+    secureJsonData: {}
+
+  - name: PostgreSQL
+    type: postgres
+    access: proxy
+    url: postgres:5432
+    database: ai_document_agent
+    user: postgres
+    secureJsonData:
+      password: "your_password_here"
+    jsonData:
+      sslmode: "disable"
+      maxOpenConns: 100
+      maxIdleConns: 100
+      connMaxLifetime: 14400
+
+  - name: Elasticsearch
+    type: elasticsearch
+    access: proxy
+    url: http://elasticsearch:9200
+    database: "ai-document-agent-logs"
+    jsonData:
+      timeField: "@timestamp"
+      esVersion: 7.0.0
+      maxConcurrentShardRequests: 5
+      logMessageField: message
+      logLevelField: level
+
+  - name: Jaeger
+    type: jaeger
+    access: proxy
+    url: http://jaeger:16686
+    jsonData:
+      nodeGraph:
+        enabled: true
+EOF
+
+    print_success "Grafana datasources configuration created"
+}
+
+# Function to setup Grafana dashboards provisioning
+setup_grafana_dashboards() {
+    print_status "Setting up Grafana dashboards provisioning..."
+    
+    cat > "$MONITORING_DIR/grafana/provisioning/dashboards/dashboards.yml" << 'EOF'
+apiVersion: 1
+
+providers:
+  - name: 'AI Document Agent'
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /etc/grafana/provisioning/dashboards
+EOF
+
+    print_success "Grafana dashboards provisioning created"
+}
+
+# Function to setup Elasticsearch configuration
+setup_elasticsearch() {
+    print_status "Setting up Elasticsearch configuration..."
+    
+    cat > "$MONITORING_DIR/elasticsearch/elasticsearch.yml" << 'EOF'
+cluster.name: ai-document-agent
+node.name: node-1
+network.host: 0.0.0.0
+http.port: 9200
+discovery.type: single-node
+xpack.security.enabled: false
+xpack.monitoring.enabled: true
+xpack.monitoring.collection.enabled: true
+
+# Memory settings
+bootstrap.memory_lock: true
+indices.memory.index_buffer_size: 30%
+
+# Logging
+logger.level: INFO
+EOF
+
+    print_success "Elasticsearch configuration created"
+}
+
+# Function to setup Filebeat configuration
+setup_filebeat() {
+    print_status "Setting up Filebeat configuration..."
+    
+    cat > "$MONITORING_DIR/filebeat/filebeat.yml" << 'EOF'
+filebeat.inputs:
+  - type: log
+    enabled: true
+    paths:
+      - /var/log/ai-document-agent/*.log
+    fields:
+      service: ai-document-agent
+    fields_under_root: true
+    multiline.pattern: '^\['
+    multiline.negate: true
+    multiline.match: after
+
+  - type: log
+    enabled: true
+    paths:
+      - /var/log/audit/*.log
+    fields:
+      service: audit
+    fields_under_root: true
+
+processors:
+  - add_host_metadata:
+      when.not.contains.tags: forwarded
+  - add_cloud_metadata: ~
+  - add_docker_metadata: ~
+  - add_kubernetes_metadata: ~
+
+output.elasticsearch:
+  hosts: ["elasticsearch:9200"]
+  indices:
+    - index: "filebeat-%{[agent.version]}-%{+yyyy.MM.dd}"
+
+setup.kibana:
+  host: "kibana:5601"
+
+setup.dashboards.enabled: true
+setup.template.enabled: true
+setup.template.name: "filebeat"
+setup.template.pattern: "filebeat-*"
+setup.template.overwrite: true
+EOF
+
+    print_success "Filebeat configuration created"
+}
+
+# Function to setup Jaeger configuration
+setup_jaeger() {
+    print_status "Setting up Jaeger configuration..."
+    
+    cat > "$MONITORING_DIR/jaeger/jaeger.yml" << 'EOF'
+sampling:
+  default_strategy:
+    type: probabilistic
+    param: 0.1
+
+storage:
+  type: elasticsearch
+  options:
+    es:
+      server_urls: http://elasticsearch:9200
+      index_prefix: jaeger
+      username: ""
+      password: ""
+
+ingester:
+  kafka:
+    consumer:
+      topic: jaeger-spans
+      brokers: kafka:9092
+
+agent:
+  http_server:
+    host_port: ":14268"
+EOF
+
+    print_success "Jaeger configuration created"
+}
+
+# Function to create monitoring Docker Compose
+create_monitoring_compose() {
+    print_status "Creating monitoring Docker Compose file..."
+    
+    cat > "$MONITORING_DIR/docker-compose.monitoring.yml" << 'EOF'
+version: '3.8'
+
+services:
+  # Prometheus
+  prometheus:
+    image: prom/prometheus:latest
+    container_name: ai-doc-bot-prometheus
+    ports:
+      - "9090:9090"
+    volumes:
+      - ./prometheus:/etc/prometheus
+      - prometheus_data:/prometheus
+    command:
+      - '--config.file=/etc/prometheus/prometheus.yml'
+      - '--storage.tsdb.path=/prometheus'
+      - '--web.console.libraries=/etc/prometheus/console_libraries'
+      - '--web.console.templates=/etc/prometheus/consoles'
+      - '--storage.tsdb.retention.time=200h'
+      - '--web.enable-lifecycle'
+    restart: unless-stopped
+    networks:
+      - monitoring
+
+  # Alertmanager
+  alertmanager:
+    image: prom/alertmanager:latest
+    container_name: ai-doc-bot-alertmanager
+    ports:
+      - "9093:9093"
+    volumes:
+      - ./alertmanager:/etc/alertmanager
+    command:
+      - '--config.file=/etc/alertmanager/alertmanager.yml'
+      - '--storage.path=/alertmanager'
+    restart: unless-stopped
+    networks:
+      - monitoring
+
+  # Grafana
+  grafana:
+    image: grafana/grafana:latest
+    container_name: ai-doc-bot-grafana
+    ports:
+      - "3001:3000"
+    environment:
+      - GF_SECURITY_ADMIN_USER=admin
+      - GF_SECURITY_ADMIN_PASSWORD=admin
+      - GF_USERS_ALLOW_SIGN_UP=false
+    volumes:
+      - ./grafana/provisioning:/etc/grafana/provisioning
+      - ./grafana/dashboards:/etc/grafana/provisioning/dashboards
+      - grafana_data:/var/lib/grafana
+    restart: unless-stopped
+    networks:
+      - monitoring
+
+  # Node Exporter
+  node-exporter:
+    image: prom/node-exporter:latest
+    container_name: ai-doc-bot-node-exporter
+    ports:
+      - "9100:9100"
+    volumes:
+      - /proc:/host/proc:ro
+      - /sys:/host/sys:ro
+      - /:/rootfs:ro
+    command:
+      - '--path.procfs=/host/proc'
+      - '--path.sysfs=/host/sys'
+      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
+    restart: unless-stopped
+    networks:
+      - monitoring
+
+  # cAdvisor
+  cadvisor:
+    image: gcr.io/cadvisor/cadvisor:latest
+    container_name: ai-doc-bot-cadvisor
+    ports:
+      - "8080:8080"
+    volumes:
+      - /:/rootfs:ro
+      - /var/run:/var/run:ro
+      - /sys:/sys:ro
+      - /var/lib/docker/:/var/lib/docker:ro
+      - /dev/disk/:/dev/disk:ro
+    restart: unless-stopped
+    networks:
+      - monitoring
+
+  # Elasticsearch
+  elasticsearch:
+    image: docker.elastic.co/elasticsearch/elasticsearch:7.17.0
+    container_name: ai-doc-bot-elasticsearch
+    environment:
+      - discovery.type=single-node
+      - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
+    ports:
+      - "9200:9200"
+    volumes:
+      - ./elasticsearch/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml
+      - elasticsearch_data:/usr/share/elasticsearch/data
+    restart: unless-stopped
+    networks:
+      - monitoring
+
+  # Kibana
+  kibana:
+    image: docker.elastic.co/kibana/kibana:7.17.0
+    container_name: ai-doc-bot-kibana
+    environment:
+      - ELASTICSEARCH_HOSTS=http://elasticsearch:9200
+    ports:
+      - "5601:5601"
+    volumes:
+      - kibana_data:/usr/share/kibana/data
+    restart: unless-stopped
+    networks:
+      - monitoring
+    depends_on:
+      - elasticsearch
+
+  # Filebeat
+  filebeat:
+    image: docker.elastic.co/beats/filebeat:7.17.0
+    container_name: ai-doc-bot-filebeat
+    user: root
+    volumes:
+      - ./filebeat/filebeat.yml:/usr/share/filebeat/filebeat.yml:ro
+      - /var/lib/docker/containers:/var/lib/docker/containers:ro
+      - /var/log/ai-document-agent:/var/log/ai-document-agent:ro
+      - /var/log/audit:/var/log/audit:ro
+    restart: unless-stopped
+    networks:
+      - monitoring
+    depends_on:
+      - elasticsearch
+
+  # Jaeger
+  jaeger:
+    image: jaegertracing/all-in-one:latest
+    container_name: ai-doc-bot-jaeger
+    environment:
+      - COLLECTOR_OTLP_ENABLED=true
+    ports:
+      - "16686:16686"
+      - "14268:14268"
+      - "14250:14250"
+    restart: unless-stopped
+    networks:
+      - monitoring
+
+volumes:
+  prometheus_data:
+  grafana_data:
+  elasticsearch_data:
+  kibana_data:
+
+networks:
+  monitoring:
+    driver: bridge
+EOF
+
+    print_success "Monitoring Docker Compose file created"
+}
+
+# Function to start monitoring services
+start_monitoring_services() {
+    print_status "Starting monitoring services..."
+    
+    cd "$MONITORING_DIR"
+    docker-compose -f docker-compose.monitoring.yml up -d
+    
+    print_success "Monitoring services started"
+    print_status "Waiting for services to be ready..."
+    sleep 30
+    
+    print_success "Monitoring services are ready!"
+}
+
+# Function to show monitoring status
+show_monitoring_status() {
+    print_status "Monitoring Service Status:"
+    cd "$MONITORING_DIR"
+    docker-compose -f docker-compose.monitoring.yml ps
+    
+    echo ""
+    print_status "Monitoring URLs:"
+    echo "Grafana: http://localhost:3001 (admin/admin)"
+    echo "Prometheus: http://localhost:9090"
+    echo "Alertmanager: http://localhost:9093"
+    echo "Kibana: http://localhost:5601"
+    echo "Jaeger: http://localhost:16686"
+    echo "Elasticsearch: http://localhost:9200"
+    echo "cAdvisor: http://localhost:8080"
+    echo "Node Exporter: http://localhost:9100"
+}
+
+# Function to stop monitoring services
+stop_monitoring_services() {
+    print_status "Stopping monitoring services..."
+    
+    cd "$MONITORING_DIR"
+    docker-compose -f docker-compose.monitoring.yml down
+    
+    print_success "Monitoring services stopped"
+}
+
+# Function to show help
+show_help() {
+    echo "AI Document Agent Monitoring Setup Script"
+    echo ""
+    echo "Usage: $0 [OPTION]"
+    echo ""
+    echo "Options:"
+    echo "  setup     - Complete monitoring setup (default)"
+    echo "  start     - Start monitoring services only"
+    echo "  stop      - Stop monitoring services"
+    echo "  restart   - Restart monitoring services"
+    echo "  status    - Show monitoring service status"
+    echo "  help      - Show this help message"
+    echo ""
+    echo "Examples:"
+    echo "  $0 setup    # Complete setup"
+    echo "  $0 start    # Start services"
+    echo "  $0 status   # Show status"
+}
+
+# Main script logic
+main() {
+    case "${1:-setup}" in
+        "setup")
+            print_status "Starting AI Document Agent monitoring setup..."
+            
+            check_docker
+            create_monitoring_directories
+            setup_prometheus
+            setup_alertmanager
+            setup_grafana_datasources
+            setup_grafana_dashboards
+            setup_elasticsearch
+            setup_filebeat
+            setup_jaeger
+            create_monitoring_compose
+            start_monitoring_services
+            show_monitoring_status
+            
+            print_success "Monitoring setup completed successfully!"
+            print_status "You can now access Grafana at http://localhost:3001"
+            ;;
+        "start")
+            start_monitoring_services
+            show_monitoring_status
+            ;;
+        "stop")
+            stop_monitoring_services
+            ;;
+        "restart")
+            stop_monitoring_services
+            start_monitoring_services
+            show_monitoring_status
+            ;;
+        "status")
+            show_monitoring_status
+            ;;
+        "help"|"-h"|"--help")
+            show_help
+            ;;
+        *)
+            print_error "Unknown option: $1"
+            show_help
+            exit 1
+            ;;
+    esac
+}
+
+# Run main function with all arguments
+main "$@"