diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..8e10e22 --- /dev/null +++ b/.env.example @@ -0,0 +1,23 @@ +# Environment Configuration for Vector Code Retrieval System + +# Ollama Configuration +OLLAMA_HOST=http://localhost:11434 +OLLAMA_MODEL=qwen3:8b + +# Embedding Configuration +EMBEDDING_SERVER=http://localhost:5000 +EMBEDDING_MODEL=nomic-ai/nomic-embed-text-v1.5 + +# Storage Configuration +CHROMA_PATH=./chroma_code + +# Service Selection +USE_LOCAL_EMBEDDINGS=true +USE_LOCAL_OLLAMA=true + +# Trust Remote Code Settings (automatically managed by trust_manager.py) +# Format: TRUST_REMOTE_CODE_=true|false +# These are automatically added when you approve/deny models +# Example: +# # TRUST_REMOTE_CODE_A1B2C3D4_MODEL=nomic-ai/nomic-embed-text-v1.5 +# TRUST_REMOTE_CODE_A1B2C3D4=true \ No newline at end of file diff --git a/.env_example b/.env_example deleted file mode 100644 index 27d3506..0000000 --- a/.env_example +++ /dev/null @@ -1,14 +0,0 @@ -# Ollama Configuration -OLLAMA_HOST=http://localhost:11434 -OLLAMA_MODEL=dolphincoder:15b - -# Embedding Configuration -EMBEDDING_SERVER=http://localhost:5000 -EMBEDDING_MODEL=nomic-ai/nomic-embed-text-v1.5 - -# ChromaDB Configuration -CHROMA_PATH=./chroma_code - -# Default Settings -USE_LOCAL_EMBEDDINGS=true -USE_LOCAL_OLLAMA=true diff --git a/.gitignore_example b/.gitignore_example new file mode 100644 index 0000000..13c92ed --- /dev/null +++ b/.gitignore_example @@ -0,0 +1,7 @@ +.gitignore +venv/ +.vscode/ +__pycache__/ +chroma_code/ +chroma_db/ +*-queries.md \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 797068f..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "terminal.integrated.allowMnemonics": false, - "terminal.integrated.automationProfile.linux": null -} \ No newline at end of file diff --git a/README.md b/README.md index 43b7aa5..71745f3 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,8 @@ A powerful semantic search system for log files that enables natural language qu - **Local LLM Integration**: Generates AI responses using Ollama with customizable models - **Interactive Query Interface**: Rich terminal interface with markdown rendering - **GPU Acceleration**: Optional GPU support for faster embedding generation -- **Comprehensive File Support**: Indexes `.py`, `.log`, `.js`, `.ts`, `.md`, `.sql`, `.html`, `.csv` files +- **Automatic File Detection**: Intelligently detects and indexes all text-based files by content analysis +- **Security-First Design**: Client-side trust_remote_code management with consent prompts and persistent tracking - **Environment Configuration**: Fully configurable via `.env` files ## Quick Start @@ -63,25 +64,30 @@ USE_LOCAL_EMBEDDINGS=true USE_LOCAL_OLLAMA=true ``` -### 3. Index Your Log Files +### 3. Index Your Files -Index a directory using local embeddings (default): +Index a directory with automatic file detection: ```bash -python index.py /path/to/your/logs +python index.py /path/to/your/files ``` +The system will: +- Automatically detect all text-based files by content analysis +- Skip binary files and common build/cache directories +- Prompt for trust_remote_code consent if needed for the embedding model + Or specify embedding type: ```bash -# Use local SentenceTransformer embeddings -python index.py /path/to/logs --local-embeddings +# Use local SentenceTransformer embeddings (default) +python index.py /path/to/files --local-embeddings # Use Ollama embeddings -python index.py /path/to/logs --ollama-embeddings +python index.py /path/to/files --ollama-embeddings # Use remote embedding server -python index.py /path/to/logs --remote-embeddings +python index.py /path/to/files --remote-embeddings ``` Additional options: @@ -94,7 +100,7 @@ python index.py /path/to/logs --model custom-model --chunk-size 1500 python index.py /path/to/logs --chroma-path ./my_custom_db ``` -### 4. Query Your Logs +### 4. Query Your Indexed Content Start the interactive query interface: @@ -102,6 +108,11 @@ Start the interactive query interface: python ask.py ``` +The system will: +- Auto-detect the embedding type used during indexing +- Apply same trust_remote_code settings for consistency +- Generate responses using Ollama's local LLM + Or specify a custom output file: ```bash @@ -113,22 +124,32 @@ python ask.py my_queries.md ### Core Components 1. **Unified Indexer (`index.py`)** - - Processes repositories and creates vector embeddings + - Processes repositories with automatic file detection - Supports multiple embedding strategies via handler classes - - Chunks code into configurable segments (default: 2000 characters) + - Chunks content into configurable segments (default: 2000 characters) + - Client-side trust_remote_code management - Stores embeddings in ChromaDB with metadata tracking 2. **Query Interface (`ask.py`)** - - Interactive CLI for natural language log queries - - Auto-detects embedding type from metadata + - Interactive CLI for natural language queries + - Auto-detects embedding type and trust settings from metadata - Generates responses using Ollama's local LLM + - Consistent security model with indexing phase - Saves all Q&A pairs with timestamps 3. **Embedding Server (`embedding_server.py`)** - Optional remote embedding service with GPU support + - Respects client-side trust_remote_code decisions - RESTful API with health checks and server info - - Configurable via command-line arguments - - Supports batch processing and model caching + - Dynamic model loading with trust setting caching + - Supports batch processing and multiple model variants + +4. **Trust Manager (`trust_manager.py`)** + - Centralized security management for trust_remote_code + - Auto-detection of models requiring remote code execution + - Interactive consent prompts with risk/benefit explanations + - Persistent approval tracking in .env files + - CLI tools for managing trust settings ### Embedding Handlers @@ -149,6 +170,7 @@ python ask.py my_queries.md | `CHROMA_PATH` | ChromaDB storage path | `./chroma_code` | | `USE_LOCAL_EMBEDDINGS` | Default embedding strategy | `true` | | `USE_LOCAL_OLLAMA` | Use local Ollama instance | `true` | +| `TRUST_REMOTE_CODE_*` | Model-specific trust settings | Auto-managed | ### Command Line Options @@ -180,6 +202,73 @@ Options: --debug Enable debug mode ``` +## Security: Trust Remote Code Management + +The system includes a comprehensive security framework for models that require `trust_remote_code=True`. This client-side security system: + +- **Auto-detects** which models likely need remote code execution based on known patterns +- **Prompts for informed consent** with detailed security warnings +- **Persists decisions** in `.env` with model-specific hash tracking +- **Client-side control** - trust decisions made locally, not on remote servers +- **Cross-component consistency** - same security model for indexing, querying, and serving + +### How It Works + +1. **Detection**: System analyzes model names against known patterns +2. **User Consent**: Interactive prompts with clear risk/benefit explanations +3. **Persistence**: Decisions saved locally with model identification hashes +4. **Communication**: Client sends trust settings to remote embedding servers + +### Managing Trust Settings + +```bash +# List all approved/denied models +python trust_manager.py --list + +# Check if a specific model needs trust_remote_code +python trust_manager.py --check "nomic-ai/nomic-embed-text-v1.5" +``` + +### Security Flow + +When you first use a model requiring remote code execution: + +``` +============================================================== +SECURITY WARNING: Remote Code Execution +============================================================== +Model: nomic-ai/nomic-embed-text-v1.5 + +This model may require 'trust_remote_code=True' which allows +the model to execute arbitrary code during loading. + +RISKS: +- The model could execute malicious code +- Your system could be compromised +- Data could be stolen or corrupted + +BENEFITS: +- Access to newer/specialized models +- Better embedding quality for some models + +Your choice will be saved for this model. +============================================================== +Allow remote code execution for this model? [y/N]: +``` + +### Trust Settings Storage + +Approval decisions are stored in your `.env` file: + +```bash +# Example entries (automatically managed) +# TRUST_REMOTE_CODE_A1B2C3D4_MODEL=nomic-ai/nomic-embed-text-v1.5 +TRUST_REMOTE_CODE_A1B2C3D4=true + +# TRUST_REMOTE_CODE_E5F6G7H8_MODEL=sentence-transformers/all-MiniLM-L6-v2 +TRUST_REMOTE_CODE_E5F6G7H8=false +``` + ## Advanced Usage ### Remote Embedding Server @@ -257,11 +346,14 @@ The system automatically detects and works with databases created by older versi ## Dependencies - **chromadb**: Vector database for embeddings -- **sentence-transformers**: Local embedding generation +- **sentence-transformers**: Local embedding generation (optional, only needed for local embeddings) - **ollama**: LLM client for local inference - **rich**: Enhanced terminal output and markdown rendering - **flask**: Web server for embedding API - **python-dotenv**: Environment configuration management +- **tiktoken**: Token counting utilities +- **einops**: Tensor operations for advanced models +- **requests**: HTTP client for remote services ## File Structure @@ -269,6 +361,7 @@ The system automatically detects and works with databases created by older versi ├── index.py # Unified indexing script ├── ask.py # Interactive query interface ├── embedding_server.py # Remote embedding server +├── trust_manager.py # Security: trust_remote_code management ├── requirements.txt # Python dependencies ├── .env_example # Environment configuration template └── chroma_code/ # Default ChromaDB storage (created after indexing) @@ -276,4 +369,10 @@ The system automatically detects and works with databases created by older versi ## License -This project is designed for local development and research use. Please ensure compliance with the terms of service for any external models or APIs used. \ No newline at end of file +This project is designed for local development and research use. Please ensure compliance with the terms of service for any external models or APIs used. + +## Contributions + +I welcome any assistance on this project, especially around trying new models for better performance and testing against ore logs than I have at my disposal! + +Please just fork off of dev and then submit a PR \ No newline at end of file diff --git a/__pycache__/ask.cpython-311.pyc b/__pycache__/ask.cpython-311.pyc deleted file mode 100644 index 4c754a8..0000000 Binary files a/__pycache__/ask.cpython-311.pyc and /dev/null differ diff --git a/__pycache__/index_remote.cpython-311.pyc b/__pycache__/index_remote.cpython-311.pyc deleted file mode 100644 index 857e736..0000000 Binary files a/__pycache__/index_remote.cpython-311.pyc and /dev/null differ diff --git a/ask.py b/ask.py index 40b9039..ac3f475 100644 --- a/ask.py +++ b/ask.py @@ -1,15 +1,15 @@ #!/usr/bin/env python3 """ -Interactive Code Query Tool +Interactive Log File Query Tool -Query your indexed codebase using natural language and get AI-generated answers. +Query your indexed log files using natural language and get AI-generated answers. Uses the embeddings created by index.py and generates responses using Ollama. Usage: python ask.py [output_file.md] Arguments: - output_file.md Optional. Markdown file to save Q&A pairs (default: codebase_queries.md) + output_file.md Optional. Markdown file to save Q&A pairs (default: logfile_queries.md) """ import os @@ -31,7 +31,7 @@ # Configuration from environment OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://localhost:11434') -OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'dolphincoder:15b') +OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'qwen3:8b') EMBEDDING_SERVER = os.getenv('EMBEDDING_SERVER', 'http://localhost:5000') EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'nomic-ai/nomic-embed-text-v1.5') CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_code') @@ -39,7 +39,7 @@ USE_LOCAL_OLLAMA = os.getenv('USE_LOCAL_OLLAMA', 'true').lower() == 'true' # Constants -DEFAULT_OUTPUT_FILE = "codebase_queries.md" +DEFAULT_OUTPUT_FILE = "logfile_queries.md" DEFAULT_TOP_K = 5 OLLAMA_EMBEDDING_MODEL = "nomic-embed-text" @@ -164,14 +164,13 @@ def _get_embedding_ollama(self, text: str) -> List[float]: def _get_embedding_local(self, text: str) -> List[float]: """Get embedding using local model""" try: - from sentence_transformers import SentenceTransformer - if not self._local_model: console.print("[yellow]Loading local embedding model...[/yellow]") - self._local_model = SentenceTransformer(self.embedding_model) - self._local_model.max_seq_length = 512 + from trust_manager import safe_sentence_transformer_load # type: ignore[import] + self._local_model = safe_sentence_transformer_load(self.embedding_model) + self._local_model.max_seq_length = 512 # type: ignore[attr-defined] - embedding = self._local_model.encode([text], show_progress_bar=False) + embedding = self._local_model.encode([text], show_progress_bar=False) # type: ignore[attr-defined] # Handle different return types from encode if hasattr(embedding, 'tolist'): # If it's a numpy array @@ -198,9 +197,18 @@ def _get_embedding_local(self, text: str) -> List[float]: def _get_embedding_remote(self, text: str) -> List[float]: """Get embedding from remote server""" try: + # Get trust setting for this model + from trust_manager import TrustManager + trust_manager = TrustManager() + trust_remote_code = trust_manager.get_trust_setting(self.embedding_model, interactive=True) + response = requests.post( f"{EMBEDDING_SERVER}/embed", - json={"texts": [text], "model": self.embedding_model}, + json={ + "texts": [text], + "model": self.embedding_model, + "trust_remote_code": trust_remote_code + }, timeout=60 ) response.raise_for_status() @@ -210,7 +218,7 @@ def _get_embedding_remote(self, text: str) -> List[float]: raise def query_codebase(self, question: str, top_k: int = DEFAULT_TOP_K) -> str: - """Query the codebase and generate a response""" + """Query the logs and generate a response""" # Get embedding for the question try: q_embed = self.get_embedding(question) @@ -285,8 +293,8 @@ def write_to_markdown(question: str, answer: str, filename: str) -> None: # Create the file if it doesn't exist if not os.path.exists(filename): with open(filename, "w", encoding="utf-8") as f: - f.write("# Codebase Query Log\n\n") - f.write("This file contains questions and answers about the codebase.\n\n") + f.write("# Log File Query Log\n\n") + f.write("This file contains questions and answers about the Log Files.\n\n") # Append the Q&A pair with open(filename, "a", encoding="utf-8") as f: @@ -305,7 +313,7 @@ def main() -> None: output_file = sys.argv[1] if len(sys.argv) == 2 else DEFAULT_OUTPUT_FILE - console.print(f"\n[bold cyan]Code Query Tool[/bold cyan]") + console.print(f"\n[bold cyan]Log Query Tool[/bold cyan]") console.print(f"Output file: [cyan]{output_file}[/cyan]") console.print(f"Ollama model: [cyan]{OLLAMA_MODEL}[/cyan]") console.print("\nType 'exit' or 'quit' to stop.\n") @@ -320,7 +328,7 @@ def main() -> None: # Interactive loop while True: try: - question = input("\n[?] Ask a question about the codebase: ") + question = input("\n[?] Ask a question about the log files: ") if question.lower() in ['exit', 'quit', 'q']: console.print(f"\n[green]✓ All responses saved to {output_file}[/green]") @@ -330,7 +338,7 @@ def main() -> None: continue # Generate answer - console.print("\n[yellow]Searching codebase and generating response...[/yellow]") + console.print("\n[yellow]Searching log files and generating response...[/yellow]") answer = handler.query_codebase(question) # Write to file diff --git a/embedding_server.py b/embedding_server.py index 0d63363..8555d78 100644 --- a/embedding_server.py +++ b/embedding_server.py @@ -33,6 +33,7 @@ model: Optional[SentenceTransformer] = None device: Optional[str] = None args: Optional[argparse.Namespace] = None +model_cache: Dict[str, SentenceTransformer] = {} # Cache models with different trust settings def initialize_model() -> None: @@ -44,45 +45,83 @@ def initialize_model() -> None: print(f"\nLoading SentenceTransformer model: {args.model}") - # CUDA Diagnostics - print("\nCUDA Diagnostics:") + # GPU Acceleration Diagnostics + print("\nGPU Acceleration Diagnostics:") print(f"PyTorch version: {torch.__version__}") print(f"CUDA available: {torch.cuda.is_available()}") + # Check for MPS (Apple Silicon GPU) + mps_available = hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() + print(f"MPS (Apple Silicon) available: {mps_available}") + if torch.cuda.is_available(): if hasattr(torch, 'version') and hasattr(torch.version, 'cuda') and torch.version.cuda: # type: ignore[attr-defined] print(f"PyTorch CUDA version: {torch.version.cuda}") # type: ignore[attr-defined] - print(f"Number of GPUs: {torch.cuda.device_count()}") + print(f"Number of CUDA GPUs: {torch.cuda.device_count()}") for i in range(torch.cuda.device_count()): - print(f"GPU {i}: {torch.cuda.get_device_name(i)}") + print(f"CUDA GPU {i}: {torch.cuda.get_device_name(i)}") props = cast(Any, torch.cuda.get_device_properties(i)) # type: ignore[misc] - print(f"GPU {i} Memory: {props.total_memory / 1024**3:.1f} GB") + print(f"CUDA GPU {i} Memory: {props.total_memory / 1024**3:.1f} GB") device = 'cuda' + elif mps_available: + print("Using Apple Silicon GPU (MPS) for acceleration") + device = 'mps' else: - print("CUDA not available. Reasons could be:") - print("1. NVIDIA GPU not present") - print("2. CUDA drivers not installed") - print("3. PyTorch not compiled with CUDA support") - print("4. Environment variables not set correctly") + print("No GPU acceleration available. Using CPU.") + print("Possible reasons:") + print("- No NVIDIA GPU (for CUDA)") + print("- No Apple Silicon chip (for MPS)") + print("- PyTorch not compiled with GPU support") + print("- Missing drivers or environment setup") device = 'cpu' print(f"\nUsing device: {device}") - # Load model - model = SentenceTransformer(args.model, device=device, trust_remote_code=True) - model.max_seq_length = args.max_length + # Determine trust setting for default model + if args.trust_remote_code: + default_trust = True + print(f"Using --trust-remote-code flag: trust_remote_code=True for {args.model}") + else: + from trust_manager import TrustManager + trust_manager = TrustManager() + default_trust = trust_manager.check_model_needs_trust(args.model) + print(f"Auto-detected trust_remote_code={default_trust} for {args.model}") + + # Load default model with appropriate trust setting + model = get_or_load_model(args.model, default_trust) - print(f"Model loaded successfully!") + print(f"Default model loaded: {args.model}") print(f"Max sequence length: {args.max_length}") print(f"Batch size: {args.batch_size}") + print(f"Note: Client requests can override model and trust settings") + + +def get_or_load_model(model_name: str, trust_remote_code: bool) -> SentenceTransformer: + """Get or load a model with specific trust_remote_code setting""" + global model_cache, device, args + from sentence_transformers import SentenceTransformer + + if args is None: + raise RuntimeError("Server not initialized") + + # Create cache key + cache_key = f"{model_name}:trust={trust_remote_code}" + + if cache_key not in model_cache: + print(f"Loading model {model_name} with trust_remote_code={trust_remote_code}") + loaded_model = SentenceTransformer(model_name, device=device, trust_remote_code=trust_remote_code) + loaded_model.max_seq_length = args.max_length + model_cache[cache_key] = loaded_model + + return model_cache[cache_key] @app.route('/embed', methods=['POST']) def embed() -> Any: """Generate embeddings for provided texts""" try: - if model is None or args is None: - return jsonify({'error': 'Model not initialized'}), 500 + if args is None: + return jsonify({'error': 'Server not initialized'}), 500 data = request.json if data is None: @@ -90,21 +129,17 @@ def embed() -> Any: data = cast(Dict[str, Any], data) texts = data.get('texts', []) - - # Allow model override per request - request_model = data.get('model') - if request_model and request_model != args.model: - # For now, we don't support dynamic model switching - # This could be implemented with a model cache - return jsonify({ - 'error': f'Model switching not supported. Server is using: {args.model}' - }), 400 + request_model = data.get('model', args.model) + trust_remote_code = data.get('trust_remote_code', False) if not texts: return jsonify({'error': 'No texts provided'}), 400 + # Get the appropriate model + model_to_use = get_or_load_model(request_model, trust_remote_code) + # Generate embeddings on GPU/CPU - embeddings_result = cast(Any, model.encode( # type: ignore[misc] + embeddings_result = cast(Any, model_to_use.encode( # type: ignore[misc] texts, batch_size=args.batch_size, show_progress_bar=False, @@ -114,7 +149,8 @@ def embed() -> Any: return jsonify({ 'embeddings': embeddings, - 'model': args.model, + 'model': request_model, + 'trust_remote_code': trust_remote_code, 'count': len(embeddings) }) @@ -209,6 +245,11 @@ def main() -> None: action='store_true', help='Run in debug mode' ) + parser.add_argument( + '--trust-remote-code', + action='store_true', + help='Force trust_remote_code=True for default model (auto-detected if not specified)' + ) args = parser.parse_args() diff --git a/index.py b/index.py index 412f7a1..2aff469 100644 --- a/index.py +++ b/index.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """ -Unified Code Indexer with Multiple Embedding Options +Unified Log File Indexer with Multiple Embedding Options -This script indexes a codebase for semantic search using various embedding strategies: +This script indexes a path of log files for semantic search using various embedding strategies: - Local embeddings using SentenceTransformer - Ollama embeddings using Ollama's API - Remote embeddings using a dedicated embedding server @@ -15,7 +15,7 @@ --ollama-embeddings Use Ollama's embedding API --remote-embeddings Use remote embedding server --model MODEL Specify embedding model (overrides .env) - --chunk-size SIZE Size of code chunks (default: 2000) + --chunk-size SIZE Size of log chunks (default: 2000) """ import os @@ -39,7 +39,7 @@ # Configuration from environment OLLAMA_HOST = os.getenv('OLLAMA_HOST', 'http://localhost:11434') -OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'dolphincoder:15b') +OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'qwen3:8b') EMBEDDING_SERVER = os.getenv('EMBEDDING_SERVER', 'http://localhost:5000') EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'nomic-ai/nomic-embed-text-v1.5') CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_code') @@ -47,7 +47,6 @@ USE_LOCAL_OLLAMA = os.getenv('USE_LOCAL_OLLAMA', 'true').lower() == 'true' # Constants -SUPPORTED_EXTENSIONS = ['.py', '.log', '.js', '.ts', '.md', '.sql', '.html', '.csv'] DEFAULT_CHUNK_SIZE = 2000 OLLAMA_EMBEDDING_MODEL = "nomic-embed-text" @@ -60,7 +59,7 @@ class EmbeddingHandler: def __init__(self, model: Optional[str] = None): self.model: str = model or EMBEDDING_MODEL - def embed(self, texts: List[str]) -> List[List[float]]: + def embed(self, _texts: List[str]) -> List[List[float]]: """Generate embeddings for a list of texts""" raise NotImplementedError @@ -77,21 +76,26 @@ def __init__(self, model: Optional[str] = None): self.transformer: Optional[Any] = None self.device: str = 'cpu' try: - from sentence_transformers import SentenceTransformer import torch - # Check for CUDA availability + # Check for GPU acceleration if torch.cuda.is_available(): console.print(f"[green]✓ CUDA available: {torch.cuda.get_device_name(0)}[/green]") self.device = 'cuda' + elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + console.print("[green]✓ Apple Silicon GPU (MPS) available[/green]") + self.device = 'mps' else: - console.print("[yellow]! CUDA not available, using CPU[/yellow]") + console.print("[yellow]! No GPU acceleration available, using CPU[/yellow]") self.device = 'cpu' - self.transformer = SentenceTransformer(self.model, device=self.device) - self.transformer.max_seq_length = 512 + from trust_manager import safe_sentence_transformer_load # type: ignore[import] + self.transformer = safe_sentence_transformer_load(self.model, device=self.device) + self.transformer.max_seq_length = 512 # type: ignore[attr-defined] except ImportError: raise ImportError("sentence-transformers not installed. Run: pip install sentence-transformers") + except Exception as e: + raise RuntimeError(f"Failed to load embedding model: {e}") def embed(self, texts: List[str]) -> List[List[float]]: """Generate embeddings using local model""" @@ -164,6 +168,13 @@ def __init__(self, model: Optional[str] = None): self.base_url: str = EMBEDDING_SERVER self.max_retries: int = 3 self.retry_delay: int = 1 + self.trust_remote_code: bool = self._get_trust_setting() + + def _get_trust_setting(self) -> bool: + """Get trust_remote_code setting for this model""" + from trust_manager import TrustManager + trust_manager = TrustManager() + return trust_manager.get_trust_setting(self.model, interactive=True) def embed(self, texts: List[str]) -> List[List[float]]: """Generate embeddings using remote server with retry logic""" @@ -171,7 +182,11 @@ def embed(self, texts: List[str]) -> List[List[float]]: try: response = requests.post( f"{self.base_url}/embed", - json={"texts": texts, "model": self.model}, + json={ + "texts": texts, + "model": self.model, + "trust_remote_code": self.trust_remote_code + }, timeout=60 ) response.raise_for_status() @@ -199,22 +214,61 @@ def check_availability(self) -> bool: return False +def is_indexable_file(file_path: Path) -> bool: + """Determine if a file can be indexed by examining its content""" + try: + # Skip if file is too large (> 100MB) + if file_path.stat().st_size > 100 * 1024 * 1024: + return False + + with open(file_path, 'rb') as f: + # Read first 8KB to check content + chunk = f.read(8192) + if not chunk: + return False # Empty file + + # Check for null bytes (indicates binary content) + if b'\x00' in chunk: + return False + + # Try to decode as text using common encodings + for encoding in ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']: + try: + chunk.decode(encoding) + return True + except UnicodeDecodeError: + continue + + return False + + except (IOError, OSError, PermissionError): + return False + + def collect_files(repo_path: Path) -> List[Path]: - """Collect all supported files from the repository""" + """Collect all indexable files from the repository by scanning content""" files: List[Path] = [] - for ext in SUPPORTED_EXTENSIONS: - files.extend(list(repo_path.rglob(f"*{ext}"))) - # Filter out common directories to ignore - ignore_dirs = {'.git', '__pycache__', 'node_modules', '.env', 'venv', 'env', '.venv'} - files = [f for f in files if not any(ignored in f.parts for ignored in ignore_dirs)] + ignore_dirs = {'.git', '__pycache__', 'node_modules', '.env', 'venv', 'env', '.venv', + 'target', 'build', 'dist', '.svn', '.hg', '.idea', '.vscode'} + + # Recursively scan all files + for file_path in repo_path.rglob('*'): + if file_path.is_file(): + # Skip files in ignored directories + if any(ignored in file_path.parts for ignored in ignore_dirs): + continue + + # Check if file is indexable by content + if is_indexable_file(file_path): + files.append(file_path) return sorted(files) def chunk_code(content: str, chunk_size: int = DEFAULT_CHUNK_SIZE) -> List[str]: - """Split code into chunks""" + """Split logs into chunks""" chunks: List[str] = [] lines = content.split('\n') current_chunk: List[str] = [] @@ -265,7 +319,18 @@ def process_repository( for file_path in files: try: - content = file_path.read_text(encoding='utf-8', errors='ignore') + # Try different encodings to read the file + content = None + for encoding in ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']: + try: + content = file_path.read_text(encoding=encoding) + break + except UnicodeDecodeError: + continue + + if content is None: + console.print(f"[yellow]Could not decode {file_path}, skipping[/yellow]") + continue chunks = chunk_code(content, chunk_size) for i, chunk in enumerate(chunks): @@ -380,7 +445,7 @@ def save_metadata(repo_path: Path, embedding_type: str, model: str, chunk_size: def main() -> None: parser = argparse.ArgumentParser( - description="Index a codebase for semantic search", + description="Index log files for semantic search", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__ ) @@ -416,7 +481,7 @@ def main() -> None: '--chunk-size', type=int, default=DEFAULT_CHUNK_SIZE, - help=f'Size of code chunks (default: {DEFAULT_CHUNK_SIZE})' + help=f'Size of log file chunks (default: {DEFAULT_CHUNK_SIZE})' ) parser.add_argument( '--chroma-path', @@ -436,7 +501,7 @@ def main() -> None: console.print(f"[red]Error: Repository path does not exist: {repo_path}[/red]") sys.exit(1) - console.print(f"\n[bold cyan]Code Indexer[/bold cyan]") + console.print(f"\n[bold cyan]Log Indexer[/bold cyan]") console.print(f"Repository: {repo_path}") # Determine embedding type diff --git a/trust_manager.py b/trust_manager.py new file mode 100644 index 0000000..405167f --- /dev/null +++ b/trust_manager.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python3 +""" +Trust Remote Code Manager + +Handles detection of models requiring trust_remote_code and manages user consent. +Saves approval state in .env with model tracking for security. +""" + +import os +from typing import Optional, TYPE_CHECKING +from dotenv import load_dotenv + +if TYPE_CHECKING: + from sentence_transformers import SentenceTransformer + +# Load environment variables +load_dotenv() + + +class TrustManager: + """Manages trust_remote_code consent and detection""" + + def __init__(self, env_file: str = '.env'): + self.env_file = env_file + self._ensure_env_file() + + def _ensure_env_file(self) -> None: + """Ensure .env file exists""" + if not os.path.exists(self.env_file): + with open(self.env_file, 'w') as f: + f.write("# Trust Remote Code Settings\n") + f.write("# Format: TRUST_REMOTE_CODE_=true|false\n\n") + + def _model_to_key(self, model_name: str) -> str: + """Convert model name to environment variable key""" + # Create a hash of the model name for consistent key generation + import hashlib + model_hash = hashlib.md5(model_name.encode()).hexdigest()[:8] + return f"TRUST_REMOTE_CODE_{model_hash.upper()}" + + def _get_model_approval(self, model_name: str) -> Optional[bool]: + """Check if model has been approved for trust_remote_code""" + key = self._model_to_key(model_name) + value = os.getenv(key) + if value is None: + return None + return value.lower() == 'true' + + def set_model_approval(self, model_name: str, approved: bool) -> None: + """Save model approval to .env file""" + key = self._model_to_key(model_name) + value = 'true' if approved else 'false' + + # Add comment with model name for clarity + comment_key = f"# {key}_MODEL" + + # Read existing content + env_content: list[str] = [] + if os.path.exists(self.env_file): + with open(self.env_file, 'r') as f: + env_content = f.readlines() + + # Remove existing entries for this model + filtered_content: list[str] = [] + skip_next = False + for line in env_content: + if skip_next: + skip_next = False + continue + if line.strip() == f"{comment_key}={model_name}": + skip_next = True # Skip the next line (the actual setting) + continue + if not line.startswith(key + '='): + filtered_content.append(line) + + # Add new entries + filtered_content.append(f"{comment_key}={model_name}\n") + filtered_content.append(f"{key}={value}\n") + + # Write back to file + with open(self.env_file, 'w') as f: + f.writelines(filtered_content) + + # Reload environment + load_dotenv(override=True) + + def check_model_needs_trust(self, model_name: str) -> bool: + """ + Auto-detect if a model requires trust_remote_code. + This is a heuristic based on known model patterns. + """ + # Known models that require trust_remote_code + trust_required_patterns = [ + 'nomic-ai/', + 'microsoft/codebert', + 'sentence-transformers/all-mpnet-base-v2', + # Add more patterns as needed + ] + + # Known models that don't require trust_remote_code + safe_patterns = [ + 'sentence-transformers/all-MiniLM', + 'sentence-transformers/paraphrase', + 'sentence-transformers/multi-qa', + ] + + model_lower = model_name.lower() + + # Check safe patterns first + for pattern in safe_patterns: + if pattern.lower() in model_lower: + return False + + # Check trust required patterns + for pattern in trust_required_patterns: + if pattern.lower() in model_lower: + return True + + # For unknown models, assume trust might be needed + return True + + def prompt_user_consent(self, model_name: str) -> bool: + """Prompt user for consent to use trust_remote_code""" + print(f"\n{'='*60}") + print("SECURITY WARNING: Remote Code Execution") + print(f"{'='*60}") + print(f"Model: {model_name}") + print() + print("This model may require 'trust_remote_code=True' which allows") + print("the model to execute arbitrary code during loading.") + print() + print("RISKS:") + print("- The model could execute malicious code") + print("- Your system could be compromised") + print("- Data could be stolen or corrupted") + print() + print("BENEFITS:") + print("- Access to newer/specialized models") + print("- Better embedding quality for some models") + print() + print("Your choice will be saved for this model.") + print(f"{'='*60}") + + while True: + try: + response = input("Allow remote code execution for this model? [y/N]: ").strip().lower() + if response in ['y', 'yes']: + return True + elif response in ['n', 'no', '']: + return False + else: + print("Please enter 'y' for yes or 'n' for no.") + except KeyboardInterrupt: + print("\nOperation cancelled.") + return False + + def get_trust_setting(self, model_name: str, interactive: bool = True) -> bool: + """ + Get trust_remote_code setting for a model. + + Args: + model_name: Name of the model + interactive: Whether to prompt user if not already approved + + Returns: + True if trust_remote_code should be used, False otherwise + """ + # Check if we have a saved decision + approval = self._get_model_approval(model_name) + if approval is not None: + return approval + + # Check if model needs trust_remote_code + needs_trust = self.check_model_needs_trust(model_name) + if not needs_trust: + # Model is known safe, save this decision + self.set_model_approval(model_name, False) + return False + + # Model might need trust, prompt user if interactive + if interactive: + user_approved = self.prompt_user_consent(model_name) + self.set_model_approval(model_name, user_approved) + return user_approved + else: + # Non-interactive mode, default to False for security + print(f"Warning: Model {model_name} may require trust_remote_code but running in non-interactive mode.") + print("Defaulting to False for security. Use interactive mode to approve.") + return False + + def list_approved_models(self) -> dict[str, bool]: + """List all models and their approval status""" + approved_models: dict[str, bool] = {} + + # Read .env file and find trust settings + if os.path.exists(self.env_file): + with open(self.env_file, 'r') as f: + lines = f.readlines() + + i = 0 + while i < len(lines): + line = lines[i].strip() + if line.startswith('# TRUST_REMOTE_CODE_') and line.endswith('_MODEL'): + # This is a model comment line + model_name = line.split('=', 1)[1] if '=' in line else 'Unknown' + # Next line should be the setting + if i + 1 < len(lines): + setting_line = lines[i + 1].strip() + if '=' in setting_line: + approved = setting_line.split('=', 1)[1].lower() == 'true' + approved_models[model_name] = approved + i += 2 + else: + i += 1 + + return approved_models + + +def safe_sentence_transformer_load(model_name: str, interactive: bool = True, **kwargs: object) -> 'SentenceTransformer': + """ + Safely load a SentenceTransformer with trust_remote_code consent management. + + Args: + model_name: Name of the model to load + interactive: Whether to prompt user for consent + **kwargs: Additional arguments to pass to SentenceTransformer + + Returns: + Loaded SentenceTransformer model + """ + from sentence_transformers import SentenceTransformer + + trust_manager = TrustManager() + trust_setting = trust_manager.get_trust_setting(model_name, interactive) + + # Remove any existing trust_remote_code setting from kwargs + kwargs_dict = dict(kwargs) + kwargs_dict.pop('trust_remote_code', None) + + if trust_setting: + print(f"Loading {model_name} with trust_remote_code=True (user approved)") + return SentenceTransformer(model_name, trust_remote_code=True, **kwargs_dict) # type: ignore[misc] + else: + print(f"Loading {model_name} with trust_remote_code=False") + try: + return SentenceTransformer(model_name, trust_remote_code=False, **kwargs_dict) # type: ignore[misc] + except Exception as e: + print(f"Error loading model with trust_remote_code=False: {e}") + if interactive: + print("Model may require trust_remote_code=True. Would you like to try again with remote code enabled?") + if trust_manager.prompt_user_consent(model_name): + trust_manager.set_model_approval(model_name, True) + return SentenceTransformer(model_name, trust_remote_code=True, **kwargs_dict) # type: ignore[misc] + raise + + +if __name__ == "__main__": + # CLI for managing trust settings + import argparse + + parser = argparse.ArgumentParser(description="Manage trust_remote_code settings") + parser.add_argument('--list', action='store_true', help='List approved models') + parser.add_argument('--check', type=str, help='Check if model needs trust_remote_code') + + args = parser.parse_args() + + trust_manager = TrustManager() + + if args.list: + models = trust_manager.list_approved_models() + if models: + print("Approved models:") + for model, approved in models.items(): + status = "✓ Approved" if approved else "✗ Denied" + print(f" {model}: {status}") + else: + print("No models have been evaluated yet.") + + elif args.check: + needs_trust = trust_manager.check_model_needs_trust(args.check) + print(f"Model {args.check} {'likely needs' if needs_trust else 'probably does not need'} trust_remote_code") + + else: + parser.print_help() \ No newline at end of file