diff --git a/README.md b/README.md index dad8a30..4512d41 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ VectorPrime is built for developers and researchers who run inference locally an | Ollama export | Generates a `Modelfile` with tuned `num_thread` and `num_gpu` values, ready for `ollama create` | Stable | | Format conversion | Bidirectional GGUF-to-ONNX and ONNX-to-GGUF conversion with full metadata round-trip | Stable | | Python API | PyO3 native extension — import and call from any Python script or notebook | Stable | -| CLI interface | `profile`, `optimize`, `convert-to-onnx`, and `convert-to-gguf` subcommands | Stable | +| CLI interface | `profile` (with `--json`, `--verbose`, `--save`), `optimize`, `convert-to-onnx`, `convert-to-gguf`, and `doctor` subcommands | Stable | --- @@ -118,16 +118,204 @@ VectorPrime detects which runtimes are available at startup and silently skips a vectorprime profile ``` -Prints a JSON hardware profile to stdout: +Prints a human-readable hardware summary to stdout: + +``` +VectorPrime Hardware Profile +──────────────────────────────────────── + +CPU + Model: Intel Core i9-14900HX + Cores: 32 threads + SIMD Support: AVX2 + +GPU + Model: NVIDIA GeForce RTX 4090 Laptop GPU + Vendor: NVIDIA + VRAM: 16.0 GB + Compute Capability: 8.9 + Tensor Cores: Yes + +Memory + Total RAM: 31.9 GB + Available RAM: 29.9 GB + +Acceleration Support + ✓ GPU inference available + ✓ FP16 supported + ✓ INT8 supported + ✓ TensorRT compatible + +Recommended Inference Setup + Runtime: TensorRT / llama.cpp + Precision: FP16 or INT8 + Estimated Model Capacity: + • ~70B quantized + • ~13B full GPU + +Tip: run `vectorprime profile --verbose` for full hardware diagnostics. +``` + +**Options:** + +``` +vectorprime profile [OPTIONS] + +Options: + --json Output the full hardware profile as structured JSON to stdout. + --verbose Show a detailed hardware diagnostic report including clock speed, + SIMD features, CUDA/driver versions, runtime compatibility, and + optimization hints. + --save PATH Save the JSON profile to a file. Can be combined with --json to + print JSON to stdout and save to a file simultaneously. +``` + +#### JSON output (`--json`) + +```bash +vectorprime profile --json +``` ```json { - "cpu": { "core_count": 16, "simd_level": "AVX2" }, - "gpu": { "name": "NVIDIA GeForce RTX 4090", "vram_mb": 24564, "compute_capability": [8, 9] }, - "ram": { "total_mb": 65536, "available_mb": 48000 } + "cpu": { + "brand": "Intel(R) Core(TM) i9-14900HX", + "core_count": 32, + "simd_level": "AVX2" + }, + "gpu": { + "name": "NVIDIA GeForce RTX 4090 Laptop GPU", + "vendor": "Nvidia", + "vram_mb": 16376, + "compute_capability": [8, 9], + "tensor_cores": true + }, + "ram": { + "total_mb": 31941, + "available_mb": 29935 + }, + "capabilities": { + "gpu_inference": true, + "fp16": true, + "int8": true, + "tensorrt_supported": true, + "tensor_cores": true + }, + "recommendation": { + "preferred_runtime": ["TensorRT", "llama.cpp"], + "preferred_precision": ["FP16", "INT8"] + } } ``` +#### Save to file (`--save`) + +```bash +vectorprime profile --save hw.json +# Hardware profile saved to: hw.json + +vectorprime profile --json --save hw.json +# Prints JSON to stdout AND saves to hw.json +``` + +#### Verbose diagnostics (`--verbose`) + +```bash +vectorprime profile --verbose +``` + +``` +VectorPrime Hardware Diagnostic Report +═══════════════════════════════════════ + +CPU + Model: Intel Core i9-14900HX + Architecture: x86_64 + Physical Cores: 24 + Logical Threads: 32 + Base Clock: 2.2 GHz + SIMD Features: SSE4, AVX, AVX2 + L3 Cache: 36 MB + +GPU + Model: NVIDIA GeForce RTX 4090 Laptop GPU + Vendor: NVIDIA + Compute Capability: 8.9 + VRAM: 16.0 GB + Tensor Cores: Yes + CUDA Version: 12.4 + Driver Version: 550.xx + Memory Bandwidth: ~576 GB/s + +System Memory + Total RAM: 31.9 GB + Available RAM: 29.9 GB + Swap: 8.0 GB + +Acceleration Support + CUDA: Available + TensorRT: Compatible + FP16 Inference: Supported + INT8 Inference: Supported + +Runtime Compatibility + llama.cpp: Supported (CPU + GPU offload) + ONNX Runtime: Supported + TensorRT: Supported + vLLM: Supported + +VectorPrime Optimization Hints + Recommended Runtime: TensorRT / llama.cpp + Recommended Precision: FP16 / INT8 + Suggested Threads: 16–32 + GPU Offload Capacity: High + +System Readiness + ✓ CUDA driver detected + ✓ GPU compute capability supported + ✓ Sufficient VRAM for large LLMs + +System ready for optimized LLM inference. +``` + +### Check System Readiness (`doctor`) + +```bash +vectorprime doctor +``` + +Probes for required inference components and reports which are available: + +``` +VectorPrime System Check +──────────────────────── + +✓ CUDA installed +✓ GPU driver detected +✓ TensorRT available +✓ llama.cpp GPU support + +System ready for optimized inference. +``` + +If a component is missing, its line shows `✗` and the summary changes to: + +``` +✗ TensorRT available +✗ llama.cpp GPU support + +Some components missing — see above. +``` + +The doctor command checks: + +| Component | Detection method | +|---|---| +| CUDA installed | `nvidia-smi` on PATH | +| GPU driver detected | `nvidia-smi` on PATH | +| TensorRT available | `trtexec` on PATH | +| llama.cpp GPU support | `llama-cli` or `llama-server` on PATH | + ### Optimize a Model ```bash diff --git a/python/vectorprime/cli.py b/python/vectorprime/cli.py index 724f479..1472ae3 100644 --- a/python/vectorprime/cli.py +++ b/python/vectorprime/cli.py @@ -1,8 +1,20 @@ -"""VectorPrime command-line interface.""" +"""VectorPrime command-line interface. + +Location: python/vectorprime/cli.py + +Summary: Entry point for the `vectorprime` CLI. Parses arguments and dispatches +to command handlers (profile, optimize, convert-to-onnx, convert-to-gguf, doctor). + +Used by: pyproject.toml console_scripts entry point; also importable as a module. +""" import argparse -import sys +import json import os +import platform +import shutil +import subprocess +import sys def _print_logo() -> None: @@ -126,14 +138,588 @@ def _print_model_summary(info: dict) -> None: print(div) -def cmd_profile(_args: argparse.Namespace) -> None: +def _mb_to_gb_str(mb: float) -> str: + """Convert megabytes to a human-readable gigabyte string, e.g. '31.9 GB'.""" + return f"{mb / 1024:.1f} GB" + + +def _compute_cap_str(cap: list) -> str: + """Format a compute capability list like [8, 9] as the string '8.9'.""" + if cap and len(cap) >= 2: + return f"{cap[0]}.{cap[1]}" + return "N/A" + + +def _round_to_model_size(param_billions: int) -> int: + """Round a raw parameter count in billions to the nearest common model size.""" + common_sizes = [7, 13, 30, 70, 130] + if param_billions <= 0: + return param_billions + closest = min(common_sizes, key=lambda s: abs(s - param_billions)) + return closest + + +def _derive_capabilities(hw: dict) -> dict: + """Derive capability flags from a raw hardware profile dict. + + Args: + hw: Parsed hardware profile containing 'cpu', 'gpu', and 'ram' keys. + + Returns: + A dict with boolean capability flags. + """ + gpu = hw.get("gpu") + vendor = (gpu.get("vendor", "") if gpu else "").lower() + cap = gpu.get("compute_capability", [0, 0]) if gpu else [0, 0] + cap_tuple = tuple(cap[:2]) if len(cap) >= 2 else (0, 0) + is_nvidia = vendor == "nvidia" + tensorrt_ok = is_nvidia and cap_tuple >= (7, 0) + tensor_cores = is_nvidia and cap_tuple >= (7, 0) + return { + "gpu_inference": gpu is not None, + "fp16": gpu is not None, + "int8": gpu is not None, + "tensorrt_supported": tensorrt_ok, + "tensor_cores": tensor_cores, + } + + +def _derive_recommendation(hw: dict, caps: dict, installed_runtimes: dict) -> dict: + """Derive runtime and precision recommendations from hardware and capabilities. + + Runtime recommendations are ordered by suitability for the detected hardware + class and filtered to only runtimes confirmed as installed. If nothing in + the priority list is installed the full priority list is returned as + aspirational recommendations (so the caller still gets a useful answer). + + Precision recommendations are based solely on hardware capabilities and do + not depend on installed software. + + Args: + hw: Parsed hardware profile dict. + caps: Capability flags from _derive_capabilities(). + installed_runtimes: Boolean dict from _check_runtime_support(). + + Returns: + A dict with 'preferred_runtime' and 'preferred_precision' lists. + Values are human-readable display strings (e.g. "llama.cpp", "FP16"). + """ + gpu = hw.get("gpu") + vendor = (gpu.get("vendor", "") if gpu else "").lower() + cpu = hw.get("cpu", {}) + simd_level = cpu.get("simd_level", "") + + cap = (gpu.get("compute_capability", [0, 0]) if gpu else [0, 0]) + cap_tuple = tuple(cap[:2]) if len(cap) >= 2 else (0, 0) + + is_nvidia = vendor == "nvidia" + is_amd = vendor == "amd" + is_apple = vendor in ("apple", "metal") + + # --- Runtime priority list (keys from _check_runtime_support) --- + if is_nvidia and cap_tuple >= (7, 0): + # Volta and above: TensorRT is the optimal choice. + priority = ["tensorrt", "vllm", "onnx_runtime", "llama_cpp", "ollama"] + elif is_nvidia: + # Pascal and older NVIDIA: no TensorRT Tensor Core benefit. + priority = ["vllm", "onnx_runtime", "llama_cpp", "ollama"] + elif is_amd: + priority = ["llama_cpp", "onnx_runtime", "ollama"] + elif is_apple: + priority = ["llama_cpp", "onnx_runtime", "ollama"] + elif gpu: + # Unknown GPU vendor — safe defaults. + priority = ["llama_cpp", "onnx_runtime", "ollama"] + else: + # CPU-only. + priority = ["llama_cpp", "ollama", "onnx_runtime"] + + # Filter priority list to installed runtimes; fall back to full list if + # nothing is installed (aspirational recommendations). + installed_priority = [k for k in priority if installed_runtimes.get(k)] + effective_priority = installed_priority if installed_priority else priority + preferred_runtime = [_RUNTIME_LABELS[k] for k in effective_priority] + + # --- Precision based on hardware capabilities only --- + if is_nvidia and cap_tuple >= (8, 0): + # Ampere / Ada / Hopper: full Tensor Core support including INT4. + preferred_precision = ["FP16", "INT8", "INT4"] + elif is_nvidia and cap_tuple >= (7, 0): + # Volta / Turing: Tensor Cores for FP16 and INT8. + preferred_precision = ["FP16", "INT8"] + elif is_nvidia: + # Pascal and older: no Tensor Cores. + preferred_precision = ["FP32", "FP16"] + elif is_amd or is_apple: + # ROCm and Metal MPS both support FP16 and INT8. + preferred_precision = ["FP16", "INT8"] + elif gpu: + # Unknown GPU. + preferred_precision = ["FP16", "INT8"] + else: + # CPU-only: choose quantized formats based on available SIMD. + if simd_level == "AVX512": + preferred_precision = ["INT8", "Q4_K_M", "Q8_0"] + elif simd_level == "AVX2": + preferred_precision = ["Q4_K_M", "Q8_0", "INT8"] + else: + preferred_precision = ["Q4_K_M", "Q8_0"] + + return { + "preferred_runtime": preferred_runtime, + "preferred_precision": preferred_precision, + } + + +def _model_capacity_estimate(vram_mb: float) -> tuple: + """Estimate model capacity from VRAM. + + Args: + vram_mb: Available VRAM in megabytes. + + Returns: + Tuple of (quantized_B, full_gpu_B) as rounded parameter counts in billions. + """ + vram_gb = vram_mb / 1024 + quantized_raw = int(vram_gb * 4) + full_gpu_raw = int(vram_gb / 2) + quantized_b = _round_to_model_size(quantized_raw) if quantized_raw > 0 else quantized_raw + full_gpu_b = _round_to_model_size(full_gpu_raw) if full_gpu_raw > 0 else full_gpu_raw + return quantized_b, full_gpu_b + + +def _get_nvidia_smi_info() -> dict: + """Run nvidia-smi and parse CUDA and driver version from its output. + + Returns: + Dict with 'cuda_version' and 'driver_version' strings, or 'N/A' on failure. + """ + result = {"cuda_version": "N/A", "driver_version": "N/A"} + if not shutil.which("nvidia-smi"): + return result + try: + proc = subprocess.run( + ["nvidia-smi"], + capture_output=True, + text=True, + timeout=5, + ) + for line in proc.stdout.splitlines(): + if "CUDA Version:" in line: + parts = line.split("CUDA Version:") + if len(parts) > 1: + result["cuda_version"] = parts[1].strip().rstrip("|").strip() + if "Driver Version:" in line: + parts = line.split("Driver Version:") + if len(parts) > 1: + # Driver Version: 550.xx CUDA Version: 12.4 + token = parts[1].strip().split()[0] + result["driver_version"] = token + except Exception: + pass + return result + + +def _get_verbose_cpu_info(hw: dict) -> dict: + """Gather verbose CPU information from /proc/cpuinfo and the hardware profile. + + Args: + hw: Parsed hardware profile dict. + + Returns: + Dict with cpu diagnostic fields; missing values fall back to 'N/A'. + """ + cpu = hw.get("cpu", {}) + logical_threads = cpu.get("core_count", "N/A") + arch = platform.machine() + + # Physical cores: try psutil, otherwise estimate as logical // 2. + try: + import psutil # type: ignore[import] + physical_cores = psutil.cpu_count(logical=False) or "N/A" + except ImportError: + physical_cores = (logical_threads // 2) if isinstance(logical_threads, int) else "N/A" + + # Base clock from /proc/cpuinfo. + base_clock = "N/A" + try: + with open("/proc/cpuinfo", "r", encoding="utf-8") as f: + for line in f: + if line.lower().startswith("cpu mhz"): + mhz_str = line.split(":", 1)[1].strip() + mhz = float(mhz_str) + base_clock = f"{mhz / 1000:.1f} GHz" + break + except Exception: + pass + + # Expand SIMD level into a full feature list. + simd_map = { + "AVX512": "SSE4, AVX, AVX2, AVX512", + "AVX2": "SSE4, AVX, AVX2", + "AVX": "SSE4, AVX", + } + simd_raw = cpu.get("simd_level", "") + simd_features = simd_map.get(simd_raw, "SSE2") + + # L3 cache from /proc/cpuinfo. + l3_cache = "N/A" + try: + with open("/proc/cpuinfo", "r", encoding="utf-8") as f: + for line in f: + if "cache size" in line.lower(): + l3_cache = line.split(":", 1)[1].strip() + break + except Exception: + pass + + return { + "arch": arch, + "physical_cores": physical_cores, + "logical_threads": logical_threads, + "base_clock": base_clock, + "simd_features": simd_features, + "l3_cache": l3_cache, + } + + +def _get_verbose_mem_info() -> dict: + """Parse /proc/meminfo for swap total. + + Returns: + Dict with 'swap' as a GB string, or 'N/A'. + """ + swap = "N/A" + try: + with open("/proc/meminfo", "r", encoding="utf-8") as f: + for line in f: + if line.startswith("SwapTotal:"): + kb = int(line.split()[1]) + swap = f"{kb / 1024 / 1024:.1f} GB" + break + except Exception: + pass + return {"swap": swap} + + +def _get_gpu_bandwidth(gpu_name: str) -> str: + """Look up approximate memory bandwidth for a known GPU model. + + Args: + gpu_name: GPU model name string from the hardware profile. + + Returns: + Memory bandwidth string, e.g. '~1008 GB/s', or 'N/A'. + """ + name_lower = gpu_name.lower() + bandwidth_table = { + "rtx 4090 laptop": "~576 GB/s", + "rtx 4090": "~1008 GB/s", + "rtx 3090": "~936 GB/s", + "rtx 3080": "~760 GB/s", + "rtx 3070": "~448 GB/s", + "a100": "~2000 GB/s", + "h100": "~3350 GB/s", + } + for key, val in bandwidth_table.items(): + if key in name_lower: + return val + return "N/A" + + +def _check_runtime_support() -> dict: + """Check which inference runtimes are available on this system. + + Uses PATH probing for binary runtimes and importlib.util.find_spec for + Python-package-based runtimes (onnxruntime, vllm) to avoid importing them. + + Returns: + Dict mapping internal runtime keys to booleans. + Keys: "llama_cpp", "ollama", "tensorrt", "onnx_runtime", "vllm". + """ + import importlib.util + + return { + "llama_cpp": bool(shutil.which("llama-cli") or shutil.which("llama-server")), + "ollama": shutil.which("ollama") is not None, + "tensorrt": shutil.which("trtexec") is not None, + "onnx_runtime": importlib.util.find_spec("onnxruntime") is not None, + "vllm": importlib.util.find_spec("vllm") is not None, + } + + +# Human-readable display labels for each internal runtime key. +_RUNTIME_LABELS: dict = { + "tensorrt": "TensorRT", + "vllm": "vLLM", + "onnx_runtime": "ONNX Runtime", + "llama_cpp": "llama.cpp", + "ollama": "Ollama", +} + + +def _print_pretty_profile(hw: dict, caps: dict, rec: dict) -> None: + """Print the default human-readable hardware profile summary. + + Args: + hw: Parsed hardware profile dict. + caps: Capability flags from _derive_capabilities(). + rec: Recommendations from _derive_recommendation(). + """ + div = "─" * 40 + cpu = hw.get("cpu", {}) + gpu = hw.get("gpu") + ram = hw.get("ram", {}) + + print("VectorPrime Hardware Profile") + print(div) + print() + + # CPU section. + print("CPU") + brand = cpu.get("brand", "N/A") + # Strip "(R)", "(TM)" for a cleaner display name. + display_brand = brand.replace("(R)", "").replace("(TM)", "").strip() + print(f" {'Model:':<22} {display_brand}") + print(f" {'Cores:':<22} {cpu.get('core_count', 'N/A')} threads") + print(f" {'SIMD Support:':<22} {cpu.get('simd_level', 'N/A')}") + print() + + # GPU section. + print("GPU") + if gpu: + gpu_name = gpu.get("name", "N/A") + vendor = gpu.get("vendor", "N/A") + vram_mb = gpu.get("vram_mb", 0) + cap = gpu.get("compute_capability", []) + print(f" {'Model:':<22} {gpu_name}") + print(f" {'Vendor:':<22} {vendor.upper() if vendor != 'N/A' else vendor}") + print(f" {'VRAM:':<22} {_mb_to_gb_str(vram_mb)}") + print(f" {'Compute Capability:':<22} {_compute_cap_str(cap)}") + tc_str = "Yes" if caps.get("tensor_cores") else "No" + print(f" {'Tensor Cores:':<22} {tc_str}") + else: + print(" No GPU detected.") + print() + + # Memory section. + print("Memory") + print(f" {'Total RAM:':<22} {_mb_to_gb_str(ram.get('total_mb', 0))}") + print(f" {'Available RAM:':<22} {_mb_to_gb_str(ram.get('available_mb', 0))}") + print() + + # Acceleration support. + print("Acceleration Support") + _checkmark = lambda flag: "✓" if flag else "✗" + print(f" {_checkmark(caps['gpu_inference'])} GPU inference available") + print(f" {_checkmark(caps['fp16'])} FP16 supported") + print(f" {_checkmark(caps['int8'])} INT8 supported") + print(f" {_checkmark(caps['tensorrt_supported'])} TensorRT compatible") + print() + + # Recommended inference setup. + print("Recommended Inference Setup") + runtime_str = " / ".join(rec.get("preferred_runtime", [])) + precision_str = " or ".join(rec.get("preferred_precision", [])) + print(f" {'Runtime:':<22} {runtime_str}") + print(f" {'Precision:':<22} {precision_str}") + + if gpu: + vram_mb = gpu.get("vram_mb", 0) + q_b, full_b = _model_capacity_estimate(vram_mb) + print(f" Estimated Model Capacity:") + print(f" • ~{q_b}B quantized") + print(f" • ~{full_b}B full GPU") + + print() + print("Tip: run `vectorprime profile --verbose` for full hardware diagnostics.") + + +def _print_verbose_profile(hw: dict, caps: dict, rec: dict, installed_runtimes: dict) -> None: + """Print the full verbose hardware diagnostic report. + + Args: + hw: Parsed hardware profile dict. + caps: Capability flags from _derive_capabilities(). + rec: Recommendations from _derive_recommendation(). + installed_runtimes: Boolean dict from _check_runtime_support(). + """ + heavy_div = "═" * 39 + div = "─" * 39 + cpu = hw.get("cpu", {}) + gpu = hw.get("gpu") + ram = hw.get("ram", {}) + + cpu_verbose = _get_verbose_cpu_info(hw) + mem_verbose = _get_verbose_mem_info() + nvidia_info = _get_nvidia_smi_info() if gpu else {"cuda_version": "N/A", "driver_version": "N/A"} + + print("VectorPrime Hardware Diagnostic Report") + print(heavy_div) + print() + + # CPU. + print("CPU") + brand = cpu.get("brand", "N/A") + display_brand = brand.replace("(R)", "").replace("(TM)", "").strip() + print(f" {'Model:':<22} {display_brand}") + print(f" {'Architecture:':<22} {cpu_verbose['arch']}") + print(f" {'Physical Cores:':<22} {cpu_verbose['physical_cores']}") + print(f" {'Logical Threads:':<22} {cpu_verbose['logical_threads']}") + print(f" {'Base Clock:':<22} {cpu_verbose['base_clock']}") + print(f" {'SIMD Features:':<22} {cpu_verbose['simd_features']}") + print(f" {'L3 Cache:':<22} {cpu_verbose['l3_cache']}") + print() + + # GPU. + print("GPU") + if gpu: + gpu_name = gpu.get("name", "N/A") + vendor = gpu.get("vendor", "N/A") + vram_mb = gpu.get("vram_mb", 0) + cap = gpu.get("compute_capability", []) + bw = _get_gpu_bandwidth(gpu_name) + tc_str = "Yes" if caps.get("tensor_cores") else "No" + print(f" {'Model:':<22} {gpu_name}") + print(f" {'Vendor:':<22} {vendor.upper() if vendor != 'N/A' else vendor}") + print(f" {'Compute Capability:':<22} {_compute_cap_str(cap)}") + print(f" {'VRAM:':<22} {_mb_to_gb_str(vram_mb)}") + print(f" {'Tensor Cores:':<22} {tc_str}") + print(f" {'CUDA Version:':<22} {nvidia_info['cuda_version']}") + print(f" {'Driver Version:':<22} {nvidia_info['driver_version']}") + print(f" {'Memory Bandwidth:':<22} {bw}") + else: + print(" No GPU detected.") + print() + + # System Memory. + print("System Memory") + print(f" {'Total RAM:':<22} {_mb_to_gb_str(ram.get('total_mb', 0))}") + print(f" {'Available RAM:':<22} {_mb_to_gb_str(ram.get('available_mb', 0))}") + print(f" {'Swap:':<22} {mem_verbose['swap']}") + print() + + # Acceleration support. + print("Acceleration Support") + cuda_avail = "Available" if shutil.which("nvidia-smi") else "Not found" + trt_avail = "Compatible" if caps.get("tensorrt_supported") else "Not available" + fp16_avail = "Supported" if caps.get("fp16") else "Not supported" + int8_avail = "Supported" if caps.get("int8") else "Not supported" + print(f" {'CUDA:':<22} {cuda_avail}") + print(f" {'TensorRT:':<22} {trt_avail}") + print(f" {'FP16 Inference:':<22} {fp16_avail}") + print(f" {'INT8 Inference:':<22} {int8_avail}") + print() + + # Runtime compatibility — display clean labels with Supported / Not found status. + print("Runtime Compatibility") + for key, label in _RUNTIME_LABELS.items(): + status = "Supported" if installed_runtimes.get(key) else "Not found" + print(f" {(label + ':'):<22} {status}") + print() + + # VectorPrime optimization hints. + print("VectorPrime Optimization Hints") + runtime_str = " / ".join(rec.get("preferred_runtime", [])) + precision_str = " / ".join(rec.get("preferred_precision", [])) + logical = cpu.get("core_count", 0) + thread_lo = max(1, logical // 2) if isinstance(logical, int) else "N/A" + thread_hi = logical if isinstance(logical, int) else "N/A" + thread_range = f"{thread_lo}–{thread_hi}" if isinstance(thread_lo, int) else "N/A" + gpu_offload = "High" if (gpu and gpu.get("vram_mb", 0) >= 8192) else ("Medium" if gpu else "None") + print(f" {'Recommended Runtime:':<26} {runtime_str}") + print(f" {'Recommended Precision:':<26} {precision_str}") + print(f" {'Suggested Threads:':<26} {thread_range}") + print(f" {'GPU Offload Capacity:':<26} {gpu_offload}") + print() + + # System readiness. + print("System Readiness") + cuda_ok = shutil.which("nvidia-smi") is not None + gpu_cap_ok = gpu is not None + vram_ok = gpu is not None and gpu.get("vram_mb", 0) >= 4096 + _c = lambda flag: "✓" if flag else "✗" + print(f" {_c(cuda_ok)} CUDA driver detected") + print(f" {_c(gpu_cap_ok)} GPU compute capability supported") + print(f" {_c(vram_ok)} Sufficient VRAM for large LLMs") + print() + if cuda_ok and gpu_cap_ok and vram_ok: + print("System ready for optimized LLM inference.") + else: + print("Some system components may limit inference performance.") + + +def cmd_profile(args: argparse.Namespace) -> None: + """Handle the `vectorprime profile` command. + + Supports four output modes controlled by flags: + - default: Pretty human-readable summary. + - --verbose: Full hardware diagnostic report. + - --json: Structured JSON output to stdout. + - --save: Save JSON to a file (combinable with --json). + """ try: import vectorprime._vectorprime as _vectorprime # type: ignore[import] - hw = _vectorprime.profile_hardware() - print(hw.to_json()) + hw_obj = _vectorprime.profile_hardware() + hw: dict = json.loads(hw_obj.to_json()) except RuntimeError as e: print(f"ERROR: {e}", file=sys.stderr) sys.exit(1) + except Exception as e: + print(f"ERROR: Failed to parse hardware profile: {e}", file=sys.stderr) + sys.exit(1) + + caps = _derive_capabilities(hw) + installed_runtimes = _check_runtime_support() + rec = _derive_recommendation(hw, caps, installed_runtimes) + + # Determine output mode. --verbose takes precedence over --json. + verbose: bool = getattr(args, "verbose", False) + as_json: bool = getattr(args, "json", False) + save_path: str | None = getattr(args, "save", None) + + if verbose: + _print_verbose_profile(hw, caps, rec, installed_runtimes) + return + + if as_json or save_path: + # Build enriched JSON payload. + gpu = hw.get("gpu") + cap = (gpu.get("compute_capability", []) if gpu else []) + enriched_gpu = None + if gpu: + enriched_gpu = { + "name": gpu.get("name"), + "vendor": gpu.get("vendor"), + "vram_mb": gpu.get("vram_mb"), + "compute_capability": cap, + "tensor_cores": caps.get("tensor_cores", False), + } + payload = { + "cpu": hw.get("cpu"), + "gpu": enriched_gpu, + "ram": hw.get("ram"), + "capabilities": caps, + "recommendation": rec, + } + json_str = json.dumps(payload, indent=2) + + if as_json: + print(json_str) + + if save_path: + try: + with open(save_path, "w", encoding="utf-8") as f: + f.write(json_str) + f.write("\n") + print(f"Hardware profile saved to: {save_path}") + except OSError as e: + print(f"ERROR: Could not write to {save_path}: {e}", file=sys.stderr) + sys.exit(1) + return + + # Default: pretty human-readable output. + _print_pretty_profile(hw, caps, rec) def cmd_optimize(args: argparse.Namespace) -> None: @@ -206,6 +792,55 @@ def cmd_optimize(args: argparse.Namespace) -> None: ) +def cmd_doctor(_args: argparse.Namespace) -> None: + """Handle the `vectorprime doctor` command. + + Checks system readiness for optimized LLM inference by probing for + required binaries. Prints a checklist with pass/fail markers and a + summary line at the end. + """ + div = "─" * 24 + + # Each entry: (display_name, detection_function) + def _has_cuda() -> bool: + return shutil.which("nvidia-smi") is not None + + def _has_gpu_driver() -> bool: + # Same heuristic: nvidia-smi presence implies driver is loaded. + return shutil.which("nvidia-smi") is not None + + def _has_tensorrt() -> bool: + return shutil.which("trtexec") is not None + + def _has_llama_cpp_gpu() -> bool: + return bool(shutil.which("llama-cli") or shutil.which("llama-server")) + + checks = [ + ("CUDA installed", _has_cuda), + ("GPU driver detected", _has_gpu_driver), + ("TensorRT available", _has_tensorrt), + ("llama.cpp GPU support", _has_llama_cpp_gpu), + ] + + print("VectorPrime System Check") + print(div) + print() + + all_ok = True + for label, probe in checks: + ok = probe() + marker = "✓" if ok else "✗" + if not ok: + all_ok = False + print(f"{marker} {label}") + + print() + if all_ok: + print("System ready for optimized inference.") + else: + print("Some components missing — see above.") + + def cmd_convert_to_onnx(args: argparse.Namespace) -> None: input_path: str = args.input_path output_path: str = args.output or _replace_ext(input_path, ".onnx") @@ -260,7 +895,25 @@ def build_parser() -> argparse.ArgumentParser: sub.required = True # profile - sub.add_parser("profile", help="Detect and print hardware profile as JSON.") + prof = sub.add_parser("profile", help="Detect and display hardware profile.") + prof.add_argument( + "--json", + action="store_true", + help="Output profile as JSON.", + ) + prof.add_argument( + "--verbose", + action="store_true", + help="Show full hardware diagnostics.", + ) + prof.add_argument( + "--save", + metavar="PATH", + help="Save JSON profile to file.", + ) + + # doctor + sub.add_parser("doctor", help="Check system readiness for optimized LLM inference.") # optimize opt = sub.add_parser( @@ -353,6 +1006,7 @@ def main() -> None: dispatch = { "profile": cmd_profile, + "doctor": cmd_doctor, "optimize": cmd_optimize, "convert-to-onnx": cmd_convert_to_onnx, "convert-to-gguf": cmd_convert_to_gguf,