diff --git a/README.md b/README.md
index dad8a30..4512d41 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ VectorPrime is built for developers and researchers who run inference locally an
 | Ollama export | Generates a `Modelfile` with tuned `num_thread` and `num_gpu` values, ready for `ollama create` | Stable |
 | Format conversion | Bidirectional GGUF-to-ONNX and ONNX-to-GGUF conversion with full metadata round-trip | Stable |
 | Python API | PyO3 native extension — import and call from any Python script or notebook | Stable |
-| CLI interface | `profile`, `optimize`, `convert-to-onnx`, and `convert-to-gguf` subcommands | Stable |
+| CLI interface | `profile` (with `--json`, `--verbose`, `--save`), `optimize`, `convert-to-onnx`, `convert-to-gguf`, and `doctor` subcommands | Stable |
 
 ---
 
@@ -118,16 +118,204 @@ VectorPrime detects which runtimes are available at startup and silently skips a
 vectorprime profile
 ```
 
-Prints a JSON hardware profile to stdout:
+Prints a human-readable hardware summary to stdout:
+
+```
+VectorPrime Hardware Profile
+────────────────────────────────────────
+
+CPU
+  Model:                 Intel Core i9-14900HX
+  Cores:                 32 threads
+  SIMD Support:          AVX2
+
+GPU
+  Model:                 NVIDIA GeForce RTX 4090 Laptop GPU
+  Vendor:                NVIDIA
+  VRAM:                  16.0 GB
+  Compute Capability:    8.9
+  Tensor Cores:          Yes
+
+Memory
+  Total RAM:             31.9 GB
+  Available RAM:         29.9 GB
+
+Acceleration Support
+  ✓ GPU inference available
+  ✓ FP16 supported
+  ✓ INT8 supported
+  ✓ TensorRT compatible
+
+Recommended Inference Setup
+  Runtime:               TensorRT / llama.cpp
+  Precision:             FP16 or INT8
+  Estimated Model Capacity:
+      • ~70B quantized
+      • ~13B full GPU
+
+Tip: run `vectorprime profile --verbose` for full hardware diagnostics.
+```
+
+**Options:**
+
+```
+vectorprime profile [OPTIONS]
+
+Options:
+  --json        Output the full hardware profile as structured JSON to stdout.
+  --verbose     Show a detailed hardware diagnostic report including clock speed,
+                SIMD features, CUDA/driver versions, runtime compatibility, and
+                optimization hints.
+  --save PATH   Save the JSON profile to a file. Can be combined with --json to
+                print JSON to stdout and save to a file simultaneously.
+```
+
+#### JSON output (`--json`)
+
+```bash
+vectorprime profile --json
+```
 
 ```json
 {
-  "cpu": { "core_count": 16, "simd_level": "AVX2" },
-  "gpu": { "name": "NVIDIA GeForce RTX 4090", "vram_mb": 24564, "compute_capability": [8, 9] },
-  "ram": { "total_mb": 65536, "available_mb": 48000 }
+  "cpu": {
+    "brand": "Intel(R) Core(TM) i9-14900HX",
+    "core_count": 32,
+    "simd_level": "AVX2"
+  },
+  "gpu": {
+    "name": "NVIDIA GeForce RTX 4090 Laptop GPU",
+    "vendor": "Nvidia",
+    "vram_mb": 16376,
+    "compute_capability": [8, 9],
+    "tensor_cores": true
+  },
+  "ram": {
+    "total_mb": 31941,
+    "available_mb": 29935
+  },
+  "capabilities": {
+    "gpu_inference": true,
+    "fp16": true,
+    "int8": true,
+    "tensorrt_supported": true,
+    "tensor_cores": true
+  },
+  "recommendation": {
+    "preferred_runtime": ["TensorRT", "llama.cpp"],
+    "preferred_precision": ["FP16", "INT8"]
+  }
 }
 ```
 
+#### Save to file (`--save`)
+
+```bash
+vectorprime profile --save hw.json
+# Hardware profile saved to: hw.json
+
+vectorprime profile --json --save hw.json
+# Prints JSON to stdout AND saves to hw.json
+```
+
+#### Verbose diagnostics (`--verbose`)
+
+```bash
+vectorprime profile --verbose
+```
+
+```
+VectorPrime Hardware Diagnostic Report
+═══════════════════════════════════════
+
+CPU
+  Model:                 Intel Core i9-14900HX
+  Architecture:          x86_64
+  Physical Cores:        24
+  Logical Threads:       32
+  Base Clock:            2.2 GHz
+  SIMD Features:         SSE4, AVX, AVX2
+  L3 Cache:              36 MB
+
+GPU
+  Model:                 NVIDIA GeForce RTX 4090 Laptop GPU
+  Vendor:                NVIDIA
+  Compute Capability:    8.9
+  VRAM:                  16.0 GB
+  Tensor Cores:          Yes
+  CUDA Version:          12.4
+  Driver Version:        550.xx
+  Memory Bandwidth:      ~576 GB/s
+
+System Memory
+  Total RAM:             31.9 GB
+  Available RAM:         29.9 GB
+  Swap:                  8.0 GB
+
+Acceleration Support
+  CUDA:                  Available
+  TensorRT:              Compatible
+  FP16 Inference:        Supported
+  INT8 Inference:        Supported
+
+Runtime Compatibility
+  llama.cpp:             Supported (CPU + GPU offload)
+  ONNX Runtime:          Supported
+  TensorRT:              Supported
+  vLLM:                  Supported
+
+VectorPrime Optimization Hints
+  Recommended Runtime:       TensorRT / llama.cpp
+  Recommended Precision:     FP16 / INT8
+  Suggested Threads:         16–32
+  GPU Offload Capacity:      High
+
+System Readiness
+  ✓ CUDA driver detected
+  ✓ GPU compute capability supported
+  ✓ Sufficient VRAM for large LLMs
+
+System ready for optimized LLM inference.
+```
+
+### Check System Readiness (`doctor`)
+
+```bash
+vectorprime doctor
+```
+
+Probes for required inference components and reports which are available:
+
+```
+VectorPrime System Check
+────────────────────────
+
+✓ CUDA installed
+✓ GPU driver detected
+✓ TensorRT available
+✓ llama.cpp GPU support
+
+System ready for optimized inference.
+```
+
+If a component is missing, its line shows `✗` and the summary changes to:
+
+```
+✗ TensorRT available
+✗ llama.cpp GPU support
+
+Some components missing — see above.
+```
+
+The doctor command checks:
+
+| Component | Detection method |
+|---|---|
+| CUDA installed | `nvidia-smi` on PATH |
+| GPU driver detected | `nvidia-smi` on PATH |
+| TensorRT available | `trtexec` on PATH |
+| llama.cpp GPU support | `llama-cli` or `llama-server` on PATH |
+
 ### Optimize a Model
 
 ```bash
diff --git a/python/vectorprime/cli.py b/python/vectorprime/cli.py
index 724f479..1472ae3 100644
--- a/python/vectorprime/cli.py
+++ b/python/vectorprime/cli.py
@@ -1,8 +1,20 @@
-"""VectorPrime command-line interface."""
+"""VectorPrime command-line interface.
+
+Location: python/vectorprime/cli.py
+
+Summary: Entry point for the `vectorprime` CLI. Parses arguments and dispatches
+to command handlers (profile, optimize, convert-to-onnx, convert-to-gguf, doctor).
+
+Used by: pyproject.toml console_scripts entry point; also importable as a module.
+"""
 
 import argparse
-import sys
+import json
 import os
+import platform
+import shutil
+import subprocess
+import sys
 
 
 def _print_logo() -> None:
@@ -126,14 +138,588 @@ def _print_model_summary(info: dict) -> None:
     print(div)
 
 
-def cmd_profile(_args: argparse.Namespace) -> None:
+def _mb_to_gb_str(mb: float) -> str:
+    """Convert megabytes to a human-readable gigabyte string, e.g. '31.9 GB'."""
+    return f"{mb / 1024:.1f} GB"
+
+
+def _compute_cap_str(cap: list) -> str:
+    """Format a compute capability list like [8, 9] as the string '8.9'."""
+    if cap and len(cap) >= 2:
+        return f"{cap[0]}.{cap[1]}"
+    return "N/A"
+
+
+def _round_to_model_size(param_billions: int) -> int:
+    """Round a raw parameter count in billions to the nearest common model size."""
+    common_sizes = [7, 13, 30, 70, 130]
+    if param_billions <= 0:
+        return param_billions
+    closest = min(common_sizes, key=lambda s: abs(s - param_billions))
+    return closest
+
+
+def _derive_capabilities(hw: dict) -> dict:
+    """Derive capability flags from a raw hardware profile dict.
+
+    Args:
+        hw: Parsed hardware profile containing 'cpu', 'gpu', and 'ram' keys.
+
+    Returns:
+        A dict with boolean capability flags.
+    """
+    gpu = hw.get("gpu")
+    vendor = (gpu.get("vendor", "") if gpu else "").lower()
+    cap = gpu.get("compute_capability", [0, 0]) if gpu else [0, 0]
+    cap_tuple = tuple(cap[:2]) if len(cap) >= 2 else (0, 0)
+    is_nvidia = vendor == "nvidia"
+    tensorrt_ok = is_nvidia and cap_tuple >= (7, 0)
+    tensor_cores = is_nvidia and cap_tuple >= (7, 0)
+    return {
+        "gpu_inference": gpu is not None,
+        "fp16": gpu is not None,
+        "int8": gpu is not None,
+        "tensorrt_supported": tensorrt_ok,
+        "tensor_cores": tensor_cores,
+    }
+
+
+def _derive_recommendation(hw: dict, caps: dict, installed_runtimes: dict) -> dict:
+    """Derive runtime and precision recommendations from hardware and capabilities.
+
+    Runtime recommendations are ordered by suitability for the detected hardware
+    class and filtered to only runtimes confirmed as installed.  If nothing in
+    the priority list is installed the full priority list is returned as
+    aspirational recommendations (so the caller still gets a useful answer).
+
+    Precision recommendations are based solely on hardware capabilities and do
+    not depend on installed software.
+
+    Args:
+        hw: Parsed hardware profile dict.
+        caps: Capability flags from _derive_capabilities().
+        installed_runtimes: Boolean dict from _check_runtime_support().
+
+    Returns:
+        A dict with 'preferred_runtime' and 'preferred_precision' lists.
+        Values are human-readable display strings (e.g. "llama.cpp", "FP16").
+    """
+    gpu = hw.get("gpu")
+    vendor = (gpu.get("vendor", "") if gpu else "").lower()
+    cpu = hw.get("cpu", {})
+    simd_level = cpu.get("simd_level", "")
+
+    cap = (gpu.get("compute_capability", [0, 0]) if gpu else [0, 0])
+    cap_tuple = tuple(cap[:2]) if len(cap) >= 2 else (0, 0)
+
+    is_nvidia = vendor == "nvidia"
+    is_amd = vendor == "amd"
+    is_apple = vendor in ("apple", "metal")
+
+    # --- Runtime priority list (keys from _check_runtime_support) ---
+    if is_nvidia and cap_tuple >= (7, 0):
+        # Volta and above: TensorRT is the optimal choice.
+        priority = ["tensorrt", "vllm", "onnx_runtime", "llama_cpp", "ollama"]
+    elif is_nvidia:
+        # Pascal and older NVIDIA: no TensorRT Tensor Core benefit.
+        priority = ["vllm", "onnx_runtime", "llama_cpp", "ollama"]
+    elif is_amd:
+        priority = ["llama_cpp", "onnx_runtime", "ollama"]
+    elif is_apple:
+        priority = ["llama_cpp", "onnx_runtime", "ollama"]
+    elif gpu:
+        # Unknown GPU vendor — safe defaults.
+        priority = ["llama_cpp", "onnx_runtime", "ollama"]
+    else:
+        # CPU-only.
+        priority = ["llama_cpp", "ollama", "onnx_runtime"]
+
+    # Filter priority list to installed runtimes; fall back to full list if
+    # nothing is installed (aspirational recommendations).
+    installed_priority = [k for k in priority if installed_runtimes.get(k)]
+    effective_priority = installed_priority if installed_priority else priority
+    preferred_runtime = [_RUNTIME_LABELS[k] for k in effective_priority]
+
+    # --- Precision based on hardware capabilities only ---
+    if is_nvidia and cap_tuple >= (8, 0):
+        # Ampere / Ada / Hopper: full Tensor Core support including INT4.
+        preferred_precision = ["FP16", "INT8", "INT4"]
+    elif is_nvidia and cap_tuple >= (7, 0):
+        # Volta / Turing: Tensor Cores for FP16 and INT8.
+        preferred_precision = ["FP16", "INT8"]
+    elif is_nvidia:
+        # Pascal and older: no Tensor Cores.
+        preferred_precision = ["FP32", "FP16"]
+    elif is_amd or is_apple:
+        # ROCm and Metal MPS both support FP16 and INT8.
+        preferred_precision = ["FP16", "INT8"]
+    elif gpu:
+        # Unknown GPU.
+        preferred_precision = ["FP16", "INT8"]
+    else:
+        # CPU-only: choose quantized formats based on available SIMD.
+        if simd_level == "AVX512":
+            preferred_precision = ["INT8", "Q4_K_M", "Q8_0"]
+        elif simd_level == "AVX2":
+            preferred_precision = ["Q4_K_M", "Q8_0", "INT8"]
+        else:
+            preferred_precision = ["Q4_K_M", "Q8_0"]
+
+    return {
+        "preferred_runtime": preferred_runtime,
+        "preferred_precision": preferred_precision,
+    }
+
+
+def _model_capacity_estimate(vram_mb: float) -> tuple:
+    """Estimate model capacity from VRAM.
+
+    Args:
+        vram_mb: Available VRAM in megabytes.
+
+    Returns:
+        Tuple of (quantized_B, full_gpu_B) as rounded parameter counts in billions.
+    """
+    vram_gb = vram_mb / 1024
+    quantized_raw = int(vram_gb * 4)
+    full_gpu_raw = int(vram_gb / 2)
+    quantized_b = _round_to_model_size(quantized_raw) if quantized_raw > 0 else quantized_raw
+    full_gpu_b = _round_to_model_size(full_gpu_raw) if full_gpu_raw > 0 else full_gpu_raw
+    return quantized_b, full_gpu_b
+
+
+def _get_nvidia_smi_info() -> dict:
+    """Run nvidia-smi and parse CUDA and driver version from its output.
+
+    Returns:
+        Dict with 'cuda_version' and 'driver_version' strings, or 'N/A' on failure.
+    """
+    result = {"cuda_version": "N/A", "driver_version": "N/A"}
+    if not shutil.which("nvidia-smi"):
+        return result
+    try:
+        proc = subprocess.run(
+            ["nvidia-smi"],
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        for line in proc.stdout.splitlines():
+            if "CUDA Version:" in line:
+                parts = line.split("CUDA Version:")
+                if len(parts) > 1:
+                    result["cuda_version"] = parts[1].strip().rstrip("|").strip()
+            if "Driver Version:" in line:
+                parts = line.split("Driver Version:")
+                if len(parts) > 1:
+                    # Driver Version: 550.xx  CUDA Version: 12.4
+                    token = parts[1].strip().split()[0]
+                    result["driver_version"] = token
+    except Exception:
+        pass
+    return result
+
+
+def _get_verbose_cpu_info(hw: dict) -> dict:
+    """Gather verbose CPU information from /proc/cpuinfo and the hardware profile.
+
+    Args:
+        hw: Parsed hardware profile dict.
+
+    Returns:
+        Dict with cpu diagnostic fields; missing values fall back to 'N/A'.
+    """
+    cpu = hw.get("cpu", {})
+    logical_threads = cpu.get("core_count", "N/A")
+    arch = platform.machine()
+
+    # Physical cores: try psutil, otherwise estimate as logical // 2.
+    try:
+        import psutil  # type: ignore[import]
+        physical_cores = psutil.cpu_count(logical=False) or "N/A"
+    except ImportError:
+        physical_cores = (logical_threads // 2) if isinstance(logical_threads, int) else "N/A"
+
+    # Base clock from /proc/cpuinfo.
+    base_clock = "N/A"
+    try:
+        with open("/proc/cpuinfo", "r", encoding="utf-8") as f:
+            for line in f:
+                if line.lower().startswith("cpu mhz"):
+                    mhz_str = line.split(":", 1)[1].strip()
+                    mhz = float(mhz_str)
+                    base_clock = f"{mhz / 1000:.1f} GHz"
+                    break
+    except Exception:
+        pass
+
+    # Expand SIMD level into a full feature list.
+    simd_map = {
+        "AVX512": "SSE4, AVX, AVX2, AVX512",
+        "AVX2":   "SSE4, AVX, AVX2",
+        "AVX":    "SSE4, AVX",
+    }
+    simd_raw = cpu.get("simd_level", "")
+    simd_features = simd_map.get(simd_raw, "SSE2")
+
+    # L3 cache from /proc/cpuinfo.
+    l3_cache = "N/A"
+    try:
+        with open("/proc/cpuinfo", "r", encoding="utf-8") as f:
+            for line in f:
+                if "cache size" in line.lower():
+                    l3_cache = line.split(":", 1)[1].strip()
+                    break
+    except Exception:
+        pass
+
+    return {
+        "arch": arch,
+        "physical_cores": physical_cores,
+        "logical_threads": logical_threads,
+        "base_clock": base_clock,
+        "simd_features": simd_features,
+        "l3_cache": l3_cache,
+    }
+
+
+def _get_verbose_mem_info() -> dict:
+    """Parse /proc/meminfo for swap total.
+
+    Returns:
+        Dict with 'swap' as a GB string, or 'N/A'.
+    """
+    swap = "N/A"
+    try:
+        with open("/proc/meminfo", "r", encoding="utf-8") as f:
+            for line in f:
+                if line.startswith("SwapTotal:"):
+                    kb = int(line.split()[1])
+                    swap = f"{kb / 1024 / 1024:.1f} GB"
+                    break
+    except Exception:
+        pass
+    return {"swap": swap}
+
+
+def _get_gpu_bandwidth(gpu_name: str) -> str:
+    """Look up approximate memory bandwidth for a known GPU model.
+
+    Args:
+        gpu_name: GPU model name string from the hardware profile.
+
+    Returns:
+        Memory bandwidth string, e.g. '~1008 GB/s', or 'N/A'.
+    """
+    name_lower = gpu_name.lower()
+    bandwidth_table = {
+        "rtx 4090 laptop": "~576 GB/s",
+        "rtx 4090":        "~1008 GB/s",
+        "rtx 3090":        "~936 GB/s",
+        "rtx 3080":        "~760 GB/s",
+        "rtx 3070":        "~448 GB/s",
+        "a100":            "~2000 GB/s",
+        "h100":            "~3350 GB/s",
+    }
+    for key, val in bandwidth_table.items():
+        if key in name_lower:
+            return val
+    return "N/A"
+
+
+def _check_runtime_support() -> dict:
+    """Check which inference runtimes are available on this system.
+
+    Uses PATH probing for binary runtimes and importlib.util.find_spec for
+    Python-package-based runtimes (onnxruntime, vllm) to avoid importing them.
+
+    Returns:
+        Dict mapping internal runtime keys to booleans.
+        Keys: "llama_cpp", "ollama", "tensorrt", "onnx_runtime", "vllm".
+    """
+    import importlib.util
+
+    return {
+        "llama_cpp":    bool(shutil.which("llama-cli") or shutil.which("llama-server")),
+        "ollama":       shutil.which("ollama") is not None,
+        "tensorrt":     shutil.which("trtexec") is not None,
+        "onnx_runtime": importlib.util.find_spec("onnxruntime") is not None,
+        "vllm":         importlib.util.find_spec("vllm") is not None,
+    }
+
+
+# Human-readable display labels for each internal runtime key.
+_RUNTIME_LABELS: dict = {
+    "tensorrt":     "TensorRT",
+    "vllm":         "vLLM",
+    "onnx_runtime": "ONNX Runtime",
+    "llama_cpp":    "llama.cpp",
+    "ollama":       "Ollama",
+}
+
+
+def _print_pretty_profile(hw: dict, caps: dict, rec: dict) -> None:
+    """Print the default human-readable hardware profile summary.
+
+    Args:
+        hw:   Parsed hardware profile dict.
+        caps: Capability flags from _derive_capabilities().
+        rec:  Recommendations from _derive_recommendation().
+    """
+    div = "─" * 40
+    cpu = hw.get("cpu", {})
+    gpu = hw.get("gpu")
+    ram = hw.get("ram", {})
+
+    print("VectorPrime Hardware Profile")
+    print(div)
+    print()
+
+    # CPU section.
+    print("CPU")
+    brand = cpu.get("brand", "N/A")
+    # Strip "(R)", "(TM)" for a cleaner display name.
+    display_brand = brand.replace("(R)", "").replace("(TM)", "").strip()
+    print(f"  {'Model:':<22} {display_brand}")
+    print(f"  {'Cores:':<22} {cpu.get('core_count', 'N/A')} threads")
+    print(f"  {'SIMD Support:':<22} {cpu.get('simd_level', 'N/A')}")
+    print()
+
+    # GPU section.
+    print("GPU")
+    if gpu:
+        gpu_name = gpu.get("name", "N/A")
+        vendor = gpu.get("vendor", "N/A")
+        vram_mb = gpu.get("vram_mb", 0)
+        cap = gpu.get("compute_capability", [])
+        print(f"  {'Model:':<22} {gpu_name}")
+        print(f"  {'Vendor:':<22} {vendor.upper() if vendor != 'N/A' else vendor}")
+        print(f"  {'VRAM:':<22} {_mb_to_gb_str(vram_mb)}")
+        print(f"  {'Compute Capability:':<22} {_compute_cap_str(cap)}")
+        tc_str = "Yes" if caps.get("tensor_cores") else "No"
+        print(f"  {'Tensor Cores:':<22} {tc_str}")
+    else:
+        print("  No GPU detected.")
+    print()
+
+    # Memory section.
+    print("Memory")
+    print(f"  {'Total RAM:':<22} {_mb_to_gb_str(ram.get('total_mb', 0))}")
+    print(f"  {'Available RAM:':<22} {_mb_to_gb_str(ram.get('available_mb', 0))}")
+    print()
+
+    # Acceleration support.
+    print("Acceleration Support")
+    _checkmark = lambda flag: "✓" if flag else "✗"
+    print(f"  {_checkmark(caps['gpu_inference'])} GPU inference available")
+    print(f"  {_checkmark(caps['fp16'])} FP16 supported")
+    print(f"  {_checkmark(caps['int8'])} INT8 supported")
+    print(f"  {_checkmark(caps['tensorrt_supported'])} TensorRT compatible")
+    print()
+
+    # Recommended inference setup.
+    print("Recommended Inference Setup")
+    runtime_str = " / ".join(rec.get("preferred_runtime", []))
+    precision_str = " or ".join(rec.get("preferred_precision", []))
+    print(f"  {'Runtime:':<22} {runtime_str}")
+    print(f"  {'Precision:':<22} {precision_str}")
+
+    if gpu:
+        vram_mb = gpu.get("vram_mb", 0)
+        q_b, full_b = _model_capacity_estimate(vram_mb)
+        print(f"  Estimated Model Capacity:")
+        print(f"      • ~{q_b}B quantized")
+        print(f"      • ~{full_b}B full GPU")
+
+    print()
+    print("Tip: run `vectorprime profile --verbose` for full hardware diagnostics.")
+
+
+def _print_verbose_profile(hw: dict, caps: dict, rec: dict, installed_runtimes: dict) -> None:
+    """Print the full verbose hardware diagnostic report.
+
+    Args:
+        hw:                Parsed hardware profile dict.
+        caps:              Capability flags from _derive_capabilities().
+        rec:               Recommendations from _derive_recommendation().
+        installed_runtimes: Boolean dict from _check_runtime_support().
+    """
+    heavy_div = "═" * 39
+    div = "─" * 39
+    cpu = hw.get("cpu", {})
+    gpu = hw.get("gpu")
+    ram = hw.get("ram", {})
+
+    cpu_verbose = _get_verbose_cpu_info(hw)
+    mem_verbose = _get_verbose_mem_info()
+    nvidia_info = _get_nvidia_smi_info() if gpu else {"cuda_version": "N/A", "driver_version": "N/A"}
+
+    print("VectorPrime Hardware Diagnostic Report")
+    print(heavy_div)
+    print()
+
+    # CPU.
+    print("CPU")
+    brand = cpu.get("brand", "N/A")
+    display_brand = brand.replace("(R)", "").replace("(TM)", "").strip()
+    print(f"  {'Model:':<22} {display_brand}")
+    print(f"  {'Architecture:':<22} {cpu_verbose['arch']}")
+    print(f"  {'Physical Cores:':<22} {cpu_verbose['physical_cores']}")
+    print(f"  {'Logical Threads:':<22} {cpu_verbose['logical_threads']}")
+    print(f"  {'Base Clock:':<22} {cpu_verbose['base_clock']}")
+    print(f"  {'SIMD Features:':<22} {cpu_verbose['simd_features']}")
+    print(f"  {'L3 Cache:':<22} {cpu_verbose['l3_cache']}")
+    print()
+
+    # GPU.
+    print("GPU")
+    if gpu:
+        gpu_name = gpu.get("name", "N/A")
+        vendor = gpu.get("vendor", "N/A")
+        vram_mb = gpu.get("vram_mb", 0)
+        cap = gpu.get("compute_capability", [])
+        bw = _get_gpu_bandwidth(gpu_name)
+        tc_str = "Yes" if caps.get("tensor_cores") else "No"
+        print(f"  {'Model:':<22} {gpu_name}")
+        print(f"  {'Vendor:':<22} {vendor.upper() if vendor != 'N/A' else vendor}")
+        print(f"  {'Compute Capability:':<22} {_compute_cap_str(cap)}")
+        print(f"  {'VRAM:':<22} {_mb_to_gb_str(vram_mb)}")
+        print(f"  {'Tensor Cores:':<22} {tc_str}")
+        print(f"  {'CUDA Version:':<22} {nvidia_info['cuda_version']}")
+        print(f"  {'Driver Version:':<22} {nvidia_info['driver_version']}")
+        print(f"  {'Memory Bandwidth:':<22} {bw}")
+    else:
+        print("  No GPU detected.")
+    print()
+
+    # System Memory.
+    print("System Memory")
+    print(f"  {'Total RAM:':<22} {_mb_to_gb_str(ram.get('total_mb', 0))}")
+    print(f"  {'Available RAM:':<22} {_mb_to_gb_str(ram.get('available_mb', 0))}")
+    print(f"  {'Swap:':<22} {mem_verbose['swap']}")
+    print()
+
+    # Acceleration support.
+    print("Acceleration Support")
+    cuda_avail = "Available" if shutil.which("nvidia-smi") else "Not found"
+    trt_avail = "Compatible" if caps.get("tensorrt_supported") else "Not available"
+    fp16_avail = "Supported" if caps.get("fp16") else "Not supported"
+    int8_avail = "Supported" if caps.get("int8") else "Not supported"
+    print(f"  {'CUDA:':<22} {cuda_avail}")
+    print(f"  {'TensorRT:':<22} {trt_avail}")
+    print(f"  {'FP16 Inference:':<22} {fp16_avail}")
+    print(f"  {'INT8 Inference:':<22} {int8_avail}")
+    print()
+
+    # Runtime compatibility — display clean labels with Supported / Not found status.
+    print("Runtime Compatibility")
+    for key, label in _RUNTIME_LABELS.items():
+        status = "Supported" if installed_runtimes.get(key) else "Not found"
+        print(f"  {(label + ':'):<22} {status}")
+    print()
+
+    # VectorPrime optimization hints.
+    print("VectorPrime Optimization Hints")
+    runtime_str = " / ".join(rec.get("preferred_runtime", []))
+    precision_str = " / ".join(rec.get("preferred_precision", []))
+    logical = cpu.get("core_count", 0)
+    thread_lo = max(1, logical // 2) if isinstance(logical, int) else "N/A"
+    thread_hi = logical if isinstance(logical, int) else "N/A"
+    thread_range = f"{thread_lo}–{thread_hi}" if isinstance(thread_lo, int) else "N/A"
+    gpu_offload = "High" if (gpu and gpu.get("vram_mb", 0) >= 8192) else ("Medium" if gpu else "None")
+    print(f"  {'Recommended Runtime:':<26} {runtime_str}")
+    print(f"  {'Recommended Precision:':<26} {precision_str}")
+    print(f"  {'Suggested Threads:':<26} {thread_range}")
+    print(f"  {'GPU Offload Capacity:':<26} {gpu_offload}")
+    print()
+
+    # System readiness.
+    print("System Readiness")
+    cuda_ok = shutil.which("nvidia-smi") is not None
+    gpu_cap_ok = gpu is not None
+    vram_ok = gpu is not None and gpu.get("vram_mb", 0) >= 4096
+    _c = lambda flag: "✓" if flag else "✗"
+    print(f"  {_c(cuda_ok)} CUDA driver detected")
+    print(f"  {_c(gpu_cap_ok)} GPU compute capability supported")
+    print(f"  {_c(vram_ok)} Sufficient VRAM for large LLMs")
+    print()
+    if cuda_ok and gpu_cap_ok and vram_ok:
+        print("System ready for optimized LLM inference.")
+    else:
+        print("Some system components may limit inference performance.")
+
+
+def cmd_profile(args: argparse.Namespace) -> None:
+    """Handle the `vectorprime profile` command.
+
+    Supports four output modes controlled by flags:
+    - default:   Pretty human-readable summary.
+    - --verbose: Full hardware diagnostic report.
+    - --json:    Structured JSON output to stdout.
+    - --save:    Save JSON to a file (combinable with --json).
+    """
     try:
         import vectorprime._vectorprime as _vectorprime  # type: ignore[import]
-        hw = _vectorprime.profile_hardware()
-        print(hw.to_json())
+        hw_obj = _vectorprime.profile_hardware()
+        hw: dict = json.loads(hw_obj.to_json())
     except RuntimeError as e:
         print(f"ERROR: {e}", file=sys.stderr)
         sys.exit(1)
+    except Exception as e:
+        print(f"ERROR: Failed to parse hardware profile: {e}", file=sys.stderr)
+        sys.exit(1)
+
+    caps = _derive_capabilities(hw)
+    installed_runtimes = _check_runtime_support()
+    rec = _derive_recommendation(hw, caps, installed_runtimes)
+
+    # Determine output mode. --verbose takes precedence over --json.
+    verbose: bool = getattr(args, "verbose", False)
+    as_json: bool = getattr(args, "json", False)
+    save_path: str | None = getattr(args, "save", None)
+
+    if verbose:
+        _print_verbose_profile(hw, caps, rec, installed_runtimes)
+        return
+
+    if as_json or save_path:
+        # Build enriched JSON payload.
+        gpu = hw.get("gpu")
+        cap = (gpu.get("compute_capability", []) if gpu else [])
+        enriched_gpu = None
+        if gpu:
+            enriched_gpu = {
+                "name": gpu.get("name"),
+                "vendor": gpu.get("vendor"),
+                "vram_mb": gpu.get("vram_mb"),
+                "compute_capability": cap,
+                "tensor_cores": caps.get("tensor_cores", False),
+            }
+        payload = {
+            "cpu": hw.get("cpu"),
+            "gpu": enriched_gpu,
+            "ram": hw.get("ram"),
+            "capabilities": caps,
+            "recommendation": rec,
+        }
+        json_str = json.dumps(payload, indent=2)
+
+        if as_json:
+            print(json_str)
+
+        if save_path:
+            try:
+                with open(save_path, "w", encoding="utf-8") as f:
+                    f.write(json_str)
+                    f.write("\n")
+                print(f"Hardware profile saved to: {save_path}")
+            except OSError as e:
+                print(f"ERROR: Could not write to {save_path}: {e}", file=sys.stderr)
+                sys.exit(1)
+        return
+
+    # Default: pretty human-readable output.
+    _print_pretty_profile(hw, caps, rec)
 
 
 def cmd_optimize(args: argparse.Namespace) -> None:
@@ -206,6 +792,55 @@ def cmd_optimize(args: argparse.Namespace) -> None:
             )
 
 
+def cmd_doctor(_args: argparse.Namespace) -> None:
+    """Handle the `vectorprime doctor` command.
+
+    Checks system readiness for optimized LLM inference by probing for
+    required binaries. Prints a checklist with pass/fail markers and a
+    summary line at the end.
+    """
+    div = "─" * 24
+
+    # Each entry: (display_name, detection_function)
+    def _has_cuda() -> bool:
+        return shutil.which("nvidia-smi") is not None
+
+    def _has_gpu_driver() -> bool:
+        # Same heuristic: nvidia-smi presence implies driver is loaded.
+        return shutil.which("nvidia-smi") is not None
+
+    def _has_tensorrt() -> bool:
+        return shutil.which("trtexec") is not None
+
+    def _has_llama_cpp_gpu() -> bool:
+        return bool(shutil.which("llama-cli") or shutil.which("llama-server"))
+
+    checks = [
+        ("CUDA installed",          _has_cuda),
+        ("GPU driver detected",     _has_gpu_driver),
+        ("TensorRT available",      _has_tensorrt),
+        ("llama.cpp GPU support",   _has_llama_cpp_gpu),
+    ]
+
+    print("VectorPrime System Check")
+    print(div)
+    print()
+
+    all_ok = True
+    for label, probe in checks:
+        ok = probe()
+        marker = "✓" if ok else "✗"
+        if not ok:
+            all_ok = False
+        print(f"{marker} {label}")
+
+    print()
+    if all_ok:
+        print("System ready for optimized inference.")
+    else:
+        print("Some components missing — see above.")
+
+
 def cmd_convert_to_onnx(args: argparse.Namespace) -> None:
     input_path: str = args.input_path
     output_path: str = args.output or _replace_ext(input_path, ".onnx")
@@ -260,7 +895,25 @@ def build_parser() -> argparse.ArgumentParser:
     sub.required = True
 
     # profile
-    sub.add_parser("profile", help="Detect and print hardware profile as JSON.")
+    prof = sub.add_parser("profile", help="Detect and display hardware profile.")
+    prof.add_argument(
+        "--json",
+        action="store_true",
+        help="Output profile as JSON.",
+    )
+    prof.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Show full hardware diagnostics.",
+    )
+    prof.add_argument(
+        "--save",
+        metavar="PATH",
+        help="Save JSON profile to file.",
+    )
+
+    # doctor
+    sub.add_parser("doctor", help="Check system readiness for optimized LLM inference.")
 
     # optimize
     opt = sub.add_parser(
@@ -353,6 +1006,7 @@ def main() -> None:
 
     dispatch = {
         "profile": cmd_profile,
+        "doctor": cmd_doctor,
         "optimize": cmd_optimize,
         "convert-to-onnx": cmd_convert_to_onnx,
         "convert-to-gguf": cmd_convert_to_gguf,