TheRadDani · TheRadDani · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026
diff --git a/.worktrees/intelligent-parameter-ordering b/.worktrees/intelligent-parameter-ordering
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,18 @@
+# Project Memory
+
+This file contains project-specific memory managed by the PACT framework.
+The global PACT Orchestrator is loaded from `~/.claude/CLAUDE.md`.
+
+<!-- SESSION_START -->
+## Current Session
+<!-- Auto-managed by session_init hook. Overwritten each session. -->
+- Resume: `claude --resume 6dd7d437-9505-4b0c-b270-a2d38bc4fa19`
+- Team: `pact-6dd7d437`
+- Started: 2026-03-15 01:26:28 UTC
+<!-- SESSION_END -->
+
+## Retrieved Context
+<!-- Auto-managed by pact-memory skill. Last 3 retrieved memories shown. -->
+
+## Working Memory
+<!-- Auto-managed by pact-memory skill. Last 3 memories shown. Full history searchable via pact-memory skill. -->
diff --git a/INSTALLATION_FIX.md b/INSTALLATION_FIX.md
@@ -0,0 +1,97 @@
+# Installation Fix: Pre-built Wheels for End-Users
+
+## Problem
+Previously, `pip install vectorprime` would fail on hosts without Rust installed because the build system always tried to compile the Rust bindings from source.
+
+## Solution
+Set up **cibuildwheel** in CI/CD to automatically build pre-compiled wheels for multiple platforms and Python versions. End-users now get pre-built binaries and don't need Rust.
+
+## Changes Made
+
+### 1. GitHub Actions Workflow (`.github/workflows/ci.yml`)
+- **Added Job 5: `build-wheels`** — Runs on all platforms (Linux, macOS, Windows)
+  - Uses `pypa/cibuildwheel@v2.17.0` to build wheels
+  - Builds for Python 3.9, 3.10, 3.11, 3.12
+  - Builds for Linux (x86_64, aarch64), macOS (x86_64, arm64), Windows (x86_64)
+  - Uploads wheels as GitHub artifacts
+
+- **Updated Job 6: `publish`** — Now publishes pre-built wheels to PyPI
+  - Downloads all platform-specific wheels from `build-wheels` job
+  - Uses `twine` to upload to PyPI
+  - Only runs on version tags (v*)
+
+### 2. Python Configuration (`pyproject.toml`)
+- **Added `[tool.cibuildwheel]` section**:
+  - Configures cibuildwheel to build wheels with maturin
+  - Specifies Python 3.9+ and skips PyPy
+  - Includes basic test command to verify wheels work
+
+### 3. Documentation (`README.md`)
+- **Split installation into two paths**:
+  - **For Users**: `pip install vectorprime` — **No Rust required**
+  - **For Developers**: "Build from Source" section clarifies Rust IS needed for contributing
+- Added platform support matrix: Linux, macOS, Windows; Python 3.9–3.12; both x86 and Arm
+
+## What Happens Next
+
+### On Version Tag Push
+When you create a tag like `v0.1.1` and push it:
+
+```bash
+git tag v0.1.1
+git push origin v0.1.1
+```
+
+GitHub Actions will automatically:
+1. ✅ Run all tests (Rust, Python, build-check, coverage)
+2. 🏗️ Build wheels on Ubuntu (x86_64, aarch64), macOS (Intel, Arm), Windows
+3. 📦 Publish all wheels to PyPI via `twine`
+
+### Requirements: Set PYPI_TOKEN Secret
+
+Before the first publish, you must configure PyPI authentication in GitHub:
+
+1. Go to https://pypi.org/account/
+2. Create an API token with permissions for "VectorPrime"
+3. In your GitHub repo Settings:
+   - Navigate to **Secrets and variables** → **Actions**
+   - Click **New repository secret**
+   - Name: `PYPI_TOKEN`
+   - Value: Your PyPI API token (starts with `pypi-...`)
+4. Save
+
+**Note**: The existing `PYPI_TOKEN` secret (if any) should work as-is with the new workflow.
+
+## User Experience Changes
+
+| Before | After |
+|--------|-------|
+| `pip install vectorprime` | ✅ Works **without Rust** (uses pre-built wheel) |
+| `pip install -e .` | ⚠️ Still requires Rust (development mode) |
+| Installation on no-Rust systems | ✅ **Now works** |
+| Supported platforms | Limited (only where you built) | **All platforms** (automatically built in CI) |
+
+## Testing Locally (Optional)
+
+To test the cibuildwheel configuration locally before pushing a tag:
+
+```bash
+pip install cibuildwheel
+cibuildwheel --platform linux --output-dir ./dist
+```
+
+This builds a wheel locally. Wheels appear in `./dist/`.
+
+## Backwards Compatibility
+
+✅ **No breaking changes**. Existing users:
+- Can continue using `pip install vectorprime` (now works better)
+- Can continue developing with `maturin develop` (unchanged)
+- Wheels are fully compatible with previous versions
+
+## Next Steps
+
+1. **Optional**: Test locally with `cibuildwheel`
+2. **When ready**: Push a version tag: `git tag v0.1.1 && git push origin v0.1.1`
+3. **Monitor**: Watch the workflow on GitHub Actions
+4. **Verify**: Check PyPI for the new wheels: https://pypi.org/project/vectorprime/
diff --git a/crates/vectorprime-bindings/src/lib.rs b/crates/vectorprime-bindings/src/lib.rs
@@ -312,15 +312,19 @@ fn optimize(
     // `{stem}-optimized.gguf` placed next to the input file.
     let derived_output = derive_output_path(&path, output_path.as_deref());
 
-    // Attempt re-quantization; if llama-quantize is absent we degrade
-    // gracefully rather than failing the whole optimization.
+    // Attempt re-quantization; if llama-quantize is absent or fails we degrade
+    // gracefully rather than failing the whole optimization, but we surface the
+    // error so the user can see the actual failure reason.
     let quantized_path = match vectorprime_export::quantize_gguf(
         &path,
         &derived_output,
         &result.config.quantization,
     ) {
         Ok(()) => Some(derived_output),
-        Err(_) => None,
+        Err(e) => {
+            eprintln!("[vectorprime] Warning: quantization failed: {e}");
+            None
+        }
     };
 
     Ok(PyOptimizationResult {

diff --git a/crates/vectorprime-export/src/lib.rs b/crates/vectorprime-export/src/lib.rs
@@ -66,9 +66,13 @@ pub fn export_ollama(
 /// (e.g. `Q4_K_M` → `"q4_k_m"`, `F16` → `"f16"`), then shells out to:
 ///
 /// ```text
-/// llama-quantize <input> <output> <type>
+/// llama-quantize --allow-requantize <input> <output> <type>
 /// ```
 ///
+/// The `--allow-requantize` flag is required when the source model is already
+/// quantized (e.g. `Q2_K`, `Q4_K_M`). It is supported since llama.cpp build
+/// b2606; older builds will exit non-zero with a different error message.
+///
 /// Returns an error if `llama-quantize` is not in PATH or exits non-zero.
 pub fn quantize_gguf(input: &Path, output: &Path, quant: &QuantizationStrategy) -> Result<()> {
     // Check that llama-quantize is available before invoking it.
@@ -79,6 +83,7 @@ pub fn quantize_gguf(input: &Path, output: &Path, quant: &QuantizationStrategy)
     let quant_type = quant_to_llama_quantize_type(quant);
 
     let status = Command::new("llama-quantize")
+        .arg("--allow-requantize")
         .arg(input)
         .arg(output)
         .arg(quant_type)
@@ -87,7 +92,8 @@ pub fn quantize_gguf(input: &Path, output: &Path, quant: &QuantizationStrategy)
 
     if !status.success() {
         return Err(anyhow::anyhow!(
-            "llama-quantize exited with code {:?}",
+            "llama-quantize exited with code {:?} \
+             (if the source model is already quantized, ensure llama.cpp >= b2606 is installed)",
             status.code()
         ));
     }

diff --git a/crates/vectorprime-optimizer/src/lib.rs b/crates/vectorprime-optimizer/src/lib.rs
@@ -264,15 +264,19 @@ const N_CANDIDATES: usize = 24;
 ///
 /// **Stage 4** runs a Tree-structured Parzen Estimator (TPE) over the remaining
 /// parameter space (runtime × quantization × gpu_layers × threads × batch_size).
+/// Scores are obtained via hardware-aware estimation (`estimate_llamacpp`) rather
+/// than live inference, so no external binaries are required and the function
+/// returns immediately.
 ///
-/// | Stage | Name                    | Benchmarks             |
-/// |-------|-------------------------|------------------------|
-/// | 1     | Hardware Profiling      | 0                      |
-/// | 2     | Model Graph Analysis    | 0                      |
-/// | 3     | Runtime Preselection    | 0                      |
-/// | 4     | Bayesian Optimization   | N_INITIAL + N_ITER ≤ 12 |
+/// | Stage | Name                                           | Benchmarks             |
+/// |-------|------------------------------------------------|------------------------|
+/// | 1     | Hardware Profiling                             | 0                      |
+/// | 2     | Model Graph Analysis                           | 0                      |
+/// | 3     | Runtime Preselection                           | 0                      |
+/// | 4     | Bayesian Optimization (hardware-aware est.)    | 0 (pure estimation)    |
 ///
-/// Falls back to `run_optimization_cartesian` when all 12 evaluations fail.
+/// Falls back to `run_optimization_cartesian` only when the latency constraint
+/// is violated; estimation itself never fails.
 /// Returns `Err` only when the fallback also produces no valid configuration.
 pub async fn run_optimization(
     model: ModelInfo,
@@ -364,7 +368,7 @@ pub async fn run_optimization(
     };
 
     eprintln!(
-        "[Stage 4/4] Bayesian optimization: {N_INITIAL} initial samples + {N_ITER} refinement iterations"
+        "[Stage 4/4] Bayesian optimization (hardware-aware estimation): {N_INITIAL} initial samples + {N_ITER} refinement iterations"
     );
 
     // ── Stage 4a: Initial Halton samples ──────────────────────────────────────
@@ -376,17 +380,16 @@ pub async fn run_optimization(
 
     let mut all_failure_reasons: std::collections::BTreeSet<String> =
         std::collections::BTreeSet::new();
-    // Track distinct binary names that were missing so we can produce an
-    // actionable "install X" error if all evaluations fail.
-    let mut all_missing_binaries: std::collections::BTreeSet<String> =
-        std::collections::BTreeSet::new();
 
-    let mut init_results = benchmark::run_benchmarks(initial_configs, &model, &hw).await;
-    for binary in collect_missing_binaries(&init_results) {
-        all_missing_binaries.insert(binary);
-    }
-    init_results = apply_llamacpp_fallback(init_results, &model, &hw);
-    init_results = drop_not_installed(init_results);
+    // Use hardware-aware estimation instead of live benchmarks. Estimation
+    // always succeeds, so no fallback or not-installed filtering is needed.
+    let init_results: Vec<(RuntimeConfig, Result<BenchmarkResult>)> = initial_configs
+        .into_iter()
+        .map(|cfg| {
+            let est = estimate::estimate_llamacpp(&cfg, &model, &hw);
+            (cfg, Ok(est))
+        })
+        .collect();
 
     // Record failures and seed the TPE model from successful observations.
     let mut tpe = TpeModel::new(0.25);
@@ -450,12 +453,10 @@ pub async fn run_optimization(
             cfg.gpu_layers,
         );
 
-        let mut iter_results = benchmark::run_benchmarks(vec![cfg.clone()], &model, &hw).await;
-        for binary in collect_missing_binaries(&iter_results) {
-            all_missing_binaries.insert(binary);
-        }
-        iter_results = apply_llamacpp_fallback(iter_results, &model, &hw);
-        iter_results = drop_not_installed(iter_results);
+        // Use hardware-aware estimation — no live inference, no binary required.
+        let est = estimate::estimate_llamacpp(&cfg, &model, &hw);
+        let iter_results: Vec<(RuntimeConfig, Result<BenchmarkResult>)> =
+            vec![(cfg.clone(), Ok(est))];
 
         for (result_cfg, outcome) in iter_results {
             match outcome {
@@ -535,9 +536,11 @@ pub async fn run_optimization(
         }
     }
 
-    // ── Fallback: all 12 evaluations failed — try cartesian ───────────────────
+    // ── Fallback: latency constraint violated — try cartesian ─────────────────
+    // With pure estimation this path is only reached when the best estimated
+    // config violates the user's --latency constraint. Estimation itself never
+    // produces failures, so all_failure_reasons will always be empty here.
     let reasons: Vec<String> = all_failure_reasons.into_iter().collect();
-    let missing: Vec<String> = all_missing_binaries.into_iter().collect();
     eprintln!(
         "[Bayes final] No successful evaluations — falling back to cartesian search. \
          Failure reasons: {}",
@@ -561,7 +564,7 @@ pub async fn run_optimization(
             }
             Ok(r)
         }
-        Err(cartesian_err) => Err(no_config_error(&reasons, &missing, max_latency_ms)
+        Err(cartesian_err) => Err(no_config_error(&reasons, &[], max_latency_ms)
             .context(format!("cartesian fallback also failed: {cartesian_err}"))),
     }
 }