diff --git a/.worktrees/intelligent-parameter-ordering b/.worktrees/intelligent-parameter-ordering new file mode 160000 index 0000000..dc4f8d6 --- /dev/null +++ b/.worktrees/intelligent-parameter-ordering @@ -0,0 +1 @@ +Subproject commit dc4f8d693fc31d6d45c9d86f5c2dfe1292f99f17 diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..298d9fb --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,18 @@ +# Project Memory + +This file contains project-specific memory managed by the PACT framework. +The global PACT Orchestrator is loaded from `~/.claude/CLAUDE.md`. + + +## Current Session + +- Resume: `claude --resume 6dd7d437-9505-4b0c-b270-a2d38bc4fa19` +- Team: `pact-6dd7d437` +- Started: 2026-03-15 01:26:28 UTC + + +## Retrieved Context + + +## Working Memory + diff --git a/INSTALLATION_FIX.md b/INSTALLATION_FIX.md new file mode 100644 index 0000000..d6ca54d --- /dev/null +++ b/INSTALLATION_FIX.md @@ -0,0 +1,97 @@ +# Installation Fix: Pre-built Wheels for End-Users + +## Problem +Previously, `pip install vectorprime` would fail on hosts without Rust installed because the build system always tried to compile the Rust bindings from source. + +## Solution +Set up **cibuildwheel** in CI/CD to automatically build pre-compiled wheels for multiple platforms and Python versions. End-users now get pre-built binaries and don't need Rust. + +## Changes Made + +### 1. GitHub Actions Workflow (`.github/workflows/ci.yml`) +- **Added Job 5: `build-wheels`** — Runs on all platforms (Linux, macOS, Windows) + - Uses `pypa/cibuildwheel@v2.17.0` to build wheels + - Builds for Python 3.9, 3.10, 3.11, 3.12 + - Builds for Linux (x86_64, aarch64), macOS (x86_64, arm64), Windows (x86_64) + - Uploads wheels as GitHub artifacts + +- **Updated Job 6: `publish`** — Now publishes pre-built wheels to PyPI + - Downloads all platform-specific wheels from `build-wheels` job + - Uses `twine` to upload to PyPI + - Only runs on version tags (v*) + +### 2. Python Configuration (`pyproject.toml`) +- **Added `[tool.cibuildwheel]` section**: + - Configures cibuildwheel to build wheels with maturin + - Specifies Python 3.9+ and skips PyPy + - Includes basic test command to verify wheels work + +### 3. Documentation (`README.md`) +- **Split installation into two paths**: + - **For Users**: `pip install vectorprime` — **No Rust required** + - **For Developers**: "Build from Source" section clarifies Rust IS needed for contributing +- Added platform support matrix: Linux, macOS, Windows; Python 3.9–3.12; both x86 and Arm + +## What Happens Next + +### On Version Tag Push +When you create a tag like `v0.1.1` and push it: + +```bash +git tag v0.1.1 +git push origin v0.1.1 +``` + +GitHub Actions will automatically: +1. ✅ Run all tests (Rust, Python, build-check, coverage) +2. 🏗️ Build wheels on Ubuntu (x86_64, aarch64), macOS (Intel, Arm), Windows +3. 📦 Publish all wheels to PyPI via `twine` + +### Requirements: Set PYPI_TOKEN Secret + +Before the first publish, you must configure PyPI authentication in GitHub: + +1. Go to https://pypi.org/account/ +2. Create an API token with permissions for "VectorPrime" +3. In your GitHub repo Settings: + - Navigate to **Secrets and variables** → **Actions** + - Click **New repository secret** + - Name: `PYPI_TOKEN` + - Value: Your PyPI API token (starts with `pypi-...`) +4. Save + +**Note**: The existing `PYPI_TOKEN` secret (if any) should work as-is with the new workflow. + +## User Experience Changes + +| Before | After | +|--------|-------| +| `pip install vectorprime` | ✅ Works **without Rust** (uses pre-built wheel) | +| `pip install -e .` | ⚠️ Still requires Rust (development mode) | +| Installation on no-Rust systems | ✅ **Now works** | +| Supported platforms | Limited (only where you built) | **All platforms** (automatically built in CI) | + +## Testing Locally (Optional) + +To test the cibuildwheel configuration locally before pushing a tag: + +```bash +pip install cibuildwheel +cibuildwheel --platform linux --output-dir ./dist +``` + +This builds a wheel locally. Wheels appear in `./dist/`. + +## Backwards Compatibility + +✅ **No breaking changes**. Existing users: +- Can continue using `pip install vectorprime` (now works better) +- Can continue developing with `maturin develop` (unchanged) +- Wheels are fully compatible with previous versions + +## Next Steps + +1. **Optional**: Test locally with `cibuildwheel` +2. **When ready**: Push a version tag: `git tag v0.1.1 && git push origin v0.1.1` +3. **Monitor**: Watch the workflow on GitHub Actions +4. **Verify**: Check PyPI for the new wheels: https://pypi.org/project/vectorprime/ diff --git a/crates/vectorprime-bindings/src/lib.rs b/crates/vectorprime-bindings/src/lib.rs index fa7f8a2..0b9e5d0 100644 --- a/crates/vectorprime-bindings/src/lib.rs +++ b/crates/vectorprime-bindings/src/lib.rs @@ -312,15 +312,19 @@ fn optimize( // `{stem}-optimized.gguf` placed next to the input file. let derived_output = derive_output_path(&path, output_path.as_deref()); - // Attempt re-quantization; if llama-quantize is absent we degrade - // gracefully rather than failing the whole optimization. + // Attempt re-quantization; if llama-quantize is absent or fails we degrade + // gracefully rather than failing the whole optimization, but we surface the + // error so the user can see the actual failure reason. let quantized_path = match vectorprime_export::quantize_gguf( &path, &derived_output, &result.config.quantization, ) { Ok(()) => Some(derived_output), - Err(_) => None, + Err(e) => { + eprintln!("[vectorprime] Warning: quantization failed: {e}"); + None + } }; Ok(PyOptimizationResult { diff --git a/crates/vectorprime-export/src/lib.rs b/crates/vectorprime-export/src/lib.rs index 42c3091..55e4641 100644 --- a/crates/vectorprime-export/src/lib.rs +++ b/crates/vectorprime-export/src/lib.rs @@ -66,9 +66,13 @@ pub fn export_ollama( /// (e.g. `Q4_K_M` → `"q4_k_m"`, `F16` → `"f16"`), then shells out to: /// /// ```text -/// llama-quantize +/// llama-quantize --allow-requantize /// ``` /// +/// The `--allow-requantize` flag is required when the source model is already +/// quantized (e.g. `Q2_K`, `Q4_K_M`). It is supported since llama.cpp build +/// b2606; older builds will exit non-zero with a different error message. +/// /// Returns an error if `llama-quantize` is not in PATH or exits non-zero. pub fn quantize_gguf(input: &Path, output: &Path, quant: &QuantizationStrategy) -> Result<()> { // Check that llama-quantize is available before invoking it. @@ -79,6 +83,7 @@ pub fn quantize_gguf(input: &Path, output: &Path, quant: &QuantizationStrategy) let quant_type = quant_to_llama_quantize_type(quant); let status = Command::new("llama-quantize") + .arg("--allow-requantize") .arg(input) .arg(output) .arg(quant_type) @@ -87,7 +92,8 @@ pub fn quantize_gguf(input: &Path, output: &Path, quant: &QuantizationStrategy) if !status.success() { return Err(anyhow::anyhow!( - "llama-quantize exited with code {:?}", + "llama-quantize exited with code {:?} \ + (if the source model is already quantized, ensure llama.cpp >= b2606 is installed)", status.code() )); } diff --git a/crates/vectorprime-optimizer/src/lib.rs b/crates/vectorprime-optimizer/src/lib.rs index e11e0c9..a4542b8 100644 --- a/crates/vectorprime-optimizer/src/lib.rs +++ b/crates/vectorprime-optimizer/src/lib.rs @@ -264,15 +264,19 @@ const N_CANDIDATES: usize = 24; /// /// **Stage 4** runs a Tree-structured Parzen Estimator (TPE) over the remaining /// parameter space (runtime × quantization × gpu_layers × threads × batch_size). +/// Scores are obtained via hardware-aware estimation (`estimate_llamacpp`) rather +/// than live inference, so no external binaries are required and the function +/// returns immediately. /// -/// | Stage | Name | Benchmarks | -/// |-------|-------------------------|------------------------| -/// | 1 | Hardware Profiling | 0 | -/// | 2 | Model Graph Analysis | 0 | -/// | 3 | Runtime Preselection | 0 | -/// | 4 | Bayesian Optimization | N_INITIAL + N_ITER ≤ 12 | +/// | Stage | Name | Benchmarks | +/// |-------|------------------------------------------------|------------------------| +/// | 1 | Hardware Profiling | 0 | +/// | 2 | Model Graph Analysis | 0 | +/// | 3 | Runtime Preselection | 0 | +/// | 4 | Bayesian Optimization (hardware-aware est.) | 0 (pure estimation) | /// -/// Falls back to `run_optimization_cartesian` when all 12 evaluations fail. +/// Falls back to `run_optimization_cartesian` only when the latency constraint +/// is violated; estimation itself never fails. /// Returns `Err` only when the fallback also produces no valid configuration. pub async fn run_optimization( model: ModelInfo, @@ -364,7 +368,7 @@ pub async fn run_optimization( }; eprintln!( - "[Stage 4/4] Bayesian optimization: {N_INITIAL} initial samples + {N_ITER} refinement iterations" + "[Stage 4/4] Bayesian optimization (hardware-aware estimation): {N_INITIAL} initial samples + {N_ITER} refinement iterations" ); // ── Stage 4a: Initial Halton samples ────────────────────────────────────── @@ -376,17 +380,16 @@ pub async fn run_optimization( let mut all_failure_reasons: std::collections::BTreeSet = std::collections::BTreeSet::new(); - // Track distinct binary names that were missing so we can produce an - // actionable "install X" error if all evaluations fail. - let mut all_missing_binaries: std::collections::BTreeSet = - std::collections::BTreeSet::new(); - let mut init_results = benchmark::run_benchmarks(initial_configs, &model, &hw).await; - for binary in collect_missing_binaries(&init_results) { - all_missing_binaries.insert(binary); - } - init_results = apply_llamacpp_fallback(init_results, &model, &hw); - init_results = drop_not_installed(init_results); + // Use hardware-aware estimation instead of live benchmarks. Estimation + // always succeeds, so no fallback or not-installed filtering is needed. + let init_results: Vec<(RuntimeConfig, Result)> = initial_configs + .into_iter() + .map(|cfg| { + let est = estimate::estimate_llamacpp(&cfg, &model, &hw); + (cfg, Ok(est)) + }) + .collect(); // Record failures and seed the TPE model from successful observations. let mut tpe = TpeModel::new(0.25); @@ -450,12 +453,10 @@ pub async fn run_optimization( cfg.gpu_layers, ); - let mut iter_results = benchmark::run_benchmarks(vec![cfg.clone()], &model, &hw).await; - for binary in collect_missing_binaries(&iter_results) { - all_missing_binaries.insert(binary); - } - iter_results = apply_llamacpp_fallback(iter_results, &model, &hw); - iter_results = drop_not_installed(iter_results); + // Use hardware-aware estimation — no live inference, no binary required. + let est = estimate::estimate_llamacpp(&cfg, &model, &hw); + let iter_results: Vec<(RuntimeConfig, Result)> = + vec![(cfg.clone(), Ok(est))]; for (result_cfg, outcome) in iter_results { match outcome { @@ -535,9 +536,11 @@ pub async fn run_optimization( } } - // ── Fallback: all 12 evaluations failed — try cartesian ─────────────────── + // ── Fallback: latency constraint violated — try cartesian ───────────────── + // With pure estimation this path is only reached when the best estimated + // config violates the user's --latency constraint. Estimation itself never + // produces failures, so all_failure_reasons will always be empty here. let reasons: Vec = all_failure_reasons.into_iter().collect(); - let missing: Vec = all_missing_binaries.into_iter().collect(); eprintln!( "[Bayes final] No successful evaluations — falling back to cartesian search. \ Failure reasons: {}", @@ -561,7 +564,7 @@ pub async fn run_optimization( } Ok(r) } - Err(cartesian_err) => Err(no_config_error(&reasons, &missing, max_latency_ms) + Err(cartesian_err) => Err(no_config_error(&reasons, &[], max_latency_ms) .context(format!("cartesian fallback also failed: {cartesian_err}"))), } }