Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ tmp/
uv.lock
.python-version
experiments/
.claude/settings.local.json
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added
- Optional, off-by-default **GPU acceleration** across the predict/explain API via a `device=` argument:
- `device="cuda"` — native CUDA (cudarc + nvrtc, `cuda` Cargo feature, NVIDIA).
- `device="mps"` / `"metal"` / `"gpu"` — wgpu → Metal/Vulkan/DX12 (`gpu` Cargo feature).
- `device="cpu"` (default) unchanged.
- Covered: `RandomForestRegressor.predict`, `RandomForestClassifier.predict`/`predict_proba`, `RFGBoost`/`RFGBoostRegressor`/`RFGBoostClassifier` `predict`/`predict_proba`, and `TreeSHAP.explain`.
- Native-only and excluded from the default and wasm wheels. A wheel carries whichever backend it was built with; requesting an unavailable device raises a clear error.
- Measured speedups over the CPU path: predict up to ~33× (A100 `cuda`) / ~12× (M4 `mps`); `TreeSHAP.explain` ~3× (exact 2^k Shapley — a correct GPU reference, not a substitute for the polynomial TreeSHAP algorithm).

## [0.1.2] - 2026-06-16

### Changed
Expand Down
28 changes: 24 additions & 4 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ rayon = "1.10"
wgpu = { version = "29", optional = true }
pollster = { version = "0.4", optional = true }
bytemuck = { version = "1", optional = true, features = ["derive"] }
# Optional native CUDA backend (cudarc + nvrtc). Native NVIDIA only, off by
# default; enable with `--features cuda`. dynamic-loading dlopens libcuda/nvrtc
# at runtime, so no CUDA toolkit is needed at build time.
cudarc = { version = "0.19", optional = true, default-features = false, features = ["driver", "nvrtc", "dynamic-loading", "std", "cuda-12080"] }

[features]
gpu = ["dep:wgpu", "dep:pollster", "dep:bytemuck"]
cuda = ["dep:cudarc"]
12 changes: 6 additions & 6 deletions rfgboost/_woe.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,13 +204,13 @@ def fit(
)
return self

def predict(self, X: ArrayLike) -> NDArray[np.float64]:
def predict(self, X: ArrayLike, device: str = "cpu") -> NDArray[np.float64]:
X_encoded = self._prepare_X(X)
return np.array(self._model.predict(X_encoded), dtype=np.float64)
return np.array(self._model.predict(X_encoded, device), dtype=np.float64)

def predict_proba(self, X: ArrayLike) -> NDArray[np.float64]:
def predict_proba(self, X: ArrayLike, device: str = "cpu") -> NDArray[np.float64]:
X_encoded = self._prepare_X(X)
return np.array(self._model.predict_proba(X_encoded), dtype=np.float64)
return np.array(self._model.predict_proba(X_encoded, device), dtype=np.float64)

def predict_ci(self, X: ArrayLike, alpha: float = 0.05) -> NDArray[np.float64]:
X_encoded = self._prepare_X(X)
Expand Down Expand Up @@ -380,9 +380,9 @@ def fit(
)
return self

def predict(self, X: ArrayLike) -> NDArray[np.float64]:
def predict(self, X: ArrayLike, device: str = "cpu") -> NDArray[np.float64]:
X_encoded = self._prepare_X(X)
return np.array(self._model.predict(X_encoded), dtype=np.float64)
return np.array(self._model.predict(X_encoded, device), dtype=np.float64)

def predict_ci(self, X: ArrayLike, alpha: float = 0.05) -> NDArray[np.float64]:
X_encoded = self._prepare_X(X)
Expand Down
120 changes: 102 additions & 18 deletions src/boosting.rs
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,85 @@ fn get_raw_predictions(models: &[InternalRF], initial_pred: &[f64], learning_rat
pred
}

fn boost_devices() -> String {
#[allow(unused_mut)]
let mut d = vec!["cpu"];
#[cfg(feature = "cuda")] d.push("cuda");
#[cfg(feature = "gpu")] d.push("mps");
d.join(", ")
}

/// Raw boosting scores `[n][n_out] = initial + Σ_rounds lr·predict_all`, on the
/// chosen device. The GPU paths fold the per-round `lr / tree-count` factors into
/// one scaled forest per output channel and reuse the mean kernel; the caller
/// adds the bias and applies the link function.
fn raw_dispatch(
models: &[InternalRF], initial_pred: &[f64], lr: f64,
x: &ArrayView2<f64>, n_out: usize, device: &str,
) -> PyResult<Vec<Vec<f64>>> {
match device {
"cpu" => Ok(get_raw_predictions(models, initial_pred, lr, x, n_out)),
#[cfg(feature = "cuda")]
"cuda" => boost_raw_cuda(models, initial_pred, lr, x, n_out),
#[cfg(feature = "gpu")]
"mps" | "metal" | "gpu" => boost_raw_gpu(models, initial_pred, lr, x, n_out),
other => Err(PyValueError::new_err(format!(
"device '{}' is not available in this build. Available: {}.", other, boost_devices()))),
}
}

#[cfg(any(feature = "cuda", feature = "gpu"))]
fn boost_class_forest<'a>(
models: &'a [InternalRF], lr: f64, n_out: usize, c: usize,
) -> Option<(Vec<&'a crate::tree::TreeNode>, Vec<f32>)> {
let class_models: Vec<&InternalRF> =
(0..models.len()).step_by(n_out).filter_map(|rs| models.get(rs + c)).collect();
let t_total: usize = class_models.iter().map(|m| m.trees.len()).sum();
if t_total == 0 { return None; }
let mut trees = Vec::with_capacity(t_total);
let mut weights = Vec::with_capacity(t_total);
for m in &class_models {
// mean kernel divides by t_total, so pre-scale leaves by (lr * t_total / t_round)
let w = lr as f32 * t_total as f32 / m.trees.len() as f32;
for tree in &m.trees { trees.push(tree); weights.push(w); }
}
Some((trees, weights))
}

#[cfg(feature = "cuda")]
fn boost_raw_cuda(models: &[InternalRF], initial_pred: &[f64], lr: f64, x: &ArrayView2<f64>, n_out: usize) -> PyResult<Vec<Vec<f64>>> {
let (n, nf) = (x.nrows(), x.ncols());
let xf: Vec<f32> = x.iter().map(|&v| v as f32).collect();
let mut raw = vec![vec![0.0f64; n_out]; n];
for r in raw.iter_mut() { r.copy_from_slice(initial_pred); }
for c in 0..n_out {
if let Some((trees, weights)) = boost_class_forest(models, lr, n_out, c) {
let forest = crate::cuda::CudaForest::new_scaled(&trees, nf, &weights)
.ok_or_else(|| PyValueError::new_err("CUDA device unavailable"))?;
let contrib = forest.predict(&xf, n);
for i in 0..n { raw[i][c] += contrib[i] as f64; }
}
}
Ok(raw)
}

#[cfg(feature = "gpu")]
fn boost_raw_gpu(models: &[InternalRF], initial_pred: &[f64], lr: f64, x: &ArrayView2<f64>, n_out: usize) -> PyResult<Vec<Vec<f64>>> {
let (n, nf) = (x.nrows(), x.ncols());
let xf: Vec<f32> = x.iter().map(|&v| v as f32).collect();
let mut raw = vec![vec![0.0f64; n_out]; n];
for r in raw.iter_mut() { r.copy_from_slice(initial_pred); }
for c in 0..n_out {
if let Some((trees, weights)) = boost_class_forest(models, lr, n_out, c) {
let forest = crate::gpu::GpuForest::new_scaled(&trees, nf, &weights)
.ok_or_else(|| PyValueError::new_err("GPU device unavailable"))?;
let contrib = forest.predict(&xf, n);
for i in 0..n { raw[i][c] += contrib[i] as f64; }
}
}
Ok(raw)
}

fn set_thread_pool(n_jobs: Option<usize>) {
if let Some(nj) = n_jobs {
if nj > 0 {
Expand Down Expand Up @@ -550,10 +629,12 @@ impl RFGBoostClassifier {
Ok(())
}

fn predict(&self, x: PyReadonlyArray2<f64>) -> PyResult<Vec<f64>> {
/// `device`: "cpu" (default), "cuda" or "mps"/"metal"/"gpu".
#[pyo3(signature = (x, device="cpu"))]
fn predict(&self, x: PyReadonlyArray2<f64>, device: &str) -> PyResult<Vec<f64>> {
if !self.is_fitted { return Err(PyValueError::new_err("RFGBoostClassifier has not been fitted")); }
let x_arr = x.as_array();
let raw = get_raw_predictions(&self.models, &self.initial_pred, self.learning_rate, &x_arr.view(), self.initial_pred.len());
let raw = raw_dispatch(&self.models, &self.initial_pred, self.learning_rate, &x_arr.view(), self.initial_pred.len(), device)?;

if self.n_classes == 2 {
Ok(raw.iter().map(|p| if sigmoid(p[0]) > 0.5 { 1.0 } else { 0.0 }).collect())
Expand All @@ -566,10 +647,12 @@ impl RFGBoostClassifier {
}
}

fn predict_proba(&self, x: PyReadonlyArray2<f64>) -> PyResult<Vec<Vec<f64>>> {
/// `device`: "cpu" (default), "cuda" or "mps"/"metal"/"gpu".
#[pyo3(signature = (x, device="cpu"))]
fn predict_proba(&self, x: PyReadonlyArray2<f64>, device: &str) -> PyResult<Vec<Vec<f64>>> {
if !self.is_fitted { return Err(PyValueError::new_err("RFGBoostClassifier has not been fitted")); }
let x_arr = x.as_array();
let raw = get_raw_predictions(&self.models, &self.initial_pred, self.learning_rate, &x_arr.view(), self.initial_pred.len());
let raw = raw_dispatch(&self.models, &self.initial_pred, self.learning_rate, &x_arr.view(), self.initial_pred.len(), device)?;

if self.n_classes == 2 {
Ok(raw.iter().map(|p| { let prob = sigmoid(p[0]); vec![1.0 - prob, prob] }).collect())
Expand Down Expand Up @@ -609,7 +692,7 @@ impl RFGBoostClassifier {
let z2 = z * z;

// Get the ensemble predicted probabilities
let proba = self.predict_proba(x)?;
let proba = self.predict_proba(x, "cpu")?;

Ok((0..n).map(|i| {
let p = proba[i][1]; // P(class=1)
Expand Down Expand Up @@ -884,16 +967,13 @@ impl RFGBoostRegressor {
Ok(())
}

fn predict(&self, x: PyReadonlyArray2<f64>) -> PyResult<Vec<f64>> {
/// `device`: "cpu" (default), "cuda" or "mps"/"metal"/"gpu".
#[pyo3(signature = (x, device="cpu"))]
fn predict(&self, x: PyReadonlyArray2<f64>, device: &str) -> PyResult<Vec<f64>> {
if !self.is_fitted { return Err(PyValueError::new_err("RFGBoostRegressor has not been fitted")); }
let x_arr = x.as_array();
let n = x_arr.nrows();
let mut pred = vec![self.initial_pred; n];
for rf in &self.models {
let update = rf.predict_all(&x_arr.view());
for i in 0..n { pred[i] += self.learning_rate * update[i]; }
}
Ok(pred)
let raw = raw_dispatch(&self.models, &[self.initial_pred], self.learning_rate, &x_arr.view(), 1, device)?;
Ok(raw.into_iter().map(|r| r[0]).collect())
}

/// Confidence intervals via split conformal prediction.
Expand Down Expand Up @@ -1021,14 +1101,18 @@ impl RFGBoost {
else { Err(PyValueError::new_err("Invalid state")) }
}

fn predict(&self, x: PyReadonlyArray2<f64>) -> PyResult<Vec<f64>> {
if let Some(clf) = &self.clf { clf.predict(x) }
else if let Some(reg) = &self.reg { reg.predict(x) }
/// `device`: "cpu" (default), "cuda" or "mps"/"metal"/"gpu".
#[pyo3(signature = (x, device="cpu"))]
fn predict(&self, x: PyReadonlyArray2<f64>, device: &str) -> PyResult<Vec<f64>> {
if let Some(clf) = &self.clf { clf.predict(x, device) }
else if let Some(reg) = &self.reg { reg.predict(x, device) }
else { Err(PyValueError::new_err("Invalid state")) }
}

fn predict_proba(&self, x: PyReadonlyArray2<f64>) -> PyResult<Vec<Vec<f64>>> {
if let Some(clf) = &self.clf { clf.predict_proba(x) }
/// `device`: "cpu" (default), "cuda" or "mps"/"metal"/"gpu".
#[pyo3(signature = (x, device="cpu"))]
fn predict_proba(&self, x: PyReadonlyArray2<f64>, device: &str) -> PyResult<Vec<Vec<f64>>> {
if let Some(clf) = &self.clf { clf.predict_proba(x, device) }
else { Err(PyValueError::new_err("predict_proba is only available for classification")) }
}

Expand Down
Loading
Loading