Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions rfgboost/_woe.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,9 +119,13 @@ def __init__(
async_mode: bool = False,
tol: float = 1e-4,
n_jobs: Optional[int] = None,
monotone_constraints: Optional[dict] = None,
cat_features: Optional[Iterable[int]] = None,
) -> None:
# Store hyperparameters as attributes (sklearn BaseEstimator convention)
# monotone_constraints: {original_column_index: +1|-1|0}, translated to
# encoded-feature order at fit time (binary classification only).
self.monotone_constraints = monotone_constraints
self.n_estimators = n_estimators
self.learning_rate = learning_rate
Comment thread
sourcery-ai[bot] marked this conversation as resolved.
self.rf_n_estimators = rf_n_estimators
Expand Down Expand Up @@ -152,8 +156,36 @@ def _build_rust_params(self) -> dict[str, Any]:
async_mode=self.async_mode,
tol=self.tol,
n_jobs=self.n_jobs,
monotone_constraints=getattr(self, "_monotone_encoded", None),
)

def _encoded_monotone_list(self, n_features: int) -> Optional[list[int]]:
"""Translate {original_column_index: dir} to a per-feature list aligned
to the encoded matrix order (WOE cat columns first in cat_features order,
then numeric columns in original index order). Binary only.

Validation happens here (at fit time, where n_features is known) rather
than in __init__, to keep the sklearn convention that __init__ stores
params verbatim. Keys must be valid column indices; direction values are
normalized to their sign so only {-1, 0, 1} reach the Rust core.
"""
if not self.monotone_constraints:
return None
cat = list(self.cat_features) if self.cat_features else []
cat_set = set(cat)
numeric = [i for i in range(n_features) if i not in cat_set]
encoded_cols = cat + numeric # original column index at each encoded position

normalized: dict[int, int] = {}
for col, direction in self.monotone_constraints.items():
if not isinstance(col, (int, np.integer)) or not (0 <= int(col) < n_features):
raise ValueError(
f"monotone_constraints key {col!r} is not a valid column index "
f"in [0, {n_features})."
)
normalized[int(col)] = 1 if direction > 0 else -1 if direction < 0 else 0
return [normalized.get(orig, 0) for orig in encoded_cols]

def fit(
self,
X: ArrayLike,
Expand Down Expand Up @@ -188,6 +220,9 @@ def fit(
self._woe_multiclass = False
X_encoded = np.ascontiguousarray(X_arr, dtype=np.float64)

# Monotone constraints (binary only): translate to encoded-feature order.
self._monotone_encoded = None if is_multiclass else self._encoded_monotone_list(X_arr.shape[1])

self._model = _RustClassifier(**self._build_rust_params())
self._model.fit(
np.ascontiguousarray(X_encoded, dtype=np.float64),
Expand Down
9 changes: 8 additions & 1 deletion src/boosting.rs
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ pub struct RFGBoostClassifier {
async_mode: bool,
tol: f64,
n_jobs: Option<usize>,
monotone_constraints: Vec<i8>,
// Fitted state
n_classes: usize,
initial_pred: Vec<f64>,
Expand All @@ -415,20 +416,23 @@ impl RFGBoostClassifier {
n_estimators=20, learning_rate=0.1, rf_n_estimators=20,
rf_max_depth=None, rf_max_features=None, bootstrap=true,
random_state=None, min_samples_split=2, min_samples_leaf=1,
use_histogram=true, async_mode=false, tol=1e-4, n_jobs=None
use_histogram=true, async_mode=false, tol=1e-4, n_jobs=None,
monotone_constraints=None
))]
fn new(
n_estimators: usize, learning_rate: f64, rf_n_estimators: usize,
rf_max_depth: Option<usize>, rf_max_features: Option<String>,
bootstrap: bool, random_state: Option<u64>,
min_samples_split: usize, min_samples_leaf: usize,
use_histogram: bool, async_mode: bool, tol: f64, n_jobs: Option<usize>,
monotone_constraints: Option<Vec<i8>>,
) -> Self {
set_thread_pool(n_jobs);
Self {
n_estimators, learning_rate, rf_n_estimators, rf_max_depth, rf_max_features,
bootstrap, random_state, min_samples_split, min_samples_leaf,
use_histogram, async_mode, tol, n_jobs,
monotone_constraints: monotone_constraints.unwrap_or_default(),
n_classes: 0, initial_pred: vec![], models: Vec::new(),
trees_used: Vec::new(), is_fitted: false,
}
Expand Down Expand Up @@ -463,6 +467,7 @@ impl RFGBoostClassifier {
max_depth: self.rf_max_depth, min_samples_split: self.min_samples_split,
min_samples_leaf: self.min_samples_leaf, is_classification: false,
max_features: if max_feat < n_features { Some(max_feat) } else { None },
monotone_constraints: self.monotone_constraints.clone(),
};

// Detect classes
Expand Down Expand Up @@ -783,6 +788,7 @@ impl RFGBoostRegressor {
max_depth: self.rf_max_depth, min_samples_split: self.min_samples_split,
min_samples_leaf: self.min_samples_leaf, is_classification: false,
max_features: if max_feat < n_features { Some(max_feat) } else { None },
monotone_constraints: Vec::new(),
};

self.initial_pred = if total_w_train > 0.0 {
Expand Down Expand Up @@ -987,6 +993,7 @@ impl RFGBoost {
n_estimators, learning_rate, rf_n_estimators, rf_max_depth, rf_max_features,
bootstrap, random_state, min_samples_split, min_samples_leaf,
use_histogram, async_mode, tol, n_jobs,
None,
)),
reg: None,
task: task.to_string(),
Expand Down
4 changes: 3 additions & 1 deletion src/decision_tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,13 @@ impl DecisionTree {
min_samples_leaf: self.min_samples_leaf,
is_classification: self.task == "classification",
max_features: None,
monotone_constraints: Vec::new(),
};
let y_slice = y_arr.as_slice().unwrap();
let indices: Vec<usize> = (0..n_samples).collect();
let mut rng = crate::tree::seed_rng(self.random_state);
self.root = Some(build_node_exact(&x_arr.view(), y_slice, &weights, &indices, 0, &config, &mut rng));
self.root = Some(build_node_exact(&x_arr.view(), y_slice, &weights, &indices, 0, &config, &mut rng,
f64::NEG_INFINITY, f64::INFINITY));
self.is_fitted = true;
Ok(())
}
Expand Down
2 changes: 2 additions & 0 deletions src/random_forest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ impl RandomForestRegressor {
max_depth: self.max_depth, min_samples_split: self.min_samples_split,
min_samples_leaf: self.min_samples_leaf, is_classification: false,
max_features: if max_feat < n_features { Some(max_feat) } else { None },
monotone_constraints: Vec::new(),
};

let tree_params: Vec<(Vec<usize>, u64)> = (0..self.n_estimators)
Expand Down Expand Up @@ -178,6 +179,7 @@ impl RandomForestClassifier {
max_depth: self.max_depth, min_samples_split: self.min_samples_split,
min_samples_leaf: self.min_samples_leaf, is_classification: true,
max_features: if max_feat < n_features { Some(max_feat) } else { None },
monotone_constraints: Vec::new(),
};

let tree_params: Vec<(Vec<usize>, u64)> = (0..self.n_estimators)
Expand Down
127 changes: 104 additions & 23 deletions src/tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,25 @@ pub struct TreeConfig {
pub min_samples_leaf: usize,
pub is_classification: bool,
pub max_features: Option<usize>,
/// Per-feature monotonic constraint: +1 (prediction non-decreasing in the
/// feature), -1 (non-increasing), 0 (unconstrained). Empty = all 0.
pub monotone_constraints: Vec<i8>,
}

/// Monotone constraint for a feature (0 if unconstrained or out of range).
#[inline]
fn constraint_of(config: &TreeConfig, feat: usize) -> i8 {
config.monotone_constraints.get(feat).copied().unwrap_or(0)
}

fn weighted_mean(y: &[f64], w: &[f64], indices: &[usize]) -> f64 {
let mut sw = 0.0;
let mut swy = 0.0;
for &i in indices {
sw += w[i];
swy += w[i] * y[i];
}
if sw > 0.0 { swy / sw } else { 0.0 }
}

pub fn resolve_max_features(spec: &Option<String>, n_features: usize) -> usize {
Expand Down Expand Up @@ -146,7 +165,8 @@ pub fn build_tree_on_bootstrap(
};

let indices: Vec<usize> = (0..n_boot).collect();
build_node(&x_boot.view(), &y_boot, &w_boot, &indices, 0, config, rng, &boot_hist)
build_node(&x_boot.view(), &y_boot, &w_boot, &indices, 0, config, rng, &boot_hist,
f64::NEG_INFINITY, f64::INFINITY)
}

pub fn build_tree_on_bootstrap_exact(
Expand All @@ -172,9 +192,35 @@ pub fn build_tree_on_bootstrap_exact(
}

let indices: Vec<usize> = (0..n_boot).collect();
build_node_exact(&x_boot.view(), &y_boot, &w_boot, &indices, 0, config, rng)
build_node_exact(&x_boot.view(), &y_boot, &w_boot, &indices, 0, config, rng,
f64::NEG_INFINITY, f64::INFINITY)
}

/// Given a split on `feat`, return (left_lower, left_upper, right_lower,
/// right_upper) enforcing the monotone constraint by value-bound propagation:
/// for an increasing feature every left-subtree leaf stays <= mid and every
/// right-subtree leaf stays >= mid, where mid is between the two child means.
fn child_bounds(
config: &TreeConfig, feat: usize, y: &[f64], w: &[f64],
left_idx: &[usize], right_idx: &[usize], lower: f64, upper: f64,
) -> (f64, f64, f64, f64) {
let c = constraint_of(config, feat);
if c == 0 {
return (lower, upper, lower, upper);
}
let lv = weighted_mean(y, w, left_idx);
let rv = weighted_mean(y, w, right_idx);
let mut mid = 0.5 * (lv + rv);
if mid < lower { mid = lower; }
if mid > upper { mid = upper; }
if c > 0 {
(lower, mid, mid, upper) // increasing: left <= mid <= right
} else {
(mid, upper, lower, mid) // decreasing: left >= mid >= right
}
}

#[allow(clippy::too_many_arguments)]
pub fn build_node(
x: &ArrayView2<f64>,
y: &[f64],
Expand All @@ -184,19 +230,21 @@ pub fn build_node(
config: &TreeConfig,
rng: &mut Pcg64,
hist: &HistogramData,
lower: f64,
upper: f64,
) -> TreeNode {
let n = indices.len();

if n < config.min_samples_split || config.max_depth.is_some_and(|md| depth >= md) {
return create_leaf(y, w, indices, config);
return create_leaf(y, w, indices, config, lower, upper);
}
if config.is_classification && is_pure(y, indices) {
return create_leaf(y, w, indices, config);
return create_leaf(y, w, indices, config, lower, upper);
}

let (best_feat, best_thresh, best_gain) = find_best_split_hist(x, y, w, indices, config, rng, hist);
if best_gain <= 0.0 {
return create_leaf(y, w, indices, config);
return create_leaf(y, w, indices, config, lower, upper);
}

let (mut left_idx, mut right_idx) = (Vec::new(), Vec::new());
Expand All @@ -209,20 +257,23 @@ pub fn build_node(
}

if left_idx.len() < config.min_samples_leaf || right_idx.len() < config.min_samples_leaf {
return create_leaf(y, w, indices, config);
return create_leaf(y, w, indices, config, lower, upper);
}

let (ll, lu, rl, ru) = child_bounds(config, best_feat, y, w, &left_idx, &right_idx, lower, upper);

TreeNode {
feature: best_feat,
threshold: best_thresh,
left: Some(Box::new(build_node(x, y, w, &left_idx, depth + 1, config, rng, hist))),
right: Some(Box::new(build_node(x, y, w, &right_idx, depth + 1, config, rng, hist))),
left: Some(Box::new(build_node(x, y, w, &left_idx, depth + 1, config, rng, hist, ll, lu))),
right: Some(Box::new(build_node(x, y, w, &right_idx, depth + 1, config, rng, hist, rl, ru))),
value: 0.0,
samples: n,
class_counts: None,
}
}

#[allow(clippy::too_many_arguments)]
pub fn build_node_exact(
x: &ArrayView2<f64>,
y: &[f64],
Expand All @@ -231,19 +282,21 @@ pub fn build_node_exact(
depth: usize,
config: &TreeConfig,
rng: &mut Pcg64,
lower: f64,
upper: f64,
) -> TreeNode {
let n = indices.len();

if n < config.min_samples_split || config.max_depth.is_some_and(|md| depth >= md) {
return create_leaf(y, w, indices, config);
return create_leaf(y, w, indices, config, lower, upper);
}
if config.is_classification && is_pure(y, indices) {
return create_leaf(y, w, indices, config);
return create_leaf(y, w, indices, config, lower, upper);
}

let (best_feat, best_thresh, best_gain) = find_best_split_exact(x, y, w, indices, config, rng);
if best_gain <= 0.0 {
return create_leaf(y, w, indices, config);
return create_leaf(y, w, indices, config, lower, upper);
}

let (mut left_idx, mut right_idx) = (Vec::new(), Vec::new());
Expand All @@ -256,14 +309,16 @@ pub fn build_node_exact(
}

if left_idx.len() < config.min_samples_leaf || right_idx.len() < config.min_samples_leaf {
return create_leaf(y, w, indices, config);
return create_leaf(y, w, indices, config, lower, upper);
}

let (ll, lu, rl, ru) = child_bounds(config, best_feat, y, w, &left_idx, &right_idx, lower, upper);

TreeNode {
feature: best_feat,
threshold: best_thresh,
left: Some(Box::new(build_node_exact(x, y, w, &left_idx, depth + 1, config, rng))),
right: Some(Box::new(build_node_exact(x, y, w, &right_idx, depth + 1, config, rng))),
left: Some(Box::new(build_node_exact(x, y, w, &left_idx, depth + 1, config, rng, ll, lu))),
right: Some(Box::new(build_node_exact(x, y, w, &right_idx, depth + 1, config, rng, rl, ru))),
value: 0.0,
samples: n,
class_counts: None,
Expand All @@ -278,7 +333,7 @@ fn is_pure(y: &[f64], indices: &[usize]) -> bool {
indices.iter().all(|&i| (y[i] as usize) == first)
}

fn create_leaf(y: &[f64], w: &[f64], indices: &[usize], config: &TreeConfig) -> TreeNode {
fn create_leaf(y: &[f64], w: &[f64], indices: &[usize], config: &TreeConfig, lower: f64, upper: f64) -> TreeNode {
let n = indices.len();
if config.is_classification {
let mut counts: HashMap<usize, f64> = HashMap::new();
Expand All @@ -300,19 +355,19 @@ fn create_leaf(y: &[f64], w: &[f64], indices: &[usize], config: &TreeConfig) ->
class_counts: Some(counts),
}
} else {
let mut sw = 0.0_f64;
let mut swy = 0.0_f64;
for &i in indices {
sw += w[i];
swy += w[i] * y[i];
}
let mean = if sw > 0.0 { swy / sw } else { 0.0 };
// Weighted mean of the leaf (shares the zero-weight fallback with
// weighted_mean used during split/bound computation).
let mut value = weighted_mean(y, w, indices);
// Monotone value bounds: clamp into the interval propagated from
// constrained ancestor splits (lower=-inf/upper=+inf when unconstrained).
if value < lower { value = lower; }
if value > upper { value = upper; }
TreeNode {
feature: 0,
threshold: 0.0,
left: None,
right: None,
value: mean,
value,
samples: n,
class_counts: None,
}
Expand Down Expand Up @@ -453,6 +508,19 @@ fn find_best_split_hist(

let right_wy = total_wy - left_wy;
let right_wyy = total_wyy - left_wyy;

// Monotone constraint: reject candidate splits whose child means
// violate the required direction, so the chosen structure is
// consistent with the value-bound propagation in build_node.
let c = constraint_of(config, feat);
if c != 0 {
let lmean = left_wy / left_w;
let rmean = right_wy / right_w;
if (c > 0 && lmean > rmean) || (c < 0 && lmean < rmean) {
continue;
}
}

let left_var = left_wyy / left_w - (left_wy / left_w).powi(2);
let right_var = right_wyy / right_w - (right_wy / right_w).powi(2);
let weighted = (left_w * left_var + right_w * right_var) / total_w;
Expand Down Expand Up @@ -585,6 +653,19 @@ fn find_best_split_exact(

let right_wy = total_wy - left_wy;
let right_wyy = total_wyy - left_wyy;

// Monotone constraint: reject candidate splits whose child means
// violate the required direction, so the chosen structure is
// consistent with the value-bound propagation in build_node.
let c = constraint_of(config, feat);
if c != 0 {
let lmean = left_wy / left_w;
let rmean = right_wy / right_w;
if (c > 0 && lmean > rmean) || (c < 0 && lmean < rmean) {
continue;
}
}

let left_var = left_wyy / left_w - (left_wy / left_w).powi(2);
let right_var = right_wyy / right_w - (right_wy / right_w).powi(2);
let weighted = (left_w * left_var + right_w * right_var) / total_w;
Expand Down
Loading