diff --git a/rfgboost/_woe.py b/rfgboost/_woe.py index 8af4f59..b302370 100644 --- a/rfgboost/_woe.py +++ b/rfgboost/_woe.py @@ -119,9 +119,13 @@ def __init__( async_mode: bool = False, tol: float = 1e-4, n_jobs: Optional[int] = None, + monotone_constraints: Optional[dict] = None, cat_features: Optional[Iterable[int]] = None, ) -> None: # Store hyperparameters as attributes (sklearn BaseEstimator convention) + # monotone_constraints: {original_column_index: +1|-1|0}, translated to + # encoded-feature order at fit time (binary classification only). + self.monotone_constraints = monotone_constraints self.n_estimators = n_estimators self.learning_rate = learning_rate self.rf_n_estimators = rf_n_estimators @@ -152,8 +156,36 @@ def _build_rust_params(self) -> dict[str, Any]: async_mode=self.async_mode, tol=self.tol, n_jobs=self.n_jobs, + monotone_constraints=getattr(self, "_monotone_encoded", None), ) + def _encoded_monotone_list(self, n_features: int) -> Optional[list[int]]: + """Translate {original_column_index: dir} to a per-feature list aligned + to the encoded matrix order (WOE cat columns first in cat_features order, + then numeric columns in original index order). Binary only. + + Validation happens here (at fit time, where n_features is known) rather + than in __init__, to keep the sklearn convention that __init__ stores + params verbatim. Keys must be valid column indices; direction values are + normalized to their sign so only {-1, 0, 1} reach the Rust core. + """ + if not self.monotone_constraints: + return None + cat = list(self.cat_features) if self.cat_features else [] + cat_set = set(cat) + numeric = [i for i in range(n_features) if i not in cat_set] + encoded_cols = cat + numeric # original column index at each encoded position + + normalized: dict[int, int] = {} + for col, direction in self.monotone_constraints.items(): + if not isinstance(col, (int, np.integer)) or not (0 <= int(col) < n_features): + raise ValueError( + f"monotone_constraints key {col!r} is not a valid column index " + f"in [0, {n_features})." + ) + normalized[int(col)] = 1 if direction > 0 else -1 if direction < 0 else 0 + return [normalized.get(orig, 0) for orig in encoded_cols] + def fit( self, X: ArrayLike, @@ -188,6 +220,9 @@ def fit( self._woe_multiclass = False X_encoded = np.ascontiguousarray(X_arr, dtype=np.float64) + # Monotone constraints (binary only): translate to encoded-feature order. + self._monotone_encoded = None if is_multiclass else self._encoded_monotone_list(X_arr.shape[1]) + self._model = _RustClassifier(**self._build_rust_params()) self._model.fit( np.ascontiguousarray(X_encoded, dtype=np.float64), diff --git a/src/boosting.rs b/src/boosting.rs index ae2452a..afcd9fc 100644 --- a/src/boosting.rs +++ b/src/boosting.rs @@ -399,6 +399,7 @@ pub struct RFGBoostClassifier { async_mode: bool, tol: f64, n_jobs: Option, + monotone_constraints: Vec, // Fitted state n_classes: usize, initial_pred: Vec, @@ -415,7 +416,8 @@ impl RFGBoostClassifier { n_estimators=20, learning_rate=0.1, rf_n_estimators=20, rf_max_depth=None, rf_max_features=None, bootstrap=true, random_state=None, min_samples_split=2, min_samples_leaf=1, - use_histogram=true, async_mode=false, tol=1e-4, n_jobs=None + use_histogram=true, async_mode=false, tol=1e-4, n_jobs=None, + monotone_constraints=None ))] fn new( n_estimators: usize, learning_rate: f64, rf_n_estimators: usize, @@ -423,12 +425,14 @@ impl RFGBoostClassifier { bootstrap: bool, random_state: Option, min_samples_split: usize, min_samples_leaf: usize, use_histogram: bool, async_mode: bool, tol: f64, n_jobs: Option, + monotone_constraints: Option>, ) -> Self { set_thread_pool(n_jobs); Self { n_estimators, learning_rate, rf_n_estimators, rf_max_depth, rf_max_features, bootstrap, random_state, min_samples_split, min_samples_leaf, use_histogram, async_mode, tol, n_jobs, + monotone_constraints: monotone_constraints.unwrap_or_default(), n_classes: 0, initial_pred: vec![], models: Vec::new(), trees_used: Vec::new(), is_fitted: false, } @@ -463,6 +467,7 @@ impl RFGBoostClassifier { max_depth: self.rf_max_depth, min_samples_split: self.min_samples_split, min_samples_leaf: self.min_samples_leaf, is_classification: false, max_features: if max_feat < n_features { Some(max_feat) } else { None }, + monotone_constraints: self.monotone_constraints.clone(), }; // Detect classes @@ -783,6 +788,7 @@ impl RFGBoostRegressor { max_depth: self.rf_max_depth, min_samples_split: self.min_samples_split, min_samples_leaf: self.min_samples_leaf, is_classification: false, max_features: if max_feat < n_features { Some(max_feat) } else { None }, + monotone_constraints: Vec::new(), }; self.initial_pred = if total_w_train > 0.0 { @@ -987,6 +993,7 @@ impl RFGBoost { n_estimators, learning_rate, rf_n_estimators, rf_max_depth, rf_max_features, bootstrap, random_state, min_samples_split, min_samples_leaf, use_histogram, async_mode, tol, n_jobs, + None, )), reg: None, task: task.to_string(), diff --git a/src/decision_tree.rs b/src/decision_tree.rs index e80dc44..730e916 100644 --- a/src/decision_tree.rs +++ b/src/decision_tree.rs @@ -88,11 +88,13 @@ impl DecisionTree { min_samples_leaf: self.min_samples_leaf, is_classification: self.task == "classification", max_features: None, + monotone_constraints: Vec::new(), }; let y_slice = y_arr.as_slice().unwrap(); let indices: Vec = (0..n_samples).collect(); let mut rng = crate::tree::seed_rng(self.random_state); - self.root = Some(build_node_exact(&x_arr.view(), y_slice, &weights, &indices, 0, &config, &mut rng)); + self.root = Some(build_node_exact(&x_arr.view(), y_slice, &weights, &indices, 0, &config, &mut rng, + f64::NEG_INFINITY, f64::INFINITY)); self.is_fitted = true; Ok(()) } diff --git a/src/random_forest.rs b/src/random_forest.rs index 7166445..95caa94 100644 --- a/src/random_forest.rs +++ b/src/random_forest.rs @@ -66,6 +66,7 @@ impl RandomForestRegressor { max_depth: self.max_depth, min_samples_split: self.min_samples_split, min_samples_leaf: self.min_samples_leaf, is_classification: false, max_features: if max_feat < n_features { Some(max_feat) } else { None }, + monotone_constraints: Vec::new(), }; let tree_params: Vec<(Vec, u64)> = (0..self.n_estimators) @@ -178,6 +179,7 @@ impl RandomForestClassifier { max_depth: self.max_depth, min_samples_split: self.min_samples_split, min_samples_leaf: self.min_samples_leaf, is_classification: true, max_features: if max_feat < n_features { Some(max_feat) } else { None }, + monotone_constraints: Vec::new(), }; let tree_params: Vec<(Vec, u64)> = (0..self.n_estimators) diff --git a/src/tree.rs b/src/tree.rs index f56248e..87683ba 100644 --- a/src/tree.rs +++ b/src/tree.rs @@ -78,6 +78,25 @@ pub struct TreeConfig { pub min_samples_leaf: usize, pub is_classification: bool, pub max_features: Option, + /// Per-feature monotonic constraint: +1 (prediction non-decreasing in the + /// feature), -1 (non-increasing), 0 (unconstrained). Empty = all 0. + pub monotone_constraints: Vec, +} + +/// Monotone constraint for a feature (0 if unconstrained or out of range). +#[inline] +fn constraint_of(config: &TreeConfig, feat: usize) -> i8 { + config.monotone_constraints.get(feat).copied().unwrap_or(0) +} + +fn weighted_mean(y: &[f64], w: &[f64], indices: &[usize]) -> f64 { + let mut sw = 0.0; + let mut swy = 0.0; + for &i in indices { + sw += w[i]; + swy += w[i] * y[i]; + } + if sw > 0.0 { swy / sw } else { 0.0 } } pub fn resolve_max_features(spec: &Option, n_features: usize) -> usize { @@ -146,7 +165,8 @@ pub fn build_tree_on_bootstrap( }; let indices: Vec = (0..n_boot).collect(); - build_node(&x_boot.view(), &y_boot, &w_boot, &indices, 0, config, rng, &boot_hist) + build_node(&x_boot.view(), &y_boot, &w_boot, &indices, 0, config, rng, &boot_hist, + f64::NEG_INFINITY, f64::INFINITY) } pub fn build_tree_on_bootstrap_exact( @@ -172,9 +192,35 @@ pub fn build_tree_on_bootstrap_exact( } let indices: Vec = (0..n_boot).collect(); - build_node_exact(&x_boot.view(), &y_boot, &w_boot, &indices, 0, config, rng) + build_node_exact(&x_boot.view(), &y_boot, &w_boot, &indices, 0, config, rng, + f64::NEG_INFINITY, f64::INFINITY) } +/// Given a split on `feat`, return (left_lower, left_upper, right_lower, +/// right_upper) enforcing the monotone constraint by value-bound propagation: +/// for an increasing feature every left-subtree leaf stays <= mid and every +/// right-subtree leaf stays >= mid, where mid is between the two child means. +fn child_bounds( + config: &TreeConfig, feat: usize, y: &[f64], w: &[f64], + left_idx: &[usize], right_idx: &[usize], lower: f64, upper: f64, +) -> (f64, f64, f64, f64) { + let c = constraint_of(config, feat); + if c == 0 { + return (lower, upper, lower, upper); + } + let lv = weighted_mean(y, w, left_idx); + let rv = weighted_mean(y, w, right_idx); + let mut mid = 0.5 * (lv + rv); + if mid < lower { mid = lower; } + if mid > upper { mid = upper; } + if c > 0 { + (lower, mid, mid, upper) // increasing: left <= mid <= right + } else { + (mid, upper, lower, mid) // decreasing: left >= mid >= right + } +} + +#[allow(clippy::too_many_arguments)] pub fn build_node( x: &ArrayView2, y: &[f64], @@ -184,19 +230,21 @@ pub fn build_node( config: &TreeConfig, rng: &mut Pcg64, hist: &HistogramData, + lower: f64, + upper: f64, ) -> TreeNode { let n = indices.len(); if n < config.min_samples_split || config.max_depth.is_some_and(|md| depth >= md) { - return create_leaf(y, w, indices, config); + return create_leaf(y, w, indices, config, lower, upper); } if config.is_classification && is_pure(y, indices) { - return create_leaf(y, w, indices, config); + return create_leaf(y, w, indices, config, lower, upper); } let (best_feat, best_thresh, best_gain) = find_best_split_hist(x, y, w, indices, config, rng, hist); if best_gain <= 0.0 { - return create_leaf(y, w, indices, config); + return create_leaf(y, w, indices, config, lower, upper); } let (mut left_idx, mut right_idx) = (Vec::new(), Vec::new()); @@ -209,20 +257,23 @@ pub fn build_node( } if left_idx.len() < config.min_samples_leaf || right_idx.len() < config.min_samples_leaf { - return create_leaf(y, w, indices, config); + return create_leaf(y, w, indices, config, lower, upper); } + let (ll, lu, rl, ru) = child_bounds(config, best_feat, y, w, &left_idx, &right_idx, lower, upper); + TreeNode { feature: best_feat, threshold: best_thresh, - left: Some(Box::new(build_node(x, y, w, &left_idx, depth + 1, config, rng, hist))), - right: Some(Box::new(build_node(x, y, w, &right_idx, depth + 1, config, rng, hist))), + left: Some(Box::new(build_node(x, y, w, &left_idx, depth + 1, config, rng, hist, ll, lu))), + right: Some(Box::new(build_node(x, y, w, &right_idx, depth + 1, config, rng, hist, rl, ru))), value: 0.0, samples: n, class_counts: None, } } +#[allow(clippy::too_many_arguments)] pub fn build_node_exact( x: &ArrayView2, y: &[f64], @@ -231,19 +282,21 @@ pub fn build_node_exact( depth: usize, config: &TreeConfig, rng: &mut Pcg64, + lower: f64, + upper: f64, ) -> TreeNode { let n = indices.len(); if n < config.min_samples_split || config.max_depth.is_some_and(|md| depth >= md) { - return create_leaf(y, w, indices, config); + return create_leaf(y, w, indices, config, lower, upper); } if config.is_classification && is_pure(y, indices) { - return create_leaf(y, w, indices, config); + return create_leaf(y, w, indices, config, lower, upper); } let (best_feat, best_thresh, best_gain) = find_best_split_exact(x, y, w, indices, config, rng); if best_gain <= 0.0 { - return create_leaf(y, w, indices, config); + return create_leaf(y, w, indices, config, lower, upper); } let (mut left_idx, mut right_idx) = (Vec::new(), Vec::new()); @@ -256,14 +309,16 @@ pub fn build_node_exact( } if left_idx.len() < config.min_samples_leaf || right_idx.len() < config.min_samples_leaf { - return create_leaf(y, w, indices, config); + return create_leaf(y, w, indices, config, lower, upper); } + let (ll, lu, rl, ru) = child_bounds(config, best_feat, y, w, &left_idx, &right_idx, lower, upper); + TreeNode { feature: best_feat, threshold: best_thresh, - left: Some(Box::new(build_node_exact(x, y, w, &left_idx, depth + 1, config, rng))), - right: Some(Box::new(build_node_exact(x, y, w, &right_idx, depth + 1, config, rng))), + left: Some(Box::new(build_node_exact(x, y, w, &left_idx, depth + 1, config, rng, ll, lu))), + right: Some(Box::new(build_node_exact(x, y, w, &right_idx, depth + 1, config, rng, rl, ru))), value: 0.0, samples: n, class_counts: None, @@ -278,7 +333,7 @@ fn is_pure(y: &[f64], indices: &[usize]) -> bool { indices.iter().all(|&i| (y[i] as usize) == first) } -fn create_leaf(y: &[f64], w: &[f64], indices: &[usize], config: &TreeConfig) -> TreeNode { +fn create_leaf(y: &[f64], w: &[f64], indices: &[usize], config: &TreeConfig, lower: f64, upper: f64) -> TreeNode { let n = indices.len(); if config.is_classification { let mut counts: HashMap = HashMap::new(); @@ -300,19 +355,19 @@ fn create_leaf(y: &[f64], w: &[f64], indices: &[usize], config: &TreeConfig) -> class_counts: Some(counts), } } else { - let mut sw = 0.0_f64; - let mut swy = 0.0_f64; - for &i in indices { - sw += w[i]; - swy += w[i] * y[i]; - } - let mean = if sw > 0.0 { swy / sw } else { 0.0 }; + // Weighted mean of the leaf (shares the zero-weight fallback with + // weighted_mean used during split/bound computation). + let mut value = weighted_mean(y, w, indices); + // Monotone value bounds: clamp into the interval propagated from + // constrained ancestor splits (lower=-inf/upper=+inf when unconstrained). + if value < lower { value = lower; } + if value > upper { value = upper; } TreeNode { feature: 0, threshold: 0.0, left: None, right: None, - value: mean, + value, samples: n, class_counts: None, } @@ -453,6 +508,19 @@ fn find_best_split_hist( let right_wy = total_wy - left_wy; let right_wyy = total_wyy - left_wyy; + + // Monotone constraint: reject candidate splits whose child means + // violate the required direction, so the chosen structure is + // consistent with the value-bound propagation in build_node. + let c = constraint_of(config, feat); + if c != 0 { + let lmean = left_wy / left_w; + let rmean = right_wy / right_w; + if (c > 0 && lmean > rmean) || (c < 0 && lmean < rmean) { + continue; + } + } + let left_var = left_wyy / left_w - (left_wy / left_w).powi(2); let right_var = right_wyy / right_w - (right_wy / right_w).powi(2); let weighted = (left_w * left_var + right_w * right_var) / total_w; @@ -585,6 +653,19 @@ fn find_best_split_exact( let right_wy = total_wy - left_wy; let right_wyy = total_wyy - left_wyy; + + // Monotone constraint: reject candidate splits whose child means + // violate the required direction, so the chosen structure is + // consistent with the value-bound propagation in build_node. + let c = constraint_of(config, feat); + if c != 0 { + let lmean = left_wy / left_w; + let rmean = right_wy / right_w; + if (c > 0 && lmean > rmean) || (c < 0 && lmean < rmean) { + continue; + } + } + let left_var = left_wyy / left_w - (left_wy / left_w).powi(2); let right_var = right_wyy / right_w - (right_wy / right_w).powi(2); let weighted = (left_w * left_var + right_w * right_var) / total_w; diff --git a/src/unsupervised.rs b/src/unsupervised.rs index 76f8013..5e11167 100644 --- a/src/unsupervised.rs +++ b/src/unsupervised.rs @@ -166,6 +166,7 @@ impl RandomForestUnsupervised { min_samples_leaf: self.min_samples_leaf, is_classification: true, max_features: if max_feat < n_features { Some(max_feat) } else { None }, + monotone_constraints: Vec::new(), }; // Pre-compute bootstrap params and track OOB for original samples diff --git a/tests/test_monotonic.py b/tests/test_monotonic.py new file mode 100644 index 0000000..d56379e --- /dev/null +++ b/tests/test_monotonic.py @@ -0,0 +1,103 @@ +"""Tests for monotonic constraints in RFGBoostClassifier. + +Constraints are enforced via value-bound propagation during tree growth, so +monotonicity is exact: sweeping a constrained feature with all others held +fixed never moves the prediction the wrong way. +""" + +import numpy as np + +from rfgboost import RFGBoostClassifier + +PARAMS = dict( + n_estimators=25, + learning_rate=0.2, + rf_n_estimators=30, + rf_max_depth=5, + random_state=42, +) + + +def _ushaped_data(n=2000, seed=0): + """y depends on feature 0 in a U shape, so an UNCONSTRAINED model is + non-monotone in feature 0 — a +1/-1 constraint must actively reshape it.""" + rng = np.random.default_rng(seed) + X = rng.normal(0, 1, (n, 3)) + z = 1.6 * (X[:, 0] ** 2) - 1.0 * X[:, 1] + y = (rng.random(n) < 1.0 / (1.0 + np.exp(-(z - z.mean())))).astype(float) + return X, y + + +def _sweep(model, base_rows, feat, grid): + """Predicted P(y=1) as `feat` is swept over `grid`, all else fixed.""" + paths = [] + for base in base_rows: + rows = np.tile(base, (len(grid), 1)) + rows[:, feat] = grid + paths.append(np.array(model.predict_proba(rows))[:, 1]) + return paths + + +def test_no_constraints_is_backward_compatible(): + """Passing no constraints reproduces the default model exactly.""" + X, y = _ushaped_data() + p_default = np.array(RFGBoostClassifier(**PARAMS).fit(X, y).predict_proba(X))[:, 1] + p_none = np.array( + RFGBoostClassifier(**PARAMS, monotone_constraints=None).fit(X, y).predict_proba(X) + )[:, 1] + assert np.allclose(p_default, p_none) + + +def test_increasing_constraint_holds(): + """With {0: +1}, prediction is non-decreasing in feature 0, all else fixed.""" + X, y = _ushaped_data() + model = RFGBoostClassifier(**PARAMS, monotone_constraints={0: 1}).fit(X, y) + grid = np.linspace(X[:, 0].min(), X[:, 0].max(), 30) + rng = np.random.default_rng(1) + base_rows = [X[rng.integers(len(X))].copy() for _ in range(20)] + for path in _sweep(model, base_rows, 0, grid): + assert np.all(np.diff(path) >= -1e-9) + + +def test_decreasing_constraint_holds(): + """With {0: -1}, prediction is non-increasing in feature 0, all else fixed.""" + X, y = _ushaped_data() + model = RFGBoostClassifier(**PARAMS, monotone_constraints={0: -1}).fit(X, y) + grid = np.linspace(X[:, 0].min(), X[:, 0].max(), 30) + rng = np.random.default_rng(2) + base_rows = [X[rng.integers(len(X))].copy() for _ in range(20)] + for path in _sweep(model, base_rows, 0, grid): + assert np.all(np.diff(path) <= 1e-9) + + +def test_constraint_value_normalized_by_sign(): + """Only the sign of a direction matters: {0: 5} behaves like {0: 1}.""" + X, y = _ushaped_data() + p1 = np.array( + RFGBoostClassifier(**PARAMS, monotone_constraints={0: 1}).fit(X, y).predict_proba(X) + )[:, 1] + p5 = np.array( + RFGBoostClassifier(**PARAMS, monotone_constraints={0: 5}).fit(X, y).predict_proba(X) + )[:, 1] + assert np.allclose(p1, p5) + + +def test_invalid_constraint_key_raises(): + """A key that is not a valid column index fails fast at fit time.""" + X, y = _ushaped_data() # 3 features, so column 99 is out of range + raised = False + try: + RFGBoostClassifier(**PARAMS, monotone_constraints={99: 1}).fit(X, y) + except ValueError: + raised = True + assert raised + + +def test_unconstrained_model_is_non_monotone(): + """Sanity: without the constraint the U-shaped data is non-monotone in + feature 0, so the constraint tests above are not vacuous.""" + X, y = _ushaped_data() + model = RFGBoostClassifier(**PARAMS).fit(X, y) + grid = np.linspace(X[:, 0].min(), X[:, 0].max(), 30) + path = _sweep(model, [np.zeros(3)], 0, grid)[0] + assert not np.all(np.diff(path) >= -1e-9)