From f32d6c0f19239b2c539b55fd0f35ac492615b9af Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 23 Sep 2025 19:57:03 +0200 Subject: [PATCH 01/23] Compiles. Big chunk of the low-level machinery done; still needs to handle bitsets --- sklearn/tree/_classes.py | 10 +++- sklearn/tree/_partitioner.pxd | 10 ++++ sklearn/tree/_partitioner.pyx | 105 ++++++++++++++++++++++++++++++---- sklearn/tree/_splitter.pxd | 1 + sklearn/tree/_splitter.pyx | 19 ++++-- sklearn/tree/_tree.pyx | 6 +- 6 files changed, 132 insertions(+), 19 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index b6ef721c3e974..616ed725ca4e9 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -472,7 +472,15 @@ def _fit( self.min_impurity_decrease, ) - builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask) + n_categories_in_feature = np.zeros(self.n_features_in_, dtype=np.intp) + builder.build( + self.tree_, + X, + y, + sample_weight, + missing_values_in_feature_mask, + n_categories_in_feature, + ) if self.n_outputs_ == 1 and is_classifier(self): self.n_classes_ = self.n_classes_[0] diff --git a/sklearn/tree/_partitioner.pxd b/sklearn/tree/_partitioner.pxd index 63e3bc2bacad0..5f8721df90f9e 100644 --- a/sklearn/tree/_partitioner.pxd +++ b/sklearn/tree/_partitioner.pxd @@ -68,14 +68,23 @@ cdef class DensePartitioner: Note that this partitioner is agnostic to the splitting strategy (best vs. random). """ cdef const float32_t[:, :] X + cdef const float64_t[:, :] y + cdef const float64_t[::1] sample_weight cdef intp_t[::1] samples cdef float32_t[::1] feature_values cdef intp_t start cdef intp_t end cdef intp_t n_missing cdef const uint8_t[::1] missing_values_in_feature_mask + cdef const intp_t[::1] n_categories_in_feature cdef bint missing_on_the_left + cdef intp_t[::1] counts + cdef float64_t[::1] weighted_counts + cdef float64_t[::1] means + cdef intp_t[::1] sorted_cat + cdef intp_t[::1] offsets + cdef void sort_samples_and_feature_values( self, intp_t current_feature ) noexcept nogil @@ -108,6 +117,7 @@ cdef class DensePartitioner: intp_t best_feature, bint best_missing_go_to_left, ) noexcept nogil + cdef void _breiman_sort_categories(self, intp_t nc) noexcept nogil cdef class SparsePartitioner: diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx index a3a2c4940d1ca..5fb286587cd11 100644 --- a/sklearn/tree/_partitioner.pyx +++ b/sklearn/tree/_partitioner.pyx @@ -13,7 +13,7 @@ and sparse data stored in a Compressed Sparse Column (CSC) format. from cython cimport final from libc.math cimport isnan, log2 from libc.stdlib cimport qsort -from libc.string cimport memcpy +from libc.string cimport memcpy, memset from ._utils cimport swap_array_slices @@ -28,6 +28,8 @@ cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 # Allow for 32 bit float comparisons cdef float32_t INFINITY_32t = np.inf +cdef intp_t MAX_N_CAT = 64 + @final cdef class DensePartitioner: @@ -38,16 +40,29 @@ cdef class DensePartitioner: def __init__( self, const float32_t[:, :] X, + const float64_t[:, :] y, + const float64_t[::1] sample_weight, intp_t[::1] samples, float32_t[::1] feature_values, const uint8_t[::1] missing_values_in_feature_mask, + const intp_t[::1] n_categories_in_feature, ): self.X = X + self.y = y + self.sample_weight = sample_weight self.samples = samples self.feature_values = feature_values self.missing_values_in_feature_mask = missing_values_in_feature_mask + self.n_categories_in_feature = n_categories_in_feature self.missing_on_the_left = False + # for breiman shortcut: + self.counts = np.empty(MAX_N_CAT, dtype=np.intp) + self.weighted_counts = np.empty(MAX_N_CAT, dtype=np.float64) + self.means = np.empty(MAX_N_CAT, dtype=np.float64) + self.sorted_cat = np.empty(MAX_N_CAT, dtype=np.intp) + self.offsets = np.empty(MAX_N_CAT, dtype=np.intp) + cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil: """Initialize splitter at the beginning of node_split.""" self.start = start @@ -98,10 +113,76 @@ cdef class DensePartitioner: for i in range(self.start, self.end): feature_values[i] = X[samples[i], current_feature] - sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing) + if self.n_categories_in_feature is None or self.n_categories_in_feature[current_feature] <= 0: + # no a categorical feature + sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing) + else: + self._breiman_sort_categories(self.n_categories_in_feature[current_feature]) self.missing_on_the_left = False self.n_missing = n_missing + cdef void _breiman_sort_categories(self, intp_t nc) noexcept nogil: + """ O(n + nc log nc)""" + cdef: + intp_t* counts = &self.counts[0] + float64_t* weighted_counts = &self.weighted_counts[0] + float64_t* means = &self.means[0] + intp_t* sorted_cat = &self.sorted_cat[0] + intp_t* offsets = &self.offsets[0] + float32_t* feature_values = &self.feature_values[0] + intp_t* samples = &self.samples[0] + intp_t c, r, p, new_p + float64_t w = 1. + + memset(means, 0, nc * sizeof(float64_t)) + memset(counts, 0, nc * sizeof(intp_t)) + memset(weighted_counts, 0, nc * sizeof(float64_t)) + + # compute counts, weighted_counts and means + for p in range(self.start, self.end): + c = feature_values[p] + counts[c] += 1 + # FIXME: overflow risk + float32 downcast: precision issues? + # `sort` only supports float32 for now, that's why means is float32 + # maybe we can use some templating to extend sort to float64 + means[c] += self.y[samples[p], 0] + if self.sample_weight is not None: + w = self.sample_weight[samples[p]] + self.weighted_counts[c] += w + + for c in range(nc): + # TODO: weighted counts + if weighted_counts[c] > 0: + means[c] /= weighted_counts[c] + + # sorted_cat[i] = i-th categories sorted my ascending means + for c in range(nc): + sorted_cat[c] = c + sort(means, sorted_cat, nc) + + # ) build offsets such that: + # offsets[c] = sum( counts[x] for all x s.t. rank(x) <= rank(c) ) - 1 + cdef intp_t offset = 0 + for r in range(nc): + c = sorted_cat[r] + offset += counts[c] + offsets[c] = offset - 1 + + # sort feature_values & samples such that they are ordered by + # the mean of the category + # while ensuring samples of the same categories are contiguous + p = self.start + while p < self.end: + # XXX: not sure this works + c = feature_values[p] + new_p = offsets[c] + if new_p > p: + swap(feature_values, samples, p, new_p) + # ^ preserves invariant: feature[p] = X[samples[p], f] + offsets[c] -= 1 + else: + p += 1 + cdef void shift_missing_to_the_left(self) noexcept nogil: """ Moves missing values from the right to the left @@ -665,26 +746,30 @@ def _py_sort(float32_t[::1] feature_values, intp_t[::1] samples, intp_t n): sort(&feature_values[0], &samples[0], n) +ctypedef fused floating_t: + float32_t + float64_t + # Sort n-element arrays pointed to by feature_values and samples, simultaneously, # by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997). -cdef inline void sort(float32_t* feature_values, intp_t* samples, intp_t n) noexcept nogil: +cdef inline void sort(floating_t* feature_values, intp_t* samples, intp_t n) noexcept nogil: if n == 0: return cdef intp_t maxd = 2 * log2(n) introsort(feature_values, samples, n, maxd) -cdef inline void swap(float32_t* feature_values, intp_t* samples, +cdef inline void swap(floating_t* feature_values, intp_t* samples, intp_t i, intp_t j) noexcept nogil: # Helper for sort feature_values[i], feature_values[j] = feature_values[j], feature_values[i] samples[i], samples[j] = samples[j], samples[i] -cdef inline float32_t median3(float32_t* feature_values, intp_t n) noexcept nogil: +cdef inline floating_t median3(floating_t* feature_values, intp_t n) noexcept nogil: # Median of three pivot selection, after Bentley and McIlroy (1993). # Engineering a sort function. SP&E. Requires 8/3 comparisons on average. - cdef float32_t a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1] + cdef floating_t a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1] if a < b: if b < c: return b @@ -703,9 +788,9 @@ cdef inline float32_t median3(float32_t* feature_values, intp_t n) noexcept nogi # Introsort with median of 3 pivot selection and 3-way partition function # (robust to repeated elements, e.g. lots of zero features). -cdef void introsort(float32_t* feature_values, intp_t *samples, +cdef void introsort(floating_t* feature_values, intp_t *samples, intp_t n, intp_t maxd) noexcept nogil: - cdef float32_t pivot + cdef floating_t pivot cdef intp_t i, l, r while n > 1: @@ -736,7 +821,7 @@ cdef void introsort(float32_t* feature_values, intp_t *samples, n -= r -cdef inline void sift_down(float32_t* feature_values, intp_t* samples, +cdef inline void sift_down(floating_t* feature_values, intp_t* samples, intp_t start, intp_t end) noexcept nogil: # Restore heap order in feature_values[start:end] by moving the max element to start. cdef intp_t child, maxind, root @@ -759,7 +844,7 @@ cdef inline void sift_down(float32_t* feature_values, intp_t* samples, root = maxind -cdef void heapsort(float32_t* feature_values, intp_t* samples, intp_t n) noexcept nogil: +cdef void heapsort(floating_t* feature_values, intp_t* samples, intp_t n) noexcept nogil: cdef intp_t start, end # heapify diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 899f130b9c291..589760efcf77d 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -84,6 +84,7 @@ cdef class Splitter: const float64_t[:, ::1] y, const float64_t[:] sample_weight, const uint8_t[::1] missing_values_in_feature_mask, + const intp_t[::1] n_categories_in_feature, ) except -1 cdef int node_reset( diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 171bf43bb4ce2..c9206aa0dd5ec 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -129,6 +129,7 @@ cdef class Splitter: const float64_t[:, ::1] y, const float64_t[:] sample_weight, const uint8_t[::1] missing_values_in_feature_mask, + const intp_t[::1] n_categories_in_feature, ) except -1: """Initialize the splitter. @@ -755,10 +756,12 @@ cdef class BestSplitter(Splitter): const float64_t[:, ::1] y, const float64_t[:] sample_weight, const uint8_t[::1] missing_values_in_feature_mask, + const intp_t[::1] n_categories_in_feature, ) except -1: - Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask, n_categories_in_feature) self.partitioner = DensePartitioner( - X, self.samples, self.feature_values, missing_values_in_feature_mask + X, y, sample_weight, self.samples, self.feature_values, + missing_values_in_feature_mask, n_categories_in_feature ) cdef int node_split( @@ -783,8 +786,9 @@ cdef class BestSparseSplitter(Splitter): const float64_t[:, ::1] y, const float64_t[:] sample_weight, const uint8_t[::1] missing_values_in_feature_mask, + const intp_t[::1] n_categories_in_feature, ) except -1: - Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask, n_categories_in_feature) self.partitioner = SparsePartitioner( X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask ) @@ -811,10 +815,12 @@ cdef class RandomSplitter(Splitter): const float64_t[:, ::1] y, const float64_t[:] sample_weight, const uint8_t[::1] missing_values_in_feature_mask, + const intp_t[::1] n_categories_in_feature, ) except -1: - Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask, n_categories_in_feature) self.partitioner = DensePartitioner( - X, self.samples, self.feature_values, missing_values_in_feature_mask + X, y, sample_weight, self.samples, self.feature_values, + missing_values_in_feature_mask, n_categories_in_feature ) cdef int node_split( @@ -839,8 +845,9 @@ cdef class RandomSparseSplitter(Splitter): const float64_t[:, ::1] y, const float64_t[:] sample_weight, const uint8_t[::1] missing_values_in_feature_mask, + const intp_t[::1] n_categories_in_feature, ) except -1: - Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask) + Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask, n_categories_in_feature) self.partitioner = SparsePartitioner( X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask ) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 9d0b2854c3ba0..83203ea142594 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -145,6 +145,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): const float64_t[:, ::1] y, const float64_t[:] sample_weight=None, const uint8_t[::1] missing_values_in_feature_mask=None, + const intp_t [::1] n_categories_in_feature=None, ): """Build a decision tree from the training set (X, y).""" @@ -170,7 +171,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): cdef float64_t min_impurity_decrease = self.min_impurity_decrease # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight, missing_values_in_feature_mask) + splitter.init(X, y, sample_weight, missing_values_in_feature_mask, n_categories_in_feature) cdef intp_t start cdef intp_t end @@ -400,6 +401,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): const float64_t[:, ::1] y, const float64_t[:] sample_weight=None, const uint8_t[::1] missing_values_in_feature_mask=None, + const intp_t[::1] n_categories_in_feature=None, ): """Build a decision tree from the training set (X, y).""" @@ -411,7 +413,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): cdef intp_t max_leaf_nodes = self.max_leaf_nodes # Recursive partition (without actual recursion) - splitter.init(X, y, sample_weight, missing_values_in_feature_mask) + splitter.init(X, y, sample_weight, missing_values_in_feature_mask, n_categories_in_feature) cdef vector[FrontierRecord] frontier cdef FrontierRecord record From 35c10acab283b50bc54b10f5ac23eb078ea81710 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 23 Sep 2025 20:34:56 +0200 Subject: [PATCH 02/23] Added bitset creation logic; TODO: using it. --- sklearn/tree/_partitioner.pyx | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx index 5fb286587cd11..c7ec18f0c2d9d 100644 --- a/sklearn/tree/_partitioner.pyx +++ b/sklearn/tree/_partitioner.pyx @@ -28,7 +28,7 @@ cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 # Allow for 32 bit float comparisons cdef float32_t INFINITY_32t = np.inf -cdef intp_t MAX_N_CAT = 64 +cdef intp_t MAX_N_CAT = 32 @final @@ -183,6 +183,18 @@ cdef class DensePartitioner: else: p += 1 + cdef uint32_t _split_pos_to_bitset(self, intp_t p, intp_t nc): + cdef uint32_t bitset = 0 + cdef intp_t r, c + cdef intp_t offset = 0 + for r in range(nc): + c = self.sorted_cat[r] + bitset |= 1 << c + offset += self.counts[c] + if offset >= p: + break + return bitset + cdef void shift_missing_to_the_left(self) noexcept nogil: """ Moves missing values from the right to the left From 61cdba743f45d8b7f0eb4a17fffe051e6ad74af5 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 23 Sep 2025 22:25:43 +0200 Subject: [PATCH 03/23] added goes_left function --- sklearn/tree/_partitioner.pxd | 3 ++- sklearn/tree/_partitioner.pyx | 19 +++++++++++++++---- sklearn/tree/_utils.pxd | 15 ++++++++++++++- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/sklearn/tree/_partitioner.pxd b/sklearn/tree/_partitioner.pxd index 5f8721df90f9e..d98f0077c501a 100644 --- a/sklearn/tree/_partitioner.pxd +++ b/sklearn/tree/_partitioner.pxd @@ -4,7 +4,7 @@ # See _partitioner.pyx for details. from ..utils._typedefs cimport ( - float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t + float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t, uint64_t ) from ._splitter cimport SplitRecord @@ -118,6 +118,7 @@ cdef class DensePartitioner: bint best_missing_go_to_left, ) noexcept nogil cdef void _breiman_sort_categories(self, intp_t nc) noexcept nogil + cdef inline uint64_t _split_pos_to_bitset(self, intp_t p, intp_t nc) noexcept nogil cdef class SparsePartitioner: diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx index c7ec18f0c2d9d..0a093e1dab085 100644 --- a/sklearn/tree/_partitioner.pyx +++ b/sklearn/tree/_partitioner.pyx @@ -15,7 +15,7 @@ from libc.math cimport isnan, log2 from libc.stdlib cimport qsort from libc.string cimport memcpy, memset -from ._utils cimport swap_array_slices +from ._utils cimport SplitValue, swap_array_slices import numpy as np from scipy.sparse import issparse @@ -28,7 +28,7 @@ cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 # Allow for 32 bit float comparisons cdef float32_t INFINITY_32t = np.inf -cdef intp_t MAX_N_CAT = 32 +cdef intp_t MAX_N_CAT = 64 @final @@ -183,8 +183,8 @@ cdef class DensePartitioner: else: p += 1 - cdef uint32_t _split_pos_to_bitset(self, intp_t p, intp_t nc): - cdef uint32_t bitset = 0 + cdef inline uint64_t _split_pos_to_bitset(self, intp_t p, intp_t nc) noexcept nogil: + cdef uint64_t bitset = 0 cdef intp_t r, c cdef intp_t offset = 0 for r in range(nc): @@ -601,6 +601,17 @@ cdef class SparsePartitioner: &self.end_negative, &self.start_positive) +cdef inline bint goes_left( + SplitValue split_value, bint missing_go_to_left, bint is_categorical, float32_t value +) noexcept nogil: + if isnan(value): + return missing_go_to_left + elif is_categorical: + return split_value.cat_split & (1 << ( value)) + else: + return value <= split_value.threshold + + cdef int compare_SIZE_t(const void* a, const void* b) noexcept nogil: """Comparison function for sort. diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 67b438d42fdbc..1a588388c773a 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -6,7 +6,9 @@ cimport numpy as cnp from ._tree cimport Node from ..neighbors._quad_tree cimport Cell -from ..utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, int32_t, uint32_t +from ..utils._typedefs cimport ( + float32_t, float64_t, intp_t, uint8_t, int32_t, uint32_t, uint64_t +) cdef enum: @@ -56,6 +58,17 @@ cdef int swap_array_slices( void* array, intp_t start, intp_t end, intp_t n, size_t itemsize ) except -1 nogil + +ctypedef union SplitValue: + # Union type to generalize the concept of a threshold to categorical + # features. The floating point view, i.e. ``split_value.threshold`` is used + # for numerical features, where feature values less than or equal to the + # threshold go left, and values greater than the threshold go right. + # + # For categorical features, TODO + float64_t threshold + uint64_t cat_split # bitset + # ============================================================================= # WeightedPQueue data structure # ============================================================================= From e8cbf55677fb228e469197e6f5799c783c170594 Mon Sep 17 00:00:00 2001 From: Arthur Date: Wed, 24 Sep 2025 12:01:37 +0200 Subject: [PATCH 04/23] Most changes down; all tests (non categorical) pass; still remains to trigger the categorical split path and test it --- sklearn/ensemble/_gradient_boosting.pyx | 3 +- sklearn/tree/_partitioner.pxd | 13 +++- sklearn/tree/_partitioner.pyx | 90 ++++++++++++++++++------- sklearn/tree/_splitter.pxd | 3 +- sklearn/tree/_splitter.pyx | 29 ++------ sklearn/tree/_tree.pxd | 24 +++---- sklearn/tree/_tree.pyx | 33 +++++++-- sklearn/tree/_utils.pxd | 37 ++++++---- 8 files changed, 147 insertions(+), 85 deletions(-) diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx index cd9845a217c7d..86b65e1f36c8a 100644 --- a/sklearn/ensemble/_gradient_boosting.pyx +++ b/sklearn/ensemble/_gradient_boosting.pyx @@ -10,9 +10,8 @@ from scipy.sparse import issparse from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint8_t # Note: _tree uses cimport numpy, cnp.import_array, so we need to include # numpy headers in the build configuration of this extension -from ..tree._tree cimport Node from ..tree._tree cimport Tree -from ..tree._utils cimport safe_realloc +from ..tree._utils cimport Node, safe_realloc # no namespace lookup for numpy dtype and array creation diff --git a/sklearn/tree/_partitioner.pxd b/sklearn/tree/_partitioner.pxd index d98f0077c501a..12fd019f1b8aa 100644 --- a/sklearn/tree/_partitioner.pxd +++ b/sklearn/tree/_partitioner.pxd @@ -7,6 +7,7 @@ from ..utils._typedefs cimport ( float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t, uint64_t ) from ._splitter cimport SplitRecord +from ._utils cimport SplitValue # Mitigate precision differences between 32 bit and 64 bit @@ -78,6 +79,7 @@ cdef class DensePartitioner: cdef const uint8_t[::1] missing_values_in_feature_mask cdef const intp_t[::1] n_categories_in_feature cdef bint missing_on_the_left + cdef intp_t n_categories cdef intp_t[::1] counts cdef float64_t[::1] weighted_counts @@ -105,6 +107,9 @@ cdef class DensePartitioner: intp_t* p_prev, intp_t* p ) noexcept nogil + cdef inline SplitValue pos_to_threshold( + self, intp_t p_prev, intp_t p + ) noexcept nogil cdef intp_t partition_samples( self, float64_t current_threshold, @@ -113,7 +118,7 @@ cdef class DensePartitioner: cdef void partition_samples_final( self, intp_t best_pos, - float64_t best_threshold, + SplitValue split_value, intp_t best_feature, bint best_missing_go_to_left, ) noexcept nogil @@ -143,6 +148,7 @@ cdef class SparsePartitioner: cdef intp_t end cdef intp_t n_missing cdef const uint8_t[::1] missing_values_in_feature_mask + cdef intp_t n_categories cdef void sort_samples_and_feature_values( self, intp_t current_feature @@ -164,6 +170,9 @@ cdef class SparsePartitioner: intp_t* p_prev, intp_t* p ) noexcept nogil + cdef inline SplitValue pos_to_threshold( + self, intp_t p_prev, intp_t p + ) noexcept nogil cdef intp_t partition_samples( self, float64_t current_threshold, @@ -172,7 +181,7 @@ cdef class SparsePartitioner: cdef void partition_samples_final( self, intp_t best_pos, - float64_t best_threshold, + SplitValue split_value, intp_t best_feature, bint best_missing_go_to_left, ) noexcept nogil diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx index 20b851fc0844b..bf6924953d12e 100644 --- a/sklearn/tree/_partitioner.pyx +++ b/sklearn/tree/_partitioner.pyx @@ -15,7 +15,7 @@ from libc.math cimport isnan, log2 from libc.stdlib cimport qsort from libc.string cimport memcpy, memset -from ._utils cimport SplitValue, swap_array_slices +from ._utils cimport swap_array_slices import numpy as np from scipy.sparse import issparse @@ -25,6 +25,7 @@ from scipy.sparse import issparse # in SparsePartitioner cdef float32_t EXTRACT_NNZ_SWITCH = 0.1 +cdef float64_t INFINITY = np.inf # Allow for 32 bit float comparisons cdef float32_t INFINITY_32t = np.inf @@ -55,6 +56,7 @@ cdef class DensePartitioner: self.missing_values_in_feature_mask = missing_values_in_feature_mask self.n_categories_in_feature = n_categories_in_feature self.missing_on_the_left = False + self.n_categories = 0 # for breiman shortcut: self.counts = np.empty(MAX_N_CAT, dtype=np.intp) @@ -113,11 +115,12 @@ cdef class DensePartitioner: for i in range(self.start, self.end): feature_values[i] = X[samples[i], current_feature] - if self.n_categories_in_feature is None or self.n_categories_in_feature[current_feature] <= 0: + self.n_categories = self.n_categories_in_feature[current_feature] + if self.n_categories <= 0: # no a categorical feature sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing) else: - self._breiman_sort_categories(self.n_categories_in_feature[current_feature]) + self._breiman_sort_categories(self.n_categories) self.missing_on_the_left = False self.n_missing = n_missing @@ -183,18 +186,6 @@ cdef class DensePartitioner: else: p += 1 - cdef inline uint64_t _split_pos_to_bitset(self, intp_t p, intp_t nc) noexcept nogil: - cdef uint64_t bitset = 0 - cdef intp_t r, c - cdef intp_t offset = 0 - for r in range(nc): - c = self.sorted_cat[r] - bitset |= 1 << c - offset += self.counts[c] - if offset >= p: - break - return bitset - cdef void shift_missing_to_the_left(self) noexcept nogil: """ Moves missing values from the right to the left @@ -276,6 +267,33 @@ cdef class DensePartitioner: p[0] += 1 p_prev[0] = p[0] - 1 + cdef inline SplitValue pos_to_threshold( + self, intp_t p_prev, intp_t p + ) noexcept nogil: + cdef intp_t end_non_missing = ( + self.end if self.missing_on_the_left + else self.end - self.n_missing) + + cdef SplitValue split + if self.n_categories > 0: + split.cat_split = self._split_pos_to_bitset(p, self.n_categories) + return split + + if p == end_non_missing and not self.missing_on_the_left: + # split with the right node being only the missing values + split.threshold = INFINITY + return split + + # split between two non-missing values + # sum of halves is used to avoid infinite value + split.threshold = ( + self.feature_values[p_prev] / 2.0 + self.feature_values[p] / 2.0 + ) + if split.threshold == INFINITY or split.threshold == -INFINITY: + split.threshold = self.feature_values[p_prev] + + return split + cdef inline intp_t partition_samples( self, float64_t current_threshold, @@ -305,7 +323,7 @@ cdef class DensePartitioner: cdef inline void partition_samples_final( self, intp_t best_pos, - float64_t best_threshold, + SplitValue split_value, intp_t best_feature, bint best_missing_go_to_left ) noexcept nogil: @@ -320,20 +338,28 @@ cdef class DensePartitioner: intp_t partition_end = self.end intp_t* samples = &self.samples[0] float32_t current_value - bint go_to_left + bint is_cat = self.n_categories_in_feature[best_feature] > 0 while p < partition_end: current_value = self.X[samples[p], best_feature] - go_to_left = ( - best_missing_go_to_left if isnan(current_value) - else current_value <= best_threshold - ) - if go_to_left: + if goes_left(split_value, best_missing_go_to_left, is_cat, current_value): p += 1 else: partition_end -= 1 samples[p], samples[partition_end] = samples[partition_end], samples[p] + cdef inline uint64_t _split_pos_to_bitset(self, intp_t p, intp_t nc) noexcept nogil: + cdef uint64_t bitset = 0 + cdef intp_t r, c + cdef intp_t offset = 0 + for r in range(nc): + c = self.sorted_cat[r] + bitset |= 1 << c + offset += self.counts[c] + if offset >= p: + break + return bitset + @final cdef class SparsePartitioner: @@ -372,6 +398,7 @@ cdef class SparsePartitioner: self.index_to_samples[samples[p]] = p self.missing_values_in_feature_mask = missing_values_in_feature_mask + self.n_categories = 0 cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil: """Initialize splitter at the beginning of node_split.""" @@ -485,6 +512,21 @@ cdef class SparsePartitioner: p_prev[0] = p[0] p[0] = p_next + cdef inline SplitValue pos_to_threshold( + self, intp_t p_prev, intp_t p + ) noexcept nogil: + + cdef SplitValue split + # split between two non-missing values + # sum of halves is used to avoid infinite value + split.threshold = ( + self.feature_values[p_prev] / 2.0 + self.feature_values[p] / 2.0 + ) + if split.threshold == INFINITY or split.threshold == -INFINITY: + split.threshold = self.feature_values[p_prev] + + return split + cdef inline intp_t partition_samples( self, float64_t current_threshold, @@ -496,13 +538,13 @@ cdef class SparsePartitioner: cdef inline void partition_samples_final( self, intp_t best_pos, - float64_t best_threshold, + SplitValue split_value, intp_t best_feature, bint missing_go_to_left ) noexcept nogil: """Partition samples for X at the best_threshold and best_feature.""" self.extract_nnz(best_feature) - self._partition(best_threshold, best_pos) + self._partition(split_value.threshold, best_pos) cdef inline intp_t _partition(self, float64_t threshold, intp_t zero_pos) noexcept nogil: """Partition samples[start:end] based on threshold.""" diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd index 589760efcf77d..cfd13a3b69600 100644 --- a/sklearn/tree/_splitter.pxd +++ b/sklearn/tree/_splitter.pxd @@ -8,6 +8,7 @@ from ..utils._typedefs cimport ( ) from ._criterion cimport Criterion from ._tree cimport ParentInfo +from ._utils cimport SplitValue cdef struct SplitRecord: @@ -16,7 +17,7 @@ cdef struct SplitRecord: intp_t pos # Split samples array at the given position, # # i.e. count of samples below threshold for feature. # # pos is >= end if the node is a leaf. - float64_t threshold # Threshold to split at. + SplitValue value # Threshold/Bitset to split at. float64_t improvement # Impurity improvement given parent node. float64_t impurity_left # Impurity of the left split. float64_t impurity_right # Impurity of the right split. diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index c9206aa0dd5ec..80721d8f7e7c0 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -49,7 +49,6 @@ cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil self.impurity_right = INFINITY self.pos = start_pos self.feature = 0 - self.threshold = 0. self.improvement = -INFINITY self.missing_go_to_left = False self.n_missing = 0 @@ -446,21 +445,7 @@ cdef inline int node_split_best( if current_proxy_improvement > best_proxy_improvement: best_proxy_improvement = current_proxy_improvement - if p == end_non_missing and not missing_go_to_left: - # split with the right node being only the missing values - current_split.threshold = INFINITY - else: - # split between two non-missing values - # sum of halves is used to avoid infinite value - current_split.threshold = ( - feature_values[p_prev] / 2.0 + feature_values[p] / 2.0 - ) - if ( - current_split.threshold == INFINITY or - current_split.threshold == -INFINITY - ): - current_split.threshold = feature_values[p_prev] - + current_split.value = partitioner.pos_to_threshold(p_prev, p) current_split.n_missing = n_missing # if there are no missing values in the training data, during @@ -477,7 +462,7 @@ cdef inline int node_split_best( if best_split.pos < end: partitioner.partition_samples_final( best_split.pos, - best_split.threshold, + best_split.value, best_split.feature, best_split.missing_go_to_left ) @@ -636,7 +621,7 @@ cdef inline int node_split_random( has_missing = n_missing != 0 # Draw a random threshold - current_split.threshold = rand_uniform( + current_split.value.threshold = rand_uniform( min_feature_value, max_feature_value, random_state, @@ -656,12 +641,12 @@ cdef inline int node_split_random( else: missing_go_to_left = 0 - if current_split.threshold == max_feature_value: - current_split.threshold = min_feature_value + if current_split.value.threshold == max_feature_value: + current_split.value.threshold = min_feature_value # Partition current_split.pos = partitioner.partition_samples( - current_split.threshold, missing_go_to_left + current_split.value.threshold, missing_go_to_left ) n_left = current_split.pos - start @@ -715,7 +700,7 @@ cdef inline int node_split_random( if current_split.feature != best_split.feature: partitioner.partition_samples_final( best_split.pos, - best_split.threshold, + best_split.value, best_split.feature, best_split.missing_go_to_left ) diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd index 2cadca4564a87..e37c0493735b7 100644 --- a/sklearn/tree/_tree.pxd +++ b/sklearn/tree/_tree.pxd @@ -6,22 +6,13 @@ import numpy as np cimport numpy as cnp -from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint8_t, uint32_t +from ..utils._typedefs cimport ( + float32_t, float64_t, intp_t, int32_t, uint8_t, uint32_t, uint64_t +) from ._splitter cimport Splitter from ._splitter cimport SplitRecord - -cdef struct Node: - # Base storage structure for the nodes in a Tree object - - intp_t left_child # id of the left child of the node - intp_t right_child # id of the right child of the node - intp_t feature # Feature used for splitting the node - float64_t threshold # Threshold value at the node - float64_t impurity # Impurity of the node (i.e., the value of the criterion) - intp_t n_node_samples # Number of samples at the node - float64_t weighted_n_node_samples # Weighted number of samples at the node - uint8_t missing_go_to_left # Whether features have missing values +from ._utils cimport Node, SplitValue cdef struct ParentInfo: @@ -44,6 +35,9 @@ cdef class Tree: cdef public intp_t n_outputs # Number of outputs in y cdef public intp_t max_n_classes # max(n_classes) + # FIXME: change to uint8_t: but the error it triggers might be a Cython bug + cdef intp_t* is_categorical # Shape (n_features,) + # Inner structures: values are stored separately from node structure, # since size is determined at runtime. cdef public intp_t max_depth # Max depth of the tree @@ -55,8 +49,8 @@ cdef class Tree: # Methods cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf, - intp_t feature, float64_t threshold, float64_t impurity, - intp_t n_node_samples, + intp_t feature, float64_t threshold, uint64_t cat_split, + float64_t impurity, intp_t n_node_samples, float64_t weighted_n_node_samples, uint8_t missing_go_to_left) except -1 nogil cdef int _resize(self, intp_t capacity) except -1 nogil diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 83203ea142594..73642cd2e639c 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -255,7 +255,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): min_impurity_decrease)) node_id = tree._add_node(parent, is_left, is_leaf, split.feature, - split.threshold, parent_record.impurity, + split.value.threshold, split.value.cat_split, + parent_record.impurity, n_node_samples, weighted_n_node_samples, split.missing_go_to_left) @@ -469,6 +470,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder): node.right_child = _TREE_LEAF node.feature = _TREE_UNDEFINED node.threshold = _TREE_UNDEFINED + # node.categorical_bitset = _TREE_UNDEFINED else: # Node is expandable @@ -613,8 +615,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder): node_id = tree._add_node(parent - tree.nodes if parent != NULL else _TREE_UNDEFINED, - is_left, is_leaf, - split.feature, split.threshold, parent_record.impurity, + is_left, is_leaf, split.feature, + split.value.threshold, split.value.cat_split, + parent_record.impurity, n_node_samples, weighted_n_node_samples, split.missing_go_to_left) if node_id == INTPTR_MAX: @@ -790,6 +793,11 @@ cdef class Tree: for k in range(n_outputs): self.n_classes[k] = n_classes[k] + self.is_categorical = NULL + safe_realloc(&self.is_categorical, n_features) + for f in range(n_features): + self.is_categorical[f] = False + # Inner structures self.max_depth = 0 self.node_count = 0 @@ -800,6 +808,7 @@ cdef class Tree: def __dealloc__(self): """Destructor.""" # Free all inner structures + free(self.is_categorical) free(self.n_classes) free(self.value) free(self.nodes) @@ -897,8 +906,8 @@ cdef class Tree: return 0 cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf, - intp_t feature, float64_t threshold, float64_t impurity, - intp_t n_node_samples, + intp_t feature, float64_t threshold, uint64_t cat_split, + float64_t impurity, intp_t n_node_samples, float64_t weighted_n_node_samples, uint8_t missing_go_to_left) except -1 nogil: """Add a node to the tree. @@ -929,11 +938,17 @@ cdef class Tree: node.right_child = _TREE_LEAF node.feature = _TREE_UNDEFINED node.threshold = _TREE_UNDEFINED + # node.categorical_bitset = _TREE_UNDEFINED else: # left_child and right_child will be set later node.feature = feature - node.threshold = threshold + if self.is_categorical[feature]: + node.threshold = -INFINITY + node.categorical_bitset = cat_split + else: + node.threshold = threshold + node.categorical_bitset = cat_split node.missing_go_to_left = missing_go_to_left self.node_count += 1 @@ -990,6 +1005,7 @@ cdef class Tree: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] + # TODO: handle categorical case: elif X_i_node_feature <= node.threshold: node = &self.nodes[node.left_child] else: @@ -1111,6 +1127,7 @@ cdef class Tree: indices[indptr[i + 1]] = (node - self.nodes) indptr[i + 1] += 1 + # TODO handle categorical if X_ndarray[i, node.feature] <= node.threshold: node = &self.nodes[node.left_child] else: @@ -1391,6 +1408,7 @@ cdef class Tree: if is_target_feature: # In this case, we push left or right child on stack + # TODO: handle categotical (and missing?) if X[sample_idx, feature_idx] <= current_node.threshold: node_idx_stack[stack_size] = current_node.left_child else: @@ -1931,7 +1949,8 @@ cdef void _build_pruned_tree( break new_node_id = tree._add_node( - parent, is_left, is_leaf, node.feature, node.threshold, + parent, is_left, is_leaf, node.feature, + node.threshold, node.categorical_bitset, node.impurity, node.n_node_samples, node.weighted_n_node_samples, node.missing_go_to_left) diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 1a588388c773a..524a60fed7cac 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -4,13 +4,37 @@ # See _utils.pyx for details. cimport numpy as cnp -from ._tree cimport Node from ..neighbors._quad_tree cimport Cell from ..utils._typedefs cimport ( float32_t, float64_t, intp_t, uint8_t, int32_t, uint32_t, uint64_t ) +ctypedef union SplitValue: + # Union type to generalize the concept of a threshold to categorical + # features. The floating point view, i.e. ``split_value.threshold`` is used + # for numerical features, where feature values less than or equal to the + # threshold go left, and values greater than the threshold go right. + # + # For categorical features, TODO + float64_t threshold + uint64_t cat_split # bitset + + +cdef struct Node: + # Base storage structure for the nodes in a Tree object + + intp_t left_child # id of the left child of the node + intp_t right_child # id of the right child of the node + intp_t feature # Feature used for splitting the node + float64_t threshold # Threshold value at the node, for continuous split (-INF otherwise) + uint64_t categorical_bitset # Bitset for categorical split (0 otherwise) + float64_t impurity # Impurity of the node (i.e., the value of the criterion) + intp_t n_node_samples # Number of samples at the node + float64_t weighted_n_node_samples # Weighted number of samples at the node + uint8_t missing_go_to_left # Whether features have missing values + + cdef enum: # Max value for our rand_r replacement (near the bottom). # We don't use RAND_MAX because it's different across platforms and @@ -58,17 +82,6 @@ cdef int swap_array_slices( void* array, intp_t start, intp_t end, intp_t n, size_t itemsize ) except -1 nogil - -ctypedef union SplitValue: - # Union type to generalize the concept of a threshold to categorical - # features. The floating point view, i.e. ``split_value.threshold`` is used - # for numerical features, where feature values less than or equal to the - # threshold go left, and values greater than the threshold go right. - # - # For categorical features, TODO - float64_t threshold - uint64_t cat_split # bitset - # ============================================================================= # WeightedPQueue data structure # ============================================================================= From 4c571af93229f23e4ffd0c4675e95e89161598ae Mon Sep 17 00:00:00 2001 From: Arthur Date: Wed, 24 Sep 2025 17:20:47 +0200 Subject: [PATCH 05/23] added categorical_features kwarg to top-level classes; non-categorical tests pass; TODO: test with categories --- sklearn/tree/_classes.py | 159 +++++++++++++++++++++++++++++++- sklearn/tree/_partitioner.pyx | 1 + sklearn/tree/_tree.pyx | 19 ++-- sklearn/tree/tests/test_tree.py | 18 +++- 4 files changed, 182 insertions(+), 15 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 616ed725ca4e9..f3147fef18cc1 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -47,6 +47,7 @@ _assert_all_finite_element_wise, _check_n_features, _check_sample_weight, + _is_pandas_df, assert_all_finite, check_is_fitted, validate_data, @@ -125,6 +126,11 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")], "ccp_alpha": [Interval(Real, 0.0, None, closed="left")], "monotonic_cst": ["array-like", None], + "categorical_features": [ + "array-like", + StrOptions({"from_dtype"}), + None, + ], } @abstractmethod @@ -144,6 +150,7 @@ def __init__( class_weight=None, ccp_alpha=0.0, monotonic_cst=None, + categorical_features=None, ): self.criterion = criterion self.splitter = splitter @@ -158,6 +165,7 @@ def __init__( self.class_weight = class_weight self.ccp_alpha = ccp_alpha self.monotonic_cst = monotonic_cst + self.categorical_features = categorical_features def get_depth(self): """Return the depth of the decision tree. @@ -241,6 +249,8 @@ def _fit( ): random_state = check_random_state(self.random_state) + self.is_categorical_ = self._check_categorical_features(X) + if check_input: # Need to validate separately here. # We can't pass multi_output=True because that would allow y to be @@ -259,13 +269,18 @@ def _fit( missing_values_in_feature_mask = ( self._compute_missing_values_in_feature_mask(X) ) - if issparse(X): + is_sparse_X = issparse(X) + if is_sparse_X: X.sort_indices() if X.indices.dtype != np.intc or X.indptr.dtype != np.intc: raise ValueError( "No support for np.int64 index based sparse matrices" ) + if is_sparse_X and self.categorical_features is not None: + raise NotImplementedError( + "Categorical features not supported with sparse inputs" + ) if self.criterion == "poisson": if np.any(y < 0): @@ -442,13 +457,19 @@ def _fit( ) if is_classifier(self): - self.tree_ = Tree(self.n_features_in_, self.n_classes_, self.n_outputs_) + self.tree_ = Tree( + self.n_features_in_, + self.n_classes_, + self.n_outputs_, + self.is_categorical_, + ) else: self.tree_ = Tree( self.n_features_in_, # TODO: tree shouldn't need this in this case np.array([1] * self.n_outputs_, dtype=np.intp), self.n_outputs_, + self.is_categorical_, ) # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise @@ -472,7 +493,14 @@ def _fit( self.min_impurity_decrease, ) - n_categories_in_feature = np.zeros(self.n_features_in_, dtype=np.intp) + n_categories_in_feature = np.full(self.n_features_in_, -1, dtype=np.intp) + if self.is_categorical_ is not None: + for idx in np.where(self.is_categorical_)[0]: + assert np.allclose(X[:, idx].astype(np.intp), X[:, idx]) + assert X[:, idx].min() >= 0 + n_categories_in_feature[idx] = X[:, idx].max() + 1 + assert (n_categories_in_feature <= 64).all() + builder.build( self.tree_, X, @@ -490,6 +518,126 @@ def _fit( return self + def _check_categorical_features(self, X): + """Copied from gradient_boosting.py on sept. 2025 + Check and validate categorical features in X + + Parameters + ---------- + X : {array-like, pandas DataFrame} of shape (n_samples, n_features) + Input data. + + Return + ------ + is_categorical : ndarray of shape (n_features,) or None, dtype=bool + Indicates whether a feature is categorical. If no feature is + categorical, this is None. + """ + # Special code for pandas because of a bug in recent pandas, which is + # fixed in main and maybe included in 2.2.1, see + # https://github.com/pandas-dev/pandas/pull/57173. + # Also pandas versions < 1.5.1 do not support the dataframe interchange + if _is_pandas_df(X): + X_is_dataframe = True + categorical_columns_mask = np.asarray(X.dtypes == "category") + elif hasattr(X, "__dataframe__"): + X_is_dataframe = True + categorical_columns_mask = np.asarray( + [ + c.dtype[0].name == "CATEGORICAL" + for c in X.__dataframe__().get_columns() + ] + ) + else: + X_is_dataframe = False + categorical_columns_mask = None + + categorical_features = self.categorical_features + + categorical_by_dtype = ( + isinstance(categorical_features, str) + and categorical_features == "from_dtype" + ) + no_categorical_dtype = categorical_features is None or ( + categorical_by_dtype and not X_is_dataframe + ) + + if no_categorical_dtype: + return None + + use_pandas_categorical = categorical_by_dtype and X_is_dataframe + if use_pandas_categorical: + categorical_features = categorical_columns_mask + else: + categorical_features = np.asarray(categorical_features) + + if categorical_features.size == 0: + return None + + if categorical_features.dtype.kind not in ("i", "b", "U", "O"): + raise ValueError( + "categorical_features must be an array-like of bool, int or " + f"str, got: {categorical_features.dtype.name}." + ) + + if categorical_features.dtype.kind == "O": + types = set(type(f) for f in categorical_features) + if types != {str}: + raise ValueError( + "categorical_features must be an array-like of bool, int or " + f"str, got: {', '.join(sorted(t.__name__ for t in types))}." + ) + + n_features = X.shape[1] + # At this point `validate_data` was not called yet because we use the original + # dtypes to discover the categorical features. Thus `feature_names_in_` + # is not defined yet. + feature_names_in_ = getattr(X, "columns", None) + + if categorical_features.dtype.kind in ("U", "O"): + # check for feature names + if feature_names_in_ is None: + raise ValueError( + "categorical_features should be passed as an array of " + "integers or as a boolean mask when the model is fitted " + "on data without feature names." + ) + is_categorical = np.zeros(n_features, dtype=bool) + feature_names = list(feature_names_in_) + for feature_name in categorical_features: + try: + is_categorical[feature_names.index(feature_name)] = True + except ValueError as e: + raise ValueError( + f"categorical_features has a item value '{feature_name}' " + "which is not a valid feature name of the training " + f"data. Observed feature names: {feature_names}" + ) from e + elif categorical_features.dtype.kind == "i": + # check for categorical features as indices + if ( + np.max(categorical_features) >= n_features + or np.min(categorical_features) < 0 + ): + raise ValueError( + "categorical_features set as integer " + "indices must be in [0, n_features - 1]" + ) + is_categorical = np.zeros(n_features, dtype=bool) + is_categorical[categorical_features] = True + else: + if categorical_features.shape[0] != n_features: + raise ValueError( + "categorical_features set as a boolean mask " + "must have shape (n_features,), got: " + f"{categorical_features.shape}" + ) + is_categorical = categorical_features + + if not np.any(is_categorical): + return None + return is_categorical + def _validate_X_predict(self, X, check_input): """Validate the training data on predict (probabilities).""" if check_input: @@ -628,13 +776,16 @@ def _prune_tree(self): # build pruned tree if is_classifier(self): n_classes = np.atleast_1d(self.n_classes_) - pruned_tree = Tree(self.n_features_in_, n_classes, self.n_outputs_) + pruned_tree = Tree( + self.n_features_in_, n_classes, self.n_outputs_, self.is_categorical_ + ) else: pruned_tree = Tree( self.n_features_in_, # TODO: the tree shouldn't need this param np.array([1] * self.n_outputs_, dtype=np.intp), self.n_outputs_, + self.is_categorical_, ) _build_pruned_tree_ccp(pruned_tree, self.tree_, self.ccp_alpha) diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx index bf6924953d12e..d0a4a66021761 100644 --- a/sklearn/tree/_partitioner.pyx +++ b/sklearn/tree/_partitioner.pyx @@ -740,6 +740,7 @@ cdef inline void extract_nnz_binary_search(const int32_t[::1] X_indices, n_samples * log(n_indices)). """ cdef intp_t n_samples + # printf("Is sorted: %d\n", is_samples_sorted[0]) if not is_samples_sorted[0]: n_samples = end - start diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 73642cd2e639c..27ef8e6416119 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -773,7 +773,7 @@ cdef class Tree: # TODO: Convert n_classes to cython.integral memory view once # https://github.com/cython/cython/issues/5243 is fixed - def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs): + def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs, cnp.ndarray is_categorical): """Constructor.""" cdef intp_t dummy = 0 size_t_dtype = np.array(dummy).dtype @@ -795,8 +795,12 @@ cdef class Tree: self.is_categorical = NULL safe_realloc(&self.is_categorical, n_features) - for f in range(n_features): - self.is_categorical[f] = False + if is_categorical is None: + for f in range(n_features): + self.is_categorical[f] = False + else: + for f in range(n_features): + self.is_categorical[f] = is_categorical[f] # Inner structures self.max_depth = 0 @@ -815,9 +819,12 @@ cdef class Tree: def __reduce__(self): """Reduce re-implementation, for pickling.""" - return (Tree, (self.n_features, - sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), - self.n_outputs), self.__getstate__()) + return (Tree, ( + self.n_features, + sizet_ptr_to_ndarray(self.n_classes, self.n_outputs), + self.n_outputs, + sizet_ptr_to_ndarray(self.is_categorical, self.n_features) + ), self.__getstate__()) def __getstate__(self): """Getstate re-implementation, for pickling.""" diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 8ad5f507a2fab..9db97ec529e24 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2141,13 +2141,15 @@ def get_different_alignment_node_ndarray(node_ndarray): def reduce_tree_with_different_bitness(tree): new_dtype = np.int64 if _IS_32BIT else np.int32 - tree_cls, (n_features, n_classes, n_outputs), state = tree.__reduce__() + tree_cls, (n_features, n_classes, n_outputs, is_categorical), state = ( + tree.__reduce__() + ) new_n_classes = n_classes.astype(new_dtype, casting="same_kind") new_state = state.copy() new_state["nodes"] = get_different_bitness_node_ndarray(new_state["nodes"]) - return (tree_cls, (n_features, new_n_classes, n_outputs), new_state) + return (tree_cls, (n_features, new_n_classes, n_outputs, is_categorical), new_state) def test_different_bitness_pickle(): @@ -2788,7 +2790,9 @@ def test_build_pruned_tree_py(): tree.fit(iris.data, iris.target) n_classes = np.atleast_1d(tree.n_classes_) - pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_) + pruned_tree = CythonTree( + tree.n_features_in_, n_classes, tree.n_outputs_, tree.is_categorical_ + ) # only keep the root note leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8) @@ -2802,7 +2806,9 @@ def test_build_pruned_tree_py(): assert_array_equal(tree.tree_.value[0], pruned_tree.value[0]) # now keep all the leaves - pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_) + pruned_tree = CythonTree( + tree.n_features_in_, n_classes, tree.n_outputs_, tree.is_categorical_ + ) leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8) leave_in_subtree[1:] = 1 @@ -2820,7 +2826,9 @@ def test_build_pruned_tree_infinite_loop(): tree = DecisionTreeClassifier(random_state=0, max_depth=1) tree.fit(iris.data, iris.target) n_classes = np.atleast_1d(tree.n_classes_) - pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_) + pruned_tree = CythonTree( + tree.n_features_in_, n_classes, tree.n_outputs_, tree.is_categorical_ + ) # only keeping one child as a leaf results in an improper tree leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8) From a27b16c8ce37879ecd695efa874de31af95fee78 Mon Sep 17 00:00:00 2001 From: Arthur Date: Wed, 24 Sep 2025 19:11:46 +0200 Subject: [PATCH 06/23] use categorical split in apply/predict; basic test test with categories work in notebook: logic seems to be correct --- sklearn/tree/_classes.py | 4 ++++ sklearn/tree/_tree.pyx | 19 ++++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index f3147fef18cc1..3c300718cb3f1 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -1135,6 +1135,7 @@ def __init__( class_weight=None, ccp_alpha=0.0, monotonic_cst=None, + categorical_features=None, ): super().__init__( criterion=criterion, @@ -1150,6 +1151,7 @@ def __init__( min_impurity_decrease=min_impurity_decrease, monotonic_cst=monotonic_cst, ccp_alpha=ccp_alpha, + categorical_features=categorical_features, ) @_fit_context(prefer_skip_nested_validation=True) @@ -1512,6 +1514,7 @@ def __init__( min_impurity_decrease=0.0, ccp_alpha=0.0, monotonic_cst=None, + categorical_features=None, ): super().__init__( criterion=criterion, @@ -1526,6 +1529,7 @@ def __init__( min_impurity_decrease=min_impurity_decrease, ccp_alpha=ccp_alpha, monotonic_cst=monotonic_cst, + categorical_features=categorical_features, ) @_fit_context(prefer_skip_nested_validation=True) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 27ef8e6416119..4db6bbd28fae3 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -799,6 +799,7 @@ cdef class Tree: for f in range(n_features): self.is_categorical[f] = False else: + is_categorical = is_categorical.astype(np.intp) for f in range(n_features): self.is_categorical[f] = is_categorical[f] @@ -1005,14 +1006,18 @@ cdef class Tree: node = self.nodes # While node not a leaf while node.left_child != _TREE_LEAF: - X_i_node_feature = X_ndarray[i, node.feature] # ... and node.right_child != _TREE_LEAF: + X_i_node_feature = X_ndarray[i, node.feature] if isnan(X_i_node_feature): if node.missing_go_to_left: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] - # TODO: handle categorical case: + if self.is_categorical[node.feature]: + if node.categorical_bitset & (1 << ( X_ndarray[i, node.feature])): + node = &self.nodes[node.left_child] + else: + node = &self.nodes[node.right_child] elif X_i_node_feature <= node.threshold: node = &self.nodes[node.left_child] else: @@ -1134,8 +1139,12 @@ cdef class Tree: indices[indptr[i + 1]] = (node - self.nodes) indptr[i + 1] += 1 - # TODO handle categorical - if X_ndarray[i, node.feature] <= node.threshold: + if self.is_categorical[node.feature]: + if node.categorical_bitset & (1 << ( X_ndarray[i, node.feature])): + node = &self.nodes[node.left_child] + else: + node = &self.nodes[node.right_child] + elif X_ndarray[i, node.feature] <= node.threshold: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -1415,7 +1424,7 @@ cdef class Tree: if is_target_feature: # In this case, we push left or right child on stack - # TODO: handle categotical (and missing?) + # TODO: handle categorical (and missing?) if X[sample_idx, feature_idx] <= current_node.threshold: node_idx_stack[stack_size] = current_node.left_child else: From 288c1eaf14ee3fac91e85dceb1a727a4ceea9adc Mon Sep 17 00:00:00 2001 From: Arthur Date: Wed, 24 Sep 2025 22:22:08 +0200 Subject: [PATCH 07/23] refacto check_cat; better error messages --- sklearn/tree/_classes.py | 130 ++++++++++----------------------------- 1 file changed, 34 insertions(+), 96 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 3c300718cb3f1..6ef5ac5b7985a 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -47,7 +47,6 @@ _assert_all_finite_element_wise, _check_n_features, _check_sample_weight, - _is_pandas_df, assert_all_finite, check_is_fitted, validate_data, @@ -249,8 +248,6 @@ def _fit( ): random_state = check_random_state(self.random_state) - self.is_categorical_ = self._check_categorical_features(X) - if check_input: # Need to validate separately here. # We can't pass multi_output=True because that would allow y to be @@ -446,6 +443,10 @@ def _fit( # *positive class*, all signs must be flipped. monotonic_cst *= -1 + self.is_categorical_, n_categories_in_feature = ( + self._check_categorical_features(X, monotonic_cst) + ) + if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter]( criterion, @@ -493,14 +494,6 @@ def _fit( self.min_impurity_decrease, ) - n_categories_in_feature = np.full(self.n_features_in_, -1, dtype=np.intp) - if self.is_categorical_ is not None: - for idx in np.where(self.is_categorical_)[0]: - assert np.allclose(X[:, idx].astype(np.intp), X[:, idx]) - assert X[:, idx].min() >= 0 - n_categories_in_feature[idx] = X[:, idx].max() + 1 - assert (n_categories_in_feature <= 64).all() - builder.build( self.tree_, X, @@ -518,101 +511,31 @@ def _fit( return self - def _check_categorical_features(self, X): - """Copied from gradient_boosting.py on sept. 2025 - Check and validate categorical features in X + def _check_categorical_features(self, X, monotonic_cst): + """Check and validate categorical features in X Parameters ---------- - X : {array-like, pandas DataFrame} of shape (n_samples, n_features) - Input data. + X : {array-like} of shape (n_samples, n_features) + Input data (after `validate_data` was called) Return ------ is_categorical : ndarray of shape (n_features,) or None, dtype=bool Indicates whether a feature is categorical. If no feature is categorical, this is None. + n_categories_in_feature: TODO """ - # Special code for pandas because of a bug in recent pandas, which is - # fixed in main and maybe included in 2.2.1, see - # https://github.com/pandas-dev/pandas/pull/57173. - # Also pandas versions < 1.5.1 do not support the dataframe interchange - if _is_pandas_df(X): - X_is_dataframe = True - categorical_columns_mask = np.asarray(X.dtypes == "category") - elif hasattr(X, "__dataframe__"): - X_is_dataframe = True - categorical_columns_mask = np.asarray( - [ - c.dtype[0].name == "CATEGORICAL" - for c in X.__dataframe__().get_columns() - ] - ) - else: - X_is_dataframe = False - categorical_columns_mask = None - - categorical_features = self.categorical_features - - categorical_by_dtype = ( - isinstance(categorical_features, str) - and categorical_features == "from_dtype" - ) - no_categorical_dtype = categorical_features is None or ( - categorical_by_dtype and not X_is_dataframe - ) - - if no_categorical_dtype: - return None - - use_pandas_categorical = categorical_by_dtype and X_is_dataframe - if use_pandas_categorical: - categorical_features = categorical_columns_mask - else: - categorical_features = np.asarray(categorical_features) - - if categorical_features.size == 0: - return None + n_features = X.shape[1] + categorical_features = np.asarray(self.categorical_features) - if categorical_features.dtype.kind not in ("i", "b", "U", "O"): + if self.categorical_features is None or categorical_features.size == 0: + is_categorical = np.zeros(n_features, dtype=bool) + elif categorical_features.dtype.kind not in ("i", "b"): raise ValueError( - "categorical_features must be an array-like of bool, int or " - f"str, got: {categorical_features.dtype.name}." + "categorical_features must be an array-like of bool or int, " + f"got: {categorical_features.dtype.name}." ) - - if categorical_features.dtype.kind == "O": - types = set(type(f) for f in categorical_features) - if types != {str}: - raise ValueError( - "categorical_features must be an array-like of bool, int or " - f"str, got: {', '.join(sorted(t.__name__ for t in types))}." - ) - - n_features = X.shape[1] - # At this point `validate_data` was not called yet because we use the original - # dtypes to discover the categorical features. Thus `feature_names_in_` - # is not defined yet. - feature_names_in_ = getattr(X, "columns", None) - - if categorical_features.dtype.kind in ("U", "O"): - # check for feature names - if feature_names_in_ is None: - raise ValueError( - "categorical_features should be passed as an array of " - "integers or as a boolean mask when the model is fitted " - "on data without feature names." - ) - is_categorical = np.zeros(n_features, dtype=bool) - feature_names = list(feature_names_in_) - for feature_name in categorical_features: - try: - is_categorical[feature_names.index(feature_name)] = True - except ValueError as e: - raise ValueError( - f"categorical_features has a item value '{feature_name}' " - "which is not a valid feature name of the training " - f"data. Observed feature names: {feature_names}" - ) from e elif categorical_features.dtype.kind == "i": # check for categorical features as indices if ( @@ -634,9 +557,24 @@ def _check_categorical_features(self, X): ) is_categorical = categorical_features - if not np.any(is_categorical): - return None - return is_categorical + n_categories_in_feature = np.full(self.n_features_in_, -1, dtype=np.intp) + MAX_NC = 64 # TODO import from somewhere + base_msg = ( + f"Values for categorical features should be integers in [0, {MAX_NC - 1}]." + ) + for idx in np.where(is_categorical)[0]: + if not np.allclose(X[:, idx].astype(np.intp), X[:, idx]): + raise ValueError(f"{base_msg} Found non-integer values.") + if X[:, idx].min() < 0: + raise ValueError(f"{base_msg} Found negative values.") + X_idx_max = X[:, idx].max() + if X_idx_max >= MAX_NC: + raise ValueError(f"{base_msg} Found {X_idx_max}.") + n_categories_in_feature[idx] = X_idx_max + 1 + if monotonic_cst is not None and monotonic_cst[idx] != 0: + raise ValueError("Categorical features are incompatible with ") + + return is_categorical, n_categories_in_feature def _validate_X_predict(self, X, check_input): """Validate the training data on predict (probabilities).""" From f512e0323ad7f08f6f801d367b893a119211c3b5 Mon Sep 17 00:00:00 2001 From: Arthur Date: Wed, 24 Sep 2025 22:46:21 +0200 Subject: [PATCH 08/23] fix segfault --- sklearn/tree/_tree.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 4db6bbd28fae3..ed71e159b1792 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1013,8 +1013,8 @@ cdef class Tree: node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] - if self.is_categorical[node.feature]: - if node.categorical_bitset & (1 << ( X_ndarray[i, node.feature])): + elif self.is_categorical[node.feature]: + if node.categorical_bitset & (1 << ( X_i_node_feature)): node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -1139,6 +1139,7 @@ cdef class Tree: indices[indptr[i + 1]] = (node - self.nodes) indptr[i + 1] += 1 + # TODO: handle missing values? if self.is_categorical[node.feature]: if node.categorical_bitset & (1 << ( X_ndarray[i, node.feature])): node = &self.nodes[node.left_child] From eb7f94ca4cd029150bdff722f9ac977fb617dbcc Mon Sep 17 00:00:00 2001 From: Arthur Date: Thu, 25 Sep 2025 10:29:42 +0200 Subject: [PATCH 09/23] added basic but strong tests; fixed some minor but impactful bugs revealed by the newly added tests --- sklearn/tree/_partitioner.pxd | 4 ++-- sklearn/tree/_partitioner.pyx | 25 +++++++++++++++++++------ sklearn/tree/_splitter.pyx | 13 +++---------- sklearn/tree/tests/test_tree.py | 31 +++++++++++++++++++++++++++++-- 4 files changed, 53 insertions(+), 20 deletions(-) diff --git a/sklearn/tree/_partitioner.pxd b/sklearn/tree/_partitioner.pxd index 12fd019f1b8aa..a157d259cae29 100644 --- a/sklearn/tree/_partitioner.pxd +++ b/sklearn/tree/_partitioner.pxd @@ -87,7 +87,7 @@ cdef class DensePartitioner: cdef intp_t[::1] sorted_cat cdef intp_t[::1] offsets - cdef void sort_samples_and_feature_values( + cdef bint sort_samples_and_feature_values( self, intp_t current_feature ) noexcept nogil cdef void shift_missing_to_the_left(self) noexcept nogil @@ -150,7 +150,7 @@ cdef class SparsePartitioner: cdef const uint8_t[::1] missing_values_in_feature_mask cdef intp_t n_categories - cdef void sort_samples_and_feature_values( + cdef bint sort_samples_and_feature_values( self, intp_t current_feature ) noexcept nogil cdef void shift_missing_to_the_left(self) noexcept nogil diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx index d0a4a66021761..7feee5b8f082e 100644 --- a/sklearn/tree/_partitioner.pyx +++ b/sklearn/tree/_partitioner.pyx @@ -71,7 +71,7 @@ cdef class DensePartitioner: self.end = end self.n_missing = 0 - cdef inline void sort_samples_and_feature_values( + cdef inline bint sort_samples_and_feature_values( self, intp_t current_feature ) noexcept nogil: """Simultaneously sort based on the feature_values. @@ -115,14 +115,18 @@ cdef class DensePartitioner: for i in range(self.start, self.end): feature_values[i] = X[samples[i], current_feature] + self.missing_on_the_left = False + self.n_missing = n_missing self.n_categories = self.n_categories_in_feature[current_feature] + if n_missing == self.end - self.start: + return True if self.n_categories <= 0: # no a categorical feature sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing) + return feature_values[self.end - n_missing - 1] <= feature_values[self.start] + FEATURE_THRESHOLD else: self._breiman_sort_categories(self.n_categories) - self.missing_on_the_left = False - self.n_missing = n_missing + return feature_values[self.start] == feature_values[self.end - 1] cdef void _breiman_sort_categories(self, intp_t nc) noexcept nogil: """ O(n + nc log nc)""" @@ -249,12 +253,20 @@ cdef class DensePartitioner: cdef intp_t end_non_missing = ( self.end if self.missing_on_the_left else self.end - self.n_missing) + cdef float32_t c if p[0] == end_non_missing and not self.missing_on_the_left: # skip the missing values up to the end # (which will end the for loop in the best split function) p[0] = self.end p_prev[0] = self.end + elif self.n_categories > 0: + c = self.feature_values[p[0]] + p[0] += 1 + while p[0] < end_non_missing and self.feature_values[p[0]] == c: + p[0] += 1 + + # p_prev is unused in this case else: if self.missing_on_the_left and p[0] == self.start: # skip the missing values up to the first non-missing value: @@ -270,11 +282,11 @@ cdef class DensePartitioner: cdef inline SplitValue pos_to_threshold( self, intp_t p_prev, intp_t p ) noexcept nogil: + cdef SplitValue split cdef intp_t end_non_missing = ( self.end if self.missing_on_the_left else self.end - self.n_missing) - cdef SplitValue split if self.n_categories > 0: split.cat_split = self._split_pos_to_bitset(p, self.n_categories) return split @@ -407,7 +419,7 @@ cdef class SparsePartitioner: self.is_samples_sorted = 0 self.n_missing = 0 - cdef inline void sort_samples_and_feature_values( + cdef inline bint sort_samples_and_feature_values( self, intp_t current_feature ) noexcept nogil: @@ -446,6 +458,8 @@ cdef class SparsePartitioner: # number of missing values for current_feature self.n_missing = 0 + return feature_values[self.end - 1] <= feature_values[self.start] + FEATURE_THRESHOLD + cdef void shift_missing_to_the_left(self) noexcept nogil: pass # missing values not support for sparse @@ -740,7 +754,6 @@ cdef inline void extract_nnz_binary_search(const int32_t[::1] X_indices, n_samples * log(n_indices)). """ cdef intp_t n_samples - # printf("Is sorted: %d\n", is_samples_sorted[0]) if not is_samples_sorted[0]: n_samples = end - start diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 80721d8f7e7c0..86f8fcfb2d2a1 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -281,7 +281,6 @@ cdef inline int node_split_best( # Find the best split cdef intp_t start = splitter.start cdef intp_t end = splitter.end - cdef intp_t end_non_missing cdef intp_t n_missing = 0 cdef bint has_missing = 0 cdef intp_t n_searches @@ -292,7 +291,6 @@ cdef inline int node_split_best( cdef intp_t[::1] constant_features = splitter.constant_features cdef intp_t n_features = splitter.n_features - cdef float32_t[::1] feature_values = splitter.feature_values cdef intp_t max_features = splitter.max_features cdef intp_t min_samples_leaf = splitter.min_samples_leaf cdef float64_t min_weight_leaf = splitter.min_weight_leaf @@ -308,6 +306,7 @@ cdef inline int node_split_best( cdef intp_t f_i = n_features cdef intp_t f_j + cdef bint is_constant cdef intp_t p cdef intp_t p_prev @@ -367,16 +366,10 @@ cdef inline int node_split_best( f_j += n_found_constants # f_j in the interval [n_total_constants, f_i[ current_split.feature = features[f_j] - partitioner.sort_samples_and_feature_values(current_split.feature) + is_constant = partitioner.sort_samples_and_feature_values(current_split.feature) n_missing = partitioner.n_missing - end_non_missing = end - n_missing - if ( - # All values for this feature are missing, or - end_non_missing == start or - # This feature is considered constant (max - min <= FEATURE_THRESHOLD) - feature_values[end_non_missing - 1] <= feature_values[start] + FEATURE_THRESHOLD - ): + if is_constant: # We consider this feature constant in this case. # Since finding a split among constant feature is not valuable, # we do not consider this feature for splitting. diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 9db97ec529e24..339c85b08a291 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2053,10 +2053,18 @@ def test_criterion_entropy_same_as_log_loss(Tree, n_classes): assert_allclose(tree_log_loss.predict(X), tree_entropy.predict(X)) +def to_categorical(x, nc): + q = np.linspace(0, 1, num=nc + 1)[1:-1] + quantiles = np.quantile(x, q) + cats = np.searchsorted(quantiles, x) + return np.random.permutation(nc)[cats] + + def test_different_endianness_pickle(): X, y = datasets.make_classification(random_state=0) + X[:, 0] = to_categorical(X[:, 0], 5) - clf = DecisionTreeClassifier(random_state=0, max_depth=3) + clf = DecisionTreeClassifier(random_state=0, max_depth=3, categorical_features=[0]) clf.fit(X, y) score = clf.score(X, y) @@ -2080,8 +2088,9 @@ def get_pickle_non_native_endianness(): def test_different_endianness_joblib_pickle(): X, y = datasets.make_classification(random_state=0) + X[:, 0] = to_categorical(X[:, 0], 5) - clf = DecisionTreeClassifier(random_state=0, max_depth=3) + clf = DecisionTreeClassifier(random_state=0, max_depth=3, categorical_features=[0]) clf.fit(X, y) score = clf.score(X, y) @@ -2859,3 +2868,21 @@ def test_sort_log2_build(): ] # fmt: on assert_array_equal(samples, expected_samples) + + +@pytest.mark.parametrize("Tree", [DecisionTreeClassifier, DecisionTreeRegressor]) +def test_categorical(Tree): + n = 40 + c = np.random.randint(0, 20, size=n) + y = c % 2 + + X = np.random.rand(n, 3) + X[:, 0] = c + + tree = Tree(categorical_features=[0], max_depth=1) + # assert perfect tree was reached in one split + assert tree.fit(X, y).score(X, y) == 1 + + # assert it's not the case without using categorical_features + tree = Tree(max_depth=1) + assert tree.fit(X, y).score(X, y) < 1 From 9afc1d8e40c73d8e240a8d3b1654744277f14c6a Mon Sep 17 00:00:00 2001 From: Arthur Date: Thu, 25 Sep 2025 13:17:19 +0200 Subject: [PATCH 10/23] Minor tests changes --- sklearn/tree/tests/test_tree.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 339c85b08a291..ca72173adb527 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -2061,11 +2061,13 @@ def to_categorical(x, nc): def test_different_endianness_pickle(): - X, y = datasets.make_classification(random_state=0) - X[:, 0] = to_categorical(X[:, 0], 5) + X, y = datasets.make_classification(random_state=0, n_redundant=0, shuffle=False) + X[:, 0] = to_categorical(X[:, 0], 50) clf = DecisionTreeClassifier(random_state=0, max_depth=3, categorical_features=[0]) clf.fit(X, y) + assert 0 < clf.feature_importances_[0] < 1 + # ^ ensures some splits are categorical, some are continuous score = clf.score(X, y) def reduce_ndarray(arr): @@ -2088,10 +2090,12 @@ def get_pickle_non_native_endianness(): def test_different_endianness_joblib_pickle(): X, y = datasets.make_classification(random_state=0) - X[:, 0] = to_categorical(X[:, 0], 5) + X[:, 0] = to_categorical(X[:, 0], 50) clf = DecisionTreeClassifier(random_state=0, max_depth=3, categorical_features=[0]) clf.fit(X, y) + assert 0 < clf.feature_importances_[0] < 1 + # ^ ensures some splits are categorical, some are continuous score = clf.score(X, y) class NonNativeEndiannessNumpyPickler(NumpyPickler): @@ -2872,16 +2876,18 @@ def test_sort_log2_build(): @pytest.mark.parametrize("Tree", [DecisionTreeClassifier, DecisionTreeRegressor]) def test_categorical(Tree): + rng = np.random.default_rng(3) n = 40 - c = np.random.randint(0, 20, size=n) + c = rng.integers(0, 20, size=n) y = c % 2 - X = np.random.rand(n, 3) + X = rng.random((n, 3)) X[:, 0] = c - tree = Tree(categorical_features=[0], max_depth=1) + tree = Tree(categorical_features=[0], max_depth=1, random_state=8) # assert perfect tree was reached in one split assert tree.fit(X, y).score(X, y) == 1 + assert tree.feature_importances_[0] == 1 # assert it's not the case without using categorical_features tree = Tree(max_depth=1) From 1d796cfd46a48a181fc1452aad0ff254da1f1be6 Mon Sep 17 00:00:00 2001 From: Arthur Date: Thu, 25 Sep 2025 16:07:32 +0200 Subject: [PATCH 11/23] test categorical; found a new bug in missing --- sklearn/tree/_partitioner.pyx | 4 +- sklearn/tree/_tree.pyx | 4 + sklearn/tree/tests/test_split.py | 150 +++++++++++++++++++++++-------- 3 files changed, 119 insertions(+), 39 deletions(-) diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx index 7feee5b8f082e..602fc202383d4 100644 --- a/sklearn/tree/_partitioner.pyx +++ b/sklearn/tree/_partitioner.pyx @@ -123,6 +123,8 @@ cdef class DensePartitioner: if self.n_categories <= 0: # no a categorical feature sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing) + if n_missing > 0: + return False return feature_values[self.end - n_missing - 1] <= feature_values[self.start] + FEATURE_THRESHOLD else: self._breiman_sort_categories(self.n_categories) @@ -152,9 +154,9 @@ cdef class DensePartitioner: # FIXME: overflow risk + float32 downcast: precision issues? # `sort` only supports float32 for now, that's why means is float32 # maybe we can use some templating to extend sort to float64 - means[c] += self.y[samples[p], 0] if self.sample_weight is not None: w = self.sample_weight[samples[p]] + means[c] += w * self.y[samples[p], 0] self.weighted_counts[c] += w for c in range(nc): diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index ed71e159b1792..4a99bed472f40 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -751,6 +751,10 @@ cdef class Tree: def threshold(self): return self._get_node_ndarray()['threshold'][:self.node_count] + @property + def categorical_bitset(self): + return self._get_node_ndarray()['categorical_bitset'][:self.node_count] + @property def impurity(self): return self._get_node_ndarray()['impurity'][:self.node_count] diff --git a/sklearn/tree/tests/test_split.py b/sklearn/tree/tests/test_split.py index 55287d4d87267..d79a3ac2b443e 100644 --- a/sklearn/tree/tests/test_split.py +++ b/sklearn/tree/tests/test_split.py @@ -1,5 +1,7 @@ from dataclasses import dataclass -from functools import partial +from functools import cached_property, partial +from itertools import chain, combinations +from operator import itemgetter import numpy as np import pytest @@ -17,9 +19,11 @@ REG_CRITERIONS = ("squared_error", "absolute_error", "friedman_mse", "poisson") -def gini_loss(y, y_pred, sample_weight): - p = y_pred[0] - return (p * (1 - p)).sum() +def powerset(iterable): + s = list(iterable) # allows handling sets too + return chain.from_iterable( + (list(c) for c in combinations(s, r)) for r in range(1, (len(s) + 1) // 2 + 1) + ) @dataclass @@ -28,10 +32,7 @@ class NaiveSplitter: criterion: str with_nans: bool n_classes: int - - @staticmethod - def weighted_mean(y, w): - return (y * w).sum() / w.sum() + is_categorical: np.ndarray @staticmethod def weighted_median(y, w): @@ -40,28 +41,33 @@ def weighted_median(y, w): idx = np.searchsorted(wc, wc[-1] / 2) return y[sorter[idx]] - def class_ratios(self, y, w): - return np.clip(np.bincount(y, w, minlength=self.n_classes) / w.sum(), None, 1) + @staticmethod + def gini_loss(y, y_pred, sample_weight): + p = y_pred[0] + return (p * (1 - p)).sum() - @property + @cached_property def loss(self): losses = { "poisson": mean_poisson_deviance, "squared_error": mean_squared_error, "absolute_error": mean_absolute_error, - "log_loss": partial(log_loss, labels=list(range(self.n_classes))), - "gini": gini_loss, + "log_loss": partial(log_loss, labels=np.arange(self.n_classes)), + "gini": self.gini_loss, } return losses[self.criterion] - @property + def class_ratios(self, y, w): + return np.clip(np.bincount(y, w, minlength=self.n_classes) / w.sum(), None, 1) + + @cached_property def predictor(self): if self.is_clf: return self.class_ratios elif self.criterion == "absolute_error": return self.weighted_median else: - return self.weighted_mean + return lambda y, w: np.average(y, weights=w) def compute_child_loss(self, y: np.ndarray, w: np.ndarray): if y.size == 0: @@ -71,13 +77,20 @@ def compute_child_loss(self, y: np.ndarray, w: np.ndarray): y_pred[:] = self.predictor(y, w) return w.sum() * self.loss(y, y_pred, sample_weight=w) - def compute_split_loss(self, x, y, w, threshold, missing_left=False): - mask = x < threshold + def compute_split_loss( + self, x, y, w, threshold=None, categories=None, missing_left=False + ): + if categories is not None: + mask_c = np.zeros(int(x.max() + 1), dtype=bool) + mask_c[categories] = True + mask = mask_c[x.astype(int)] + else: + mask = x < threshold if missing_left: mask |= np.isnan(x) if self.criterion == "friedman_mse": - diff = self.weighted_mean(y[mask], w[mask]) - self.weighted_mean( - y[~mask], w[~mask] + diff = np.average(y[mask], weights=w[mask]) - np.average( + y[~mask], weights=w[~mask] ) return (-(diff**2) * w[mask].sum() * w[~mask].sum() / w.sum(),) return ( @@ -85,21 +98,32 @@ def compute_split_loss(self, x, y, w, threshold, missing_left=False): self.compute_child_loss(y[~mask], w[~mask]), ) - def compute_all_losses(self, x, y, w, missing_left=False): + def compute_all_losses(self, x, y, w, is_categorical=False, missing_left=False): + if is_categorical: + return self.compute_all_losses_categorical(x, y, w) nan_mask = np.isnan(x) xu = np.unique(x[~nan_mask], sorted=True) thresholds = (xu[1:] + xu[:-1]) / 2 if nan_mask.any() and not missing_left: thresholds = np.append(thresholds, xu.max() * 2) return thresholds, [ - sum(self.compute_split_loss(x, y, w, threshold, missing_left)) + sum(self.compute_split_loss(x, y, w, threshold, missing_left=missing_left)) for threshold in thresholds ] + def compute_all_losses_categorical(self, x, y, w): + cat_splits = list(powerset(np.unique(x).astype(int))) + return cat_splits, [ + sum(self.compute_split_loss(x, y, w, categories=left_cat)) + for left_cat in cat_splits + ] + def best_split_naive(self, X, y, w): splits = [] for f in range(X.shape[1]): - thresholds, losses = self.compute_all_losses(X[:, f], y, w) + thresholds, losses = self.compute_all_losses( + X[:, f], y, w, is_categorical=self.is_categorical[f] + ) if self.with_nans: thresholds_, losses_ = self.compute_all_losses( X[:, f], y, w, missing_left=True @@ -117,7 +141,7 @@ def best_split_naive(self, X, y, w): f, ) ) - return min(splits) + return min(splits, key=itemgetter(0)) def sparsify(X, density): @@ -128,13 +152,30 @@ def sparsify(X, density): return csc_array(X) +def to_categorical(x, nc): + q = np.linspace(0, 1, num=nc + 1)[1:-1] + quantiles = np.quantile(x, q) + cats = np.searchsorted(quantiles, x) + return np.random.permutation(nc)[cats] + + def make_simple_dataset( - n, d, with_nans, is_sparse, is_clf, n_classes, rng: np.random.Generator + n, + d, + with_nans, + is_sparse, + is_categorical, + is_clf, + n_classes, + rng: np.random.Generator, ): X_dense = np.random.rand(n, d) y = np.random.rand(n) + X_dense.sum(axis=1) w = np.random.rand(n) + for idx in np.where(is_categorical)[0]: + nc = rng.integers(2, 6) # cant go to high or test will be too slow + X_dense[:, idx] = to_categorical(X_dense[:, idx], nc) with_duplicates = rng.integers(2) == 0 if with_duplicates: X_dense = X_dense.round(1 if n < 50 else 2) @@ -155,44 +196,77 @@ def make_simple_dataset( return X_dense, X, y, w +def bitset_to_set(v: np.uint64): + return [c for c in range(64) if v & (1 << c)] + + @pytest.mark.parametrize("sparse", ["x", "sparse"]) -@pytest.mark.parametrize("missing_values", ["x", "missing_values"]) +@pytest.mark.parametrize("categorical", ["x", "categorical"]) +@pytest.mark.parametrize("missing_values", ["missing_values"]) @pytest.mark.parametrize( "criterion", - ["gini", "log_loss", "squared_error", "absolute_error", "friedman_mse", "poisson"], + ["gini", "log_loss", "squared_error", "friedman_mse", "poisson"], ) -def test_best_split_optimality(sparse, missing_values, criterion, global_random_seed): +def test_best_split_optimality( + sparse, categorical, missing_values, criterion, global_random_seed +): is_clf = criterion in CLF_CRITERIONS with_nans = missing_values != "x" is_sparse = sparse != "x" + with_categoricals = categorical != "x" if is_sparse and with_nans: pytest.skip("Sparse + missing values not supported yet") + if with_categoricals and (is_sparse or criterion == "absolute_error" or with_nans): + pytest.skip("Categorical features not supported in this case") - rng = np.random.default_rng(global_random_seed) + rng = np.random.default_rng() - d = 2 - for it, n in enumerate([5] * 10 + [10] * 10 + [30] * 5 + [100, 100, 200]): - n_classes = rng.integers(2, 5) + ns = [5] * 5 + [10] * 5 + [30, 30] + if not with_categoricals and criterion != "log_loss": + ns.extend([30, 30, 30, 100, 100, 200]) + + for it, n in enumerate(ns): + d = rng.integers(1, 4) + n_classes = 2 if with_categoricals else rng.integers(2, 5) + if with_categoricals: + is_categorical = rng.random(d) < 0.5 + else: + is_categorical = np.zeros(d, dtype=bool) X_dense, X, y, w = make_simple_dataset( - n, d, with_nans, is_sparse, is_clf, n_classes, rng + n, d, with_nans, is_sparse, is_categorical, is_clf, n_classes, rng ) - naive_splitter = NaiveSplitter(is_clf, criterion, with_nans, n_classes) + naive_splitter = NaiveSplitter( + is_clf, criterion, with_nans, n_classes, is_categorical + ) + best_split = naive_splitter.best_split_naive(X_dense, y, w) + is_categorical = is_categorical if is_categorical.any() else None Tree = DecisionTreeClassifier if is_clf else DecisionTreeRegressor - tree = Tree(criterion=criterion, max_depth=1, max_features=d) + tree = Tree( + criterion=criterion, + max_depth=1, + max_features=d, + categorical_features=is_categorical, + ) tree.fit(X, y, sample_weight=w) - best_split = naive_splitter.best_split_naive(X_dense, y, w) + + split_feature = tree.tree_.feature[0] + split = ( + {"threshold": tree.tree_.threshold[0]} + if is_categorical is None or not is_categorical[split_feature] + else {"categories": bitset_to_set(tree.tree_.categorical_bitset[0])} + ) tree_loss = naive_splitter.compute_split_loss( X_dense[:, tree.tree_.feature[0]], y, w, - tree.tree_.threshold[0], - bool(tree.tree_.missing_go_to_left[0]), + **split, + missing_left=bool(tree.tree_.missing_go_to_left[0]), ) tree_split = ( sum(tree_loss), - tree.tree_.threshold[0], + split, bool(tree.tree_.missing_go_to_left[0]), tree.tree_.feature[0], ) From 5c3de79000111c01723dca885398f80ddf905207 Mon Sep 17 00:00:00 2001 From: Arthur Date: Thu, 25 Sep 2025 23:02:48 +0200 Subject: [PATCH 12/23] docstring for brieman sort --- sklearn/tree/_partitioner.pyx | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx index 602fc202383d4..ab6cd3c7a37ed 100644 --- a/sklearn/tree/_partitioner.pyx +++ b/sklearn/tree/_partitioner.pyx @@ -131,7 +131,21 @@ cdef class DensePartitioner: return feature_values[self.start] == feature_values[self.end - 1] cdef void _breiman_sort_categories(self, intp_t nc) noexcept nogil: - """ O(n + nc log nc)""" + """ + Order self.sorted_cat by ascending average target value + and order self.features_values & self.samples such that + - self.features_values is ordered according to the order of sorted_cat + - the relation `self.features_values[p] = self.X[f, self.samples[p]]` is + preserved + + E.g. sorted_cat is [2 0 1] + features_values is [2 2 2 0 0 1 1 1 1] + + This ordering ensures the optimal split will be among the candidate splits + evaluated by the splitter (this is called the Brieman shortcut). + + Time complexity: O(n + nc log nc) + """ cdef: intp_t* counts = &self.counts[0] float64_t* weighted_counts = &self.weighted_counts[0] @@ -151,9 +165,6 @@ cdef class DensePartitioner: for p in range(self.start, self.end): c = feature_values[p] counts[c] += 1 - # FIXME: overflow risk + float32 downcast: precision issues? - # `sort` only supports float32 for now, that's why means is float32 - # maybe we can use some templating to extend sort to float64 if self.sample_weight is not None: w = self.sample_weight[samples[p]] means[c] += w * self.y[samples[p], 0] @@ -182,7 +193,6 @@ cdef class DensePartitioner: # while ensuring samples of the same categories are contiguous p = self.start while p < self.end: - # XXX: not sure this works c = feature_values[p] new_p = offsets[c] if new_p > p: From f47ca8e6db782a56baafb2dc2671866da31917e1 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 26 Sep 2025 00:20:29 +0200 Subject: [PATCH 13/23] minor comments fixes --- sklearn/tree/_partitioner.pyx | 13 ++++++------- sklearn/tree/tests/test_split.py | 4 ++-- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx index ab6cd3c7a37ed..d0f51e9341d59 100644 --- a/sklearn/tree/_partitioner.pyx +++ b/sklearn/tree/_partitioner.pyx @@ -135,7 +135,7 @@ cdef class DensePartitioner: Order self.sorted_cat by ascending average target value and order self.features_values & self.samples such that - self.features_values is ordered according to the order of sorted_cat - - the relation `self.features_values[p] = self.X[f, self.samples[p]]` is + - the relation `self.features_values[p] = self.X[self.samples[p], f]` is preserved E.g. sorted_cat is [2 0 1] @@ -171,16 +171,15 @@ cdef class DensePartitioner: self.weighted_counts[c] += w for c in range(nc): - # TODO: weighted counts if weighted_counts[c] > 0: means[c] /= weighted_counts[c] - # sorted_cat[i] = i-th categories sorted my ascending means + # sorted_cat[i] = i-th categories sorted by ascending means for c in range(nc): sorted_cat[c] = c sort(means, sorted_cat, nc) - # ) build offsets such that: + # build offsets such that: # offsets[c] = sum( counts[x] for all x s.t. rank(x) <= rank(c) ) - 1 cdef intp_t offset = 0 for r in range(nc): @@ -188,8 +187,8 @@ cdef class DensePartitioner: offset += counts[c] offsets[c] = offset - 1 - # sort feature_values & samples such that they are ordered by - # the mean of the category + # sort feature_values & samples in-place such that + # they are ordered by the mean of the category # while ensuring samples of the same categories are contiguous p = self.start while p < self.end: @@ -197,7 +196,7 @@ cdef class DensePartitioner: new_p = offsets[c] if new_p > p: swap(feature_values, samples, p, new_p) - # ^ preserves invariant: feature[p] = X[samples[p], f] + # swap preserves invariant: feature[p] = X[samples[p], f] offsets[c] -= 1 else: p += 1 diff --git a/sklearn/tree/tests/test_split.py b/sklearn/tree/tests/test_split.py index d79a3ac2b443e..5bbedb7207903 100644 --- a/sklearn/tree/tests/test_split.py +++ b/sklearn/tree/tests/test_split.py @@ -200,9 +200,9 @@ def bitset_to_set(v: np.uint64): return [c for c in range(64) if v & (1 << c)] -@pytest.mark.parametrize("sparse", ["x", "sparse"]) +@pytest.mark.parametrize("sparse", ["x"]) @pytest.mark.parametrize("categorical", ["x", "categorical"]) -@pytest.mark.parametrize("missing_values", ["missing_values"]) +@pytest.mark.parametrize("missing_values", ["x"]) @pytest.mark.parametrize( "criterion", ["gini", "log_loss", "squared_error", "friedman_mse", "poisson"], From 96998eef5b0e961a68c32434ac5dd52f038d8179 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 26 Sep 2025 10:05:52 +0200 Subject: [PATCH 14/23] adressed some copilot comments --- sklearn/tree/_classes.py | 10 ++++++++-- sklearn/tree/_partitioner.pyx | 2 +- sklearn/tree/_tree.pyx | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 6ef5ac5b7985a..083ee36346d32 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -127,7 +127,6 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): "monotonic_cst": ["array-like", None], "categorical_features": [ "array-like", - StrOptions({"from_dtype"}), None, ], } @@ -563,6 +562,10 @@ def _check_categorical_features(self, X, monotonic_cst): f"Values for categorical features should be integers in [0, {MAX_NC - 1}]." ) for idx in np.where(is_categorical)[0]: + if np.isnan(X[:, idx]).any(): + raise ValueError( + "Missing values are not supported in categorical features" + ) if not np.allclose(X[:, idx].astype(np.intp), X[:, idx]): raise ValueError(f"{base_msg} Found non-integer values.") if X[:, idx].min() < 0: @@ -572,7 +575,10 @@ def _check_categorical_features(self, X, monotonic_cst): raise ValueError(f"{base_msg} Found {X_idx_max}.") n_categories_in_feature[idx] = X_idx_max + 1 if monotonic_cst is not None and monotonic_cst[idx] != 0: - raise ValueError("Categorical features are incompatible with ") + raise ValueError( + "A categorical feature cannot have a non-null monotonic" + " constraint. " + ) return is_categorical, n_categories_in_feature diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx index d0f51e9341d59..9ef254dea41e8 100644 --- a/sklearn/tree/_partitioner.pyx +++ b/sklearn/tree/_partitioner.pyx @@ -121,7 +121,7 @@ cdef class DensePartitioner: if n_missing == self.end - self.start: return True if self.n_categories <= 0: - # no a categorical feature + # not a categorical feature sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing) if n_missing > 0: return False diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 4a99bed472f40..10ede1924614e 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -960,7 +960,7 @@ cdef class Tree: node.categorical_bitset = cat_split else: node.threshold = threshold - node.categorical_bitset = cat_split + node.categorical_bitset = 0 node.missing_go_to_left = missing_go_to_left self.node_count += 1 From 09b03a0a58aa885618849b9ebc02bb93450861f3 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 7 Oct 2025 14:45:25 +0200 Subject: [PATCH 15/23] enable more tests --- sklearn/tree/tests/test_split.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/tests/test_split.py b/sklearn/tree/tests/test_split.py index 5bbedb7207903..af1734bffc249 100644 --- a/sklearn/tree/tests/test_split.py +++ b/sklearn/tree/tests/test_split.py @@ -200,9 +200,9 @@ def bitset_to_set(v: np.uint64): return [c for c in range(64) if v & (1 << c)] -@pytest.mark.parametrize("sparse", ["x"]) +@pytest.mark.parametrize("sparse", ["x", "sparse"]) @pytest.mark.parametrize("categorical", ["x", "categorical"]) -@pytest.mark.parametrize("missing_values", ["x"]) +@pytest.mark.parametrize("missing_values", ["x", "missing_values"]) @pytest.mark.parametrize( "criterion", ["gini", "log_loss", "squared_error", "friedman_mse", "poisson"], From 1ded41a956aaed269c4f7b0a1e8734e89a539d84 Mon Sep 17 00:00:00 2001 From: Arthur Date: Sun, 18 Jan 2026 20:17:07 +0100 Subject: [PATCH 16/23] fix uint64 bug --- sklearn/tree/_partitioner.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx index ad8d9a7137115..0f555b0662b49 100644 --- a/sklearn/tree/_partitioner.pyx +++ b/sklearn/tree/_partitioner.pyx @@ -392,7 +392,7 @@ cdef class DensePartitioner: cdef intp_t offset = 0 for r in range(nc): c = self.sorted_cat[r] - bitset |= 1 << c + bitset |= (1) << c offset += self.counts[c] if offset >= p: break @@ -680,7 +680,7 @@ cdef inline bint goes_left( if isnan(value): return missing_go_to_left elif is_categorical: - return split_value.cat_split & (1 << ( value)) + return split_value.cat_split & ((1) << ( value)) else: return value <= split_value.threshold From 7eac9b6d5611858cd360b14a91ff3ed58880a4c0 Mon Sep 17 00:00:00 2001 From: Arthur Date: Sun, 18 Jan 2026 20:18:06 +0100 Subject: [PATCH 17/23] rm --- sklearn/tree/README.md | 76 ------------------------------------------ 1 file changed, 76 deletions(-) delete mode 100644 sklearn/tree/README.md diff --git a/sklearn/tree/README.md b/sklearn/tree/README.md deleted file mode 100644 index 870f0bc191b61..0000000000000 --- a/sklearn/tree/README.md +++ /dev/null @@ -1,76 +0,0 @@ -# sklearn.tree internals - -Quick map of how `sklearn.tree` is wired and how other modules use it. - -## File map -- `_classes.py`: public estimators + `BaseDecisionTree` (validation, builder setup). -- `_tree.pyx` / `_tree.pxd`: core `Tree` data structure, builders, prediction, - decision path, pruning helpers; defines `DTYPE` (float32) and `DOUBLE` (float64). -- `_splitter.pyx` / `_splitter.pxd`: split search logic (best/random) using - criteria and partitioners. -- `_partitioner.pyx` / `_partitioner.pxd`: dense/sparse sample partitioning for - a given feature/threshold; handles missing values layout. -- `_criterion.pyx` / `_criterion.pxd`: impurity criteria (Gini/Entropy/MSE/MAE/Poisson), - plus helper structures for efficient updates. -- `_utils.pyx` / `_utils.pxd`: small C helpers, RNG helpers, and `WeightedFenwickTree` - used by criteria. -- `_export.py`: `export_graphviz`, `export_text`, `plot_tree`. -- `_reingold_tilford.py`: layout algorithm used by `plot_tree`. - -## Core objects and data flow -### Fit path -1. `BaseDecisionTree._fit` in `_classes.py` validates input, handles class/target - encoding, computes missing-value mask, and resolves hyperparameters. -2. A `Criterion` instance is created (classification or regression). -3. A `Splitter` instance is created (dense vs sparse, best vs random), optionally - with monotonic constraints. -4. A `Tree` object is allocated to hold node arrays and node values. -5. A `TreeBuilder` builds the tree: - - `DepthFirstTreeBuilder` grows nodes in DFS order (default). - - `BestFirstTreeBuilder` grows by impurity improvement when `max_leaf_nodes` is set. -6. Optional cost-complexity pruning replaces `tree_` with a pruned tree. - -### Predict path -- `Tree.predict` walks each sample to a leaf and returns per-node values. -- Estimator methods (`predict`, `predict_proba`, `apply`, `decision_path`) call `tree_`. - -## Data structures -- `Tree` stores a binary tree as parallel arrays of nodes (see `_tree.pxd`). - Key arrays include `children_left/right`, `feature`, `threshold`, `impurity`, - `n_node_samples`, `weighted_n_node_samples`, `missing_go_to_left`, and `value`. -- Node values are stored as a `(node_count, n_outputs, max_n_classes)` array so - the same structure supports regression and classification. - -## Split search and partitioning -- `Splitter` owns the feature subset, sample indices, and the current `Criterion`. -- `_partitioner` provides dense and sparse implementations for sorting and - partitioning samples for a candidate feature, including missing-value handling. -- `_criterion` computes impurity, impurity improvement, and node values using - incremental updates as the split position moves. - -## Missing values and monotonic constraints -- Missing values are only supported on dense inputs and when tags allow it; - `_compute_missing_values_in_feature_mask` computes a per-feature mask. -- Missing values are grouped to the left or right during split evaluation and - recorded in `missing_go_to_left` so prediction follows the same rule. -- Monotonic constraints are validated in `_classes.py` and passed to splitters. - -## Pruning -- `_build_pruned_tree_ccp` (in `_tree.pyx`) builds a new `Tree` from a subtree - based on `ccp_alpha`. `cost_complexity_pruning_path` in `_classes.py` exposes - the pruning path utility. - -## How other modules use `sklearn.tree` -- `sklearn.ensemble` uses `DecisionTree*` and `ExtraTree*` as base estimators: - - `_forest.py` calls `tree._fit(...)` directly to reuse input validation and - pass bootstrap weights; it also calls `tree.apply` and `tree.decision_path`. - - Feature importances are aggregated via `tree.tree_.compute_feature_importances()`. -- Gradient boosting and related ensemble models also build trees via - `DecisionTreeRegressor` and rely on the same `Tree`/`Splitter`/`Criterion` - machinery. - -## Notes for edits -- Most heavy lifting is in Cython; change signatures in `.pxd` and `.pyx` together. -- Input arrays are coerced to `DTYPE` (float32) for `X` and `DOUBLE` (float64) for `y`. -- Missing-value support is conservative; ensure changes keep the mask and - `missing_go_to_left` consistent. From ff856944aa5f29955a2653463d2dd3c41e0874fa Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 26 Jan 2026 21:54:05 +0100 Subject: [PATCH 18/23] fixed test --- sklearn/tree/tests/test_split.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sklearn/tree/tests/test_split.py b/sklearn/tree/tests/test_split.py index b85a6a1e6ebf8..1b4afa4003fb7 100644 --- a/sklearn/tree/tests/test_split.py +++ b/sklearn/tree/tests/test_split.py @@ -35,7 +35,7 @@ def powerset(iterable): """returns all the subsets of `iterable`.""" - s = list(iterable) # allows handling sets too + s = list(iterable) return chain.from_iterable( combinations(s, r) for r in range(1, (len(s) + 1) // 2 + 1) ) @@ -132,7 +132,7 @@ def _generate_all_splits(self, X): nan_mask = np.isnan(x) thresholds = np.unique(x[~nan_mask]) if self.is_categorical[f]: - thresholds = list(powerset(thresholds.astype(int))) + thresholds = list(powerset(int(th) for th in thresholds)) for th in thresholds: yield Split(f, th) if not nan_mask.any(): @@ -153,11 +153,11 @@ def best_split_naive(self, X, y, w): return min(zip(split_impurities, splits), key=itemgetter(0)) -def to_categorical(x, nc): +def to_categorical(x, nc, rng): q = np.linspace(0, 1, num=nc + 1)[1:-1] quantiles = np.quantile(x, q) cats = np.searchsorted(quantiles, x) - return np.random.permutation(nc)[cats] + return rng.permutation(nc)[cats] def make_simple_dataset( @@ -176,7 +176,7 @@ def make_simple_dataset( for idx in np.where(is_categorical)[0]: nc = rng.integers(2, 6) # cant go to high or test will be too slow - X_dense[:, idx] = to_categorical(X_dense[:, idx], nc) + X_dense[:, idx] = to_categorical(X_dense[:, idx], nc, rng) with_duplicates = rng.integers(2) == 0 if with_duplicates: X_dense = X_dense.round(1 if n < 50 else 2) @@ -221,7 +221,7 @@ def test_split_impurity( if categorical and "Extra" in Tree.__name__: pytest.skip("Categorical features not implemented for the random splitter") - rng = np.random.default_rng() + rng = np.random.default_rng(global_random_seed) ns = [5] * 5 + [10] * 5 + [20, 30, 50, 100] @@ -234,6 +234,7 @@ def test_split_impurity( ) if categorical: is_categorical = rng.random(d) < 0.5 + n_classes = 2 tree_kwargs["categorical_features"] = is_categorical else: is_categorical = np.zeros(d, dtype=bool) From a5f7376eafd592b3f28791bd1e640a5eb54065c1 Mon Sep 17 00:00:00 2001 From: Arthur Date: Mon, 26 Jan 2026 22:04:52 +0100 Subject: [PATCH 19/23] fix int size bug --- sklearn/tree/_tree.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx index 0537961fca776..4f220bfd9da2c 100644 --- a/sklearn/tree/_tree.pyx +++ b/sklearn/tree/_tree.pyx @@ -1018,7 +1018,7 @@ cdef class Tree: else: node = &self.nodes[node.right_child] elif self.is_categorical[node.feature]: - if node.categorical_bitset & (1 << ( X_i_node_feature)): + if node.categorical_bitset & ((1) << ( X_i_node_feature)): node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] @@ -1150,7 +1150,7 @@ cdef class Tree: else: node = &self.nodes[node.right_child] elif self.is_categorical[node.feature]: - if node.categorical_bitset & (1 << ( X_ndarray[i, node.feature])): + if node.categorical_bitset & ((1) << ( X_ndarray[i, node.feature])): node = &self.nodes[node.left_child] else: node = &self.nodes[node.right_child] From 67200b3ae6e00a1c4d77d8a0488d2bf8de0ea559 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 27 Jan 2026 09:49:20 +0100 Subject: [PATCH 20/23] guard for incompatibilities of categorical features --- sklearn/tree/_classes.py | 16 ++++++++++++++++ sklearn/tree/tests/test_tree.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index 32e35a1a7a75f..bb6100631fe7e 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -445,6 +445,22 @@ def _fit( self._check_categorical_features(X, monotonic_cst) ) + has_categorical = bool(np.any(self.is_categorical_)) + if has_categorical and self.splitter == "random": + raise ValueError( + "Categorical features are not supported with splitter='random'. " + "Use splitter='best' instead." + ) + if has_categorical and self.n_outputs_ > 1: + raise ValueError( + "Categorical features are not supported with multi-output targets." + ) + if has_categorical and is_classifier(self) and np.any(self.n_classes_ > 2): + raise ValueError( + "Categorical features are only supported for binary classification. " + f"Found {self.n_classes_.max()} classes." + ) + if not isinstance(self.splitter, Splitter): splitter = SPLITTERS[self.splitter]( criterion, diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py index 174cf6ea2bd11..662736d0cfae9 100644 --- a/sklearn/tree/tests/test_tree.py +++ b/sklearn/tree/tests/test_tree.py @@ -3134,3 +3134,32 @@ def test_random_splitter_missing_values_uses_non_missing_min_max(X, y): assert np.isfinite(threshold) assert non_missing.min() <= threshold <= non_missing.max() + + +def test_categorical_random_splitter_raises(): + X = np.array([[0], [1], [2], [3]], dtype=np.float32) + y = np.array([0, 1, 0, 1]) + + tree = DecisionTreeClassifier( + splitter="random", categorical_features=[0], random_state=0 + ) + with pytest.raises(ValueError, match="splitter='random'"): + tree.fit(X, y) + + +def test_categorical_multiclass_classification_raises(): + X = np.array([[0], [1], [2], [3]], dtype=np.float32) + y = np.array([0, 1, 2, 0]) + + tree = DecisionTreeClassifier(categorical_features=[0], random_state=0) + with pytest.raises(ValueError, match="binary classification"): + tree.fit(X, y) + + +def test_categorical_multioutput_raises(): + X = np.array([[0], [1], [2], [3]], dtype=np.float32) + y = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]]) + + tree = DecisionTreeRegressor(categorical_features=[0], random_state=0) + with pytest.raises(ValueError, match="multi-output"): + tree.fit(X, y) From 09188aab550651959dcd3335e422a9706e2e8071 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 27 Jan 2026 09:56:59 +0100 Subject: [PATCH 21/23] Update BasePartitioner commented class --- sklearn/tree/_partitioner.pxd | 16 ++++++++++------ sklearn/tree/_partitioner.pyx | 2 -- sklearn/tree/_utils.pxd | 3 +-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/sklearn/tree/_partitioner.pxd b/sklearn/tree/_partitioner.pxd index 572eeb6ea04af..94314af5285cc 100644 --- a/sklearn/tree/_partitioner.pxd +++ b/sklearn/tree/_partitioner.pxd @@ -32,10 +32,12 @@ cdef const float32_t FEATURE_THRESHOLD = 1e-7 # cdef intp_t end # cdef intp_t n_missing # cdef const uint8_t[::1] missing_values_in_feature_mask +# cdef intp_t n_categories -# cdef void sort_samples_and_feature_values( +# cdef bint sort_samples_and_feature_values( # self, intp_t current_feature # ) noexcept nogil +# cdef void shift_missing_to_the_left(self) noexcept nogil # cdef void init_node_split( # self, # intp_t start, @@ -52,15 +54,19 @@ cdef const float32_t FEATURE_THRESHOLD = 1e-7 # intp_t* p_prev, # intp_t* p # ) noexcept nogil +# cdef inline SplitValue pos_to_threshold( +# self, intp_t p_prev, intp_t p +# ) noexcept nogil # cdef intp_t partition_samples( # self, -# float64_t current_threshold +# float64_t current_threshold, +# bint missing_go_to_left # ) noexcept nogil # cdef void partition_samples_final( # self, -# float64_t best_threshold, +# SplitValue split_value, # intp_t best_feature, -# bint best_missing_go_to_left +# bint best_missing_go_to_left, # ) noexcept nogil @@ -119,7 +125,6 @@ cdef class DensePartitioner: ) noexcept nogil cdef void partition_samples_final( self, - intp_t best_pos, SplitValue split_value, intp_t best_feature, bint best_missing_go_to_left, @@ -182,7 +187,6 @@ cdef class SparsePartitioner: ) noexcept nogil cdef void partition_samples_final( self, - intp_t best_pos, SplitValue split_value, intp_t best_feature, bint best_missing_go_to_left, diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx index 0f555b0662b49..0daf9e08c1f1d 100644 --- a/sklearn/tree/_partitioner.pyx +++ b/sklearn/tree/_partitioner.pyx @@ -359,7 +359,6 @@ cdef class DensePartitioner: cdef inline void partition_samples_final( self, - intp_t best_pos, SplitValue split_value, intp_t best_feature, bint best_missing_go_to_left @@ -577,7 +576,6 @@ cdef class SparsePartitioner: cdef inline void partition_samples_final( self, - intp_t best_pos, SplitValue split_value, intp_t best_feature, bint missing_go_to_left diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd index 32f42f0d59b84..4857edc070202 100644 --- a/sklearn/tree/_utils.pxd +++ b/sklearn/tree/_utils.pxd @@ -15,8 +15,7 @@ ctypedef union SplitValue: # features. The floating point view, i.e. ``split_value.threshold`` is used # for numerical features, where feature values less than or equal to the # threshold go left, and values greater than the threshold go right. - # - # For categorical features, TODO + float64_t threshold uint64_t cat_split # bitset From 88a2d31ae67088190ae6f2049804be024c2c2759 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 27 Jan 2026 15:09:50 +0100 Subject: [PATCH 22/23] fix --- sklearn/tree/_splitter.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 967e561cbd59c..27b9efaf0bfd8 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -453,7 +453,6 @@ cdef inline int node_split_best( # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end] if best_split.pos < end: partitioner.partition_samples_final( - best_split.pos, best_split.value, best_split.feature, best_split.missing_go_to_left @@ -689,7 +688,6 @@ cdef inline int node_split_random( if best_split.pos < end: if current_split.feature != best_split.feature: partitioner.partition_samples_final( - best_split.pos, best_split.value, best_split.feature, best_split.missing_go_to_left From e43336c4a095a13829f1a2e33c18f0fd572308f9 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 27 Jan 2026 22:57:59 +0100 Subject: [PATCH 23/23] docstrings (public and internal) --- sklearn/tree/_classes.py | 37 +++++++++++++++++++++++++++++++---- sklearn/tree/_partitioner.pyx | 26 +++++++++++++++++++----- sklearn/tree/_splitter.pyx | 5 +++-- 3 files changed, 57 insertions(+), 11 deletions(-) diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py index bb6100631fe7e..0b02e01e3896e 100644 --- a/sklearn/tree/_classes.py +++ b/sklearn/tree/_classes.py @@ -535,10 +535,11 @@ def _check_categorical_features(self, X, monotonic_cst): Return ------ - is_categorical : ndarray of shape (n_features,) or None, dtype=bool - Indicates whether a feature is categorical. If no feature is - categorical, this is None. - n_categories_in_feature: TODO + is_categorical : ndarray of shape (n_features,), dtype=bool + Boolean mask indicating whether each feature is categorical. + n_categories_in_feature : ndarray of shape (n_features,), dtype=intp + For categorical features, stores ``max(X[:, idx]) + 1``. For + non-categorical features, stores ``-1``. """ n_features = X.shape[1] categorical_features = np.asarray(self.categorical_features) @@ -977,6 +978,19 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): .. versionadded:: 1.4 + categorical_features : array-like of int or bool of shape (n_features,) or + (n_categorical_features,), default=None + Indicates which features are treated as categorical. + + - If array-like of int, the entries are feature indices. + - If array-like of bool, it is a boolean mask over features. + + Categorical features are only supported for dense inputs + and single-output targets. + Values of categorical features must be contiguous integers in ``[0, 63]`` + (missing values are not supported). + Categorical features cannot have non-zero monotonic constraints. + Attributes ---------- classes_ : ndarray of shape (n_classes,) or list of ndarray @@ -1371,6 +1385,21 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): .. versionadded:: 1.4 + categorical_features : array-like of int or bool of shape (n_features,) or + (n_categorical_features,), default=None + Indicates which features are treated as categorical. + + - If array-like of int, the entries are feature indices. + - If array-like of bool, it is a boolean mask over features. + + Categorical features are only supported for dense inputs + and single-output targets. + Values of categorical features must be contiguous integers in ``[0, 63]`` + (missing values are not supported). + Categorical features cannot have non-zero monotonic constraints. + + When these constraints are not met, ``fit`` will raise an error. + Attributes ---------- feature_importances_ : ndarray of shape (n_features,) diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx index 0daf9e08c1f1d..803631e143c08 100644 --- a/sklearn/tree/_partitioner.pyx +++ b/sklearn/tree/_partitioner.pyx @@ -80,9 +80,10 @@ cdef class DensePartitioner: ) noexcept nogil: """Simultaneously sort based on the feature_values. - Missing values are stored at the end of feature_values. - The number of missing values observed in feature_values is stored - in self.n_missing. + For numerical features, this is a standard sort. For categorical + features, samples are reordered using the Breiman ordering shortcut. + + Returns ``True`` when the feature is constant at the current node. """ cdef: intp_t i, current_end @@ -302,6 +303,12 @@ cdef class DensePartitioner: cdef inline SplitValue pos_to_threshold( self, intp_t p_prev, intp_t p ) noexcept nogil: + """Convert a split position into a concrete split value. + + For numerical features, this returns the usual mid-point threshold. + For categorical features, it converts the split position into a bitset + over categories. + """ cdef SplitValue split cdef intp_t end_non_missing = ( self.end if self.missing_on_the_left @@ -386,6 +393,7 @@ cdef class DensePartitioner: samples[partition_end], samples[partition_start]) cdef inline uint64_t _split_pos_to_bitset(self, intp_t p, intp_t nc) noexcept nogil: + """Convert a split position ``p`` into a categorical bitset.""" cdef uint64_t bitset = 0 cdef intp_t r, c cdef intp_t offset = 0 @@ -448,7 +456,10 @@ cdef class SparsePartitioner: self, intp_t current_feature ) noexcept nogil: - """Simultaneously sort based on the feature_values.""" + """Simultaneously sort based on the feature_values. + + Returns ``True`` when the feature is constant at the current node. + """ cdef: float32_t[::1] feature_values = self.feature_values intp_t[::1] index_to_samples = self.index_to_samples @@ -554,7 +565,7 @@ cdef class SparsePartitioner: cdef inline SplitValue pos_to_threshold( self, intp_t p_prev, intp_t p ) noexcept nogil: - + """Convert a split position into a numerical threshold.""" cdef SplitValue split # split between two non-missing values # sum of halves is used to avoid infinite value @@ -675,6 +686,11 @@ cdef class SparsePartitioner: cdef inline bint goes_left( SplitValue split_value, bint missing_go_to_left, bint is_categorical, float32_t value ) noexcept nogil: + """Return whether ``value`` should go to the left child + + This helper centralizes the split semantics for numerical, categorical, + and missing values. + """ if isnan(value): return missing_go_to_left elif is_categorical: diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx index 27b9efaf0bfd8..c08ab11992654 100644 --- a/sklearn/tree/_splitter.pyx +++ b/sklearn/tree/_splitter.pyx @@ -152,8 +152,9 @@ cdef class Splitter: are assumed to have uniform weight. This is represented as a Cython memoryview. - has_missing : bool - At least one missing values is in X. + n_categories_in_feature : ndarray, dtype=intp_t + Per-feature number of categories for categorical features, and + ``-1`` for numerical features. """ self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)