From f32d6c0f19239b2c539b55fd0f35ac492615b9af Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Tue, 23 Sep 2025 19:57:03 +0200
Subject: [PATCH 01/23] Compiles. Big chunk of the low-level machinery done;
 still needs to handle bitsets

---
 sklearn/tree/_classes.py      |  10 +++-
 sklearn/tree/_partitioner.pxd |  10 ++++
 sklearn/tree/_partitioner.pyx | 105 ++++++++++++++++++++++++++++++----
 sklearn/tree/_splitter.pxd    |   1 +
 sklearn/tree/_splitter.pyx    |  19 ++++--
 sklearn/tree/_tree.pyx        |   6 +-
 6 files changed, 132 insertions(+), 19 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index b6ef721c3e974..616ed725ca4e9 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -472,7 +472,15 @@ def _fit(
                 self.min_impurity_decrease,
             )
 
-        builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
+        n_categories_in_feature = np.zeros(self.n_features_in_, dtype=np.intp)
+        builder.build(
+            self.tree_,
+            X,
+            y,
+            sample_weight,
+            missing_values_in_feature_mask,
+            n_categories_in_feature,
+        )
 
         if self.n_outputs_ == 1 and is_classifier(self):
             self.n_classes_ = self.n_classes_[0]
diff --git a/sklearn/tree/_partitioner.pxd b/sklearn/tree/_partitioner.pxd
index 63e3bc2bacad0..5f8721df90f9e 100644
--- a/sklearn/tree/_partitioner.pxd
+++ b/sklearn/tree/_partitioner.pxd
@@ -68,14 +68,23 @@ cdef class DensePartitioner:
     Note that this partitioner is agnostic to the splitting strategy (best vs. random).
     """
     cdef const float32_t[:, :] X
+    cdef const float64_t[:, :] y
+    cdef const float64_t[::1] sample_weight
     cdef intp_t[::1] samples
     cdef float32_t[::1] feature_values
     cdef intp_t start
     cdef intp_t end
     cdef intp_t n_missing
     cdef const uint8_t[::1] missing_values_in_feature_mask
+    cdef const intp_t[::1] n_categories_in_feature
     cdef bint missing_on_the_left
 
+    cdef intp_t[::1] counts
+    cdef float64_t[::1] weighted_counts
+    cdef float64_t[::1] means
+    cdef intp_t[::1] sorted_cat
+    cdef intp_t[::1] offsets
+
     cdef void sort_samples_and_feature_values(
         self, intp_t current_feature
     ) noexcept nogil
@@ -108,6 +117,7 @@ cdef class DensePartitioner:
         intp_t best_feature,
         bint best_missing_go_to_left,
     ) noexcept nogil
+    cdef void _breiman_sort_categories(self, intp_t nc) noexcept nogil
 
 
 cdef class SparsePartitioner:
diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
index a3a2c4940d1ca..5fb286587cd11 100644
--- a/sklearn/tree/_partitioner.pyx
+++ b/sklearn/tree/_partitioner.pyx
@@ -13,7 +13,7 @@ and sparse data stored in a Compressed Sparse Column (CSC) format.
 from cython cimport final
 from libc.math cimport isnan, log2
 from libc.stdlib cimport qsort
-from libc.string cimport memcpy
+from libc.string cimport memcpy, memset
 
 from ._utils cimport swap_array_slices
 
@@ -28,6 +28,8 @@ cdef float32_t EXTRACT_NNZ_SWITCH = 0.1
 # Allow for 32 bit float comparisons
 cdef float32_t INFINITY_32t = np.inf
 
+cdef intp_t MAX_N_CAT = 64
+
 
 @final
 cdef class DensePartitioner:
@@ -38,16 +40,29 @@ cdef class DensePartitioner:
     def __init__(
         self,
         const float32_t[:, :] X,
+        const float64_t[:, :] y,
+        const float64_t[::1] sample_weight,
         intp_t[::1] samples,
         float32_t[::1] feature_values,
         const uint8_t[::1] missing_values_in_feature_mask,
+        const intp_t[::1] n_categories_in_feature,
     ):
         self.X = X
+        self.y = y
+        self.sample_weight = sample_weight
         self.samples = samples
         self.feature_values = feature_values
         self.missing_values_in_feature_mask = missing_values_in_feature_mask
+        self.n_categories_in_feature = n_categories_in_feature
         self.missing_on_the_left = False
 
+        # for breiman shortcut:
+        self.counts = np.empty(MAX_N_CAT, dtype=np.intp)
+        self.weighted_counts = np.empty(MAX_N_CAT, dtype=np.float64)
+        self.means = np.empty(MAX_N_CAT, dtype=np.float64)
+        self.sorted_cat = np.empty(MAX_N_CAT, dtype=np.intp)
+        self.offsets = np.empty(MAX_N_CAT, dtype=np.intp)
+
     cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil:
         """Initialize splitter at the beginning of node_split."""
         self.start = start
@@ -98,10 +113,76 @@ cdef class DensePartitioner:
             for i in range(self.start, self.end):
                 feature_values[i] = X[samples[i], current_feature]
 
-        sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing)
+        if self.n_categories_in_feature is None or self.n_categories_in_feature[current_feature] <= 0:
+            # no a categorical feature
+            sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing)
+        else:
+            self._breiman_sort_categories(self.n_categories_in_feature[current_feature])
         self.missing_on_the_left = False
         self.n_missing = n_missing
 
+    cdef void _breiman_sort_categories(self, intp_t nc) noexcept nogil:
+        """ O(n + nc log nc)"""
+        cdef:
+            intp_t* counts = &self.counts[0]
+            float64_t* weighted_counts = &self.weighted_counts[0]
+            float64_t* means = &self.means[0]
+            intp_t* sorted_cat = &self.sorted_cat[0]
+            intp_t* offsets = &self.offsets[0]
+            float32_t* feature_values = &self.feature_values[0]
+            intp_t* samples = &self.samples[0]
+            intp_t c, r, p, new_p
+            float64_t w = 1.
+
+        memset(means, 0, nc * sizeof(float64_t))
+        memset(counts, 0, nc * sizeof(intp_t))
+        memset(weighted_counts, 0, nc * sizeof(float64_t))
+
+        # compute counts, weighted_counts and means
+        for p in range(self.start, self.end):
+            c = <int> feature_values[p]
+            counts[c] += 1
+            # FIXME: overflow risk + float32 downcast: precision issues?
+            # `sort` only supports float32 for now, that's why means is float32
+            # maybe we can use some templating to extend sort to float64
+            means[c] += self.y[samples[p], 0]
+            if self.sample_weight is not None:
+                w = self.sample_weight[samples[p]]
+            self.weighted_counts[c] += w
+
+        for c in range(nc):
+            # TODO: weighted counts
+            if weighted_counts[c] > 0:
+                means[c] /= weighted_counts[c]
+
+        # sorted_cat[i] = i-th categories sorted my ascending means
+        for c in range(nc):
+            sorted_cat[c] = c
+        sort(means, sorted_cat, nc)
+
+        # ) build offsets such that:
+        # offsets[c] = sum( counts[x] for all x s.t. rank(x) <= rank(c) ) - 1
+        cdef intp_t offset = 0
+        for r in range(nc):
+            c = sorted_cat[r]
+            offset += counts[c]
+            offsets[c] = offset - 1
+
+        # sort feature_values & samples such that they are ordered by
+        # the mean of the category
+        # while ensuring samples of the same categories are contiguous
+        p = self.start
+        while p < self.end:
+            # XXX: not sure this works
+            c = <int> feature_values[p]
+            new_p = offsets[c]
+            if new_p > p:
+                swap(feature_values, samples, p, new_p)
+                # ^ preserves invariant: feature[p] = X[samples[p], f]
+                offsets[c] -= 1
+            else:
+                p += 1
+
     cdef void shift_missing_to_the_left(self) noexcept nogil:
         """
         Moves missing values from the right to the left
@@ -665,26 +746,30 @@ def _py_sort(float32_t[::1] feature_values, intp_t[::1] samples, intp_t n):
     sort(&feature_values[0], &samples[0], n)
 
 
+ctypedef fused floating_t:
+    float32_t
+    float64_t
+
 # Sort n-element arrays pointed to by feature_values and samples, simultaneously,
 # by the values in feature_values. Algorithm: Introsort (Musser, SP&E, 1997).
-cdef inline void sort(float32_t* feature_values, intp_t* samples, intp_t n) noexcept nogil:
+cdef inline void sort(floating_t* feature_values, intp_t* samples, intp_t n) noexcept nogil:
     if n == 0:
         return
     cdef intp_t maxd = 2 * <intp_t>log2(n)
     introsort(feature_values, samples, n, maxd)
 
 
-cdef inline void swap(float32_t* feature_values, intp_t* samples,
+cdef inline void swap(floating_t* feature_values, intp_t* samples,
                       intp_t i, intp_t j) noexcept nogil:
     # Helper for sort
     feature_values[i], feature_values[j] = feature_values[j], feature_values[i]
     samples[i], samples[j] = samples[j], samples[i]
 
 
-cdef inline float32_t median3(float32_t* feature_values, intp_t n) noexcept nogil:
+cdef inline floating_t median3(floating_t* feature_values, intp_t n) noexcept nogil:
     # Median of three pivot selection, after Bentley and McIlroy (1993).
     # Engineering a sort function. SP&E. Requires 8/3 comparisons on average.
-    cdef float32_t a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1]
+    cdef floating_t a = feature_values[0], b = feature_values[n / 2], c = feature_values[n - 1]
     if a < b:
         if b < c:
             return b
@@ -703,9 +788,9 @@ cdef inline float32_t median3(float32_t* feature_values, intp_t n) noexcept nogi
 
 # Introsort with median of 3 pivot selection and 3-way partition function
 # (robust to repeated elements, e.g. lots of zero features).
-cdef void introsort(float32_t* feature_values, intp_t *samples,
+cdef void introsort(floating_t* feature_values, intp_t *samples,
                     intp_t n, intp_t maxd) noexcept nogil:
-    cdef float32_t pivot
+    cdef floating_t pivot
     cdef intp_t i, l, r
 
     while n > 1:
@@ -736,7 +821,7 @@ cdef void introsort(float32_t* feature_values, intp_t *samples,
         n -= r
 
 
-cdef inline void sift_down(float32_t* feature_values, intp_t* samples,
+cdef inline void sift_down(floating_t* feature_values, intp_t* samples,
                            intp_t start, intp_t end) noexcept nogil:
     # Restore heap order in feature_values[start:end] by moving the max element to start.
     cdef intp_t child, maxind, root
@@ -759,7 +844,7 @@ cdef inline void sift_down(float32_t* feature_values, intp_t* samples,
             root = maxind
 
 
-cdef void heapsort(float32_t* feature_values, intp_t* samples, intp_t n) noexcept nogil:
+cdef void heapsort(floating_t* feature_values, intp_t* samples, intp_t n) noexcept nogil:
     cdef intp_t start, end
 
     # heapify
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 899f130b9c291..589760efcf77d 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -84,6 +84,7 @@ cdef class Splitter:
         const float64_t[:, ::1] y,
         const float64_t[:] sample_weight,
         const uint8_t[::1] missing_values_in_feature_mask,
+        const intp_t[::1] n_categories_in_feature,
     ) except -1
 
     cdef int node_reset(
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 171bf43bb4ce2..c9206aa0dd5ec 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -129,6 +129,7 @@ cdef class Splitter:
         const float64_t[:, ::1] y,
         const float64_t[:] sample_weight,
         const uint8_t[::1] missing_values_in_feature_mask,
+        const intp_t[::1] n_categories_in_feature,
     ) except -1:
         """Initialize the splitter.
 
@@ -755,10 +756,12 @@ cdef class BestSplitter(Splitter):
         const float64_t[:, ::1] y,
         const float64_t[:] sample_weight,
         const uint8_t[::1] missing_values_in_feature_mask,
+        const intp_t[::1] n_categories_in_feature,
     ) except -1:
-        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask, n_categories_in_feature)
         self.partitioner = DensePartitioner(
-            X, self.samples, self.feature_values, missing_values_in_feature_mask
+            X, y, sample_weight, self.samples, self.feature_values,
+            missing_values_in_feature_mask, n_categories_in_feature
         )
 
     cdef int node_split(
@@ -783,8 +786,9 @@ cdef class BestSparseSplitter(Splitter):
         const float64_t[:, ::1] y,
         const float64_t[:] sample_weight,
         const uint8_t[::1] missing_values_in_feature_mask,
+        const intp_t[::1] n_categories_in_feature,
     ) except -1:
-        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask, n_categories_in_feature)
         self.partitioner = SparsePartitioner(
             X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
         )
@@ -811,10 +815,12 @@ cdef class RandomSplitter(Splitter):
         const float64_t[:, ::1] y,
         const float64_t[:] sample_weight,
         const uint8_t[::1] missing_values_in_feature_mask,
+        const intp_t[::1] n_categories_in_feature,
     ) except -1:
-        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask, n_categories_in_feature)
         self.partitioner = DensePartitioner(
-            X, self.samples, self.feature_values, missing_values_in_feature_mask
+            X, y, sample_weight, self.samples, self.feature_values,
+            missing_values_in_feature_mask, n_categories_in_feature
         )
 
     cdef int node_split(
@@ -839,8 +845,9 @@ cdef class RandomSparseSplitter(Splitter):
         const float64_t[:, ::1] y,
         const float64_t[:] sample_weight,
         const uint8_t[::1] missing_values_in_feature_mask,
+        const intp_t[::1] n_categories_in_feature,
     ) except -1:
-        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask)
+        Splitter.init(self, X, y, sample_weight, missing_values_in_feature_mask, n_categories_in_feature)
         self.partitioner = SparsePartitioner(
             X, self.samples, self.n_samples, self.feature_values, missing_values_in_feature_mask
         )
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 9d0b2854c3ba0..83203ea142594 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -145,6 +145,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         const float64_t[:, ::1] y,
         const float64_t[:] sample_weight=None,
         const uint8_t[::1] missing_values_in_feature_mask=None,
+        const intp_t [::1] n_categories_in_feature=None,
     ):
         """Build a decision tree from the training set (X, y)."""
 
@@ -170,7 +171,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
         cdef float64_t min_impurity_decrease = self.min_impurity_decrease
 
         # Recursive partition (without actual recursion)
-        splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
+        splitter.init(X, y, sample_weight, missing_values_in_feature_mask, n_categories_in_feature)
 
         cdef intp_t start
         cdef intp_t end
@@ -400,6 +401,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         const float64_t[:, ::1] y,
         const float64_t[:] sample_weight=None,
         const uint8_t[::1] missing_values_in_feature_mask=None,
+        const intp_t[::1] n_categories_in_feature=None,
     ):
         """Build a decision tree from the training set (X, y)."""
 
@@ -411,7 +413,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         cdef intp_t max_leaf_nodes = self.max_leaf_nodes
 
         # Recursive partition (without actual recursion)
-        splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
+        splitter.init(X, y, sample_weight, missing_values_in_feature_mask, n_categories_in_feature)
 
         cdef vector[FrontierRecord] frontier
         cdef FrontierRecord record

From 35c10acab283b50bc54b10f5ac23eb078ea81710 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Tue, 23 Sep 2025 20:34:56 +0200
Subject: [PATCH 02/23] Added bitset creation logic; TODO: using it.

---
 sklearn/tree/_partitioner.pyx | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
index 5fb286587cd11..c7ec18f0c2d9d 100644
--- a/sklearn/tree/_partitioner.pyx
+++ b/sklearn/tree/_partitioner.pyx
@@ -28,7 +28,7 @@ cdef float32_t EXTRACT_NNZ_SWITCH = 0.1
 # Allow for 32 bit float comparisons
 cdef float32_t INFINITY_32t = np.inf
 
-cdef intp_t MAX_N_CAT = 64
+cdef intp_t MAX_N_CAT = 32
 
 
 @final
@@ -183,6 +183,18 @@ cdef class DensePartitioner:
             else:
                 p += 1
 
+    cdef uint32_t _split_pos_to_bitset(self, intp_t p, intp_t nc):
+        cdef uint32_t bitset = 0
+        cdef intp_t r, c
+        cdef intp_t offset = 0
+        for r in range(nc):
+            c = self.sorted_cat[r]
+            bitset |= 1 << c
+            offset += self.counts[c]
+            if offset >= p:
+                break
+        return bitset
+
     cdef void shift_missing_to_the_left(self) noexcept nogil:
         """
         Moves missing values from the right to the left

From 61cdba743f45d8b7f0eb4a17fffe051e6ad74af5 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Tue, 23 Sep 2025 22:25:43 +0200
Subject: [PATCH 03/23] added goes_left function

---
 sklearn/tree/_partitioner.pxd |  3 ++-
 sklearn/tree/_partitioner.pyx | 19 +++++++++++++++----
 sklearn/tree/_utils.pxd       | 15 ++++++++++++++-
 3 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/sklearn/tree/_partitioner.pxd b/sklearn/tree/_partitioner.pxd
index 5f8721df90f9e..d98f0077c501a 100644
--- a/sklearn/tree/_partitioner.pxd
+++ b/sklearn/tree/_partitioner.pxd
@@ -4,7 +4,7 @@
 # See _partitioner.pyx for details.
 
 from ..utils._typedefs cimport (
-    float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t
+    float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t, uint64_t
 )
 from ._splitter cimport SplitRecord
 
@@ -118,6 +118,7 @@ cdef class DensePartitioner:
         bint best_missing_go_to_left,
     ) noexcept nogil
     cdef void _breiman_sort_categories(self, intp_t nc) noexcept nogil
+    cdef inline uint64_t _split_pos_to_bitset(self, intp_t p, intp_t nc) noexcept nogil
 
 
 cdef class SparsePartitioner:
diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
index c7ec18f0c2d9d..0a093e1dab085 100644
--- a/sklearn/tree/_partitioner.pyx
+++ b/sklearn/tree/_partitioner.pyx
@@ -15,7 +15,7 @@ from libc.math cimport isnan, log2
 from libc.stdlib cimport qsort
 from libc.string cimport memcpy, memset
 
-from ._utils cimport swap_array_slices
+from ._utils cimport SplitValue, swap_array_slices
 
 import numpy as np
 from scipy.sparse import issparse
@@ -28,7 +28,7 @@ cdef float32_t EXTRACT_NNZ_SWITCH = 0.1
 # Allow for 32 bit float comparisons
 cdef float32_t INFINITY_32t = np.inf
 
-cdef intp_t MAX_N_CAT = 32
+cdef intp_t MAX_N_CAT = 64
 
 
 @final
@@ -183,8 +183,8 @@ cdef class DensePartitioner:
             else:
                 p += 1
 
-    cdef uint32_t _split_pos_to_bitset(self, intp_t p, intp_t nc):
-        cdef uint32_t bitset = 0
+    cdef inline uint64_t _split_pos_to_bitset(self, intp_t p, intp_t nc) noexcept nogil:
+        cdef uint64_t bitset = 0
         cdef intp_t r, c
         cdef intp_t offset = 0
         for r in range(nc):
@@ -601,6 +601,17 @@ cdef class SparsePartitioner:
                                          &self.end_negative, &self.start_positive)
 
 
+cdef inline bint goes_left(
+    SplitValue split_value, bint missing_go_to_left, bint is_categorical, float32_t value
+) noexcept nogil:
+    if isnan(value):
+        return missing_go_to_left
+    elif is_categorical:
+        return split_value.cat_split & (1 << (<uint8_t> value))
+    else:
+        return value <= split_value.threshold
+
+
 cdef int compare_SIZE_t(const void* a, const void* b) noexcept nogil:
     """Comparison function for sort.
 
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 67b438d42fdbc..1a588388c773a 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -6,7 +6,9 @@
 cimport numpy as cnp
 from ._tree cimport Node
 from ..neighbors._quad_tree cimport Cell
-from ..utils._typedefs cimport float32_t, float64_t, intp_t, uint8_t, int32_t, uint32_t
+from ..utils._typedefs cimport (
+    float32_t, float64_t, intp_t, uint8_t, int32_t, uint32_t, uint64_t
+)
 
 
 cdef enum:
@@ -56,6 +58,17 @@ cdef int swap_array_slices(
     void* array, intp_t start, intp_t end, intp_t n, size_t itemsize
 ) except -1 nogil
 
+
+ctypedef union SplitValue:
+    # Union type to generalize the concept of a threshold to categorical
+    # features. The floating point view, i.e. ``split_value.threshold`` is used
+    # for numerical features, where feature values less than or equal to the
+    # threshold go left, and values greater than the threshold go right.
+    #
+    # For categorical features, TODO
+    float64_t threshold
+    uint64_t cat_split  # bitset
+
 # =============================================================================
 # WeightedPQueue data structure
 # =============================================================================

From e8cbf55677fb228e469197e6f5799c783c170594 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Wed, 24 Sep 2025 12:01:37 +0200
Subject: [PATCH 04/23] Most changes down; all tests (non categorical) pass;
 still remains to trigger the categorical split path and test it

---
 sklearn/ensemble/_gradient_boosting.pyx |  3 +-
 sklearn/tree/_partitioner.pxd           | 13 +++-
 sklearn/tree/_partitioner.pyx           | 90 ++++++++++++++++++-------
 sklearn/tree/_splitter.pxd              |  3 +-
 sklearn/tree/_splitter.pyx              | 29 ++------
 sklearn/tree/_tree.pxd                  | 24 +++----
 sklearn/tree/_tree.pyx                  | 33 +++++++--
 sklearn/tree/_utils.pxd                 | 37 ++++++----
 8 files changed, 147 insertions(+), 85 deletions(-)

diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx
index cd9845a217c7d..86b65e1f36c8a 100644
--- a/sklearn/ensemble/_gradient_boosting.pyx
+++ b/sklearn/ensemble/_gradient_boosting.pyx
@@ -10,9 +10,8 @@ from scipy.sparse import issparse
 from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint8_t
 # Note: _tree uses cimport numpy, cnp.import_array, so we need to include
 # numpy headers in the build configuration of this extension
-from ..tree._tree cimport Node
 from ..tree._tree cimport Tree
-from ..tree._utils cimport safe_realloc
+from ..tree._utils cimport Node, safe_realloc
 
 
 # no namespace lookup for numpy dtype and array creation
diff --git a/sklearn/tree/_partitioner.pxd b/sklearn/tree/_partitioner.pxd
index d98f0077c501a..12fd019f1b8aa 100644
--- a/sklearn/tree/_partitioner.pxd
+++ b/sklearn/tree/_partitioner.pxd
@@ -7,6 +7,7 @@ from ..utils._typedefs cimport (
     float32_t, float64_t, int8_t, int32_t, intp_t, uint8_t, uint32_t, uint64_t
 )
 from ._splitter cimport SplitRecord
+from ._utils cimport SplitValue
 
 
 # Mitigate precision differences between 32 bit and 64 bit
@@ -78,6 +79,7 @@ cdef class DensePartitioner:
     cdef const uint8_t[::1] missing_values_in_feature_mask
     cdef const intp_t[::1] n_categories_in_feature
     cdef bint missing_on_the_left
+    cdef intp_t n_categories
 
     cdef intp_t[::1] counts
     cdef float64_t[::1] weighted_counts
@@ -105,6 +107,9 @@ cdef class DensePartitioner:
         intp_t* p_prev,
         intp_t* p
     ) noexcept nogil
+    cdef inline SplitValue pos_to_threshold(
+        self, intp_t p_prev, intp_t p
+    ) noexcept nogil
     cdef intp_t partition_samples(
         self,
         float64_t current_threshold,
@@ -113,7 +118,7 @@ cdef class DensePartitioner:
     cdef void partition_samples_final(
         self,
         intp_t best_pos,
-        float64_t best_threshold,
+        SplitValue split_value,
         intp_t best_feature,
         bint best_missing_go_to_left,
     ) noexcept nogil
@@ -143,6 +148,7 @@ cdef class SparsePartitioner:
     cdef intp_t end
     cdef intp_t n_missing
     cdef const uint8_t[::1] missing_values_in_feature_mask
+    cdef intp_t n_categories
 
     cdef void sort_samples_and_feature_values(
         self, intp_t current_feature
@@ -164,6 +170,9 @@ cdef class SparsePartitioner:
         intp_t* p_prev,
         intp_t* p
     ) noexcept nogil
+    cdef inline SplitValue pos_to_threshold(
+        self, intp_t p_prev, intp_t p
+    ) noexcept nogil
     cdef intp_t partition_samples(
         self,
         float64_t current_threshold,
@@ -172,7 +181,7 @@ cdef class SparsePartitioner:
     cdef void partition_samples_final(
         self,
         intp_t best_pos,
-        float64_t best_threshold,
+        SplitValue split_value,
         intp_t best_feature,
         bint best_missing_go_to_left,
     ) noexcept nogil
diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
index 20b851fc0844b..bf6924953d12e 100644
--- a/sklearn/tree/_partitioner.pyx
+++ b/sklearn/tree/_partitioner.pyx
@@ -15,7 +15,7 @@ from libc.math cimport isnan, log2
 from libc.stdlib cimport qsort
 from libc.string cimport memcpy, memset
 
-from ._utils cimport SplitValue, swap_array_slices
+from ._utils cimport swap_array_slices
 
 import numpy as np
 from scipy.sparse import issparse
@@ -25,6 +25,7 @@ from scipy.sparse import issparse
 # in SparsePartitioner
 cdef float32_t EXTRACT_NNZ_SWITCH = 0.1
 
+cdef float64_t INFINITY = np.inf
 # Allow for 32 bit float comparisons
 cdef float32_t INFINITY_32t = np.inf
 
@@ -55,6 +56,7 @@ cdef class DensePartitioner:
         self.missing_values_in_feature_mask = missing_values_in_feature_mask
         self.n_categories_in_feature = n_categories_in_feature
         self.missing_on_the_left = False
+        self.n_categories = 0
 
         # for breiman shortcut:
         self.counts = np.empty(MAX_N_CAT, dtype=np.intp)
@@ -113,11 +115,12 @@ cdef class DensePartitioner:
             for i in range(self.start, self.end):
                 feature_values[i] = X[samples[i], current_feature]
 
-        if self.n_categories_in_feature is None or self.n_categories_in_feature[current_feature] <= 0:
+        self.n_categories = self.n_categories_in_feature[current_feature]
+        if self.n_categories <= 0:
             # no a categorical feature
             sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing)
         else:
-            self._breiman_sort_categories(self.n_categories_in_feature[current_feature])
+            self._breiman_sort_categories(self.n_categories)
         self.missing_on_the_left = False
         self.n_missing = n_missing
 
@@ -183,18 +186,6 @@ cdef class DensePartitioner:
             else:
                 p += 1
 
-    cdef inline uint64_t _split_pos_to_bitset(self, intp_t p, intp_t nc) noexcept nogil:
-        cdef uint64_t bitset = 0
-        cdef intp_t r, c
-        cdef intp_t offset = 0
-        for r in range(nc):
-            c = self.sorted_cat[r]
-            bitset |= 1 << c
-            offset += self.counts[c]
-            if offset >= p:
-                break
-        return bitset
-
     cdef void shift_missing_to_the_left(self) noexcept nogil:
         """
         Moves missing values from the right to the left
@@ -276,6 +267,33 @@ cdef class DensePartitioner:
                 p[0] += 1
             p_prev[0] = p[0] - 1
 
+    cdef inline SplitValue pos_to_threshold(
+        self, intp_t p_prev, intp_t p
+    ) noexcept nogil:
+        cdef intp_t end_non_missing = (
+            self.end if self.missing_on_the_left
+            else self.end - self.n_missing)
+
+        cdef SplitValue split
+        if self.n_categories > 0:
+            split.cat_split = self._split_pos_to_bitset(p, self.n_categories)
+            return split
+
+        if p == end_non_missing and not self.missing_on_the_left:
+            # split with the right node being only the missing values
+            split.threshold = INFINITY
+            return split
+
+        # split between two non-missing values
+        # sum of halves is used to avoid infinite value
+        split.threshold = (
+            self.feature_values[p_prev] / 2.0 + self.feature_values[p] / 2.0
+        )
+        if split.threshold == INFINITY or split.threshold == -INFINITY:
+            split.threshold = self.feature_values[p_prev]
+
+        return split
+
     cdef inline intp_t partition_samples(
         self,
         float64_t current_threshold,
@@ -305,7 +323,7 @@ cdef class DensePartitioner:
     cdef inline void partition_samples_final(
         self,
         intp_t best_pos,
-        float64_t best_threshold,
+        SplitValue split_value,
         intp_t best_feature,
         bint best_missing_go_to_left
     ) noexcept nogil:
@@ -320,20 +338,28 @@ cdef class DensePartitioner:
             intp_t partition_end = self.end
             intp_t* samples = &self.samples[0]
             float32_t current_value
-            bint go_to_left
+            bint is_cat = self.n_categories_in_feature[best_feature] > 0
 
         while p < partition_end:
             current_value = self.X[samples[p], best_feature]
-            go_to_left = (
-                best_missing_go_to_left if isnan(current_value)
-                else current_value <= best_threshold
-            )
-            if go_to_left:
+            if goes_left(split_value, best_missing_go_to_left, is_cat, current_value):
                 p += 1
             else:
                 partition_end -= 1
                 samples[p], samples[partition_end] = samples[partition_end], samples[p]
 
+    cdef inline uint64_t _split_pos_to_bitset(self, intp_t p, intp_t nc) noexcept nogil:
+        cdef uint64_t bitset = 0
+        cdef intp_t r, c
+        cdef intp_t offset = 0
+        for r in range(nc):
+            c = self.sorted_cat[r]
+            bitset |= 1 << c
+            offset += self.counts[c]
+            if offset >= p:
+                break
+        return bitset
+
 
 @final
 cdef class SparsePartitioner:
@@ -372,6 +398,7 @@ cdef class SparsePartitioner:
             self.index_to_samples[samples[p]] = p
 
         self.missing_values_in_feature_mask = missing_values_in_feature_mask
+        self.n_categories = 0
 
     cdef inline void init_node_split(self, intp_t start, intp_t end) noexcept nogil:
         """Initialize splitter at the beginning of node_split."""
@@ -485,6 +512,21 @@ cdef class SparsePartitioner:
         p_prev[0] = p[0]
         p[0] = p_next
 
+    cdef inline SplitValue pos_to_threshold(
+        self, intp_t p_prev, intp_t p
+    ) noexcept nogil:
+
+        cdef SplitValue split
+        # split between two non-missing values
+        # sum of halves is used to avoid infinite value
+        split.threshold = (
+            self.feature_values[p_prev] / 2.0 + self.feature_values[p] / 2.0
+        )
+        if split.threshold == INFINITY or split.threshold == -INFINITY:
+            split.threshold = self.feature_values[p_prev]
+
+        return split
+
     cdef inline intp_t partition_samples(
         self,
         float64_t current_threshold,
@@ -496,13 +538,13 @@ cdef class SparsePartitioner:
     cdef inline void partition_samples_final(
         self,
         intp_t best_pos,
-        float64_t best_threshold,
+        SplitValue split_value,
         intp_t best_feature,
         bint missing_go_to_left
     ) noexcept nogil:
         """Partition samples for X at the best_threshold and best_feature."""
         self.extract_nnz(best_feature)
-        self._partition(best_threshold, best_pos)
+        self._partition(split_value.threshold, best_pos)
 
     cdef inline intp_t _partition(self, float64_t threshold, intp_t zero_pos) noexcept nogil:
         """Partition samples[start:end] based on threshold."""
diff --git a/sklearn/tree/_splitter.pxd b/sklearn/tree/_splitter.pxd
index 589760efcf77d..cfd13a3b69600 100644
--- a/sklearn/tree/_splitter.pxd
+++ b/sklearn/tree/_splitter.pxd
@@ -8,6 +8,7 @@ from ..utils._typedefs cimport (
 )
 from ._criterion cimport Criterion
 from ._tree cimport ParentInfo
+from ._utils cimport SplitValue
 
 
 cdef struct SplitRecord:
@@ -16,7 +17,7 @@ cdef struct SplitRecord:
     intp_t pos             # Split samples array at the given position,
     #                      # i.e. count of samples below threshold for feature.
     #                      # pos is >= end if the node is a leaf.
-    float64_t threshold       # Threshold to split at.
+    SplitValue value       # Threshold/Bitset to split at.
     float64_t improvement     # Impurity improvement given parent node.
     float64_t impurity_left   # Impurity of the left split.
     float64_t impurity_right  # Impurity of the right split.
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index c9206aa0dd5ec..80721d8f7e7c0 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -49,7 +49,6 @@ cdef inline void _init_split(SplitRecord* self, intp_t start_pos) noexcept nogil
     self.impurity_right = INFINITY
     self.pos = start_pos
     self.feature = 0
-    self.threshold = 0.
     self.improvement = -INFINITY
     self.missing_go_to_left = False
     self.n_missing = 0
@@ -446,21 +445,7 @@ cdef inline int node_split_best(
 
                 if current_proxy_improvement > best_proxy_improvement:
                     best_proxy_improvement = current_proxy_improvement
-                    if p == end_non_missing and not missing_go_to_left:
-                        # split with the right node being only the missing values
-                        current_split.threshold = INFINITY
-                    else:
-                        # split between two non-missing values
-                        # sum of halves is used to avoid infinite value
-                        current_split.threshold = (
-                            feature_values[p_prev] / 2.0 + feature_values[p] / 2.0
-                        )
-                        if (
-                            current_split.threshold == INFINITY or
-                            current_split.threshold == -INFINITY
-                        ):
-                            current_split.threshold = feature_values[p_prev]
-
+                    current_split.value = partitioner.pos_to_threshold(p_prev, p)
                     current_split.n_missing = n_missing
 
                     # if there are no missing values in the training data, during
@@ -477,7 +462,7 @@ cdef inline int node_split_best(
     if best_split.pos < end:
         partitioner.partition_samples_final(
             best_split.pos,
-            best_split.threshold,
+            best_split.value,
             best_split.feature,
             best_split.missing_go_to_left
         )
@@ -636,7 +621,7 @@ cdef inline int node_split_random(
         has_missing = n_missing != 0
 
         # Draw a random threshold
-        current_split.threshold = rand_uniform(
+        current_split.value.threshold = rand_uniform(
             min_feature_value,
             max_feature_value,
             random_state,
@@ -656,12 +641,12 @@ cdef inline int node_split_random(
         else:
             missing_go_to_left = 0
 
-        if current_split.threshold == max_feature_value:
-            current_split.threshold = min_feature_value
+        if current_split.value.threshold == max_feature_value:
+            current_split.value.threshold = min_feature_value
 
         # Partition
         current_split.pos = partitioner.partition_samples(
-            current_split.threshold, missing_go_to_left
+            current_split.value.threshold, missing_go_to_left
         )
 
         n_left = current_split.pos - start
@@ -715,7 +700,7 @@ cdef inline int node_split_random(
         if current_split.feature != best_split.feature:
             partitioner.partition_samples_final(
                 best_split.pos,
-                best_split.threshold,
+                best_split.value,
                 best_split.feature,
                 best_split.missing_go_to_left
             )
diff --git a/sklearn/tree/_tree.pxd b/sklearn/tree/_tree.pxd
index 2cadca4564a87..e37c0493735b7 100644
--- a/sklearn/tree/_tree.pxd
+++ b/sklearn/tree/_tree.pxd
@@ -6,22 +6,13 @@
 import numpy as np
 cimport numpy as cnp
 
-from ..utils._typedefs cimport float32_t, float64_t, intp_t, int32_t, uint8_t, uint32_t
+from ..utils._typedefs cimport (
+    float32_t, float64_t, intp_t, int32_t, uint8_t, uint32_t, uint64_t
+)
 
 from ._splitter cimport Splitter
 from ._splitter cimport SplitRecord
-
-cdef struct Node:
-    # Base storage structure for the nodes in a Tree object
-
-    intp_t left_child                    # id of the left child of the node
-    intp_t right_child                   # id of the right child of the node
-    intp_t feature                       # Feature used for splitting the node
-    float64_t threshold                  # Threshold value at the node
-    float64_t impurity                   # Impurity of the node (i.e., the value of the criterion)
-    intp_t n_node_samples                # Number of samples at the node
-    float64_t weighted_n_node_samples    # Weighted number of samples at the node
-    uint8_t missing_go_to_left     # Whether features have missing values
+from ._utils cimport Node, SplitValue
 
 
 cdef struct ParentInfo:
@@ -44,6 +35,9 @@ cdef class Tree:
     cdef public intp_t n_outputs         # Number of outputs in y
     cdef public intp_t max_n_classes     # max(n_classes)
 
+    # FIXME: change to uint8_t: but the error it triggers might be a Cython bug
+    cdef intp_t* is_categorical            # Shape (n_features,)
+
     # Inner structures: values are stored separately from node structure,
     # since size is determined at runtime.
     cdef public intp_t max_depth         # Max depth of the tree
@@ -55,8 +49,8 @@ cdef class Tree:
 
     # Methods
     cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf,
-                          intp_t feature, float64_t threshold, float64_t impurity,
-                          intp_t n_node_samples,
+                          intp_t feature, float64_t threshold, uint64_t cat_split,
+                          float64_t impurity, intp_t n_node_samples,
                           float64_t weighted_n_node_samples,
                           uint8_t missing_go_to_left) except -1 nogil
     cdef int _resize(self, intp_t capacity) except -1 nogil
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 83203ea142594..73642cd2e639c 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -255,7 +255,8 @@ cdef class DepthFirstTreeBuilder(TreeBuilder):
                                 min_impurity_decrease))
 
                 node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
-                                         split.threshold, parent_record.impurity,
+                                         split.value.threshold, split.value.cat_split,
+                                         parent_record.impurity,
                                          n_node_samples, weighted_n_node_samples,
                                          split.missing_go_to_left)
 
@@ -469,6 +470,7 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
                     node.right_child = _TREE_LEAF
                     node.feature = _TREE_UNDEFINED
                     node.threshold = _TREE_UNDEFINED
+                    # node.categorical_bitset = _TREE_UNDEFINED
 
                 else:
                     # Node is expandable
@@ -613,8 +615,9 @@ cdef class BestFirstTreeBuilder(TreeBuilder):
         node_id = tree._add_node(parent - tree.nodes
                                  if parent != NULL
                                  else _TREE_UNDEFINED,
-                                 is_left, is_leaf,
-                                 split.feature, split.threshold, parent_record.impurity,
+                                 is_left, is_leaf, split.feature,
+                                 split.value.threshold, split.value.cat_split,
+                                 parent_record.impurity,
                                  n_node_samples, weighted_n_node_samples,
                                  split.missing_go_to_left)
         if node_id == INTPTR_MAX:
@@ -790,6 +793,11 @@ cdef class Tree:
         for k in range(n_outputs):
             self.n_classes[k] = n_classes[k]
 
+        self.is_categorical = NULL
+        safe_realloc(&self.is_categorical, n_features)
+        for f in range(n_features):
+            self.is_categorical[f] = False
+
         # Inner structures
         self.max_depth = 0
         self.node_count = 0
@@ -800,6 +808,7 @@ cdef class Tree:
     def __dealloc__(self):
         """Destructor."""
         # Free all inner structures
+        free(self.is_categorical)
         free(self.n_classes)
         free(self.value)
         free(self.nodes)
@@ -897,8 +906,8 @@ cdef class Tree:
         return 0
 
     cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf,
-                          intp_t feature, float64_t threshold, float64_t impurity,
-                          intp_t n_node_samples,
+                          intp_t feature, float64_t threshold, uint64_t cat_split,
+                          float64_t impurity, intp_t n_node_samples,
                           float64_t weighted_n_node_samples,
                           uint8_t missing_go_to_left) except -1 nogil:
         """Add a node to the tree.
@@ -929,11 +938,17 @@ cdef class Tree:
             node.right_child = _TREE_LEAF
             node.feature = _TREE_UNDEFINED
             node.threshold = _TREE_UNDEFINED
+            # node.categorical_bitset = _TREE_UNDEFINED
 
         else:
             # left_child and right_child will be set later
             node.feature = feature
-            node.threshold = threshold
+            if self.is_categorical[feature]:
+                node.threshold = -INFINITY
+                node.categorical_bitset = cat_split
+            else:
+                node.threshold = threshold
+                node.categorical_bitset = cat_split
             node.missing_go_to_left = missing_go_to_left
 
         self.node_count += 1
@@ -990,6 +1005,7 @@ cdef class Tree:
                             node = &self.nodes[node.left_child]
                         else:
                             node = &self.nodes[node.right_child]
+                    # TODO: handle categorical case:
                     elif X_i_node_feature <= node.threshold:
                         node = &self.nodes[node.left_child]
                     else:
@@ -1111,6 +1127,7 @@ cdef class Tree:
                     indices[indptr[i + 1]] = <intp_t>(node - self.nodes)
                     indptr[i + 1] += 1
 
+                    # TODO handle categorical
                     if X_ndarray[i, node.feature] <= node.threshold:
                         node = &self.nodes[node.left_child]
                     else:
@@ -1391,6 +1408,7 @@ cdef class Tree:
 
                     if is_target_feature:
                         # In this case, we push left or right child on stack
+                        # TODO: handle categotical (and missing?)
                         if X[sample_idx, feature_idx] <= current_node.threshold:
                             node_idx_stack[stack_size] = current_node.left_child
                         else:
@@ -1931,7 +1949,8 @@ cdef void _build_pruned_tree(
                 break
 
             new_node_id = tree._add_node(
-                parent, is_left, is_leaf, node.feature, node.threshold,
+                parent, is_left, is_leaf, node.feature,
+                node.threshold, node.categorical_bitset,
                 node.impurity, node.n_node_samples,
                 node.weighted_n_node_samples, node.missing_go_to_left)
 
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 1a588388c773a..524a60fed7cac 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -4,13 +4,37 @@
 # See _utils.pyx for details.
 
 cimport numpy as cnp
-from ._tree cimport Node
 from ..neighbors._quad_tree cimport Cell
 from ..utils._typedefs cimport (
     float32_t, float64_t, intp_t, uint8_t, int32_t, uint32_t, uint64_t
 )
 
 
+ctypedef union SplitValue:
+    # Union type to generalize the concept of a threshold to categorical
+    # features. The floating point view, i.e. ``split_value.threshold`` is used
+    # for numerical features, where feature values less than or equal to the
+    # threshold go left, and values greater than the threshold go right.
+    #
+    # For categorical features, TODO
+    float64_t threshold
+    uint64_t cat_split  # bitset
+
+
+cdef struct Node:
+    # Base storage structure for the nodes in a Tree object
+
+    intp_t left_child                    # id of the left child of the node
+    intp_t right_child                   # id of the right child of the node
+    intp_t feature                       # Feature used for splitting the node
+    float64_t threshold                  # Threshold value at the node, for continuous split (-INF otherwise)
+    uint64_t categorical_bitset          # Bitset for categorical split (0 otherwise)
+    float64_t impurity                   # Impurity of the node (i.e., the value of the criterion)
+    intp_t n_node_samples                # Number of samples at the node
+    float64_t weighted_n_node_samples    # Weighted number of samples at the node
+    uint8_t missing_go_to_left     # Whether features have missing values
+
+
 cdef enum:
     # Max value for our rand_r replacement (near the bottom).
     # We don't use RAND_MAX because it's different across platforms and
@@ -58,17 +82,6 @@ cdef int swap_array_slices(
     void* array, intp_t start, intp_t end, intp_t n, size_t itemsize
 ) except -1 nogil
 
-
-ctypedef union SplitValue:
-    # Union type to generalize the concept of a threshold to categorical
-    # features. The floating point view, i.e. ``split_value.threshold`` is used
-    # for numerical features, where feature values less than or equal to the
-    # threshold go left, and values greater than the threshold go right.
-    #
-    # For categorical features, TODO
-    float64_t threshold
-    uint64_t cat_split  # bitset
-
 # =============================================================================
 # WeightedPQueue data structure
 # =============================================================================

From 4c571af93229f23e4ffd0c4675e95e89161598ae Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Wed, 24 Sep 2025 17:20:47 +0200
Subject: [PATCH 05/23] added categorical_features kwarg to top-level classes;
 non-categorical tests pass; TODO: test with categories

---
 sklearn/tree/_classes.py        | 159 +++++++++++++++++++++++++++++++-
 sklearn/tree/_partitioner.pyx   |   1 +
 sklearn/tree/_tree.pyx          |  19 ++--
 sklearn/tree/tests/test_tree.py |  18 +++-
 4 files changed, 182 insertions(+), 15 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 616ed725ca4e9..f3147fef18cc1 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -47,6 +47,7 @@
     _assert_all_finite_element_wise,
     _check_n_features,
     _check_sample_weight,
+    _is_pandas_df,
     assert_all_finite,
     check_is_fitted,
     validate_data,
@@ -125,6 +126,11 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
         "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")],
         "ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
         "monotonic_cst": ["array-like", None],
+        "categorical_features": [
+            "array-like",
+            StrOptions({"from_dtype"}),
+            None,
+        ],
     }
 
     @abstractmethod
@@ -144,6 +150,7 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         monotonic_cst=None,
+        categorical_features=None,
     ):
         self.criterion = criterion
         self.splitter = splitter
@@ -158,6 +165,7 @@ def __init__(
         self.class_weight = class_weight
         self.ccp_alpha = ccp_alpha
         self.monotonic_cst = monotonic_cst
+        self.categorical_features = categorical_features
 
     def get_depth(self):
         """Return the depth of the decision tree.
@@ -241,6 +249,8 @@ def _fit(
     ):
         random_state = check_random_state(self.random_state)
 
+        self.is_categorical_ = self._check_categorical_features(X)
+
         if check_input:
             # Need to validate separately here.
             # We can't pass multi_output=True because that would allow y to be
@@ -259,13 +269,18 @@ def _fit(
             missing_values_in_feature_mask = (
                 self._compute_missing_values_in_feature_mask(X)
             )
-            if issparse(X):
+            is_sparse_X = issparse(X)
+            if is_sparse_X:
                 X.sort_indices()
 
                 if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
                     raise ValueError(
                         "No support for np.int64 index based sparse matrices"
                     )
+            if is_sparse_X and self.categorical_features is not None:
+                raise NotImplementedError(
+                    "Categorical features not supported with sparse inputs"
+                )
 
             if self.criterion == "poisson":
                 if np.any(y < 0):
@@ -442,13 +457,19 @@ def _fit(
             )
 
         if is_classifier(self):
-            self.tree_ = Tree(self.n_features_in_, self.n_classes_, self.n_outputs_)
+            self.tree_ = Tree(
+                self.n_features_in_,
+                self.n_classes_,
+                self.n_outputs_,
+                self.is_categorical_,
+            )
         else:
             self.tree_ = Tree(
                 self.n_features_in_,
                 # TODO: tree shouldn't need this in this case
                 np.array([1] * self.n_outputs_, dtype=np.intp),
                 self.n_outputs_,
+                self.is_categorical_,
             )
 
         # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
@@ -472,7 +493,14 @@ def _fit(
                 self.min_impurity_decrease,
             )
 
-        n_categories_in_feature = np.zeros(self.n_features_in_, dtype=np.intp)
+        n_categories_in_feature = np.full(self.n_features_in_, -1, dtype=np.intp)
+        if self.is_categorical_ is not None:
+            for idx in np.where(self.is_categorical_)[0]:
+                assert np.allclose(X[:, idx].astype(np.intp), X[:, idx])
+                assert X[:, idx].min() >= 0
+                n_categories_in_feature[idx] = X[:, idx].max() + 1
+            assert (n_categories_in_feature <= 64).all()
+
         builder.build(
             self.tree_,
             X,
@@ -490,6 +518,126 @@ def _fit(
 
         return self
 
+    def _check_categorical_features(self, X):
+        """Copied from gradient_boosting.py on sept. 2025
+        Check and validate categorical features in X
+
+        Parameters
+        ----------
+        X : {array-like, pandas DataFrame} of shape (n_samples, n_features)
+            Input data.
+
+        Return
+        ------
+        is_categorical : ndarray of shape (n_features,) or None, dtype=bool
+            Indicates whether a feature is categorical. If no feature is
+            categorical, this is None.
+        """
+        # Special code for pandas because of a bug in recent pandas, which is
+        # fixed in main and maybe included in 2.2.1, see
+        # https://github.com/pandas-dev/pandas/pull/57173.
+        # Also pandas versions < 1.5.1 do not support the dataframe interchange
+        if _is_pandas_df(X):
+            X_is_dataframe = True
+            categorical_columns_mask = np.asarray(X.dtypes == "category")
+        elif hasattr(X, "__dataframe__"):
+            X_is_dataframe = True
+            categorical_columns_mask = np.asarray(
+                [
+                    c.dtype[0].name == "CATEGORICAL"
+                    for c in X.__dataframe__().get_columns()
+                ]
+            )
+        else:
+            X_is_dataframe = False
+            categorical_columns_mask = None
+
+        categorical_features = self.categorical_features
+
+        categorical_by_dtype = (
+            isinstance(categorical_features, str)
+            and categorical_features == "from_dtype"
+        )
+        no_categorical_dtype = categorical_features is None or (
+            categorical_by_dtype and not X_is_dataframe
+        )
+
+        if no_categorical_dtype:
+            return None
+
+        use_pandas_categorical = categorical_by_dtype and X_is_dataframe
+        if use_pandas_categorical:
+            categorical_features = categorical_columns_mask
+        else:
+            categorical_features = np.asarray(categorical_features)
+
+        if categorical_features.size == 0:
+            return None
+
+        if categorical_features.dtype.kind not in ("i", "b", "U", "O"):
+            raise ValueError(
+                "categorical_features must be an array-like of bool, int or "
+                f"str, got: {categorical_features.dtype.name}."
+            )
+
+        if categorical_features.dtype.kind == "O":
+            types = set(type(f) for f in categorical_features)
+            if types != {str}:
+                raise ValueError(
+                    "categorical_features must be an array-like of bool, int or "
+                    f"str, got: {', '.join(sorted(t.__name__ for t in types))}."
+                )
+
+        n_features = X.shape[1]
+        # At this point `validate_data` was not called yet because we use the original
+        # dtypes to discover the categorical features. Thus `feature_names_in_`
+        # is not defined yet.
+        feature_names_in_ = getattr(X, "columns", None)
+
+        if categorical_features.dtype.kind in ("U", "O"):
+            # check for feature names
+            if feature_names_in_ is None:
+                raise ValueError(
+                    "categorical_features should be passed as an array of "
+                    "integers or as a boolean mask when the model is fitted "
+                    "on data without feature names."
+                )
+            is_categorical = np.zeros(n_features, dtype=bool)
+            feature_names = list(feature_names_in_)
+            for feature_name in categorical_features:
+                try:
+                    is_categorical[feature_names.index(feature_name)] = True
+                except ValueError as e:
+                    raise ValueError(
+                        f"categorical_features has a item value '{feature_name}' "
+                        "which is not a valid feature name of the training "
+                        f"data. Observed feature names: {feature_names}"
+                    ) from e
+        elif categorical_features.dtype.kind == "i":
+            # check for categorical features as indices
+            if (
+                np.max(categorical_features) >= n_features
+                or np.min(categorical_features) < 0
+            ):
+                raise ValueError(
+                    "categorical_features set as integer "
+                    "indices must be in [0, n_features - 1]"
+                )
+            is_categorical = np.zeros(n_features, dtype=bool)
+            is_categorical[categorical_features] = True
+        else:
+            if categorical_features.shape[0] != n_features:
+                raise ValueError(
+                    "categorical_features set as a boolean mask "
+                    "must have shape (n_features,), got: "
+                    f"{categorical_features.shape}"
+                )
+            is_categorical = categorical_features
+
+        if not np.any(is_categorical):
+            return None
+        return is_categorical
+
     def _validate_X_predict(self, X, check_input):
         """Validate the training data on predict (probabilities)."""
         if check_input:
@@ -628,13 +776,16 @@ def _prune_tree(self):
         # build pruned tree
         if is_classifier(self):
             n_classes = np.atleast_1d(self.n_classes_)
-            pruned_tree = Tree(self.n_features_in_, n_classes, self.n_outputs_)
+            pruned_tree = Tree(
+                self.n_features_in_, n_classes, self.n_outputs_, self.is_categorical_
+            )
         else:
             pruned_tree = Tree(
                 self.n_features_in_,
                 # TODO: the tree shouldn't need this param
                 np.array([1] * self.n_outputs_, dtype=np.intp),
                 self.n_outputs_,
+                self.is_categorical_,
             )
         _build_pruned_tree_ccp(pruned_tree, self.tree_, self.ccp_alpha)
 
diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
index bf6924953d12e..d0a4a66021761 100644
--- a/sklearn/tree/_partitioner.pyx
+++ b/sklearn/tree/_partitioner.pyx
@@ -740,6 +740,7 @@ cdef inline void extract_nnz_binary_search(const int32_t[::1] X_indices,
           n_samples * log(n_indices)).
     """
     cdef intp_t n_samples
+    # printf("Is sorted: %d\n", is_samples_sorted[0])
 
     if not is_samples_sorted[0]:
         n_samples = end - start
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 73642cd2e639c..27ef8e6416119 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -773,7 +773,7 @@ cdef class Tree:
 
     # TODO: Convert n_classes to cython.integral memory view once
     #  https://github.com/cython/cython/issues/5243 is fixed
-    def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs):
+    def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs, cnp.ndarray is_categorical):
         """Constructor."""
         cdef intp_t dummy = 0
         size_t_dtype = np.array(dummy).dtype
@@ -795,8 +795,12 @@ cdef class Tree:
 
         self.is_categorical = NULL
         safe_realloc(&self.is_categorical, n_features)
-        for f in range(n_features):
-            self.is_categorical[f] = False
+        if is_categorical is None:
+            for f in range(n_features):
+                self.is_categorical[f] = False
+        else:
+            for f in range(n_features):
+                self.is_categorical[f] = is_categorical[f]
 
         # Inner structures
         self.max_depth = 0
@@ -815,9 +819,12 @@ cdef class Tree:
 
     def __reduce__(self):
         """Reduce re-implementation, for pickling."""
-        return (Tree, (self.n_features,
-                       sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
-                       self.n_outputs), self.__getstate__())
+        return (Tree, (
+            self.n_features,
+            sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
+            self.n_outputs,
+            sizet_ptr_to_ndarray(self.is_categorical, self.n_features)
+        ), self.__getstate__())
 
     def __getstate__(self):
         """Getstate re-implementation, for pickling."""
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 8ad5f507a2fab..9db97ec529e24 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -2141,13 +2141,15 @@ def get_different_alignment_node_ndarray(node_ndarray):
 
 def reduce_tree_with_different_bitness(tree):
     new_dtype = np.int64 if _IS_32BIT else np.int32
-    tree_cls, (n_features, n_classes, n_outputs), state = tree.__reduce__()
+    tree_cls, (n_features, n_classes, n_outputs, is_categorical), state = (
+        tree.__reduce__()
+    )
     new_n_classes = n_classes.astype(new_dtype, casting="same_kind")
 
     new_state = state.copy()
     new_state["nodes"] = get_different_bitness_node_ndarray(new_state["nodes"])
 
-    return (tree_cls, (n_features, new_n_classes, n_outputs), new_state)
+    return (tree_cls, (n_features, new_n_classes, n_outputs, is_categorical), new_state)
 
 
 def test_different_bitness_pickle():
@@ -2788,7 +2790,9 @@ def test_build_pruned_tree_py():
     tree.fit(iris.data, iris.target)
 
     n_classes = np.atleast_1d(tree.n_classes_)
-    pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_)
+    pruned_tree = CythonTree(
+        tree.n_features_in_, n_classes, tree.n_outputs_, tree.is_categorical_
+    )
 
     # only keep the root note
     leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8)
@@ -2802,7 +2806,9 @@ def test_build_pruned_tree_py():
     assert_array_equal(tree.tree_.value[0], pruned_tree.value[0])
 
     # now keep all the leaves
-    pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_)
+    pruned_tree = CythonTree(
+        tree.n_features_in_, n_classes, tree.n_outputs_, tree.is_categorical_
+    )
     leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8)
     leave_in_subtree[1:] = 1
 
@@ -2820,7 +2826,9 @@ def test_build_pruned_tree_infinite_loop():
     tree = DecisionTreeClassifier(random_state=0, max_depth=1)
     tree.fit(iris.data, iris.target)
     n_classes = np.atleast_1d(tree.n_classes_)
-    pruned_tree = CythonTree(tree.n_features_in_, n_classes, tree.n_outputs_)
+    pruned_tree = CythonTree(
+        tree.n_features_in_, n_classes, tree.n_outputs_, tree.is_categorical_
+    )
 
     # only keeping one child as a leaf results in an improper tree
     leave_in_subtree = np.zeros(tree.tree_.node_count, dtype=np.uint8)

From a27b16c8ce37879ecd695efa874de31af95fee78 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Wed, 24 Sep 2025 19:11:46 +0200
Subject: [PATCH 06/23] use categorical split in apply/predict; basic test test
 with categories work in notebook: logic seems to be correct

---
 sklearn/tree/_classes.py |  4 ++++
 sklearn/tree/_tree.pyx   | 19 ++++++++++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index f3147fef18cc1..3c300718cb3f1 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -1135,6 +1135,7 @@ def __init__(
         class_weight=None,
         ccp_alpha=0.0,
         monotonic_cst=None,
+        categorical_features=None,
     ):
         super().__init__(
             criterion=criterion,
@@ -1150,6 +1151,7 @@ def __init__(
             min_impurity_decrease=min_impurity_decrease,
             monotonic_cst=monotonic_cst,
             ccp_alpha=ccp_alpha,
+            categorical_features=categorical_features,
         )
 
     @_fit_context(prefer_skip_nested_validation=True)
@@ -1512,6 +1514,7 @@ def __init__(
         min_impurity_decrease=0.0,
         ccp_alpha=0.0,
         monotonic_cst=None,
+        categorical_features=None,
     ):
         super().__init__(
             criterion=criterion,
@@ -1526,6 +1529,7 @@ def __init__(
             min_impurity_decrease=min_impurity_decrease,
             ccp_alpha=ccp_alpha,
             monotonic_cst=monotonic_cst,
+            categorical_features=categorical_features,
         )
 
     @_fit_context(prefer_skip_nested_validation=True)
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 27ef8e6416119..4db6bbd28fae3 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -799,6 +799,7 @@ cdef class Tree:
             for f in range(n_features):
                 self.is_categorical[f] = False
         else:
+            is_categorical = is_categorical.astype(np.intp)
             for f in range(n_features):
                 self.is_categorical[f] = is_categorical[f]
 
@@ -1005,14 +1006,18 @@ cdef class Tree:
                 node = self.nodes
                 # While node not a leaf
                 while node.left_child != _TREE_LEAF:
-                    X_i_node_feature = X_ndarray[i, node.feature]
                     # ... and node.right_child != _TREE_LEAF:
+                    X_i_node_feature = X_ndarray[i, node.feature]
                     if isnan(X_i_node_feature):
                         if node.missing_go_to_left:
                             node = &self.nodes[node.left_child]
                         else:
                             node = &self.nodes[node.right_child]
-                    # TODO: handle categorical case:
+                    if self.is_categorical[node.feature]:
+                        if node.categorical_bitset & (1 << (<intp_t> X_ndarray[i, node.feature])):
+                            node = &self.nodes[node.left_child]
+                        else:
+                            node = &self.nodes[node.right_child]
                     elif X_i_node_feature <= node.threshold:
                         node = &self.nodes[node.left_child]
                     else:
@@ -1134,8 +1139,12 @@ cdef class Tree:
                     indices[indptr[i + 1]] = <intp_t>(node - self.nodes)
                     indptr[i + 1] += 1
 
-                    # TODO handle categorical
-                    if X_ndarray[i, node.feature] <= node.threshold:
+                    if self.is_categorical[node.feature]:
+                        if node.categorical_bitset & (1 << (<intp_t> X_ndarray[i, node.feature])):
+                            node = &self.nodes[node.left_child]
+                        else:
+                            node = &self.nodes[node.right_child]
+                    elif X_ndarray[i, node.feature] <= node.threshold:
                         node = &self.nodes[node.left_child]
                     else:
                         node = &self.nodes[node.right_child]
@@ -1415,7 +1424,7 @@ cdef class Tree:
 
                     if is_target_feature:
                         # In this case, we push left or right child on stack
-                        # TODO: handle categotical (and missing?)
+                        # TODO: handle categorical (and missing?)
                         if X[sample_idx, feature_idx] <= current_node.threshold:
                             node_idx_stack[stack_size] = current_node.left_child
                         else:

From 288c1eaf14ee3fac91e85dceb1a727a4ceea9adc Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Wed, 24 Sep 2025 22:22:08 +0200
Subject: [PATCH 07/23] refacto check_cat; better error messages

---
 sklearn/tree/_classes.py | 130 ++++++++++-----------------------------
 1 file changed, 34 insertions(+), 96 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 3c300718cb3f1..6ef5ac5b7985a 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -47,7 +47,6 @@
     _assert_all_finite_element_wise,
     _check_n_features,
     _check_sample_weight,
-    _is_pandas_df,
     assert_all_finite,
     check_is_fitted,
     validate_data,
@@ -249,8 +248,6 @@ def _fit(
     ):
         random_state = check_random_state(self.random_state)
 
-        self.is_categorical_ = self._check_categorical_features(X)
-
         if check_input:
             # Need to validate separately here.
             # We can't pass multi_output=True because that would allow y to be
@@ -446,6 +443,10 @@ def _fit(
                 # *positive class*, all signs must be flipped.
                 monotonic_cst *= -1
 
+        self.is_categorical_, n_categories_in_feature = (
+            self._check_categorical_features(X, monotonic_cst)
+        )
+
         if not isinstance(self.splitter, Splitter):
             splitter = SPLITTERS[self.splitter](
                 criterion,
@@ -493,14 +494,6 @@ def _fit(
                 self.min_impurity_decrease,
             )
 
-        n_categories_in_feature = np.full(self.n_features_in_, -1, dtype=np.intp)
-        if self.is_categorical_ is not None:
-            for idx in np.where(self.is_categorical_)[0]:
-                assert np.allclose(X[:, idx].astype(np.intp), X[:, idx])
-                assert X[:, idx].min() >= 0
-                n_categories_in_feature[idx] = X[:, idx].max() + 1
-            assert (n_categories_in_feature <= 64).all()
-
         builder.build(
             self.tree_,
             X,
@@ -518,101 +511,31 @@ def _fit(
 
         return self
 
-    def _check_categorical_features(self, X):
-        """Copied from gradient_boosting.py on sept. 2025
-        Check and validate categorical features in X
+    def _check_categorical_features(self, X, monotonic_cst):
+        """Check and validate categorical features in X
 
         Parameters
         ----------
-        X : {array-like, pandas DataFrame} of shape (n_samples, n_features)
-            Input data.
+        X : {array-like} of shape (n_samples, n_features)
+            Input data (after `validate_data` was called)
 
         Return
         ------
         is_categorical : ndarray of shape (n_features,) or None, dtype=bool
             Indicates whether a feature is categorical. If no feature is
             categorical, this is None.
+        n_categories_in_feature: TODO
         """
-        # Special code for pandas because of a bug in recent pandas, which is
-        # fixed in main and maybe included in 2.2.1, see
-        # https://github.com/pandas-dev/pandas/pull/57173.
-        # Also pandas versions < 1.5.1 do not support the dataframe interchange
-        if _is_pandas_df(X):
-            X_is_dataframe = True
-            categorical_columns_mask = np.asarray(X.dtypes == "category")
-        elif hasattr(X, "__dataframe__"):
-            X_is_dataframe = True
-            categorical_columns_mask = np.asarray(
-                [
-                    c.dtype[0].name == "CATEGORICAL"
-                    for c in X.__dataframe__().get_columns()
-                ]
-            )
-        else:
-            X_is_dataframe = False
-            categorical_columns_mask = None
-
-        categorical_features = self.categorical_features
-
-        categorical_by_dtype = (
-            isinstance(categorical_features, str)
-            and categorical_features == "from_dtype"
-        )
-        no_categorical_dtype = categorical_features is None or (
-            categorical_by_dtype and not X_is_dataframe
-        )
-
-        if no_categorical_dtype:
-            return None
-
-        use_pandas_categorical = categorical_by_dtype and X_is_dataframe
-        if use_pandas_categorical:
-            categorical_features = categorical_columns_mask
-        else:
-            categorical_features = np.asarray(categorical_features)
-
-        if categorical_features.size == 0:
-            return None
+        n_features = X.shape[1]
+        categorical_features = np.asarray(self.categorical_features)
 
-        if categorical_features.dtype.kind not in ("i", "b", "U", "O"):
+        if self.categorical_features is None or categorical_features.size == 0:
+            is_categorical = np.zeros(n_features, dtype=bool)
+        elif categorical_features.dtype.kind not in ("i", "b"):
             raise ValueError(
-                "categorical_features must be an array-like of bool, int or "
-                f"str, got: {categorical_features.dtype.name}."
+                "categorical_features must be an array-like of bool or int, "
+                f"got: {categorical_features.dtype.name}."
             )
-
-        if categorical_features.dtype.kind == "O":
-            types = set(type(f) for f in categorical_features)
-            if types != {str}:
-                raise ValueError(
-                    "categorical_features must be an array-like of bool, int or "
-                    f"str, got: {', '.join(sorted(t.__name__ for t in types))}."
-                )
-
-        n_features = X.shape[1]
-        # At this point `validate_data` was not called yet because we use the original
-        # dtypes to discover the categorical features. Thus `feature_names_in_`
-        # is not defined yet.
-        feature_names_in_ = getattr(X, "columns", None)
-
-        if categorical_features.dtype.kind in ("U", "O"):
-            # check for feature names
-            if feature_names_in_ is None:
-                raise ValueError(
-                    "categorical_features should be passed as an array of "
-                    "integers or as a boolean mask when the model is fitted "
-                    "on data without feature names."
-                )
-            is_categorical = np.zeros(n_features, dtype=bool)
-            feature_names = list(feature_names_in_)
-            for feature_name in categorical_features:
-                try:
-                    is_categorical[feature_names.index(feature_name)] = True
-                except ValueError as e:
-                    raise ValueError(
-                        f"categorical_features has a item value '{feature_name}' "
-                        "which is not a valid feature name of the training "
-                        f"data. Observed feature names: {feature_names}"
-                    ) from e
         elif categorical_features.dtype.kind == "i":
             # check for categorical features as indices
             if (
@@ -634,9 +557,24 @@ def _check_categorical_features(self, X):
                 )
             is_categorical = categorical_features
 
-        if not np.any(is_categorical):
-            return None
-        return is_categorical
+        n_categories_in_feature = np.full(self.n_features_in_, -1, dtype=np.intp)
+        MAX_NC = 64  # TODO import from somewhere
+        base_msg = (
+            f"Values for categorical features should be integers in [0, {MAX_NC - 1}]."
+        )
+        for idx in np.where(is_categorical)[0]:
+            if not np.allclose(X[:, idx].astype(np.intp), X[:, idx]):
+                raise ValueError(f"{base_msg} Found non-integer values.")
+            if X[:, idx].min() < 0:
+                raise ValueError(f"{base_msg} Found negative values.")
+            X_idx_max = X[:, idx].max()
+            if X_idx_max >= MAX_NC:
+                raise ValueError(f"{base_msg} Found {X_idx_max}.")
+            n_categories_in_feature[idx] = X_idx_max + 1
+            if monotonic_cst is not None and monotonic_cst[idx] != 0:
+                raise ValueError("Categorical features are incompatible with ")
+
+        return is_categorical, n_categories_in_feature
 
     def _validate_X_predict(self, X, check_input):
         """Validate the training data on predict (probabilities)."""

From f512e0323ad7f08f6f801d367b893a119211c3b5 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Wed, 24 Sep 2025 22:46:21 +0200
Subject: [PATCH 08/23] fix segfault

---
 sklearn/tree/_tree.pyx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 4db6bbd28fae3..ed71e159b1792 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1013,8 +1013,8 @@ cdef class Tree:
                             node = &self.nodes[node.left_child]
                         else:
                             node = &self.nodes[node.right_child]
-                    if self.is_categorical[node.feature]:
-                        if node.categorical_bitset & (1 << (<intp_t> X_ndarray[i, node.feature])):
+                    elif self.is_categorical[node.feature]:
+                        if node.categorical_bitset & (1 << (<intp_t> X_i_node_feature)):
                             node = &self.nodes[node.left_child]
                         else:
                             node = &self.nodes[node.right_child]
@@ -1139,6 +1139,7 @@ cdef class Tree:
                     indices[indptr[i + 1]] = <intp_t>(node - self.nodes)
                     indptr[i + 1] += 1
 
+                    # TODO: handle missing values?
                     if self.is_categorical[node.feature]:
                         if node.categorical_bitset & (1 << (<intp_t> X_ndarray[i, node.feature])):
                             node = &self.nodes[node.left_child]

From eb7f94ca4cd029150bdff722f9ac977fb617dbcc Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Thu, 25 Sep 2025 10:29:42 +0200
Subject: [PATCH 09/23] added basic but strong tests; fixed some minor but
 impactful bugs revealed by the newly added tests

---
 sklearn/tree/_partitioner.pxd   |  4 ++--
 sklearn/tree/_partitioner.pyx   | 25 +++++++++++++++++++------
 sklearn/tree/_splitter.pyx      | 13 +++----------
 sklearn/tree/tests/test_tree.py | 31 +++++++++++++++++++++++++++++--
 4 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/sklearn/tree/_partitioner.pxd b/sklearn/tree/_partitioner.pxd
index 12fd019f1b8aa..a157d259cae29 100644
--- a/sklearn/tree/_partitioner.pxd
+++ b/sklearn/tree/_partitioner.pxd
@@ -87,7 +87,7 @@ cdef class DensePartitioner:
     cdef intp_t[::1] sorted_cat
     cdef intp_t[::1] offsets
 
-    cdef void sort_samples_and_feature_values(
+    cdef bint sort_samples_and_feature_values(
         self, intp_t current_feature
     ) noexcept nogil
     cdef void shift_missing_to_the_left(self) noexcept nogil
@@ -150,7 +150,7 @@ cdef class SparsePartitioner:
     cdef const uint8_t[::1] missing_values_in_feature_mask
     cdef intp_t n_categories
 
-    cdef void sort_samples_and_feature_values(
+    cdef bint sort_samples_and_feature_values(
         self, intp_t current_feature
     ) noexcept nogil
     cdef void shift_missing_to_the_left(self) noexcept nogil
diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
index d0a4a66021761..7feee5b8f082e 100644
--- a/sklearn/tree/_partitioner.pyx
+++ b/sklearn/tree/_partitioner.pyx
@@ -71,7 +71,7 @@ cdef class DensePartitioner:
         self.end = end
         self.n_missing = 0
 
-    cdef inline void sort_samples_and_feature_values(
+    cdef inline bint sort_samples_and_feature_values(
         self, intp_t current_feature
     ) noexcept nogil:
         """Simultaneously sort based on the feature_values.
@@ -115,14 +115,18 @@ cdef class DensePartitioner:
             for i in range(self.start, self.end):
                 feature_values[i] = X[samples[i], current_feature]
 
+        self.missing_on_the_left = False
+        self.n_missing = n_missing
         self.n_categories = self.n_categories_in_feature[current_feature]
+        if n_missing == self.end - self.start:
+            return True
         if self.n_categories <= 0:
             # no a categorical feature
             sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing)
+            return feature_values[self.end - n_missing - 1] <= feature_values[self.start] + FEATURE_THRESHOLD
         else:
             self._breiman_sort_categories(self.n_categories)
-        self.missing_on_the_left = False
-        self.n_missing = n_missing
+            return feature_values[self.start] == feature_values[self.end - 1]
 
     cdef void _breiman_sort_categories(self, intp_t nc) noexcept nogil:
         """ O(n + nc log nc)"""
@@ -249,12 +253,20 @@ cdef class DensePartitioner:
         cdef intp_t end_non_missing = (
             self.end if self.missing_on_the_left
             else self.end - self.n_missing)
+        cdef float32_t c
 
         if p[0] == end_non_missing and not self.missing_on_the_left:
             # skip the missing values up to the end
             # (which will end the for loop in the best split function)
             p[0] = self.end
             p_prev[0] = self.end
+        elif self.n_categories > 0:
+            c = self.feature_values[p[0]]
+            p[0] += 1
+            while p[0] < end_non_missing and self.feature_values[p[0]] == c:
+                p[0] += 1
+
+            # p_prev is unused in this case
         else:
             if self.missing_on_the_left and p[0] == self.start:
                 # skip the missing values up to the first non-missing value:
@@ -270,11 +282,11 @@ cdef class DensePartitioner:
     cdef inline SplitValue pos_to_threshold(
         self, intp_t p_prev, intp_t p
     ) noexcept nogil:
+        cdef SplitValue split
         cdef intp_t end_non_missing = (
             self.end if self.missing_on_the_left
             else self.end - self.n_missing)
 
-        cdef SplitValue split
         if self.n_categories > 0:
             split.cat_split = self._split_pos_to_bitset(p, self.n_categories)
             return split
@@ -407,7 +419,7 @@ cdef class SparsePartitioner:
         self.is_samples_sorted = 0
         self.n_missing = 0
 
-    cdef inline void sort_samples_and_feature_values(
+    cdef inline bint sort_samples_and_feature_values(
         self,
         intp_t current_feature
     ) noexcept nogil:
@@ -446,6 +458,8 @@ cdef class SparsePartitioner:
         # number of missing values for current_feature
         self.n_missing = 0
 
+        return feature_values[self.end - 1] <= feature_values[self.start] + FEATURE_THRESHOLD
+
     cdef void shift_missing_to_the_left(self) noexcept nogil:
         pass  # missing values not support for sparse
 
@@ -740,7 +754,6 @@ cdef inline void extract_nnz_binary_search(const int32_t[::1] X_indices,
           n_samples * log(n_indices)).
     """
     cdef intp_t n_samples
-    # printf("Is sorted: %d\n", is_samples_sorted[0])
 
     if not is_samples_sorted[0]:
         n_samples = end - start
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 80721d8f7e7c0..86f8fcfb2d2a1 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -281,7 +281,6 @@ cdef inline int node_split_best(
     # Find the best split
     cdef intp_t start = splitter.start
     cdef intp_t end = splitter.end
-    cdef intp_t end_non_missing
     cdef intp_t n_missing = 0
     cdef bint has_missing = 0
     cdef intp_t n_searches
@@ -292,7 +291,6 @@ cdef inline int node_split_best(
     cdef intp_t[::1] constant_features = splitter.constant_features
     cdef intp_t n_features = splitter.n_features
 
-    cdef float32_t[::1] feature_values = splitter.feature_values
     cdef intp_t max_features = splitter.max_features
     cdef intp_t min_samples_leaf = splitter.min_samples_leaf
     cdef float64_t min_weight_leaf = splitter.min_weight_leaf
@@ -308,6 +306,7 @@ cdef inline int node_split_best(
 
     cdef intp_t f_i = n_features
     cdef intp_t f_j
+    cdef bint is_constant
     cdef intp_t p
     cdef intp_t p_prev
 
@@ -367,16 +366,10 @@ cdef inline int node_split_best(
         f_j += n_found_constants
         # f_j in the interval [n_total_constants, f_i[
         current_split.feature = features[f_j]
-        partitioner.sort_samples_and_feature_values(current_split.feature)
+        is_constant = partitioner.sort_samples_and_feature_values(current_split.feature)
         n_missing = partitioner.n_missing
-        end_non_missing = end - n_missing
 
-        if (
-            # All values for this feature are missing, or
-            end_non_missing == start or
-            # This feature is considered constant (max - min <= FEATURE_THRESHOLD)
-            feature_values[end_non_missing - 1] <= feature_values[start] + FEATURE_THRESHOLD
-        ):
+        if is_constant:
             # We consider this feature constant in this case.
             # Since finding a split among constant feature is not valuable,
             # we do not consider this feature for splitting.
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 9db97ec529e24..339c85b08a291 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -2053,10 +2053,18 @@ def test_criterion_entropy_same_as_log_loss(Tree, n_classes):
     assert_allclose(tree_log_loss.predict(X), tree_entropy.predict(X))
 
 
+def to_categorical(x, nc):
+    q = np.linspace(0, 1, num=nc + 1)[1:-1]
+    quantiles = np.quantile(x, q)
+    cats = np.searchsorted(quantiles, x)
+    return np.random.permutation(nc)[cats]
+
+
 def test_different_endianness_pickle():
     X, y = datasets.make_classification(random_state=0)
+    X[:, 0] = to_categorical(X[:, 0], 5)
 
-    clf = DecisionTreeClassifier(random_state=0, max_depth=3)
+    clf = DecisionTreeClassifier(random_state=0, max_depth=3, categorical_features=[0])
     clf.fit(X, y)
     score = clf.score(X, y)
 
@@ -2080,8 +2088,9 @@ def get_pickle_non_native_endianness():
 
 def test_different_endianness_joblib_pickle():
     X, y = datasets.make_classification(random_state=0)
+    X[:, 0] = to_categorical(X[:, 0], 5)
 
-    clf = DecisionTreeClassifier(random_state=0, max_depth=3)
+    clf = DecisionTreeClassifier(random_state=0, max_depth=3, categorical_features=[0])
     clf.fit(X, y)
     score = clf.score(X, y)
 
@@ -2859,3 +2868,21 @@ def test_sort_log2_build():
     ]
     # fmt: on
     assert_array_equal(samples, expected_samples)
+
+
+@pytest.mark.parametrize("Tree", [DecisionTreeClassifier, DecisionTreeRegressor])
+def test_categorical(Tree):
+    n = 40
+    c = np.random.randint(0, 20, size=n)
+    y = c % 2
+
+    X = np.random.rand(n, 3)
+    X[:, 0] = c
+
+    tree = Tree(categorical_features=[0], max_depth=1)
+    # assert perfect tree was reached in one split
+    assert tree.fit(X, y).score(X, y) == 1
+
+    # assert it's not the case without using categorical_features
+    tree = Tree(max_depth=1)
+    assert tree.fit(X, y).score(X, y) < 1

From 9afc1d8e40c73d8e240a8d3b1654744277f14c6a Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Thu, 25 Sep 2025 13:17:19 +0200
Subject: [PATCH 10/23] Minor tests changes

---
 sklearn/tree/tests/test_tree.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 339c85b08a291..ca72173adb527 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -2061,11 +2061,13 @@ def to_categorical(x, nc):
 
 
 def test_different_endianness_pickle():
-    X, y = datasets.make_classification(random_state=0)
-    X[:, 0] = to_categorical(X[:, 0], 5)
+    X, y = datasets.make_classification(random_state=0, n_redundant=0, shuffle=False)
+    X[:, 0] = to_categorical(X[:, 0], 50)
 
     clf = DecisionTreeClassifier(random_state=0, max_depth=3, categorical_features=[0])
     clf.fit(X, y)
+    assert 0 < clf.feature_importances_[0] < 1
+    # ^ ensures some splits are categorical, some are continuous
     score = clf.score(X, y)
 
     def reduce_ndarray(arr):
@@ -2088,10 +2090,12 @@ def get_pickle_non_native_endianness():
 
 def test_different_endianness_joblib_pickle():
     X, y = datasets.make_classification(random_state=0)
-    X[:, 0] = to_categorical(X[:, 0], 5)
+    X[:, 0] = to_categorical(X[:, 0], 50)
 
     clf = DecisionTreeClassifier(random_state=0, max_depth=3, categorical_features=[0])
     clf.fit(X, y)
+    assert 0 < clf.feature_importances_[0] < 1
+    # ^ ensures some splits are categorical, some are continuous
     score = clf.score(X, y)
 
     class NonNativeEndiannessNumpyPickler(NumpyPickler):
@@ -2872,16 +2876,18 @@ def test_sort_log2_build():
 
 @pytest.mark.parametrize("Tree", [DecisionTreeClassifier, DecisionTreeRegressor])
 def test_categorical(Tree):
+    rng = np.random.default_rng(3)
     n = 40
-    c = np.random.randint(0, 20, size=n)
+    c = rng.integers(0, 20, size=n)
     y = c % 2
 
-    X = np.random.rand(n, 3)
+    X = rng.random((n, 3))
     X[:, 0] = c
 
-    tree = Tree(categorical_features=[0], max_depth=1)
+    tree = Tree(categorical_features=[0], max_depth=1, random_state=8)
     # assert perfect tree was reached in one split
     assert tree.fit(X, y).score(X, y) == 1
+    assert tree.feature_importances_[0] == 1
 
     # assert it's not the case without using categorical_features
     tree = Tree(max_depth=1)

From 1d796cfd46a48a181fc1452aad0ff254da1f1be6 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Thu, 25 Sep 2025 16:07:32 +0200
Subject: [PATCH 11/23] test categorical; found a new bug in missing

---
 sklearn/tree/_partitioner.pyx    |   4 +-
 sklearn/tree/_tree.pyx           |   4 +
 sklearn/tree/tests/test_split.py | 150 +++++++++++++++++++++++--------
 3 files changed, 119 insertions(+), 39 deletions(-)

diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
index 7feee5b8f082e..602fc202383d4 100644
--- a/sklearn/tree/_partitioner.pyx
+++ b/sklearn/tree/_partitioner.pyx
@@ -123,6 +123,8 @@ cdef class DensePartitioner:
         if self.n_categories <= 0:
             # no a categorical feature
             sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing)
+            if n_missing > 0:
+                return False
             return feature_values[self.end - n_missing - 1] <= feature_values[self.start] + FEATURE_THRESHOLD
         else:
             self._breiman_sort_categories(self.n_categories)
@@ -152,9 +154,9 @@ cdef class DensePartitioner:
             # FIXME: overflow risk + float32 downcast: precision issues?
             # `sort` only supports float32 for now, that's why means is float32
             # maybe we can use some templating to extend sort to float64
-            means[c] += self.y[samples[p], 0]
             if self.sample_weight is not None:
                 w = self.sample_weight[samples[p]]
+            means[c] += w * self.y[samples[p], 0]
             self.weighted_counts[c] += w
 
         for c in range(nc):
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index ed71e159b1792..4a99bed472f40 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -751,6 +751,10 @@ cdef class Tree:
     def threshold(self):
         return self._get_node_ndarray()['threshold'][:self.node_count]
 
+    @property
+    def categorical_bitset(self):
+        return self._get_node_ndarray()['categorical_bitset'][:self.node_count]
+
     @property
     def impurity(self):
         return self._get_node_ndarray()['impurity'][:self.node_count]
diff --git a/sklearn/tree/tests/test_split.py b/sklearn/tree/tests/test_split.py
index 55287d4d87267..d79a3ac2b443e 100644
--- a/sklearn/tree/tests/test_split.py
+++ b/sklearn/tree/tests/test_split.py
@@ -1,5 +1,7 @@
 from dataclasses import dataclass
-from functools import partial
+from functools import cached_property, partial
+from itertools import chain, combinations
+from operator import itemgetter
 
 import numpy as np
 import pytest
@@ -17,9 +19,11 @@
 REG_CRITERIONS = ("squared_error", "absolute_error", "friedman_mse", "poisson")
 
 
-def gini_loss(y, y_pred, sample_weight):
-    p = y_pred[0]
-    return (p * (1 - p)).sum()
+def powerset(iterable):
+    s = list(iterable)  # allows handling sets too
+    return chain.from_iterable(
+        (list(c) for c in combinations(s, r)) for r in range(1, (len(s) + 1) // 2 + 1)
+    )
 
 
 @dataclass
@@ -28,10 +32,7 @@ class NaiveSplitter:
     criterion: str
     with_nans: bool
     n_classes: int
-
-    @staticmethod
-    def weighted_mean(y, w):
-        return (y * w).sum() / w.sum()
+    is_categorical: np.ndarray
 
     @staticmethod
     def weighted_median(y, w):
@@ -40,28 +41,33 @@ def weighted_median(y, w):
         idx = np.searchsorted(wc, wc[-1] / 2)
         return y[sorter[idx]]
 
-    def class_ratios(self, y, w):
-        return np.clip(np.bincount(y, w, minlength=self.n_classes) / w.sum(), None, 1)
+    @staticmethod
+    def gini_loss(y, y_pred, sample_weight):
+        p = y_pred[0]
+        return (p * (1 - p)).sum()
 
-    @property
+    @cached_property
     def loss(self):
         losses = {
             "poisson": mean_poisson_deviance,
             "squared_error": mean_squared_error,
             "absolute_error": mean_absolute_error,
-            "log_loss": partial(log_loss, labels=list(range(self.n_classes))),
-            "gini": gini_loss,
+            "log_loss": partial(log_loss, labels=np.arange(self.n_classes)),
+            "gini": self.gini_loss,
         }
         return losses[self.criterion]
 
-    @property
+    def class_ratios(self, y, w):
+        return np.clip(np.bincount(y, w, minlength=self.n_classes) / w.sum(), None, 1)
+
+    @cached_property
     def predictor(self):
         if self.is_clf:
             return self.class_ratios
         elif self.criterion == "absolute_error":
             return self.weighted_median
         else:
-            return self.weighted_mean
+            return lambda y, w: np.average(y, weights=w)
 
     def compute_child_loss(self, y: np.ndarray, w: np.ndarray):
         if y.size == 0:
@@ -71,13 +77,20 @@ def compute_child_loss(self, y: np.ndarray, w: np.ndarray):
         y_pred[:] = self.predictor(y, w)
         return w.sum() * self.loss(y, y_pred, sample_weight=w)
 
-    def compute_split_loss(self, x, y, w, threshold, missing_left=False):
-        mask = x < threshold
+    def compute_split_loss(
+        self, x, y, w, threshold=None, categories=None, missing_left=False
+    ):
+        if categories is not None:
+            mask_c = np.zeros(int(x.max() + 1), dtype=bool)
+            mask_c[categories] = True
+            mask = mask_c[x.astype(int)]
+        else:
+            mask = x < threshold
         if missing_left:
             mask |= np.isnan(x)
         if self.criterion == "friedman_mse":
-            diff = self.weighted_mean(y[mask], w[mask]) - self.weighted_mean(
-                y[~mask], w[~mask]
+            diff = np.average(y[mask], weights=w[mask]) - np.average(
+                y[~mask], weights=w[~mask]
             )
             return (-(diff**2) * w[mask].sum() * w[~mask].sum() / w.sum(),)
         return (
@@ -85,21 +98,32 @@ def compute_split_loss(self, x, y, w, threshold, missing_left=False):
             self.compute_child_loss(y[~mask], w[~mask]),
         )
 
-    def compute_all_losses(self, x, y, w, missing_left=False):
+    def compute_all_losses(self, x, y, w, is_categorical=False, missing_left=False):
+        if is_categorical:
+            return self.compute_all_losses_categorical(x, y, w)
         nan_mask = np.isnan(x)
         xu = np.unique(x[~nan_mask], sorted=True)
         thresholds = (xu[1:] + xu[:-1]) / 2
         if nan_mask.any() and not missing_left:
             thresholds = np.append(thresholds, xu.max() * 2)
         return thresholds, [
-            sum(self.compute_split_loss(x, y, w, threshold, missing_left))
+            sum(self.compute_split_loss(x, y, w, threshold, missing_left=missing_left))
             for threshold in thresholds
         ]
 
+    def compute_all_losses_categorical(self, x, y, w):
+        cat_splits = list(powerset(np.unique(x).astype(int)))
+        return cat_splits, [
+            sum(self.compute_split_loss(x, y, w, categories=left_cat))
+            for left_cat in cat_splits
+        ]
+
     def best_split_naive(self, X, y, w):
         splits = []
         for f in range(X.shape[1]):
-            thresholds, losses = self.compute_all_losses(X[:, f], y, w)
+            thresholds, losses = self.compute_all_losses(
+                X[:, f], y, w, is_categorical=self.is_categorical[f]
+            )
             if self.with_nans:
                 thresholds_, losses_ = self.compute_all_losses(
                     X[:, f], y, w, missing_left=True
@@ -117,7 +141,7 @@ def best_split_naive(self, X, y, w):
                     f,
                 )
             )
-        return min(splits)
+        return min(splits, key=itemgetter(0))
 
 
 def sparsify(X, density):
@@ -128,13 +152,30 @@ def sparsify(X, density):
     return csc_array(X)
 
 
+def to_categorical(x, nc):
+    q = np.linspace(0, 1, num=nc + 1)[1:-1]
+    quantiles = np.quantile(x, q)
+    cats = np.searchsorted(quantiles, x)
+    return np.random.permutation(nc)[cats]
+
+
 def make_simple_dataset(
-    n, d, with_nans, is_sparse, is_clf, n_classes, rng: np.random.Generator
+    n,
+    d,
+    with_nans,
+    is_sparse,
+    is_categorical,
+    is_clf,
+    n_classes,
+    rng: np.random.Generator,
 ):
     X_dense = np.random.rand(n, d)
     y = np.random.rand(n) + X_dense.sum(axis=1)
     w = np.random.rand(n)
 
+    for idx in np.where(is_categorical)[0]:
+        nc = rng.integers(2, 6)  # cant go to high or test will be too slow
+        X_dense[:, idx] = to_categorical(X_dense[:, idx], nc)
     with_duplicates = rng.integers(2) == 0
     if with_duplicates:
         X_dense = X_dense.round(1 if n < 50 else 2)
@@ -155,44 +196,77 @@ def make_simple_dataset(
     return X_dense, X, y, w
 
 
+def bitset_to_set(v: np.uint64):
+    return [c for c in range(64) if v & (1 << c)]
+
+
 @pytest.mark.parametrize("sparse", ["x", "sparse"])
-@pytest.mark.parametrize("missing_values", ["x", "missing_values"])
+@pytest.mark.parametrize("categorical", ["x", "categorical"])
+@pytest.mark.parametrize("missing_values", ["missing_values"])
 @pytest.mark.parametrize(
     "criterion",
-    ["gini", "log_loss", "squared_error", "absolute_error", "friedman_mse", "poisson"],
+    ["gini", "log_loss", "squared_error", "friedman_mse", "poisson"],
 )
-def test_best_split_optimality(sparse, missing_values, criterion, global_random_seed):
+def test_best_split_optimality(
+    sparse, categorical, missing_values, criterion, global_random_seed
+):
     is_clf = criterion in CLF_CRITERIONS
     with_nans = missing_values != "x"
     is_sparse = sparse != "x"
+    with_categoricals = categorical != "x"
     if is_sparse and with_nans:
         pytest.skip("Sparse + missing values not supported yet")
+    if with_categoricals and (is_sparse or criterion == "absolute_error" or with_nans):
+        pytest.skip("Categorical features not supported in this case")
 
-    rng = np.random.default_rng(global_random_seed)
+    rng = np.random.default_rng()
 
-    d = 2
-    for it, n in enumerate([5] * 10 + [10] * 10 + [30] * 5 + [100, 100, 200]):
-        n_classes = rng.integers(2, 5)
+    ns = [5] * 5 + [10] * 5 + [30, 30]
+    if not with_categoricals and criterion != "log_loss":
+        ns.extend([30, 30, 30, 100, 100, 200])
+
+    for it, n in enumerate(ns):
+        d = rng.integers(1, 4)
+        n_classes = 2 if with_categoricals else rng.integers(2, 5)
+        if with_categoricals:
+            is_categorical = rng.random(d) < 0.5
+        else:
+            is_categorical = np.zeros(d, dtype=bool)
         X_dense, X, y, w = make_simple_dataset(
-            n, d, with_nans, is_sparse, is_clf, n_classes, rng
+            n, d, with_nans, is_sparse, is_categorical, is_clf, n_classes, rng
         )
 
-        naive_splitter = NaiveSplitter(is_clf, criterion, with_nans, n_classes)
+        naive_splitter = NaiveSplitter(
+            is_clf, criterion, with_nans, n_classes, is_categorical
+        )
+        best_split = naive_splitter.best_split_naive(X_dense, y, w)
 
+        is_categorical = is_categorical if is_categorical.any() else None
         Tree = DecisionTreeClassifier if is_clf else DecisionTreeRegressor
-        tree = Tree(criterion=criterion, max_depth=1, max_features=d)
+        tree = Tree(
+            criterion=criterion,
+            max_depth=1,
+            max_features=d,
+            categorical_features=is_categorical,
+        )
         tree.fit(X, y, sample_weight=w)
-        best_split = naive_splitter.best_split_naive(X_dense, y, w)
+
+        split_feature = tree.tree_.feature[0]
+        split = (
+            {"threshold": tree.tree_.threshold[0]}
+            if is_categorical is None or not is_categorical[split_feature]
+            else {"categories": bitset_to_set(tree.tree_.categorical_bitset[0])}
+        )
         tree_loss = naive_splitter.compute_split_loss(
             X_dense[:, tree.tree_.feature[0]],
             y,
             w,
-            tree.tree_.threshold[0],
-            bool(tree.tree_.missing_go_to_left[0]),
+            **split,
+            missing_left=bool(tree.tree_.missing_go_to_left[0]),
         )
         tree_split = (
             sum(tree_loss),
-            tree.tree_.threshold[0],
+            split,
             bool(tree.tree_.missing_go_to_left[0]),
             tree.tree_.feature[0],
         )

From 5c3de79000111c01723dca885398f80ddf905207 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Thu, 25 Sep 2025 23:02:48 +0200
Subject: [PATCH 12/23] docstring for brieman sort

---
 sklearn/tree/_partitioner.pyx | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
index 602fc202383d4..ab6cd3c7a37ed 100644
--- a/sklearn/tree/_partitioner.pyx
+++ b/sklearn/tree/_partitioner.pyx
@@ -131,7 +131,21 @@ cdef class DensePartitioner:
             return feature_values[self.start] == feature_values[self.end - 1]
 
     cdef void _breiman_sort_categories(self, intp_t nc) noexcept nogil:
-        """ O(n + nc log nc)"""
+        """
+        Order self.sorted_cat by ascending average target value
+        and order self.features_values & self.samples such that
+        - self.features_values is ordered according to the order of sorted_cat
+        - the relation `self.features_values[p] = self.X[f, self.samples[p]]` is
+          preserved
+
+        E.g. sorted_cat is [2 0 1]
+             features_values is [2 2 2 0 0 1 1 1 1]
+
+        This ordering ensures the optimal split will be among the candidate splits
+        evaluated by the splitter (this is called the Brieman shortcut).
+
+        Time complexity: O(n + nc log nc)
+        """
         cdef:
             intp_t* counts = &self.counts[0]
             float64_t* weighted_counts = &self.weighted_counts[0]
@@ -151,9 +165,6 @@ cdef class DensePartitioner:
         for p in range(self.start, self.end):
             c = <int> feature_values[p]
             counts[c] += 1
-            # FIXME: overflow risk + float32 downcast: precision issues?
-            # `sort` only supports float32 for now, that's why means is float32
-            # maybe we can use some templating to extend sort to float64
             if self.sample_weight is not None:
                 w = self.sample_weight[samples[p]]
             means[c] += w * self.y[samples[p], 0]
@@ -182,7 +193,6 @@ cdef class DensePartitioner:
         # while ensuring samples of the same categories are contiguous
         p = self.start
         while p < self.end:
-            # XXX: not sure this works
             c = <int> feature_values[p]
             new_p = offsets[c]
             if new_p > p:

From f47ca8e6db782a56baafb2dc2671866da31917e1 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Fri, 26 Sep 2025 00:20:29 +0200
Subject: [PATCH 13/23] minor comments fixes

---
 sklearn/tree/_partitioner.pyx    | 13 ++++++-------
 sklearn/tree/tests/test_split.py |  4 ++--
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
index ab6cd3c7a37ed..d0f51e9341d59 100644
--- a/sklearn/tree/_partitioner.pyx
+++ b/sklearn/tree/_partitioner.pyx
@@ -135,7 +135,7 @@ cdef class DensePartitioner:
         Order self.sorted_cat by ascending average target value
         and order self.features_values & self.samples such that
         - self.features_values is ordered according to the order of sorted_cat
-        - the relation `self.features_values[p] = self.X[f, self.samples[p]]` is
+        - the relation `self.features_values[p] = self.X[self.samples[p], f]` is
           preserved
 
         E.g. sorted_cat is [2 0 1]
@@ -171,16 +171,15 @@ cdef class DensePartitioner:
             self.weighted_counts[c] += w
 
         for c in range(nc):
-            # TODO: weighted counts
             if weighted_counts[c] > 0:
                 means[c] /= weighted_counts[c]
 
-        # sorted_cat[i] = i-th categories sorted my ascending means
+        # sorted_cat[i] = i-th categories sorted by ascending means
         for c in range(nc):
             sorted_cat[c] = c
         sort(means, sorted_cat, nc)
 
-        # ) build offsets such that:
+        # build offsets such that:
         # offsets[c] = sum( counts[x] for all x s.t. rank(x) <= rank(c) ) - 1
         cdef intp_t offset = 0
         for r in range(nc):
@@ -188,8 +187,8 @@ cdef class DensePartitioner:
             offset += counts[c]
             offsets[c] = offset - 1
 
-        # sort feature_values & samples such that they are ordered by
-        # the mean of the category
+        # sort feature_values & samples in-place such that
+        # they are ordered by the mean of the category
         # while ensuring samples of the same categories are contiguous
         p = self.start
         while p < self.end:
@@ -197,7 +196,7 @@ cdef class DensePartitioner:
             new_p = offsets[c]
             if new_p > p:
                 swap(feature_values, samples, p, new_p)
-                # ^ preserves invariant: feature[p] = X[samples[p], f]
+                # swap preserves invariant: feature[p] = X[samples[p], f]
                 offsets[c] -= 1
             else:
                 p += 1
diff --git a/sklearn/tree/tests/test_split.py b/sklearn/tree/tests/test_split.py
index d79a3ac2b443e..5bbedb7207903 100644
--- a/sklearn/tree/tests/test_split.py
+++ b/sklearn/tree/tests/test_split.py
@@ -200,9 +200,9 @@ def bitset_to_set(v: np.uint64):
     return [c for c in range(64) if v & (1 << c)]
 
 
-@pytest.mark.parametrize("sparse", ["x", "sparse"])
+@pytest.mark.parametrize("sparse", ["x"])
 @pytest.mark.parametrize("categorical", ["x", "categorical"])
-@pytest.mark.parametrize("missing_values", ["missing_values"])
+@pytest.mark.parametrize("missing_values", ["x"])
 @pytest.mark.parametrize(
     "criterion",
     ["gini", "log_loss", "squared_error", "friedman_mse", "poisson"],

From 96998eef5b0e961a68c32434ac5dd52f038d8179 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Fri, 26 Sep 2025 10:05:52 +0200
Subject: [PATCH 14/23] adressed some copilot comments

---
 sklearn/tree/_classes.py      | 10 ++++++++--
 sklearn/tree/_partitioner.pyx |  2 +-
 sklearn/tree/_tree.pyx        |  2 +-
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 6ef5ac5b7985a..083ee36346d32 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -127,7 +127,6 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
         "monotonic_cst": ["array-like", None],
         "categorical_features": [
             "array-like",
-            StrOptions({"from_dtype"}),
             None,
         ],
     }
@@ -563,6 +562,10 @@ def _check_categorical_features(self, X, monotonic_cst):
             f"Values for categorical features should be integers in [0, {MAX_NC - 1}]."
         )
         for idx in np.where(is_categorical)[0]:
+            if np.isnan(X[:, idx]).any():
+                raise ValueError(
+                    "Missing values are not supported in categorical features"
+                )
             if not np.allclose(X[:, idx].astype(np.intp), X[:, idx]):
                 raise ValueError(f"{base_msg} Found non-integer values.")
             if X[:, idx].min() < 0:
@@ -572,7 +575,10 @@ def _check_categorical_features(self, X, monotonic_cst):
                 raise ValueError(f"{base_msg} Found {X_idx_max}.")
             n_categories_in_feature[idx] = X_idx_max + 1
             if monotonic_cst is not None and monotonic_cst[idx] != 0:
-                raise ValueError("Categorical features are incompatible with ")
+                raise ValueError(
+                    "A categorical feature cannot have a non-null monotonic"
+                    " constraint. "
+                )
 
         return is_categorical, n_categories_in_feature
 
diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
index d0f51e9341d59..9ef254dea41e8 100644
--- a/sklearn/tree/_partitioner.pyx
+++ b/sklearn/tree/_partitioner.pyx
@@ -121,7 +121,7 @@ cdef class DensePartitioner:
         if n_missing == self.end - self.start:
             return True
         if self.n_categories <= 0:
-            # no a categorical feature
+            # not a categorical feature
             sort(&feature_values[self.start], &samples[self.start], self.end - self.start - n_missing)
             if n_missing > 0:
                 return False
diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 4a99bed472f40..10ede1924614e 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -960,7 +960,7 @@ cdef class Tree:
                 node.categorical_bitset = cat_split
             else:
                 node.threshold = threshold
-                node.categorical_bitset = cat_split
+                node.categorical_bitset = 0
             node.missing_go_to_left = missing_go_to_left
 
         self.node_count += 1

From 09b03a0a58aa885618849b9ebc02bb93450861f3 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Tue, 7 Oct 2025 14:45:25 +0200
Subject: [PATCH 15/23] enable more tests

---
 sklearn/tree/tests/test_split.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/tests/test_split.py b/sklearn/tree/tests/test_split.py
index 5bbedb7207903..af1734bffc249 100644
--- a/sklearn/tree/tests/test_split.py
+++ b/sklearn/tree/tests/test_split.py
@@ -200,9 +200,9 @@ def bitset_to_set(v: np.uint64):
     return [c for c in range(64) if v & (1 << c)]
 
 
-@pytest.mark.parametrize("sparse", ["x"])
+@pytest.mark.parametrize("sparse", ["x", "sparse"])
 @pytest.mark.parametrize("categorical", ["x", "categorical"])
-@pytest.mark.parametrize("missing_values", ["x"])
+@pytest.mark.parametrize("missing_values", ["x", "missing_values"])
 @pytest.mark.parametrize(
     "criterion",
     ["gini", "log_loss", "squared_error", "friedman_mse", "poisson"],

From 1ded41a956aaed269c4f7b0a1e8734e89a539d84 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Sun, 18 Jan 2026 20:17:07 +0100
Subject: [PATCH 16/23] fix uint64 bug

---
 sklearn/tree/_partitioner.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
index ad8d9a7137115..0f555b0662b49 100644
--- a/sklearn/tree/_partitioner.pyx
+++ b/sklearn/tree/_partitioner.pyx
@@ -392,7 +392,7 @@ cdef class DensePartitioner:
         cdef intp_t offset = 0
         for r in range(nc):
             c = self.sorted_cat[r]
-            bitset |= 1 << c
+            bitset |= (<uint64_t>1) << c
             offset += self.counts[c]
             if offset >= p:
                 break
@@ -680,7 +680,7 @@ cdef inline bint goes_left(
     if isnan(value):
         return missing_go_to_left
     elif is_categorical:
-        return split_value.cat_split & (1 << (<uint8_t> value))
+        return split_value.cat_split & ((<uint64_t>1) << (<uint8_t> value))
     else:
         return value <= split_value.threshold
 

From 7eac9b6d5611858cd360b14a91ff3ed58880a4c0 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Sun, 18 Jan 2026 20:18:06 +0100
Subject: [PATCH 17/23] rm

---
 sklearn/tree/README.md | 76 ------------------------------------------
 1 file changed, 76 deletions(-)
 delete mode 100644 sklearn/tree/README.md

diff --git a/sklearn/tree/README.md b/sklearn/tree/README.md
deleted file mode 100644
index 870f0bc191b61..0000000000000
--- a/sklearn/tree/README.md
+++ /dev/null
@@ -1,76 +0,0 @@
-# sklearn.tree internals
-
-Quick map of how `sklearn.tree` is wired and how other modules use it.
-
-## File map
-- `_classes.py`: public estimators + `BaseDecisionTree` (validation, builder setup).
-- `_tree.pyx` / `_tree.pxd`: core `Tree` data structure, builders, prediction,
-  decision path, pruning helpers; defines `DTYPE` (float32) and `DOUBLE` (float64).
-- `_splitter.pyx` / `_splitter.pxd`: split search logic (best/random) using
-  criteria and partitioners.
-- `_partitioner.pyx` / `_partitioner.pxd`: dense/sparse sample partitioning for
-  a given feature/threshold; handles missing values layout.
-- `_criterion.pyx` / `_criterion.pxd`: impurity criteria (Gini/Entropy/MSE/MAE/Poisson),
-  plus helper structures for efficient updates.
-- `_utils.pyx` / `_utils.pxd`: small C helpers, RNG helpers, and `WeightedFenwickTree`
-  used by criteria.
-- `_export.py`: `export_graphviz`, `export_text`, `plot_tree`.
-- `_reingold_tilford.py`: layout algorithm used by `plot_tree`.
-
-## Core objects and data flow
-### Fit path
-1. `BaseDecisionTree._fit` in `_classes.py` validates input, handles class/target
-   encoding, computes missing-value mask, and resolves hyperparameters.
-2. A `Criterion` instance is created (classification or regression).
-3. A `Splitter` instance is created (dense vs sparse, best vs random), optionally
-   with monotonic constraints.
-4. A `Tree` object is allocated to hold node arrays and node values.
-5. A `TreeBuilder` builds the tree:
-   - `DepthFirstTreeBuilder` grows nodes in DFS order (default).
-   - `BestFirstTreeBuilder` grows by impurity improvement when `max_leaf_nodes` is set.
-6. Optional cost-complexity pruning replaces `tree_` with a pruned tree.
-
-### Predict path
-- `Tree.predict` walks each sample to a leaf and returns per-node values.
-- Estimator methods (`predict`, `predict_proba`, `apply`, `decision_path`) call `tree_`.
-
-## Data structures
-- `Tree` stores a binary tree as parallel arrays of nodes (see `_tree.pxd`).
-  Key arrays include `children_left/right`, `feature`, `threshold`, `impurity`,
-  `n_node_samples`, `weighted_n_node_samples`, `missing_go_to_left`, and `value`.
-- Node values are stored as a `(node_count, n_outputs, max_n_classes)` array so
-  the same structure supports regression and classification.
-
-## Split search and partitioning
-- `Splitter` owns the feature subset, sample indices, and the current `Criterion`.
-- `_partitioner` provides dense and sparse implementations for sorting and
-  partitioning samples for a candidate feature, including missing-value handling.
-- `_criterion` computes impurity, impurity improvement, and node values using
-  incremental updates as the split position moves.
-
-## Missing values and monotonic constraints
-- Missing values are only supported on dense inputs and when tags allow it;
-  `_compute_missing_values_in_feature_mask` computes a per-feature mask.
-- Missing values are grouped to the left or right during split evaluation and
-  recorded in `missing_go_to_left` so prediction follows the same rule.
-- Monotonic constraints are validated in `_classes.py` and passed to splitters.
-
-## Pruning
-- `_build_pruned_tree_ccp` (in `_tree.pyx`) builds a new `Tree` from a subtree
-  based on `ccp_alpha`. `cost_complexity_pruning_path` in `_classes.py` exposes
-  the pruning path utility.
-
-## How other modules use `sklearn.tree`
-- `sklearn.ensemble` uses `DecisionTree*` and `ExtraTree*` as base estimators:
-  - `_forest.py` calls `tree._fit(...)` directly to reuse input validation and
-    pass bootstrap weights; it also calls `tree.apply` and `tree.decision_path`.
-  - Feature importances are aggregated via `tree.tree_.compute_feature_importances()`.
-- Gradient boosting and related ensemble models also build trees via
-  `DecisionTreeRegressor` and rely on the same `Tree`/`Splitter`/`Criterion`
-  machinery.
-
-## Notes for edits
-- Most heavy lifting is in Cython; change signatures in `.pxd` and `.pyx` together.
-- Input arrays are coerced to `DTYPE` (float32) for `X` and `DOUBLE` (float64) for `y`.
-- Missing-value support is conservative; ensure changes keep the mask and
-  `missing_go_to_left` consistent.

From ff856944aa5f29955a2653463d2dd3c41e0874fa Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Mon, 26 Jan 2026 21:54:05 +0100
Subject: [PATCH 18/23] fixed test

---
 sklearn/tree/tests/test_split.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/sklearn/tree/tests/test_split.py b/sklearn/tree/tests/test_split.py
index b85a6a1e6ebf8..1b4afa4003fb7 100644
--- a/sklearn/tree/tests/test_split.py
+++ b/sklearn/tree/tests/test_split.py
@@ -35,7 +35,7 @@
 
 def powerset(iterable):
     """returns all the subsets of `iterable`."""
-    s = list(iterable)  # allows handling sets too
+    s = list(iterable)
     return chain.from_iterable(
         combinations(s, r) for r in range(1, (len(s) + 1) // 2 + 1)
     )
@@ -132,7 +132,7 @@ def _generate_all_splits(self, X):
             nan_mask = np.isnan(x)
             thresholds = np.unique(x[~nan_mask])
             if self.is_categorical[f]:
-                thresholds = list(powerset(thresholds.astype(int)))
+                thresholds = list(powerset(int(th) for th in thresholds))
             for th in thresholds:
                 yield Split(f, th)
             if not nan_mask.any():
@@ -153,11 +153,11 @@ def best_split_naive(self, X, y, w):
         return min(zip(split_impurities, splits), key=itemgetter(0))
 
 
-def to_categorical(x, nc):
+def to_categorical(x, nc, rng):
     q = np.linspace(0, 1, num=nc + 1)[1:-1]
     quantiles = np.quantile(x, q)
     cats = np.searchsorted(quantiles, x)
-    return np.random.permutation(nc)[cats]
+    return rng.permutation(nc)[cats]
 
 
 def make_simple_dataset(
@@ -176,7 +176,7 @@ def make_simple_dataset(
 
     for idx in np.where(is_categorical)[0]:
         nc = rng.integers(2, 6)  # cant go to high or test will be too slow
-        X_dense[:, idx] = to_categorical(X_dense[:, idx], nc)
+        X_dense[:, idx] = to_categorical(X_dense[:, idx], nc, rng)
     with_duplicates = rng.integers(2) == 0
     if with_duplicates:
         X_dense = X_dense.round(1 if n < 50 else 2)
@@ -221,7 +221,7 @@ def test_split_impurity(
     if categorical and "Extra" in Tree.__name__:
         pytest.skip("Categorical features not implemented for the random splitter")
 
-    rng = np.random.default_rng()
+    rng = np.random.default_rng(global_random_seed)
 
     ns = [5] * 5 + [10] * 5 + [20, 30, 50, 100]
 
@@ -234,6 +234,7 @@ def test_split_impurity(
         )
         if categorical:
             is_categorical = rng.random(d) < 0.5
+            n_classes = 2
             tree_kwargs["categorical_features"] = is_categorical
         else:
             is_categorical = np.zeros(d, dtype=bool)

From a5f7376eafd592b3f28791bd1e640a5eb54065c1 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Mon, 26 Jan 2026 22:04:52 +0100
Subject: [PATCH 19/23] fix int size bug

---
 sklearn/tree/_tree.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sklearn/tree/_tree.pyx b/sklearn/tree/_tree.pyx
index 0537961fca776..4f220bfd9da2c 100644
--- a/sklearn/tree/_tree.pyx
+++ b/sklearn/tree/_tree.pyx
@@ -1018,7 +1018,7 @@ cdef class Tree:
                         else:
                             node = &self.nodes[node.right_child]
                     elif self.is_categorical[node.feature]:
-                        if node.categorical_bitset & (1 << (<intp_t> X_i_node_feature)):
+                        if node.categorical_bitset & ((<uint64_t>1) << (<uint8_t> X_i_node_feature)):
                             node = &self.nodes[node.left_child]
                         else:
                             node = &self.nodes[node.right_child]
@@ -1150,7 +1150,7 @@ cdef class Tree:
                         else:
                             node = &self.nodes[node.right_child]
                     elif self.is_categorical[node.feature]:
-                        if node.categorical_bitset & (1 << (<intp_t> X_ndarray[i, node.feature])):
+                        if node.categorical_bitset & ((<uint64_t>1) << (<uint8_t> X_ndarray[i, node.feature])):
                             node = &self.nodes[node.left_child]
                         else:
                             node = &self.nodes[node.right_child]

From 67200b3ae6e00a1c4d77d8a0488d2bf8de0ea559 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Tue, 27 Jan 2026 09:49:20 +0100
Subject: [PATCH 20/23] guard for incompatibilities of categorical features

---
 sklearn/tree/_classes.py        | 16 ++++++++++++++++
 sklearn/tree/tests/test_tree.py | 29 +++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index 32e35a1a7a75f..bb6100631fe7e 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -445,6 +445,22 @@ def _fit(
             self._check_categorical_features(X, monotonic_cst)
         )
 
+        has_categorical = bool(np.any(self.is_categorical_))
+        if has_categorical and self.splitter == "random":
+            raise ValueError(
+                "Categorical features are not supported with splitter='random'. "
+                "Use splitter='best' instead."
+            )
+        if has_categorical and self.n_outputs_ > 1:
+            raise ValueError(
+                "Categorical features are not supported with multi-output targets."
+            )
+        if has_categorical and is_classifier(self) and np.any(self.n_classes_ > 2):
+            raise ValueError(
+                "Categorical features are only supported for binary classification. "
+                f"Found {self.n_classes_.max()} classes."
+            )
+
         if not isinstance(self.splitter, Splitter):
             splitter = SPLITTERS[self.splitter](
                 criterion,
diff --git a/sklearn/tree/tests/test_tree.py b/sklearn/tree/tests/test_tree.py
index 174cf6ea2bd11..662736d0cfae9 100644
--- a/sklearn/tree/tests/test_tree.py
+++ b/sklearn/tree/tests/test_tree.py
@@ -3134,3 +3134,32 @@ def test_random_splitter_missing_values_uses_non_missing_min_max(X, y):
 
     assert np.isfinite(threshold)
     assert non_missing.min() <= threshold <= non_missing.max()
+
+
+def test_categorical_random_splitter_raises():
+    X = np.array([[0], [1], [2], [3]], dtype=np.float32)
+    y = np.array([0, 1, 0, 1])
+
+    tree = DecisionTreeClassifier(
+        splitter="random", categorical_features=[0], random_state=0
+    )
+    with pytest.raises(ValueError, match="splitter='random'"):
+        tree.fit(X, y)
+
+
+def test_categorical_multiclass_classification_raises():
+    X = np.array([[0], [1], [2], [3]], dtype=np.float32)
+    y = np.array([0, 1, 2, 0])
+
+    tree = DecisionTreeClassifier(categorical_features=[0], random_state=0)
+    with pytest.raises(ValueError, match="binary classification"):
+        tree.fit(X, y)
+
+
+def test_categorical_multioutput_raises():
+    X = np.array([[0], [1], [2], [3]], dtype=np.float32)
+    y = np.array([[0.0, 1.0], [1.0, 0.0], [0.0, 1.0], [1.0, 0.0]])
+
+    tree = DecisionTreeRegressor(categorical_features=[0], random_state=0)
+    with pytest.raises(ValueError, match="multi-output"):
+        tree.fit(X, y)

From 09188aab550651959dcd3335e422a9706e2e8071 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Tue, 27 Jan 2026 09:56:59 +0100
Subject: [PATCH 21/23]  Update BasePartitioner commented class

---
 sklearn/tree/_partitioner.pxd | 16 ++++++++++------
 sklearn/tree/_partitioner.pyx |  2 --
 sklearn/tree/_utils.pxd       |  3 +--
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/sklearn/tree/_partitioner.pxd b/sklearn/tree/_partitioner.pxd
index 572eeb6ea04af..94314af5285cc 100644
--- a/sklearn/tree/_partitioner.pxd
+++ b/sklearn/tree/_partitioner.pxd
@@ -32,10 +32,12 @@ cdef const float32_t FEATURE_THRESHOLD = 1e-7
 #     cdef intp_t end
 #     cdef intp_t n_missing
 #     cdef const uint8_t[::1] missing_values_in_feature_mask
+#     cdef intp_t n_categories
 
-#     cdef void sort_samples_and_feature_values(
+#     cdef bint sort_samples_and_feature_values(
 #         self, intp_t current_feature
 #     ) noexcept nogil
+#     cdef void shift_missing_to_the_left(self) noexcept nogil
 #     cdef void init_node_split(
 #         self,
 #         intp_t start,
@@ -52,15 +54,19 @@ cdef const float32_t FEATURE_THRESHOLD = 1e-7
 #         intp_t* p_prev,
 #         intp_t* p
 #     ) noexcept nogil
+#     cdef inline SplitValue pos_to_threshold(
+#         self, intp_t p_prev, intp_t p
+#     ) noexcept nogil
 #     cdef intp_t partition_samples(
 #         self,
-#         float64_t current_threshold
+#         float64_t current_threshold,
+#         bint missing_go_to_left
 #     ) noexcept nogil
 #     cdef void partition_samples_final(
 #         self,
-#         float64_t best_threshold,
+#         SplitValue split_value,
 #         intp_t best_feature,
-#         bint best_missing_go_to_left
+#         bint best_missing_go_to_left,
 #     ) noexcept nogil
 
 
@@ -119,7 +125,6 @@ cdef class DensePartitioner:
     ) noexcept nogil
     cdef void partition_samples_final(
         self,
-        intp_t best_pos,
         SplitValue split_value,
         intp_t best_feature,
         bint best_missing_go_to_left,
@@ -182,7 +187,6 @@ cdef class SparsePartitioner:
     ) noexcept nogil
     cdef void partition_samples_final(
         self,
-        intp_t best_pos,
         SplitValue split_value,
         intp_t best_feature,
         bint best_missing_go_to_left,
diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
index 0f555b0662b49..0daf9e08c1f1d 100644
--- a/sklearn/tree/_partitioner.pyx
+++ b/sklearn/tree/_partitioner.pyx
@@ -359,7 +359,6 @@ cdef class DensePartitioner:
 
     cdef inline void partition_samples_final(
         self,
-        intp_t best_pos,
         SplitValue split_value,
         intp_t best_feature,
         bint best_missing_go_to_left
@@ -577,7 +576,6 @@ cdef class SparsePartitioner:
 
     cdef inline void partition_samples_final(
         self,
-        intp_t best_pos,
         SplitValue split_value,
         intp_t best_feature,
         bint missing_go_to_left
diff --git a/sklearn/tree/_utils.pxd b/sklearn/tree/_utils.pxd
index 32f42f0d59b84..4857edc070202 100644
--- a/sklearn/tree/_utils.pxd
+++ b/sklearn/tree/_utils.pxd
@@ -15,8 +15,7 @@ ctypedef union SplitValue:
     # features. The floating point view, i.e. ``split_value.threshold`` is used
     # for numerical features, where feature values less than or equal to the
     # threshold go left, and values greater than the threshold go right.
-    #
-    # For categorical features, TODO
+
     float64_t threshold
     uint64_t cat_split  # bitset
 

From 88a2d31ae67088190ae6f2049804be024c2c2759 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Tue, 27 Jan 2026 15:09:50 +0100
Subject: [PATCH 22/23] fix

---
 sklearn/tree/_splitter.pyx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 967e561cbd59c..27b9efaf0bfd8 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -453,7 +453,6 @@ cdef inline int node_split_best(
     # Reorganize into samples[start:best_split.pos] + samples[best_split.pos:end]
     if best_split.pos < end:
         partitioner.partition_samples_final(
-            best_split.pos,
             best_split.value,
             best_split.feature,
             best_split.missing_go_to_left
@@ -689,7 +688,6 @@ cdef inline int node_split_random(
     if best_split.pos < end:
         if current_split.feature != best_split.feature:
             partitioner.partition_samples_final(
-                best_split.pos,
                 best_split.value,
                 best_split.feature,
                 best_split.missing_go_to_left

From e43336c4a095a13829f1a2e33c18f0fd572308f9 Mon Sep 17 00:00:00 2001
From: Arthur <arthur.lcte@gmail.com>
Date: Tue, 27 Jan 2026 22:57:59 +0100
Subject: [PATCH 23/23] docstrings (public and internal)

---
 sklearn/tree/_classes.py      | 37 +++++++++++++++++++++++++++++++----
 sklearn/tree/_partitioner.pyx | 26 +++++++++++++++++++-----
 sklearn/tree/_splitter.pyx    |  5 +++--
 3 files changed, 57 insertions(+), 11 deletions(-)

diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
index bb6100631fe7e..0b02e01e3896e 100644
--- a/sklearn/tree/_classes.py
+++ b/sklearn/tree/_classes.py
@@ -535,10 +535,11 @@ def _check_categorical_features(self, X, monotonic_cst):
 
         Return
         ------
-        is_categorical : ndarray of shape (n_features,) or None, dtype=bool
-            Indicates whether a feature is categorical. If no feature is
-            categorical, this is None.
-        n_categories_in_feature: TODO
+        is_categorical : ndarray of shape (n_features,), dtype=bool
+            Boolean mask indicating whether each feature is categorical.
+        n_categories_in_feature : ndarray of shape (n_features,), dtype=intp
+            For categorical features, stores ``max(X[:, idx]) + 1``. For
+            non-categorical features, stores ``-1``.
         """
         n_features = X.shape[1]
         categorical_features = np.asarray(self.categorical_features)
@@ -977,6 +978,19 @@ class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
 
         .. versionadded:: 1.4
 
+    categorical_features : array-like of int or bool of shape (n_features,) or
+        (n_categorical_features,), default=None
+        Indicates which features are treated as categorical.
+
+        - If array-like of int, the entries are feature indices.
+        - If array-like of bool, it is a boolean mask over features.
+
+        Categorical features are only supported for dense inputs
+        and single-output targets.
+        Values of categorical features must be contiguous integers in ``[0, 63]``
+        (missing values are not supported).
+        Categorical features cannot have non-zero monotonic constraints.
+
     Attributes
     ----------
     classes_ : ndarray of shape (n_classes,) or list of ndarray
@@ -1371,6 +1385,21 @@ class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
 
         .. versionadded:: 1.4
 
+    categorical_features : array-like of int or bool of shape (n_features,) or
+        (n_categorical_features,), default=None
+        Indicates which features are treated as categorical.
+
+        - If array-like of int, the entries are feature indices.
+        - If array-like of bool, it is a boolean mask over features.
+
+        Categorical features are only supported for dense inputs
+        and single-output targets.
+        Values of categorical features must be contiguous integers in ``[0, 63]``
+        (missing values are not supported).
+        Categorical features cannot have non-zero monotonic constraints.
+
+        When these constraints are not met, ``fit`` will raise an error.
+
     Attributes
     ----------
     feature_importances_ : ndarray of shape (n_features,)
diff --git a/sklearn/tree/_partitioner.pyx b/sklearn/tree/_partitioner.pyx
index 0daf9e08c1f1d..803631e143c08 100644
--- a/sklearn/tree/_partitioner.pyx
+++ b/sklearn/tree/_partitioner.pyx
@@ -80,9 +80,10 @@ cdef class DensePartitioner:
     ) noexcept nogil:
         """Simultaneously sort based on the feature_values.
 
-        Missing values are stored at the end of feature_values.
-        The number of missing values observed in feature_values is stored
-        in self.n_missing.
+        For numerical features, this is a standard sort. For categorical
+        features, samples are reordered using the Breiman ordering shortcut.
+
+        Returns ``True`` when the feature is constant at the current node.
         """
         cdef:
             intp_t i, current_end
@@ -302,6 +303,12 @@ cdef class DensePartitioner:
     cdef inline SplitValue pos_to_threshold(
         self, intp_t p_prev, intp_t p
     ) noexcept nogil:
+        """Convert a split position into a concrete split value.
+
+        For numerical features, this returns the usual mid-point threshold.
+        For categorical features, it converts the split position into a bitset
+        over categories.
+        """
         cdef SplitValue split
         cdef intp_t end_non_missing = (
             self.end if self.missing_on_the_left
@@ -386,6 +393,7 @@ cdef class DensePartitioner:
                     samples[partition_end], samples[partition_start])
 
     cdef inline uint64_t _split_pos_to_bitset(self, intp_t p, intp_t nc) noexcept nogil:
+        """Convert a split position ``p`` into a categorical bitset."""
         cdef uint64_t bitset = 0
         cdef intp_t r, c
         cdef intp_t offset = 0
@@ -448,7 +456,10 @@ cdef class SparsePartitioner:
         self,
         intp_t current_feature
     ) noexcept nogil:
-        """Simultaneously sort based on the feature_values."""
+        """Simultaneously sort based on the feature_values.
+
+        Returns ``True`` when the feature is constant at the current node.
+        """
         cdef:
             float32_t[::1] feature_values = self.feature_values
             intp_t[::1] index_to_samples = self.index_to_samples
@@ -554,7 +565,7 @@ cdef class SparsePartitioner:
     cdef inline SplitValue pos_to_threshold(
         self, intp_t p_prev, intp_t p
     ) noexcept nogil:
-
+        """Convert a split position into a numerical threshold."""
         cdef SplitValue split
         # split between two non-missing values
         # sum of halves is used to avoid infinite value
@@ -675,6 +686,11 @@ cdef class SparsePartitioner:
 cdef inline bint goes_left(
     SplitValue split_value, bint missing_go_to_left, bint is_categorical, float32_t value
 ) noexcept nogil:
+    """Return whether ``value`` should go to the left child
+
+    This helper centralizes the split semantics for numerical, categorical,
+    and missing values.
+    """
     if isnan(value):
         return missing_go_to_left
     elif is_categorical:
diff --git a/sklearn/tree/_splitter.pyx b/sklearn/tree/_splitter.pyx
index 27b9efaf0bfd8..c08ab11992654 100644
--- a/sklearn/tree/_splitter.pyx
+++ b/sklearn/tree/_splitter.pyx
@@ -152,8 +152,9 @@ cdef class Splitter:
             are assumed to have uniform weight. This is represented
             as a Cython memoryview.
 
-        has_missing : bool
-            At least one missing values is in X.
+        n_categories_in_feature : ndarray, dtype=intp_t
+            Per-feature number of categories for categorical features, and
+            ``-1`` for numerical features.
         """
 
         self.rand_r_state = self.random_state.randint(0, RAND_R_MAX)