rapidsai · jamxia155 · Mar 31, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 13, 2026
@@ -104,6 +104,7 @@ add_library(
   src/preprocessing/quantize/pq.cpp
   src/preprocessing/quantize/scalar.cpp
   src/distance/pairwise_distance.cpp
+  src/selection/select_k.cpp
 )
 add_library(cuvs::c_api ALIAS cuvs_c)
 set_target_properties(

@@ -129,6 +129,22 @@ cuvsError_t cuvsStreamSync(cuvsResources_t res);
  */
 cuvsError_t cuvsDeviceIdGet(cuvsResources_t res, int* device_id);
 
+/**
+ * @brief Configure the temporary workspace on this resources object as an uncapped pool, backed
+ *        by the current device memory resource. After the initial reservation is allocated on
+ *        first use, subsequent calls to cuvsRMMAlloc / cuvsRMMFree on the same resources handle
+ *        hit the pool cache rather than calling cudaMallocAsync / cudaFreeAsync, reducing CUDA
+ *        context lock contention under concurrent query threads. The pool grows without shrinking:
+ *        freed allocations are returned to the pool rather than to the device, so the pool's
+ *        high-water mark only increases until the resources object is destroyed.
+ *
+ * @param[in] res                cuvsResources_t opaque C handle
+ * @param[in] initial_size_bytes initial pool reservation in bytes; size to cover the
+ *                               steady-state working set to avoid growth after warmup
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsResourcesSetWorkspacePool(cuvsResources_t res, size_t initial_size_bytes);
+
 /**
  * @brief Create an Initialized opaque C handle for C++ type `raft::device_resources_snmg`
  *        for multi-GPU operations
@@ -210,6 +226,19 @@ cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes);
 cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent,
                                             int max_pool_size_percent,
                                             bool managed);
+/**
+ * @brief Switches the working memory resource to use stream-ordered asynchronous allocation
+ * (cudaMallocAsync / cudaFreeAsync). Unlike the pool resource, this resource returns memory to
+ * the stream immediately without blocking the CPU, eliminating device-wide synchronization on
+ * deallocation. This is especially beneficial when multiple CAGRA searches run concurrently on
+ * separate CUDA streams, because the internal workspace allocations no longer serialize kernel
+ * launches. Be aware that this function will change the memory resource for the whole process
+ * and the new memory resource will be used until explicitly changed.
+ *
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsRMMAsyncMemoryResourceEnable();
+
 /**
  * @brief Resets the memory resource to use the default memory resource (cuda_memory_resource)
  * @return cuvsError_t

@@ -712,6 +712,34 @@ cuvsError_t cuvsCagraSearch(cuvsResources_t res,
                             DLManagedTensor* distances,
                             cuvsFilter filter);
 
+/**
+ * @brief Search multiple CAGRA index segments concurrently using a single GPU kernel launch.
+ *
+ * Launches a single kernel with grid (1, num_queries, num_segments) so each CTA handles one
+ * (query, segment) pair concurrently. All results land in the caller-supplied device buffers
+ * on the same CUDA stream, so downstream operations (e.g. selectK) see them via stream ordering
+ * with no explicit synchronization needed.
+ *
+ * Only float32 datasets are currently supported.  Distance values are comparable across segments
+ * (same scale) but are not postprocessed (no kScale correction) — they are suitable for
+ * relative comparison (selectK / recall).
+ *
+ * @param[in]  res          cuvsResources_t opaque C handle
+ * @param[in]  params       search parameters
+ * @param[in]  num_segments number of index segments
+ * @param[in]  indices      array of num_segments cuvsCagraIndex_t pointers
+ * @param[in]  queries      array of num_segments DLManagedTensor* (device, float32, [nq, dim])
+ * @param[out] neighbors    array of num_segments DLManagedTensor* (device, uint32, [nq, topk])
+ * @param[out] distances    array of num_segments DLManagedTensor* (device, float32, [nq, topk])
+ */
+cuvsError_t cuvsCagraSearchMultiSegment(cuvsResources_t res,
+                                        cuvsCagraSearchParams_t params,
+                                        uint32_t num_segments,
+                                        cuvsCagraIndex_t* indices,
+                                        DLManagedTensor** queries,
+                                        DLManagedTensor** neighbors,
+                                        DLManagedTensor** distances);
+
 /**
  * @}
  */

@@ -0,0 +1,37 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <cuvs/core/c_api.h>
+#include <dlpack/dlpack.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Select the k smallest values from a flat device array of n candidates.
+ *
+ * Treats `in_val` as a matrix of shape [1, n] and selects the `k` smallest
+ * float values.  `out_idx` receives the int64 column positions of the selected
+ * values in [0, n), so the caller can recover per-segment identity as:
+ *
+ *   segment_index        = out_idx[j] / segment_k
+ *   position_in_segment  = out_idx[j] % segment_k
+ *
+ * @param[in]  res      cuvsResources_t handle
+ * @param[in]  in_val   DLManagedTensor* shape [1, n], float32, device memory
+ * @param[out] out_val  DLManagedTensor* shape [1, k], float32, device memory
+ * @param[out] out_idx  DLManagedTensor* shape [1, k], int64,   device memory
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsSelectK(cuvsResources_t res,
+                        DLManagedTensor* in_val,
+                        DLManagedTensor* out_val,
+                        DLManagedTensor* out_idx);
+
+#ifdef __cplusplus
+}
+#endif
@@ -9,11 +9,13 @@
 #include <raft/core/device_resources_snmg.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_id.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/core/resource/resource_types.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rapids_logger/logger.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/cuda_async_memory_resource.hpp>
 #include <rmm/mr/cuda_memory_resource.hpp>
 #include <rmm/mr/managed_memory_resource.hpp>
 #include <rmm/mr/per_device_resource.hpp>
@@ -35,6 +37,19 @@ extern "C" cuvsError_t cuvsResourcesCreate(cuvsResources_t* res)
   });
 }
 
+extern "C" cuvsError_t cuvsResourcesSetWorkspacePool(cuvsResources_t res, size_t initial_size_bytes)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto res_ptr = reinterpret_cast<raft::resources*>(res);
+    // Create an uncapped pool: pre-warms with initial_size_bytes to avoid cudaMalloc on every
+    // query, but can grow beyond that if an allocation exceeds the initial reservation.
+    raft::resource::set_workspace_resource(
+      *res_ptr,
+      rmm::mr::pool_memory_resource{rmm::mr::get_current_device_resource_ref(),
+                                    initial_size_bytes});
+  });
+}
+
 extern "C" cuvsError_t cuvsResourcesDestroy(cuvsResources_t res)
 {
   return cuvs::core::translate_exceptions([=] {
@@ -132,20 +147,22 @@ extern "C" cuvsError_t cuvsRMMAlloc(cuvsResources_t res, void** ptr, size_t byte
 {
   return cuvs::core::translate_exceptions([=] {
     auto res_ptr = reinterpret_cast<raft::resources*>(res);
-    auto mr      = rmm::mr::get_current_device_resource_ref();
-    *ptr         = mr.allocate(raft::resource::get_cuda_stream(*res_ptr), bytes);
+    auto stream  = raft::resource::get_cuda_stream(*res_ptr);
+    *ptr         = raft::resource::get_workspace_resource_ref(*res_ptr).allocate(stream, bytes);
   });
 }
 
 extern "C" cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes)
 {
   return cuvs::core::translate_exceptions([=] {
     auto res_ptr = reinterpret_cast<raft::resources*>(res);
-    auto mr      = rmm::mr::get_current_device_resource_ref();
-    mr.deallocate(raft::resource::get_cuda_stream(*res_ptr), ptr, bytes);
+    auto stream  = raft::resource::get_cuda_stream(*res_ptr);
+    raft::resource::get_workspace_resource_ref(*res_ptr).deallocate(stream, ptr, bytes);
   });
 }
 
+thread_local std::shared_ptr<rmm::mr::cuda_async_memory_resource> async_mr;
+
 extern "C" cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent,
                                                        int max_pool_size_percent,
                                                        bool managed)
@@ -164,9 +181,20 @@ extern "C" cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_per
   });
 }
 
+extern "C" cuvsError_t cuvsRMMAsyncMemoryResourceEnable()
+{
+  return cuvs::core::translate_exceptions([=] {
+    async_mr = std::make_shared<rmm::mr::cuda_async_memory_resource>();
+    rmm::mr::set_current_device_resource(*async_mr);
+  });
+}
+
 extern "C" cuvsError_t cuvsRMMMemoryResourceReset()
 {
-  return cuvs::core::translate_exceptions([=] { rmm::mr::reset_current_device_resource(); });
+  return cuvs::core::translate_exceptions([=] {
+    rmm::mr::reset_current_device_resource();
+    async_mr.reset();
+  });
 }
 
 thread_local std::unique_ptr<rmm::mr::pinned_host_memory_resource> pinned_mr;

@@ -689,6 +689,54 @@ extern "C" cuvsError_t cuvsCagraSearch(cuvsResources_t res,
   });
 }
 
+extern "C" cuvsError_t cuvsCagraSearchMultiSegment(cuvsResources_t res,
+                                                   cuvsCagraSearchParams_t params,
+                                                   uint32_t num_segments,
+                                                   cuvsCagraIndex_t* indices,
+                                                   DLManagedTensor** queries,
+                                                   DLManagedTensor** neighbors,
+                                                   DLManagedTensor** distances)
+{
+  return cuvs::core::translate_exceptions([=] {
+    RAFT_EXPECTS(num_segments > 0, "num_segments must be > 0");
+    RAFT_EXPECTS(indices != nullptr && queries != nullptr && neighbors != nullptr &&
+                   distances != nullptr,
+                 "All pointer arrays must be non-null");
+
+    auto res_ptr = reinterpret_cast<raft::resources*>(res);
+    auto search_params = cuvs::neighbors::cagra::search_params();
+    convert_c_search_params(*params, &search_params);
+
+    // Only float32 is supported for multi-segment search.
+    RAFT_EXPECTS(
+      indices[0]->dtype.code == kDLFloat && indices[0]->dtype.bits == 32,
+      "Multi-segment search only supports float32 indices");
+
+    using T        = float;
+    using IdxT     = uint32_t;
+    using OutIdxT  = uint32_t;
+    using DistanceT = float;
+    using IndexT    = cuvs::neighbors::cagra::index<T, IdxT>;
+
+    std::vector<const IndexT*> idx_vec(num_segments);
+    std::vector<raft::device_matrix_view<const T, int64_t, raft::row_major>> q_vec(num_segments);
+    std::vector<raft::device_matrix_view<OutIdxT, int64_t, raft::row_major>> n_vec(num_segments);
+    std::vector<raft::device_matrix_view<DistanceT, int64_t, raft::row_major>> d_vec(num_segments);
+
+    for (uint32_t i = 0; i < num_segments; i++) {
+      RAFT_EXPECTS(indices[i] != nullptr && indices[i]->addr != 0,
+                   "Index at position %u is null or not built", i);
+      idx_vec[i] = reinterpret_cast<const IndexT*>(indices[i]->addr);
+      q_vec[i]   = cuvs::core::from_dlpack<std::remove_reference_t<decltype(q_vec[i])>>(queries[i]);
+      n_vec[i]   = cuvs::core::from_dlpack<std::remove_reference_t<decltype(n_vec[i])>>(neighbors[i]);
+      d_vec[i]   = cuvs::core::from_dlpack<std::remove_reference_t<decltype(d_vec[i])>>(distances[i]);
+    }
+
+    cuvs::neighbors::cagra::search_multi_segment(
+      *res_ptr, search_params, idx_vec, q_vec, n_vec, d_vec);
+  });
+}
+
 extern "C" cuvsError_t cuvsCagraMerge(cuvsResources_t res,
                                       cuvsCagraIndexParams_t params,
                                       cuvsCagraIndex_t* indices,

@@ -0,0 +1,42 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/core/c_api.h>
+#include "../core/exceptions.hpp"
+#include <cuvs/selection/select_k.hpp>
+#include <dlpack/dlpack.h>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resources.hpp>
+
+extern "C" cuvsError_t cuvsSelectK(cuvsResources_t res,
+                                   DLManagedTensor* in_val,
+                                   DLManagedTensor* out_val,
+                                   DLManagedTensor* out_idx)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto* res_ptr = reinterpret_cast<raft::resources*>(res);
+
+    int64_t n = in_val->dl_tensor.shape[1];
+    int64_t k = out_val->dl_tensor.shape[1];
+
+    auto in_view = raft::make_device_matrix_view<const float, int64_t, raft::row_major>(
+      static_cast<const float*>(in_val->dl_tensor.data), 1, n);
+
+    auto out_val_view = raft::make_device_matrix_view<float, int64_t, raft::row_major>(
+      static_cast<float*>(out_val->dl_tensor.data), 1, k);
+
+    auto out_idx_view = raft::make_device_matrix_view<int64_t, int64_t, raft::row_major>(
+      static_cast<int64_t*>(out_idx->dl_tensor.data), 1, k);
+
+    cuvs::selection::select_k(
+      *res_ptr,
+      in_view,
+      std::nullopt,  // implicit positions [0, n) as in_idx
+      out_val_view,
+      out_idx_view,
+      true);  // select_min = true (smallest distance = nearest neighbor)
+  });
+}