diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
index 2439f9269f..df0c81923f 100644
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@@ -104,6 +104,7 @@ add_library(
   src/preprocessing/quantize/pq.cpp
   src/preprocessing/quantize/scalar.cpp
   src/distance/pairwise_distance.cpp
+  src/selection/select_k.cpp
 )
 add_library(cuvs::c_api ALIAS cuvs_c)
 set_target_properties(
diff --git a/c/include/cuvs/core/c_api.h b/c/include/cuvs/core/c_api.h
index b9941d6ae4..622fe4cdcf 100644
--- a/c/include/cuvs/core/c_api.h
+++ b/c/include/cuvs/core/c_api.h
@@ -129,6 +129,22 @@ cuvsError_t cuvsStreamSync(cuvsResources_t res);
  */
 cuvsError_t cuvsDeviceIdGet(cuvsResources_t res, int* device_id);
 
+/**
+ * @brief Configure the temporary workspace on this resources object as an uncapped pool, backed
+ *        by the current device memory resource. After the initial reservation is allocated on
+ *        first use, subsequent calls to cuvsRMMAlloc / cuvsRMMFree on the same resources handle
+ *        hit the pool cache rather than calling cudaMallocAsync / cudaFreeAsync, reducing CUDA
+ *        context lock contention under concurrent query threads. The pool grows without shrinking:
+ *        freed allocations are returned to the pool rather than to the device, so the pool's
+ *        high-water mark only increases until the resources object is destroyed.
+ *
+ * @param[in] res                cuvsResources_t opaque C handle
+ * @param[in] initial_size_bytes initial pool reservation in bytes; size to cover the
+ *                               steady-state working set to avoid growth after warmup
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsResourcesSetWorkspacePool(cuvsResources_t res, size_t initial_size_bytes);
+
 /**
  * @brief Create an Initialized opaque C handle for C++ type `raft::device_resources_snmg`
  *        for multi-GPU operations
@@ -210,6 +226,19 @@ cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes);
 cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent,
                                             int max_pool_size_percent,
                                             bool managed);
+/**
+ * @brief Switches the working memory resource to use stream-ordered asynchronous allocation
+ * (cudaMallocAsync / cudaFreeAsync). Unlike the pool resource, this resource returns memory to
+ * the stream immediately without blocking the CPU, eliminating device-wide synchronization on
+ * deallocation. This is especially beneficial when multiple CAGRA searches run concurrently on
+ * separate CUDA streams, because the internal workspace allocations no longer serialize kernel
+ * launches. Be aware that this function will change the memory resource for the whole process
+ * and the new memory resource will be used until explicitly changed.
+ *
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsRMMAsyncMemoryResourceEnable();
+
 /**
  * @brief Resets the memory resource to use the default memory resource (cuda_memory_resource)
  * @return cuvsError_t
diff --git a/c/include/cuvs/neighbors/cagra.h b/c/include/cuvs/neighbors/cagra.h
index 3304e10f37..f6fc7da8e5 100644
--- a/c/include/cuvs/neighbors/cagra.h
+++ b/c/include/cuvs/neighbors/cagra.h
@@ -712,6 +712,34 @@ cuvsError_t cuvsCagraSearch(cuvsResources_t res,
                             DLManagedTensor* distances,
                             cuvsFilter filter);
 
+/**
+ * @brief Search multiple CAGRA index segments concurrently using a single GPU kernel launch.
+ *
+ * Launches a single kernel with grid (1, num_queries, num_segments) so each CTA handles one
+ * (query, segment) pair concurrently. All results land in the caller-supplied device buffers
+ * on the same CUDA stream, so downstream operations (e.g. selectK) see them via stream ordering
+ * with no explicit synchronization needed.
+ *
+ * Only float32 datasets are currently supported.  Distance values are comparable across segments
+ * (same scale) but are not postprocessed (no kScale correction) — they are suitable for
+ * relative comparison (selectK / recall).
+ *
+ * @param[in]  res          cuvsResources_t opaque C handle
+ * @param[in]  params       search parameters
+ * @param[in]  num_segments number of index segments
+ * @param[in]  indices      array of num_segments cuvsCagraIndex_t pointers
+ * @param[in]  queries      array of num_segments DLManagedTensor* (device, float32, [nq, dim])
+ * @param[out] neighbors    array of num_segments DLManagedTensor* (device, uint32, [nq, topk])
+ * @param[out] distances    array of num_segments DLManagedTensor* (device, float32, [nq, topk])
+ */
+cuvsError_t cuvsCagraSearchMultiSegment(cuvsResources_t res,
+                                        cuvsCagraSearchParams_t params,
+                                        uint32_t num_segments,
+                                        cuvsCagraIndex_t* indices,
+                                        DLManagedTensor** queries,
+                                        DLManagedTensor** neighbors,
+                                        DLManagedTensor** distances);
+
 /**
  * @}
  */
diff --git a/c/include/cuvs/selection/select_k.h b/c/include/cuvs/selection/select_k.h
new file mode 100644
index 0000000000..ad79b9e3a7
--- /dev/null
+++ b/c/include/cuvs/selection/select_k.h
@@ -0,0 +1,37 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <cuvs/core/c_api.h>
+#include <dlpack/dlpack.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Select the k smallest values from a flat device array of n candidates.
+ *
+ * Treats `in_val` as a matrix of shape [1, n] and selects the `k` smallest
+ * float values.  `out_idx` receives the int64 column positions of the selected
+ * values in [0, n), so the caller can recover per-segment identity as:
+ *
+ *   segment_index        = out_idx[j] / segment_k
+ *   position_in_segment  = out_idx[j] % segment_k
+ *
+ * @param[in]  res      cuvsResources_t handle
+ * @param[in]  in_val   DLManagedTensor* shape [1, n], float32, device memory
+ * @param[out] out_val  DLManagedTensor* shape [1, k], float32, device memory
+ * @param[out] out_idx  DLManagedTensor* shape [1, k], int64,   device memory
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsSelectK(cuvsResources_t res,
+                        DLManagedTensor* in_val,
+                        DLManagedTensor* out_val,
+                        DLManagedTensor* out_idx);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/c/src/core/c_api.cpp b/c/src/core/c_api.cpp
index f4e3664482..05e3856da1 100644
--- a/c/src/core/c_api.cpp
+++ b/c/src/core/c_api.cpp
@@ -9,11 +9,13 @@
 #include <raft/core/device_resources_snmg.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_id.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/core/resource/resource_types.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <rapids_logger/logger.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/cuda_async_memory_resource.hpp>
 #include <rmm/mr/cuda_memory_resource.hpp>
 #include <rmm/mr/managed_memory_resource.hpp>
 #include <rmm/mr/per_device_resource.hpp>
@@ -35,6 +37,19 @@ extern "C" cuvsError_t cuvsResourcesCreate(cuvsResources_t* res)
   });
 }
 
+extern "C" cuvsError_t cuvsResourcesSetWorkspacePool(cuvsResources_t res, size_t initial_size_bytes)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto res_ptr = reinterpret_cast<raft::resources*>(res);
+    // Create an uncapped pool: pre-warms with initial_size_bytes to avoid cudaMalloc on every
+    // query, but can grow beyond that if an allocation exceeds the initial reservation.
+    raft::resource::set_workspace_resource(
+      *res_ptr,
+      rmm::mr::pool_memory_resource{rmm::mr::get_current_device_resource_ref(),
+                                    initial_size_bytes});
+  });
+}
+
 extern "C" cuvsError_t cuvsResourcesDestroy(cuvsResources_t res)
 {
   return cuvs::core::translate_exceptions([=] {
@@ -132,8 +147,8 @@ extern "C" cuvsError_t cuvsRMMAlloc(cuvsResources_t res, void** ptr, size_t byte
 {
   return cuvs::core::translate_exceptions([=] {
     auto res_ptr = reinterpret_cast<raft::resources*>(res);
-    auto mr      = rmm::mr::get_current_device_resource_ref();
-    *ptr         = mr.allocate(raft::resource::get_cuda_stream(*res_ptr), bytes);
+    auto stream  = raft::resource::get_cuda_stream(*res_ptr);
+    *ptr         = raft::resource::get_workspace_resource_ref(*res_ptr).allocate(stream, bytes);
   });
 }
 
@@ -141,11 +156,13 @@ extern "C" cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes)
 {
   return cuvs::core::translate_exceptions([=] {
     auto res_ptr = reinterpret_cast<raft::resources*>(res);
-    auto mr      = rmm::mr::get_current_device_resource_ref();
-    mr.deallocate(raft::resource::get_cuda_stream(*res_ptr), ptr, bytes);
+    auto stream  = raft::resource::get_cuda_stream(*res_ptr);
+    raft::resource::get_workspace_resource_ref(*res_ptr).deallocate(stream, ptr, bytes);
   });
 }
 
+thread_local std::shared_ptr<rmm::mr::cuda_async_memory_resource> async_mr;
+
 extern "C" cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent,
                                                        int max_pool_size_percent,
                                                        bool managed)
@@ -164,9 +181,20 @@ extern "C" cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_per
   });
 }
 
+extern "C" cuvsError_t cuvsRMMAsyncMemoryResourceEnable()
+{
+  return cuvs::core::translate_exceptions([=] {
+    async_mr = std::make_shared<rmm::mr::cuda_async_memory_resource>();
+    rmm::mr::set_current_device_resource(*async_mr);
+  });
+}
+
 extern "C" cuvsError_t cuvsRMMMemoryResourceReset()
 {
-  return cuvs::core::translate_exceptions([=] { rmm::mr::reset_current_device_resource(); });
+  return cuvs::core::translate_exceptions([=] {
+    rmm::mr::reset_current_device_resource();
+    async_mr.reset();
+  });
 }
 
 thread_local std::unique_ptr<rmm::mr::pinned_host_memory_resource> pinned_mr;
diff --git a/c/src/neighbors/cagra.cpp b/c/src/neighbors/cagra.cpp
index 081179ca46..2a4edda2a8 100644
--- a/c/src/neighbors/cagra.cpp
+++ b/c/src/neighbors/cagra.cpp
@@ -689,6 +689,54 @@ extern "C" cuvsError_t cuvsCagraSearch(cuvsResources_t res,
   });
 }
 
+extern "C" cuvsError_t cuvsCagraSearchMultiSegment(cuvsResources_t res,
+                                                   cuvsCagraSearchParams_t params,
+                                                   uint32_t num_segments,
+                                                   cuvsCagraIndex_t* indices,
+                                                   DLManagedTensor** queries,
+                                                   DLManagedTensor** neighbors,
+                                                   DLManagedTensor** distances)
+{
+  return cuvs::core::translate_exceptions([=] {
+    RAFT_EXPECTS(num_segments > 0, "num_segments must be > 0");
+    RAFT_EXPECTS(indices != nullptr && queries != nullptr && neighbors != nullptr &&
+                   distances != nullptr,
+                 "All pointer arrays must be non-null");
+
+    auto res_ptr = reinterpret_cast<raft::resources*>(res);
+    auto search_params = cuvs::neighbors::cagra::search_params();
+    convert_c_search_params(*params, &search_params);
+
+    // Only float32 is supported for multi-segment search.
+    RAFT_EXPECTS(
+      indices[0]->dtype.code == kDLFloat && indices[0]->dtype.bits == 32,
+      "Multi-segment search only supports float32 indices");
+
+    using T        = float;
+    using IdxT     = uint32_t;
+    using OutIdxT  = uint32_t;
+    using DistanceT = float;
+    using IndexT    = cuvs::neighbors::cagra::index<T, IdxT>;
+
+    std::vector<const IndexT*> idx_vec(num_segments);
+    std::vector<raft::device_matrix_view<const T, int64_t, raft::row_major>> q_vec(num_segments);
+    std::vector<raft::device_matrix_view<OutIdxT, int64_t, raft::row_major>> n_vec(num_segments);
+    std::vector<raft::device_matrix_view<DistanceT, int64_t, raft::row_major>> d_vec(num_segments);
+
+    for (uint32_t i = 0; i < num_segments; i++) {
+      RAFT_EXPECTS(indices[i] != nullptr && indices[i]->addr != 0,
+                   "Index at position %u is null or not built", i);
+      idx_vec[i] = reinterpret_cast<const IndexT*>(indices[i]->addr);
+      q_vec[i]   = cuvs::core::from_dlpack<std::remove_reference_t<decltype(q_vec[i])>>(queries[i]);
+      n_vec[i]   = cuvs::core::from_dlpack<std::remove_reference_t<decltype(n_vec[i])>>(neighbors[i]);
+      d_vec[i]   = cuvs::core::from_dlpack<std::remove_reference_t<decltype(d_vec[i])>>(distances[i]);
+    }
+
+    cuvs::neighbors::cagra::search_multi_segment(
+      *res_ptr, search_params, idx_vec, q_vec, n_vec, d_vec);
+  });
+}
+
 extern "C" cuvsError_t cuvsCagraMerge(cuvsResources_t res,
                                       cuvsCagraIndexParams_t params,
                                       cuvsCagraIndex_t* indices,
diff --git a/c/src/selection/select_k.cpp b/c/src/selection/select_k.cpp
new file mode 100644
index 0000000000..f68416454a
--- /dev/null
+++ b/c/src/selection/select_k.cpp
@@ -0,0 +1,42 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/core/c_api.h>
+#include "../core/exceptions.hpp"
+#include <cuvs/selection/select_k.hpp>
+#include <dlpack/dlpack.h>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resources.hpp>
+
+extern "C" cuvsError_t cuvsSelectK(cuvsResources_t res,
+                                   DLManagedTensor* in_val,
+                                   DLManagedTensor* out_val,
+                                   DLManagedTensor* out_idx)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto* res_ptr = reinterpret_cast<raft::resources*>(res);
+
+    int64_t n = in_val->dl_tensor.shape[1];
+    int64_t k = out_val->dl_tensor.shape[1];
+
+    auto in_view = raft::make_device_matrix_view<const float, int64_t, raft::row_major>(
+      static_cast<const float*>(in_val->dl_tensor.data), 1, n);
+
+    auto out_val_view = raft::make_device_matrix_view<float, int64_t, raft::row_major>(
+      static_cast<float*>(out_val->dl_tensor.data), 1, k);
+
+    auto out_idx_view = raft::make_device_matrix_view<int64_t, int64_t, raft::row_major>(
+      static_cast<int64_t*>(out_idx->dl_tensor.data), 1, k);
+
+    cuvs::selection::select_k(
+      *res_ptr,
+      in_view,
+      std::nullopt,  // implicit positions [0, n) as in_idx
+      out_val_view,
+      out_idx_view,
+      true);  // select_min = true (smallest distance = nearest neighbor)
+  });
+}
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index a7e1249677..fd9b551a36 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -29,6 +29,7 @@
 #include <optional>
 #include <string>
 #include <variant>
+#include <vector>
 
 namespace cuvs::neighbors::graph_build_params {
 using iterative_search_params = cuvs::neighbors::search_params;
@@ -1723,6 +1724,87 @@ void search(raft::resources const& res,
             const cuvs::neighbors::filtering::base_filter& sample_filter =
               cuvs::neighbors::filtering::none_sample_filter{});
 
+/**
+ * @brief Search multiple CAGRA index segments concurrently in a single kernel launch.
+ *
+ * Launches a single SINGLE_CTA kernel grid that covers all segments, with one CTA per segment.
+ * All per-segment results are written into the caller-supplied device buffers on the stream
+ * associated with @p res; the call returns when all segments have been submitted to the stream
+ * (not necessarily completed).  Use @c cuvsStreamSync to wait for completion.
+ *
+ * Distance values are comparable across segments but are not postprocessed (no kScale correction).
+ *
+ * @param[in]  res        raft resources
+ * @param[in]  params     search parameters
+ * @param[in]  indices    one index per segment
+ * @param[in]  queries    per-segment query matrix [n_queries, dim]
+ * @param[out] neighbors  per-segment result neighbors [n_queries, topk]
+ * @param[out] distances  per-segment result distances [n_queries, topk]
+ */
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<float, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const float, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<uint32_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<float, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const float, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<int64_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<half, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const half, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<uint32_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<half, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const half, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<int64_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<int8_t, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const int8_t, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<uint32_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<int8_t, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const int8_t, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<int64_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<uint8_t, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const uint8_t, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<uint32_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<uint8_t, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const uint8_t, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<int64_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
 /**
  * @}
  */
diff --git a/cpp/src/neighbors/cagra.cuh b/cpp/src/neighbors/cagra.cuh
index 73c3794d39..8df172cff0 100644
--- a/cpp/src/neighbors/cagra.cuh
+++ b/cpp/src/neighbors/cagra.cuh
@@ -406,6 +406,19 @@ index<T, IdxT> merge(raft::resources const& handle,
   return cagra::detail::merge<T, IdxT>(handle, params, indices, row_filter);
 }
 
+template <typename T, typename IdxT = uint32_t, typename OutputIdxT = uint32_t>
+void search_multi_segment(
+  raft::resources const& res,
+  search_params const& params,
+  const std::vector<const index<T, IdxT>*>& indices,
+  const std::vector<raft::device_matrix_view<const T, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<OutputIdxT, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances)
+{
+  cagra::detail::search_multi_segment<T, OutputIdxT, IdxT>(
+    res, params, indices, queries, neighbors, distances);
+}
+
 /** @} */  // end group cagra
 
 }  // namespace cuvs::neighbors::cagra
diff --git a/cpp/src/neighbors/cagra_search_inst.cu.in b/cpp/src/neighbors/cagra_search_inst.cu.in
index dfef630798..be5c4a906d 100644
--- a/cpp/src/neighbors/cagra_search_inst.cu.in
+++ b/cpp/src/neighbors/cagra_search_inst.cu.in
@@ -32,4 +32,22 @@ CUVS_INST_CAGRA_SEARCH(data_t, uint32_t, int64_t);
 
 #undef CUVS_INST_CAGRA_SEARCH
 
+#define CUVS_INST_CAGRA_SEARCH_MULTI_SEGMENT(T, IdxT, OutputIdxT)                                    \
+  void search_multi_segment(                                                                           \
+    raft::resources const& handle,                                                                     \
+    cuvs::neighbors::cagra::search_params const& params,                                              \
+    const std::vector<const cuvs::neighbors::cagra::index<T, IdxT>*>& indices,                       \
+    const std::vector<raft::device_matrix_view<const T, int64_t, raft::row_major>>& queries,          \
+    const std::vector<raft::device_matrix_view<OutputIdxT, int64_t, raft::row_major>>& neighbors,     \
+    const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances)          \
+  {                                                                                                    \
+    cuvs::neighbors::cagra::search_multi_segment<T, IdxT, OutputIdxT>(                               \
+      handle, params, indices, queries, neighbors, distances);                                         \
+  }
+
+CUVS_INST_CAGRA_SEARCH_MULTI_SEGMENT(data_t, uint32_t, uint32_t);
+CUVS_INST_CAGRA_SEARCH_MULTI_SEGMENT(data_t, uint32_t, int64_t);
+
+#undef CUVS_INST_CAGRA_SEARCH_MULTI_SEGMENT
+
 }  // namespace cuvs::neighbors::cagra
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index f1650980e0..4b3480ae50 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -9,6 +9,7 @@
 #include "factory.cuh"
 #include "sample_filter_utils.cuh"
 #include "search_plan.cuh"
+#include "search_single_cta.cuh"
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
@@ -251,4 +252,169 @@ void search_main(raft::resources const& res,
 }
 /** @} */  // end group cagra
 
+/**
+ * @brief Search all segments concurrently for a single query using one kernel launch.
+ *
+ * Each segment's CTA runs independently (blockIdx.z = segment_id, blockIdx.y = query_id).
+ * All segments must use float32 data with the same search parameters.
+ *
+ * @param indices    per-segment indices (strided datasets only for now)
+ * @param queries    per-segment query views — same vector repeated for each segment
+ * @param neighbors  per-segment output neighbor views — each [num_queries, topk]
+ * @param distances  per-segment output distance views — each [num_queries, topk]
+ */
+template <typename T, typename OutputIdxT = uint32_t, typename IdxT = uint32_t,
+          typename DistanceT = float>
+void search_multi_segment(
+  raft::resources const& res,
+  search_params params,
+  const std::vector<const index<T, IdxT>*>& indices,
+  const std::vector<raft::device_matrix_view<const T, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<OutputIdxT, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<DistanceT, int64_t, raft::row_major>>& distances)
+{
+  static_assert(std::is_same_v<IdxT, uint32_t>, "Only uint32_t graph index type is supported");
+  static_assert(std::is_same_v<DistanceT, float>, "Only float distances are supported");
+
+  const uint32_t num_segments = static_cast<uint32_t>(indices.size());
+  RAFT_EXPECTS(num_segments > 0, "At least one segment is required");
+  RAFT_EXPECTS(queries.size() == num_segments && neighbors.size() == num_segments &&
+                 distances.size() == num_segments,
+               "All input vectors must have the same size");
+
+  const int64_t dim       = queries[0].extent(1);
+  const uint32_t topk     = static_cast<uint32_t>(neighbors[0].extent(1));
+  const uint32_t n_queries = static_cast<uint32_t>(queries[0].extent(0));
+
+  // Find the max graph_degree across all segments (needed for the shared kernel plan).
+  int64_t max_graph_degree = 0;
+  int64_t max_dataset_size = 0;
+  for (uint32_t i = 0; i < num_segments; i++) {
+    RAFT_EXPECTS(!indices[i]->dataset_fd().has_value(),
+                 "Disk-based datasets are not supported for multi-segment search");
+    max_graph_degree = std::max(max_graph_degree, indices[i]->graph().extent(1));
+    max_dataset_size = std::max(max_dataset_size, indices[i]->data().n_rows());
+  }
+
+  if (params.max_queries == 0) {
+    cudaDeviceProp deviceProp = raft::resource::get_device_properties(res);
+    params.max_queries =
+      std::min<size_t>(static_cast<size_t>(n_queries), deviceProp.maxGridSize[1]);
+  }
+
+  // Multi-segment uses a regular (non-persistent) single-CTA kernel launch.
+  // MULTI_CTA requires a different plan type (multi_cta_search::search) and is not supported here.
+  // AUTO could resolve to MULTI_CTA for large itopk_size, so force SINGLE_CTA unconditionally.
+  params.persistent = false;
+  params.algo       = search_algo::SINGLE_CTA;
+
+  // Build a single search plan sized for the maximum graph_degree across all segments.
+  // For the first segment's descriptor type (strided float): use it to init the plan.
+  using graph_idx_type = uint32_t;
+  auto* strided_dset0  = dynamic_cast<const strided_dataset<T, int64_t>*>(&indices[0]->data());
+  RAFT_EXPECTS(strided_dset0 != nullptr,
+               "Multi-segment search only supports strided (non-compressed) datasets");
+
+  RAFT_EXPECTS(indices[0]->metric() != cuvs::distance::DistanceType::CosineExpanded ||
+                 indices[0]->dataset_norms().has_value(),
+               "Dataset norms must be provided for CosineExpanded metric");
+  const float* dataset_norms_ptr0 = nullptr;
+  if (indices[0]->metric() == cuvs::distance::DistanceType::CosineExpanded) {
+    dataset_norms_ptr0 = indices[0]->dataset_norms().value().data_handle();
+  }
+  // Use the first segment's descriptor to construct the plan (smem layout is type-dependent only).
+  auto plan_desc = dataset_descriptor_init_with_cache<T, graph_idx_type, DistanceT>(
+    res, params, *strided_dset0, indices[0]->metric(), dataset_norms_ptr0);
+
+  single_cta_search::search<T, graph_idx_type, DistanceT,
+                             cuvs::neighbors::filtering::none_sample_filter,
+                             graph_idx_type, OutputIdxT>
+    plan(res, params, plan_desc, dim, max_dataset_size, max_graph_degree, topk);
+
+  // Build per-segment descriptors and result pointers on the host.
+  // The device copy is allocated below.
+  using seg_desc_t = single_cta_search::multi_segment_desc_t<T, graph_idx_type, DistanceT>;
+  std::vector<seg_desc_t> host_seg_descs(num_segments);
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(res);
+
+  // Collect per-segment dataset descriptors (may trigger lazy device init on `stream`).
+  std::vector<dataset_descriptor_host<T, graph_idx_type, DistanceT>> seg_dataset_descs;
+  seg_dataset_descs.reserve(num_segments);
+
+  for (uint32_t i = 0; i < num_segments; i++) {
+    auto* strided_dset = dynamic_cast<const strided_dataset<T, int64_t>*>(&indices[i]->data());
+    RAFT_EXPECTS(strided_dset != nullptr,
+                 "All segments must have strided (non-compressed) datasets");
+    const float* norms_ptr = nullptr;
+    if (indices[i]->metric() == cuvs::distance::DistanceType::CosineExpanded) {
+      RAFT_EXPECTS(indices[i]->dataset_norms().has_value(),
+                   "Dataset norms required for CosineExpanded metric (segment %u)", i);
+      norms_ptr = indices[i]->dataset_norms().value().data_handle();
+    }
+    seg_dataset_descs.push_back(dataset_descriptor_init_with_cache<T, graph_idx_type, DistanceT>(
+      res, params, *strided_dset, indices[i]->metric(), norms_ptr));
+
+    // Call dev_ptr to trigger lazy device-side descriptor upload (enqueued on stream).
+    host_seg_descs[i].dataset_desc = seg_dataset_descs.back().dev_ptr(stream);
+    host_seg_descs[i].graph        = indices[i]->graph().data_handle();
+    host_seg_descs[i].graph_degree = static_cast<uint32_t>(indices[i]->graph().extent(1));
+    host_seg_descs[i].queries_ptr  = queries[i].data_handle();
+
+    // Tag the result_indices_ptr with the OutputIdxT size (same convention as select_and_run).
+    constexpr uintptr_t kTag = raft::Pow2<sizeof(OutputIdxT)>::Log2;
+    host_seg_descs[i].result_indices_ptr =
+      reinterpret_cast<uintptr_t>(neighbors[i].data_handle()) | kTag;
+    host_seg_descs[i].result_distances_ptr = distances[i].data_handle();
+  }
+
+  // Upload segment descriptors via workspace pool (no cudaMallocAsync/cudaFreeAsync after warmup).
+  lightweight_uvector<seg_desc_t> dev_seg_descs_buf(res);
+  dev_seg_descs_buf.resize(num_segments, stream);
+  RAFT_CUDA_TRY(cudaMemcpyAsync(dev_seg_descs_buf.data(),
+                                host_seg_descs.data(),
+                                num_segments * sizeof(seg_desc_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
+
+  // Launch all-segment kernel; stream ordering ensures descriptor upload and per-segment
+  // dataset_desc device-init complete before the search kernel executes.
+  plan.run_multi_segment(res, dev_seg_descs_buf.data(), num_segments, n_queries, topk);
+  // dev_seg_descs_buf destructor returns memory to workspace pool (stream-ordered).
+
+  // Post-process distances (scale + metric transform) for each segment.
+  constexpr float kScale = cuvs::spatial::knn::detail::utils::config<T>::kDivisor /
+                           cuvs::spatial::knn::detail::utils::config<DistanceT>::kDivisor;
+  for (uint32_t i = 0; i < num_segments; i++) {
+    float* dist_out          = distances[i].data_handle();
+    const DistanceT* dist_in = distances[i].data_handle();
+    if (indices[i]->metric() == cuvs::distance::DistanceType::CosineExpanded) {
+      auto query_norms = raft::make_device_vector<DistanceT, int64_t>(res, n_queries);
+      auto scaled_sq_op =
+        raft::compose_op(raft::sq_op{},
+                         raft::div_const_op<DistanceT>{DistanceT(kScale)},
+                         raft::cast_op<DistanceT>());
+      raft::linalg::reduce<raft::Apply::ALONG_ROWS>(
+        res,
+        raft::make_device_matrix_view<const T, int64_t, raft::row_major>(
+          queries[i].data_handle(), n_queries, dim),
+        query_norms.view(),
+        (DistanceT)0,
+        false,
+        scaled_sq_op,
+        raft::add_op(),
+        raft::sqrt_op{});
+      raft::linalg::matrix_vector_op<raft::Apply::ALONG_COLUMNS>(
+        res,
+        raft::make_const_mdspan(distances[i]),
+        raft::make_const_mdspan(query_norms.view()),
+        distances[i],
+        raft::compose_op(raft::add_const_op<DistanceT>{DistanceT(1)}, raft::div_checkzero_op{}));
+    } else {
+      cuvs::neighbors::ivf::detail::postprocess_distances(
+        res, dist_out, dist_in, indices[i]->metric(), n_queries, topk, kScale, true);
+    }
+  }
+}
+
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
index 02bf1ff697..0592dc4ed5 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
@@ -87,6 +87,12 @@ struct search
 
   uint32_t num_itopk_candidates;
 
+  /** Number of elements in a hashmap covering @p n_queries queries across @p n_segments segments. */
+  static size_t hashmap_element_count(size_t n_segments, size_t n_queries, size_t h_bitlen)
+  {
+    return n_segments * n_queries * hashmap::get_size(h_bitlen);
+  }
+
   search(raft::resources const& res,
          search_params params,
          const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
@@ -197,12 +203,60 @@ struct search
     RAFT_LOG_DEBUG("# smem_size: %u", smem_size);
     hashmap_size = 0;
     if (small_hash_bitlen == 0 && !this->persistent) {
-      hashmap_size = max_queries * hashmap::get_size(hash_bitlen);
+      hashmap_size = hashmap_element_count(1, max_queries, hash_bitlen);
       hashmap.resize(hashmap_size, raft::resource::get_cuda_stream(res));
     }
     RAFT_LOG_DEBUG("# hashmap_size: %lu", hashmap_size);
   }
 
+  /**
+   * @brief Search all segments concurrently in a single kernel launch.
+   *
+   * @param res         RAFT resources (stream is extracted from here)
+   * @param segment_descs  device pointer to [num_segments] descriptors
+   * @param num_segments   number of segments (gridDim.z)
+   * @param num_queries    queries per segment (gridDim.y)
+   * @param topk           neighbors to return per (query, segment)
+   */
+  void run_multi_segment(
+    raft::resources const& res,
+    const multi_segment_desc_t<DATA_T, INDEX_T, DISTANCE_T>* segment_descs,
+    uint32_t num_segments,
+    uint32_t num_queries,
+    uint32_t topk)
+  {
+    cudaStream_t stream = raft::resource::get_cuda_stream(res);
+
+    // Allocate global hashmap when small-hash is disabled via the workspace pool
+    // (no cudaMallocAsync/cudaFreeAsync after pool warmup).
+    // Layout: [num_segments][num_queries][hash_size].
+    lightweight_uvector<INDEX_T> ms_hashmap_buf(res);
+    INDEX_T* ms_hashmap_ptr = nullptr;
+    if (small_hash_bitlen == 0) {
+      const size_t ms_hashmap_elems = hashmap_element_count(num_segments, num_queries, hash_bitlen);
+      ms_hashmap_buf.resize(ms_hashmap_elems, stream);
+      ms_hashmap_ptr = ms_hashmap_buf.data();
+    }
+
+    select_and_run_multi_segment<DATA_T, INDEX_T, DISTANCE_T, INDEX_T,
+                                 cuvs::neighbors::filtering::none_sample_filter>(
+      segment_descs,
+      num_segments,
+      num_queries,
+      *this,
+      topk,
+      num_itopk_candidates,
+      static_cast<uint32_t>(thread_block_size),
+      smem_size,
+      hash_bitlen,
+      ms_hashmap_ptr,
+      small_hash_bitlen,
+      small_hash_reset_interval,
+      cuvs::neighbors::filtering::none_sample_filter{},
+      stream);
+    // ms_hashmap_buf destructor returns memory to workspace pool (stream-ordered).
+  }
+
   void operator()(
     raft::resources const& res,
     raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cu.in b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cu.in
index 85342e7093..c2d30fa6f5 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cu.in
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cu.in
@@ -20,5 +20,9 @@ instantiate_kernel_selection(data_t,
                              float,
                              cuvs::neighbors::filtering::none_sample_filter);
 instantiate_kernel_selection(data_t, uint32_t, float, bitset_filter_t);
+instantiate_kernel_selection_ms(data_t,
+                                uint32_t,
+                                float,
+                                cuvs::neighbors::filtering::none_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
index 11b468cfca..18314b5c59 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
@@ -34,4 +34,21 @@ namespace cuvs::neighbors::cagra::detail::single_cta_search {
     SampleFilterT sample_filter,                                                   \
     cudaStream_t stream);
 
+#define instantiate_kernel_selection_ms(DataT, IndexT, DistanceT, SampleFilterT)               \
+  template void select_and_run_multi_segment<DataT, IndexT, DistanceT, IndexT, SampleFilterT>( \
+    const multi_segment_desc_t<DataT, IndexT, DistanceT>* segment_descs,                       \
+    uint32_t num_segments,                                                                     \
+    uint32_t num_queries,                                                                      \
+    const search_params& ps,                                                                   \
+    uint32_t topk,                                                                             \
+    uint32_t num_itopk_candidates,                                                             \
+    uint32_t block_size,                                                                       \
+    uint32_t smem_size,                                                                        \
+    int64_t hash_bitlen,                                                                       \
+    IndexT* hashmap_ptr,                                                                       \
+    size_t small_hash_bitlen,                                                                  \
+    size_t small_hash_reset_interval,                                                          \
+    SampleFilterT sample_filter,                                                               \
+    cudaStream_t stream);
+
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 48553611bf..a0b66acc4b 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1165,6 +1165,146 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
                                graph_size);
 }
 
+/**
+ * @brief Multi-segment CAGRA search kernel.
+ *
+ * Grid: (1, num_queries, num_segments).
+ * Each CTA handles one (query, segment) pair independently.
+ * The global hashmap (if used) must be laid out as
+ *   [num_segments][num_queries][hashmap::get_size(hash_bitlen)].
+ */
+template <bool TOPK_BY_BITONIC_SORT,
+          bool BITONIC_SORT_AND_MERGE_MULTI_WARPS,
+          class DATASET_DESCRIPTOR_T,
+          class SourceIndexT,
+          class SAMPLE_FILTER_T>
+RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel_ms(
+  const multi_segment_desc_t<typename DATASET_DESCRIPTOR_T::DATA_T,
+                              typename DATASET_DESCRIPTOR_T::INDEX_T,
+                              typename DATASET_DESCRIPTOR_T::DISTANCE_T>* segments,
+  const std::uint32_t top_k,
+  const SourceIndexT* source_indices_ptr,
+  const unsigned num_distilation,
+  const uint64_t rand_xor_mask,
+  const uint32_t num_seeds,
+  typename DATASET_DESCRIPTOR_T::INDEX_T* visited_hashmap_ptr,
+  const std::uint32_t max_candidates,
+  const std::uint32_t max_itopk,
+  const std::uint32_t internal_topk,
+  const std::uint32_t search_width,
+  const std::uint32_t min_iteration,
+  const std::uint32_t max_iteration,
+  std::uint32_t* const num_executed_iterations,
+  const std::uint32_t hash_bitlen,
+  const std::uint32_t small_hash_bitlen,
+  const std::uint32_t small_hash_reset_interval,
+  SAMPLE_FILTER_T sample_filter)
+{
+  using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T;
+
+  const uint32_t query_id = blockIdx.y;
+  const uint32_t seg_id   = blockIdx.z;
+  const auto& seg         = segments[seg_id];
+
+  // Offset the global hashmap to the base of this segment's block.
+  // search_core will then add blockIdx.y * hash_size for the per-query offset, giving the correct
+  // layout: visited_hashmap_ptr[(seg_id * gridDim.y + query_id) * hash_size].
+  INDEX_T* seg_hashmap_ptr =
+    (visited_hashmap_ptr != nullptr)
+      ? visited_hashmap_ptr +
+          seg_id * static_cast<size_t>(gridDim.y) * hashmap::get_size(hash_bitlen)
+      : nullptr;
+
+  search_core<TOPK_BY_BITONIC_SORT,
+              BITONIC_SORT_AND_MERGE_MULTI_WARPS,
+              DATASET_DESCRIPTOR_T,
+              SourceIndexT,
+              SAMPLE_FILTER_T>(seg.result_indices_ptr,
+                               seg.result_distances_ptr,
+                               top_k,
+                               seg.dataset_desc,
+                               seg.queries_ptr,
+                               seg.graph,
+                               seg.graph_degree,
+                               source_indices_ptr,
+                               num_distilation,
+                               rand_xor_mask,
+                               nullptr,  // seed_ptr: not used in multi-segment
+                               0,        // num_seeds
+                               seg_hashmap_ptr,
+                               max_candidates,
+                               max_itopk,
+                               internal_topk,
+                               search_width,
+                               min_iteration,
+                               max_iteration,
+                               num_executed_iterations,
+                               hash_bitlen,
+                               small_hash_bitlen,
+                               small_hash_reset_interval,
+                               query_id,
+                               sample_filter);
+}
+
+template <bool TOPK_BY_BITONIC_SORT,
+          bool BITONIC_SORT_AND_MERGE_MULTI_WARPS,
+          class DATASET_DESCRIPTOR_T,
+          class SourceIndexT,
+          class SAMPLE_FILTER_T>
+auto dispatch_kernel_ms = []() {
+  static_assert(TOPK_BY_BITONIC_SORT || !BITONIC_SORT_AND_MERGE_MULTI_WARPS);
+  return search_kernel_ms<TOPK_BY_BITONIC_SORT,
+                          BITONIC_SORT_AND_MERGE_MULTI_WARPS,
+                          DATASET_DESCRIPTOR_T,
+                          SourceIndexT,
+                          SAMPLE_FILTER_T>;
+}();
+
+/**
+ * @brief Encodes the (TOPK_BY_BITONIC_SORT, BITONIC_SORT_AND_MERGE_MULTI_WARPS) template
+ * booleans as a runtime value, selected from search parameters.
+ */
+enum class TopkVariant {
+  BITONIC,             ///< bitonic sort, no multi-warp merge  (num_itopk_candidates ≤ 256, itopk_size ≤ 256)
+  BITONIC_MERGE_MULTI, ///< bitonic sort, multi-warp merge     (num_itopk_candidates ≤ 256, itopk_size > 256)
+  RADIX,               ///< radix-based topk                   (num_itopk_candidates > 256)
+};
+
+/**
+ * @brief Selects the topk algorithm variant from runtime search parameters.
+ */
+inline TopkVariant select_topk_variant(unsigned itopk_size,
+                                        unsigned num_itopk_candidates,
+                                        unsigned block_size)
+{
+  assert(itopk_size <= 512);
+  if (num_itopk_candidates <= 256) {
+    if (itopk_size <= 256) { return TopkVariant::BITONIC; }
+    assert(block_size >= 64);
+    return TopkVariant::BITONIC_MERGE_MULTI;
+  }
+  return TopkVariant::RADIX;
+}
+
+template <typename DATASET_DESCRIPTOR_T, typename SourceIndexT, typename SAMPLE_FILTER_T>
+struct search_kernel_config_ms {
+  using kernel_t = decltype(dispatch_kernel_ms<false, false, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>);
+
+  static auto choose_itopk_and_mx_candidates(unsigned itopk_size,
+                                             unsigned num_itopk_candidates,
+                                             unsigned block_size) -> kernel_t
+  {
+    switch (select_topk_variant(itopk_size, num_itopk_candidates, block_size)) {
+      case TopkVariant::BITONIC:
+        return dispatch_kernel_ms<true, false, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>;
+      case TopkVariant::BITONIC_MERGE_MULTI:
+        return dispatch_kernel_ms<true, true, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>;
+      default:
+        return dispatch_kernel_ms<false, false, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>;
+    }
+  }
+};
+
 // To make sure we avoid false sharing on both CPU and GPU, we enforce cache line size to the
 // maximum of the two.
 // This makes sync atomic significantly faster.
@@ -2222,6 +2362,48 @@ auto get_runner(Args... args) -> std::shared_ptr<RunnerT>
   return runner;
 }
 
+/**
+ * @brief Computes the max_candidates and max_itopk constants passed to the search kernel.
+ *
+ * Both values are rounded up to the next power-of-two bucket supported by the kernel template
+ * instantiations. They are the same for single-segment and multi-segment launches, so this helper
+ * is shared by select_and_run and select_and_run_multi_segment.
+ */
+struct kernel_dispatch_params {
+  uint32_t max_candidates;
+  uint32_t max_itopk;
+
+  static kernel_dispatch_params compute(const search_params& ps, uint32_t num_itopk_candidates)
+  {
+    kernel_dispatch_params p{};
+    if (num_itopk_candidates <= 64) {
+      p.max_candidates = 64;
+    } else if (num_itopk_candidates <= 128) {
+      p.max_candidates = 128;
+    } else if (num_itopk_candidates <= 256) {
+      p.max_candidates = 256;
+    } else {
+      p.max_candidates = 32;  // irrelevant, radix-based topk is used
+    }
+
+    assert(ps.itopk_size <= 512);
+    if (num_itopk_candidates <= 256) {  // bitonic sort
+      if (ps.itopk_size <= 64) {
+        p.max_itopk = 64;
+      } else if (ps.itopk_size <= 128) {
+        p.max_itopk = 128;
+      } else if (ps.itopk_size <= 256) {
+        p.max_itopk = 256;
+      } else {
+        p.max_itopk = 512;
+      }
+    } else {  // radix sort
+      p.max_itopk = (ps.itopk_size <= 256) ? 256 : 512;
+    }
+    return p;
+  }
+};
+
 template <typename DataT,
           typename IndexT,
           typename DistanceT,
@@ -2253,37 +2435,7 @@ void select_and_run(
   const SourceIndexT* source_indices_ptr =
     source_indices.has_value() ? source_indices->data_handle() : nullptr;
 
-  uint32_t max_candidates{};
-  if (num_itopk_candidates <= 64) {
-    max_candidates = 64;
-  } else if (num_itopk_candidates <= 128) {
-    max_candidates = 128;
-  } else if (num_itopk_candidates <= 256) {
-    max_candidates = 256;
-  } else {
-    max_candidates =
-      32;  // irrelevant, radix based topk is used (see choose_itopk_and_max_candidates)
-  }
-
-  uint32_t max_itopk{};
-  assert(ps.itopk_size <= 512);
-  if (num_itopk_candidates <= 256) {  // bitonic sort
-    if (ps.itopk_size <= 64) {
-      max_itopk = 64;
-    } else if (ps.itopk_size <= 128) {
-      max_itopk = 128;
-    } else if (ps.itopk_size <= 256) {
-      max_itopk = 256;
-    } else {
-      max_itopk = 512;
-    }
-  } else {  // radix sort
-    if (ps.itopk_size <= 256) {
-      max_itopk = 256;
-    } else {
-      max_itopk = 512;
-    }
-  }
+  auto [max_candidates, max_itopk] = kernel_dispatch_params::compute(ps, num_itopk_candidates);
 
   if (ps.persistent) {
     using runner_type = persistent_runner_t<DataT, IndexT, DistanceT, SourceIndexT, SampleFilterT>;
@@ -2351,5 +2503,87 @@ control is returned in this thread (in persistent_runner_t constructor), so we'r
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }
+
+/**
+ * @brief Launch the multi-segment CAGRA search kernel.
+ *
+ * Searches all N segments concurrently in a single kernel launch.  Each CTA (indexed by
+ * blockIdx.y = query_id, blockIdx.z = seg_id) independently searches one segment for one query.
+ *
+ * @param segment_descs  device pointer to array of num_segments descriptors
+ * @param num_segments   number of segments (= gridDim.z)
+ * @param num_queries    number of queries (= gridDim.y)
+ * @param ps             search parameters (shared across all segments)
+ * @param topk           number of neighbors to return per segment
+ * @param num_itopk_candidates  search_width * max_graph_degree
+ * @param block_size     thread-block size
+ * @param smem_size      shared memory per CTA (computed for max graph_degree)
+ * @param hash_bitlen    global hashmap bit-length
+ * @param hashmap_ptr    device buffer sized [num_segments * num_queries * get_size(hash_bitlen)]
+ * @param small_hash_bitlen    small-hash bit-length (0 = disabled)
+ * @param small_hash_reset_interval  reset interval for small hash
+ * @param sample_filter  sample filter
+ * @param stream         CUDA stream
+ */
+template <typename DataT,
+          typename IndexT,
+          typename DistanceT,
+          typename SourceIndexT,
+          typename SampleFilterT>
+void select_and_run_multi_segment(
+  const multi_segment_desc_t<DataT, IndexT, DistanceT>* segment_descs,
+  uint32_t num_segments,
+  uint32_t num_queries,
+  const search_params& ps,
+  uint32_t topk,
+  uint32_t num_itopk_candidates,
+  uint32_t block_size,
+  uint32_t smem_size,
+  int64_t hash_bitlen,
+  IndexT* hashmap_ptr,
+  size_t small_hash_bitlen,
+  size_t small_hash_reset_interval,
+  SampleFilterT sample_filter,
+  cudaStream_t stream)
+{
+  using descriptor_base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+
+  auto [max_candidates, max_itopk] = kernel_dispatch_params::compute(ps, num_itopk_candidates);
+
+  auto kernel =
+    search_kernel_config_ms<descriptor_base_type, SourceIndexT, SampleFilterT>::
+      choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size);
+
+  dim3 thread_dims(block_size, 1, 1);
+  dim3 block_dims(1, num_queries, num_segments);
+  RAFT_LOG_DEBUG("Launching ms kernel: %u threads, %u queries, %u segments, %u smem",
+                 block_size,
+                 num_queries,
+                 num_segments,
+                 smem_size);
+  auto const& kernel_launcher = [&](auto const& kernel) -> void {
+    kernel<<<block_dims, thread_dims, smem_size, stream>>>(segment_descs,
+                                                           topk,
+                                                           nullptr,  // source_indices_ptr
+                                                           ps.num_random_samplings,
+                                                           ps.rand_xor_mask,
+                                                           0,  // num_seeds
+                                                           hashmap_ptr,
+                                                           max_candidates,
+                                                           max_itopk,
+                                                           ps.itopk_size,
+                                                           ps.search_width,
+                                                           ps.min_iterations,
+                                                           ps.max_iterations,
+                                                           nullptr,  // num_executed_iterations
+                                                           hash_bitlen,
+                                                           small_hash_bitlen,
+                                                           small_hash_reset_interval,
+                                                           sample_filter);
+  };
+  cuvs::neighbors::detail::safely_launch_kernel_with_smem_size(kernel, smem_size, kernel_launcher);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 }  // namespace single_cta_search
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
index ba308db98b..2af57d421f 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
@@ -10,6 +10,23 @@
 
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
 
+/**
+ * @brief Per-segment descriptor for the multi-segment CAGRA search kernel.
+ *
+ * One instance per Lucene segment; the kernel reads this array from device memory using
+ * blockIdx.z as the segment index.
+ */
+template <typename DataT, typename IndexT, typename DistanceT>
+struct alignas(16) multi_segment_desc_t {
+  const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc;
+  const DataT* queries_ptr;         // [num_queries, dim] for this segment
+  const IndexT* graph;              // [dataset_size, graph_degree]
+  uint32_t graph_degree;
+  uint32_t _pad;
+  uintptr_t result_indices_ptr;     // tagged pointer: [num_queries, top_k]
+  DistanceT* result_distances_ptr;  // [num_queries, top_k]
+};
+
 template <typename DataT,
           typename IndexT,
           typename DistanceT,
@@ -38,4 +55,25 @@ void select_and_run(
   SampleFilterT sample_filter,
   cudaStream_t stream);
 
+template <typename DataT,
+          typename IndexT,
+          typename DistanceT,
+          typename SourceIndexT,
+          typename SampleFilterT>
+void select_and_run_multi_segment(
+  const multi_segment_desc_t<DataT, IndexT, DistanceT>* segment_descs,
+  uint32_t num_segments,
+  uint32_t num_queries,
+  const search_params& ps,
+  uint32_t topk,
+  uint32_t num_itopk_candidates,
+  uint32_t block_size,
+  uint32_t smem_size,
+  int64_t hash_bitlen,
+  IndexT* hashmap_ptr,
+  size_t small_hash_bitlen,
+  size_t small_hash_reset_interval,
+  SampleFilterT sample_filter,
+  cudaStream_t stream);
+
 }
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndex.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndex.java
index c87f024124..2e48928636 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndex.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndex.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs;
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java
index 76e1f10bd9..a9ea94a440 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java
@@ -25,6 +25,9 @@ public class CagraSearchParams {
   private long randXORMask;
   private SearchAlgo searchAlgo;
   private HashMapMode hashMapMode;
+  private boolean persistent;
+  private float persistentLifetime;
+  private float persistentDeviceUsage;
 
   /**
    * Enum to denote algorithm used to search CAGRA Index.
@@ -120,7 +123,10 @@ private CagraSearchParams(
       int hashmapMinBitlen,
       float hashmapMaxFillRate,
       int numRandomSamplings,
-      long randXORMask) {
+      long randXORMask,
+      boolean persistent,
+      float persistentLifetime,
+      float persistentDeviceUsage) {
     this.maxQueries = maxQueries;
     this.iTopKSize = iTopKSize;
     this.maxIterations = maxIterations;
@@ -134,6 +140,9 @@ private CagraSearchParams(
     this.hashMapMaxFillRate = hashmapMaxFillRate;
     this.numRandomSamplings = numRandomSamplings;
     this.randXORMask = randXORMask;
+    this.persistent = persistent;
+    this.persistentLifetime = persistentLifetime;
+    this.persistentDeviceUsage = persistentDeviceUsage;
   }
 
   /**
@@ -254,6 +263,33 @@ public HashMapMode getHashMapMode() {
     return hashMapMode;
   }
 
+  /**
+   * Gets whether the persistent kernel is enabled.
+   *
+   * @return true if the persistent kernel is enabled
+   */
+  public boolean isPersistent() {
+    return persistent;
+  }
+
+  /**
+   * Gets the persistent kernel lifetime in seconds.
+   *
+   * @return the lifetime in seconds
+   */
+  public float getPersistentLifetime() {
+    return persistentLifetime;
+  }
+
+  /**
+   * Gets the fraction of maximum grid size used by the persistent kernel.
+   *
+   * @return the device usage fraction (0.0, 1.0]
+   */
+  public float getPersistentDeviceUsage() {
+    return persistentDeviceUsage;
+  }
+
   @Override
   public String toString() {
     return "CagraSearchParams [maxQueries="
@@ -301,8 +337,11 @@ public static class Builder {
     private int numRandomSamplings = 1;
     private float hashMapMaxFillRate = 0.5f;
     private long randXORMask = 0x128394;
-    private SearchAlgo searchAlgo;
-    private HashMapMode hashMapMode;
+    private SearchAlgo searchAlgo = SearchAlgo.AUTO;
+    private HashMapMode hashMapMode = HashMapMode.AUTO_HASH;
+    private boolean persistent = false;
+    private float persistentLifetime = 2.0f;
+    private float persistentDeviceUsage = 1.0f;
 
     /**
      * Default constructor.
@@ -460,6 +499,43 @@ public Builder withRandXorMask(long randXORMask) {
       return this;
     }
 
+    /**
+     * Enables or disables the persistent kernel.
+     *
+     * <p>When enabled, the CAGRA SINGLE_CTA kernel stays resident on the GPU and serves search
+     * jobs via system-scope atomics. The kernel is shared across all indexes.
+     *
+     * @param persistent true to enable the persistent kernel
+     * @return an instance of this Builder
+     */
+    public Builder withPersistent(boolean persistent) {
+      this.persistent = persistent;
+      return this;
+    }
+
+    /**
+     * Sets the time in seconds before an idle persistent kernel exits.
+     *
+     * @param persistentLifetime lifetime in seconds (default 2.0)
+     * @return an instance of this Builder
+     */
+    public Builder withPersistentLifetime(float persistentLifetime) {
+      this.persistentLifetime = persistentLifetime;
+      return this;
+    }
+
+    /**
+     * Sets the fraction of maximum grid size used by the persistent kernel.
+     * Must be greater than 0.0 and not greater than 1.0.
+     *
+     * @param persistentDeviceUsage device usage fraction (default 1.0)
+     * @return an instance of this Builder
+     */
+    public Builder withPersistentDeviceUsage(float persistentDeviceUsage) {
+      this.persistentDeviceUsage = persistentDeviceUsage;
+      return this;
+    }
+
     /**
      * Builds an instance of {@link CagraSearchParams} with passed search
      * parameters.
@@ -480,7 +556,10 @@ public CagraSearchParams build() {
           hashMapMinBitlen,
           hashMapMaxFillRate,
           numRandomSamplings,
-          randXORMask);
+          randXORMask,
+          persistent,
+          persistentLifetime,
+          persistentDeviceUsage);
     }
   }
 }
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java
index b70547b333..d9e6b2598d 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs;
@@ -75,8 +75,13 @@ public class CuVSAceParams {
    */
   private final double maxGpuMemoryGb;
 
-  private CuVSAceParams(long npartitions, long efConstruction, String buildDir, boolean useDisk,
-                        double maxHostMemoryGb, double maxGpuMemoryGb) {
+  private CuVSAceParams(
+      long npartitions,
+      long efConstruction,
+      String buildDir,
+      boolean useDisk,
+      double maxHostMemoryGb,
+      double maxGpuMemoryGb) {
     this.npartitions = npartitions;
     this.efConstruction = efConstruction;
     this.buildDir = buildDir;
@@ -259,8 +264,8 @@ public Builder withMaxGpuMemoryGb(double maxGpuMemoryGb) {
      * @return an instance of {@link CuVSAceParams}
      */
     public CuVSAceParams build() {
-      return new CuVSAceParams(npartitions, efConstruction, buildDir, useDisk,
-                               maxHostMemoryGb, maxGpuMemoryGb);
+      return new CuVSAceParams(
+          npartitions, efConstruction, buildDir, useDisk, maxHostMemoryGb, maxGpuMemoryGb);
     }
   }
 }
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSMatrix.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSMatrix.java
index e0e39a4b4b..a19254ee3b 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSMatrix.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSMatrix.java
@@ -18,6 +18,7 @@ public interface CuVSMatrix extends AutoCloseable {
 
   enum DataType {
     FLOAT(4),
+    HALF(2),
     INT(4),
     UINT(4),
     BYTE(1);
@@ -94,6 +95,13 @@ interface Builder<T extends CuVSMatrix> {
      */
     void addVector(int[] vector);
 
+    /**
+     * Adds a single vector to the matrix. Each element is a raw float16 bit pattern stored in a short.
+     *
+     * @param vector A short array of as many elements as the dimensions
+     */
+    void addVector(short[] vector);
+
     T build();
   }
 
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSResources.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSResources.java
index b105580328..01d500b948 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSResources.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSResources.java
@@ -57,6 +57,24 @@ interface ScopedAccess extends AutoCloseable {
    */
   Path tempDirectory();
 
+  /**
+   * Configure the temporary workspace on this resources object as an uncapped pool backed by the
+   * current device memory resource. After the initial reservation is allocated on first use,
+   * subsequent calls to {@code cuvsRMMAlloc} / {@code cuvsRMMFree} on this handle hit the pool
+   * cache rather than calling {@code cudaMallocAsync} / {@code cudaFreeAsync}, reducing CUDA
+   * context lock contention under concurrent query threads. The pool grows without shrinking:
+   * freed allocations are returned to the pool rather than to the device, so the pool's
+   * high-water mark only increases until the resources object is closed.
+   *
+   * <p>The pool is per-resources-handle (i.e. per query thread when resources are thread-local),
+   * so there is no cross-thread pool mutex contention. Call this once after creating the resources
+   * object; calling it again replaces the pool.
+   *
+   * @param initialSizeBytes initial pool reservation in bytes; size {@code initialSizeBytes} to
+   *                         cover the steady-state working set to avoid growth after warmup
+   */
+  void setWorkspacePool(long initialSizeBytes);
+
   /**
    * Creates a new resources.
    * Equivalent to
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswAceParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswAceParams.java
index 325f424fae..215c22838f 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswAceParams.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswAceParams.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs;
@@ -21,8 +21,12 @@ public class HnswAceParams {
   private double maxHostMemoryGb;
   private double maxGpuMemoryGb;
 
-  private HnswAceParams(long npartitions, String buildDir, boolean useDisk,
-                        double maxHostMemoryGb, double maxGpuMemoryGb) {
+  private HnswAceParams(
+      long npartitions,
+      String buildDir,
+      boolean useDisk,
+      double maxHostMemoryGb,
+      double maxGpuMemoryGb) {
     this.npartitions = npartitions;
     this.buildDir = buildDir;
     this.useDisk = useDisk;
@@ -188,8 +192,7 @@ public Builder withMaxGpuMemoryGb(double maxGpuMemoryGb) {
      * @return an instance of {@link HnswAceParams}
      */
     public HnswAceParams build() {
-      return new HnswAceParams(npartitions, buildDir, useDisk,
-                               maxHostMemoryGb, maxGpuMemoryGb);
+      return new HnswAceParams(npartitions, buildDir, useDisk, maxHostMemoryGb, maxGpuMemoryGb);
     }
   }
 }
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java
index 3eef491b62..84979cfe0c 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs;
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java
index 070cbedae1..d68e01b58b 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs;
@@ -283,13 +283,7 @@ public Builder withAceParams(HnswAceParams aceParams) {
      */
     public HnswIndexParams build() {
       return new HnswIndexParams(
-          hierarchy,
-          efConstruction,
-          numThreads,
-          vectorDimension,
-          m,
-          metric,
-          aceParams);
+          hierarchy, efConstruction, numThreads, vectorDimension, m, metric, aceParams);
     }
   }
 }
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/MultiSegmentSearchResults.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/MultiSegmentSearchResults.java
new file mode 100644
index 0000000000..fd43d18739
--- /dev/null
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/MultiSegmentSearchResults.java
@@ -0,0 +1,55 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package com.nvidia.cuvs;
+
+/**
+ * Holds the decoded results of a multi-segment GPU search.
+ *
+ * <p>Each entry {@code i} in [0, {@link #count}) identifies:
+ * <ul>
+ *   <li>which input segment the result came from ({@link #getSegmentIndex(int)})</li>
+ *   <li>the local vector ordinal within that segment ({@link #getOrdinal(int)})</li>
+ *   <li>the raw CAGRA distance ({@link #getDistance(int)})</li>
+ * </ul>
+ *
+ * <p>The caller is responsible for mapping ordinals to Lucene doc IDs using the
+ * segment-specific {@code ordToDoc} function and adding {@code docBase}.
+ *
+ * @since 25.10
+ */
+public class MultiSegmentSearchResults {
+
+  private final int count;
+  private final int[] segmentIndices;
+  private final int[] ordinals;
+  private final float[] distances;
+
+  MultiSegmentSearchResults(int count, int[] segmentIndices, int[] ordinals, float[] distances) {
+    this.count = count;
+    this.segmentIndices = segmentIndices;
+    this.ordinals = ordinals;
+    this.distances = distances;
+  }
+
+  /** Number of valid results (may be less than k if fewer candidates exist). */
+  public int count() {
+    return count;
+  }
+
+  /** Index into the original segment list for result {@code i}. */
+  public int getSegmentIndex(int i) {
+    return segmentIndices[i];
+  }
+
+  /** Local vector ordinal within the segment for result {@code i}. */
+  public int getOrdinal(int i) {
+    return ordinals[i];
+  }
+
+  /** Raw CAGRA distance for result {@code i} (before score normalization). */
+  public float getDistance(int i) {
+    return distances[i];
+  }
+}
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/SynchronizedCuVSResources.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/SynchronizedCuVSResources.java
index 64a72ec32a..aa74893c6f 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/SynchronizedCuVSResources.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/SynchronizedCuVSResources.java
@@ -40,6 +40,11 @@ public void close() {
     inner.close();
   }
 
+  @Override
+  public void setWorkspacePool(long sizeBytes) {
+    inner.setWorkspacePool(sizeBytes);
+  }
+
   @Override
   public Path tempDirectory() {
     return inner.tempDirectory();
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
index c39578755c..558d2e73f7 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs.spi;
@@ -189,6 +189,16 @@ default CagraIndex mergeCagraIndexes(CagraIndex[] indexes, CagraIndexParams merg
    */
   void enableRMMManagedPooledMemory(int initialPoolSizePercent, int maxPoolSizePercent);
 
+  /**
+   * Switch RMM allocations to use stream-ordered asynchronous allocation
+   * ({@code cudaMallocAsync} / {@code cudaFreeAsync}). Unlike the pool resource, this resource
+   * returns memory to the stream without blocking the CPU, eliminating device-wide synchronization
+   * on deallocation. This is especially beneficial when multiple CAGRA searches run concurrently
+   * on separate CUDA streams, because internal workspace allocations no longer serialize kernel
+   * launches. This operation has a global effect and will affect all resources on the current device.
+   */
+  void enableRMMAsyncMemory();
+
   /** Disables pooled memory on the current device, reverting back to the default setting.  */
   void resetRMMPooledMemory();
 
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java
index 7cbeee4e75..6701afa47b 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs.spi;
@@ -47,8 +47,8 @@ public HnswIndex hnswIndexFromCagra(HnswIndexParams hnswParams, CagraIndex cagra
   }
 
   @Override
-  public HnswIndex hnswIndexBuild(CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset)
-      throws Throwable {
+  public HnswIndex hnswIndexBuild(
+      CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset) throws Throwable {
     throw new UnsupportedOperationException(reasons);
   }
 
@@ -106,6 +106,11 @@ public Level getLogLevel() {
     throw new UnsupportedOperationException(reasons);
   }
 
+  @Override
+  public void enableRMMAsyncMemory() {
+    throw new UnsupportedOperationException(reasons);
+  }
+
   @Override
   public void enableRMMPooledMemory(int initialPoolSizePercent, int maxPoolSizePercent) {
     throw new UnsupportedOperationException(reasons);
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
new file mode 100644
index 0000000000..ad2d5a449a
--- /dev/null
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
@@ -0,0 +1,220 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package com.nvidia.cuvs;
+
+import static com.nvidia.cuvs.internal.common.CloseableRMMAllocation.allocateRMMSegment;
+import static com.nvidia.cuvs.internal.common.Util.CudaMemcpyKind.DEVICE_TO_HOST;
+import static com.nvidia.cuvs.internal.common.Util.checkCuVSError;
+import static com.nvidia.cuvs.internal.common.Util.cudaMemcpyAsync;
+import static com.nvidia.cuvs.internal.common.Util.getStream;
+import static com.nvidia.cuvs.internal.common.Util.prepareTensor;
+import static com.nvidia.cuvs.internal.panama.headers_h.cuvsCagraSearchMultiSegment;
+import static com.nvidia.cuvs.internal.panama.headers_h.cuvsStreamSync;
+import static com.nvidia.cuvs.internal.panama.headers_h.kDLCUDA;
+import static com.nvidia.cuvs.internal.panama.headers_h.kDLFloat;
+import static com.nvidia.cuvs.internal.panama.headers_h.kDLUInt;
+
+import com.nvidia.cuvs.internal.BufferedCagraSearch;
+import com.nvidia.cuvs.internal.CuVSMatrixInternal;
+import com.nvidia.cuvs.internal.CuVSParamsHelper;
+import com.nvidia.cuvs.internal.SelectKHelper;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.util.List;
+
+/**
+ * Performs a single-query approximate nearest neighbor search across multiple CAGRA index segments
+ * using a shared GPU buffer, eliminating per-segment device-to-host copies.
+ *
+ * <h3>Algorithm</h3>
+ * <ol>
+ *   <li>Allocate two global device buffers sized {@code numSegments × k}:
+ *       one for uint32 neighbor ordinals and one for float32 distances.</li>
+ *   <li>Call {@code cuvsCagraSearchMultiSegment} which launches a single GPU kernel covering all
+ *       segments concurrently (one CTA per segment), writing results into the global buffers.</li>
+ *   <li>Call {@code cuvsSelectK} on the main stream to find the global top-k entirely on GPU.</li>
+ *   <li>Sync the main stream.</li>
+ *   <li>Copy the three result arrays to host in a single pass.</li>
+ *   <li>Decode each result: {@code segment = position / k},
+ *       {@code ordinal = ordinals[position]}.</li>
+ * </ol>
+ *
+ * @since 25.10
+ */
+public class MultiSegmentCagraSearch {
+
+  private MultiSegmentCagraSearch() {}
+
+  /**
+   * Searches multiple CAGRA index segments for the global top-k nearest neighbors.
+   *
+   * @param resources  shared {@link CuVSResources} handle; all queries must use the same instance
+   * @param indices    one {@link CagraIndex} per segment, in segment order; each must implement
+   *                   {@link BufferedCagraSearch} (all built-in implementations do)
+   * @param queries    one {@link CagraQuery} per segment (same topK for all); each query encodes
+   *                   the target vector, search parameters, and optional prefilter for that segment
+   * @param k          number of global nearest neighbors to return
+   * @return decoded search results with per-result (segmentIndex, ordinal, distance)
+   * @throws IllegalArgumentException if {@code indices} and {@code queries} differ in size, or if
+   *                                  any index does not support buffered search
+   */
+  public static MultiSegmentSearchResults search(
+      CuVSResources resources, List<CagraIndex> indices, List<CagraQuery> queries, int k)
+      throws Throwable {
+    int numSegments = indices.size();
+    if (numSegments != queries.size()) {
+      throw new IllegalArgumentException(
+          "indices and queries must have the same size; got "
+              + numSegments
+              + " vs "
+              + queries.size());
+    }
+    if (numSegments == 0) {
+      return new MultiSegmentSearchResults(0, new int[0], new int[0], new float[0]);
+    }
+
+    // Validate that all indices support buffered search.
+    BufferedCagraSearch[] buffered = new BufferedCagraSearch[numSegments];
+    for (int i = 0; i < numSegments; i++) {
+      CagraIndex idx = indices.get(i);
+      if (!(idx instanceof BufferedCagraSearch)) {
+        throw new IllegalArgumentException(
+            "Index at position " + i + " does not support buffered search");
+      }
+      buffered[i] = (BufferedCagraSearch) idx;
+    }
+
+    long totalCandidates = (long) numSegments * k;
+    long neighborsBytes = totalCandidates * Integer.BYTES; // uint32 per ordinal
+    long distancesBytes = totalCandidates * Float.BYTES; // float32 per distance
+    long outIdxBytes = (long) k * Long.BYTES; // int64 positions from select_k
+    long outValBytes = (long) k * Float.BYTES;
+
+    CagraSearchParams searchParameters = queries.get(0).getCagraSearchParameters();
+
+    try (var resourcesAccessor = resources.access()) {
+      long cuvsRes = resourcesAccessor.handle();
+      var cuvsStream = getStream(cuvsRes);
+
+      try (var globalNeighborsDP = allocateRMMSegment(cuvsRes, neighborsBytes);
+          var globalDistancesDP = allocateRMMSegment(cuvsRes, distancesBytes);
+          var outIdxDP = allocateRMMSegment(cuvsRes, outIdxBytes);
+          var outValDP = allocateRMMSegment(cuvsRes, outValBytes)) {
+
+        // --- Phase 1: call cuvsCagraSearchMultiSegment ---
+        // Single kernel launch covers all segments; results land in globalNeighborsDP /
+        // globalDistancesDP on the same CUDA stream, so SelectK below sees them via ordering.
+        try (var arena = Arena.ofConfined()) {
+          MemorySegment sp = CuVSParamsHelper.buildCagraSearchParams(arena, searchParameters);
+
+          // Build C arrays: cuvsCagraIndex_t* indices, DLManagedTensor** q/n/d
+          MemorySegment indexArray = arena.allocate(ValueLayout.ADDRESS, numSegments);
+          MemorySegment queriesArray = arena.allocate(ValueLayout.ADDRESS, numSegments);
+          MemorySegment neighborsArray = arena.allocate(ValueLayout.ADDRESS, numSegments);
+          MemorySegment distancesArray = arena.allocate(ValueLayout.ADDRESS, numSegments);
+
+          long[] segShape = {1, k};
+          for (int i = 0; i < numSegments; i++) {
+            // Index handle
+            indexArray.setAtIndex(ValueLayout.ADDRESS, i, buffered[i].getIndexHandle());
+
+            // Query DLTensor
+            var queryVectors = (CuVSMatrixInternal) queries.get(i).getQueryVectors();
+            queriesArray.setAtIndex(ValueLayout.ADDRESS, i, queryVectors.toTensor(arena));
+
+            // Neighbors DLTensor — slice of global buffer
+            long nByteOffset = (long) i * k * Integer.BYTES;
+            MemorySegment nSlice =
+                MemorySegment.ofAddress(globalNeighborsDP.handle().address() + nByteOffset);
+            neighborsArray.setAtIndex(
+                ValueLayout.ADDRESS,
+                i,
+                prepareTensor(arena, nSlice, segShape, kDLUInt(), 32, kDLCUDA()));
+
+            // Distances DLTensor — slice of global buffer
+            long dByteOffset = (long) i * k * Float.BYTES;
+            MemorySegment dSlice =
+                MemorySegment.ofAddress(globalDistancesDP.handle().address() + dByteOffset);
+            distancesArray.setAtIndex(
+                ValueLayout.ADDRESS,
+                i,
+                prepareTensor(arena, dSlice, segShape, kDLFloat(), 32, kDLCUDA()));
+          }
+
+          checkCuVSError(
+              cuvsCagraSearchMultiSegment(
+                  cuvsRes,
+                  sp,
+                  numSegments,
+                  indexArray,
+                  queriesArray,
+                  neighborsArray,
+                  distancesArray),
+              "cuvsCagraSearchMultiSegment");
+        }
+
+        // --- Phase 2: select global top-k on GPU ---
+        SelectKHelper.selectK(
+            cuvsRes,
+            globalDistancesDP.handle(),
+            totalCandidates,
+            outValDP.handle(),
+            outIdxDP.handle(),
+            k);
+
+        // No stream sync needed here: the D2H copies below are enqueued on the same cuvsStream,
+        // so CUDA stream ordering guarantees selectK completes before the copies begin.
+
+        // --- Phase 3: single device-to-host copy for all three arrays ---
+        // Layout (in order of decreasing alignment): int64 outIdx | float32 outVal | uint32
+        // ordinals
+        try (var hostArena = Arena.ofConfined()) {
+          MemorySegment hostBuf =
+              hostArena.allocate(outIdxBytes + outValBytes + neighborsBytes, Long.BYTES);
+          MemorySegment hostOutIdx = hostBuf.asSlice(0, outIdxBytes);
+          MemorySegment hostOutVal = hostBuf.asSlice(outIdxBytes, outValBytes);
+          MemorySegment hostAllOrdinals =
+              hostBuf.asSlice(outIdxBytes + outValBytes, neighborsBytes);
+
+          cudaMemcpyAsync(hostOutIdx, outIdxDP.handle(), outIdxBytes, DEVICE_TO_HOST, cuvsStream);
+          cudaMemcpyAsync(hostOutVal, outValDP.handle(), outValBytes, DEVICE_TO_HOST, cuvsStream);
+          cudaMemcpyAsync(
+              hostAllOrdinals,
+              globalNeighborsDP.handle(),
+              neighborsBytes,
+              DEVICE_TO_HOST,
+              cuvsStream);
+
+          checkCuVSError(cuvsStreamSync(cuvsRes), "cuvsStreamSync after D2H copy");
+
+          // --- Phase 4: decode results ---
+          int[] segmentIndices = new int[k];
+          int[] selectedOrdinals = new int[k];
+          float[] selectedDistances = new float[k];
+          int count = 0;
+
+          for (int j = 0; j < k; j++) {
+            long pos = hostOutIdx.getAtIndex(ValueLayout.JAVA_LONG, j);
+            float dist = hostOutVal.getAtIndex(ValueLayout.JAVA_FLOAT, j);
+            int ordinal = hostAllOrdinals.getAtIndex(ValueLayout.JAVA_INT, (int) pos);
+
+            if (ordinal < 0) {
+              // CAGRA uses negative sentinel values for unfilled slots
+              continue;
+            }
+            segmentIndices[count] = (int) (pos / k);
+            selectedOrdinals[count] = ordinal;
+            selectedDistances[count] = dist;
+            count++;
+          }
+
+          return new MultiSegmentSearchResults(
+              count, segmentIndices, selectedOrdinals, selectedDistances);
+        }
+      }
+    }
+  }
+}
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java
new file mode 100644
index 0000000000..7df129639c
--- /dev/null
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java
@@ -0,0 +1,65 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package com.nvidia.cuvs.internal;
+
+import com.nvidia.cuvs.CagraQuery;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+
+/**
+ * Internal interface implemented by CAGRA index classes that support writing
+ * search results directly into a caller-owned device buffer without syncing
+ * the stream or copying results to host.
+ *
+ * <p>Used by {@link com.nvidia.cuvs.MultiSegmentCagraSearch} to queue all
+ * per-segment searches before running a single GPU-side top-k reduction.
+ */
+public interface BufferedCagraSearch {
+
+  /**
+   * Runs CAGRA search and writes results into a slice of caller-owned device
+   * buffers without copying results to host or syncing the stream.
+   *
+   * <p>Results are written at element offset {@code segmentIdx * query.getTopK()}
+   * in each buffer:
+   * <ul>
+   *   <li>{@code globalNeighborsDP}: uint32 ordinals</li>
+   *   <li>{@code globalDistancesDP}: float32 distances</li>
+   * </ul>
+   *
+   * <p>The search is submitted to {@code segmentStream}. The caller is responsible for
+   * synchronizing that stream (e.g. via a CUDA event) before consuming the output buffers.
+   *
+   * @param query             query with vectors, topK, search params, optional prefilter
+   * @param globalNeighborsDP device pointer to the shared uint32 neighbors buffer
+   * @param globalDistancesDP device pointer to the shared float32 distances buffer
+   * @param segmentIdx        zero-based segment index; determines the write offset
+   * @param segmentCuvsRes    {@code cuvsResources_t} handle whose CUDA stream receives the kernel
+   * @param segmentStream     CUDA stream corresponding to {@code segmentCuvsRes}; passed explicitly
+   *                          to avoid a redundant {@code cuvsStreamGet} call inside the method
+   * @param searchParams      pre-built {@code cuvsCagraSearchParams} struct; shared across all
+   *                          segments to avoid repeated allocation and population
+   * @param arena             shared scratch arena for per-call CPU-side allocations (tensor
+   *                          descriptors, filter struct); must remain open until after this call
+   *                          returns, and the GPU kernel has launched
+   */
+  void searchIntoBuffer(
+      CagraQuery query,
+      MemorySegment globalNeighborsDP,
+      MemorySegment globalDistancesDP,
+      int segmentIdx,
+      long segmentCuvsRes,
+      MemorySegment segmentStream,
+      MemorySegment searchParams,
+      Arena arena)
+      throws Throwable;
+
+  /**
+   * Returns the raw {@code cuvsCagraIndex_t} handle as a {@link MemorySegment}.
+   * Used by {@link com.nvidia.cuvs.MultiSegmentCagraSearch} to build the index pointer array
+   * for {@code cuvsCagraSearchMultiSegment}.
+   */
+  MemorySegment getIndexHandle();
+}
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
index abc53a5945..7d0d0e2907 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs.internal;
@@ -47,7 +47,7 @@
  *
  * @since 25.02
  */
-public class CagraIndexImpl implements CagraIndex {
+public class CagraIndexImpl implements CagraIndex, BufferedCagraSearch {
   private final CuVSResources resources;
   private final IndexReference cagraIndexReference;
   private boolean destroyed;
@@ -338,6 +338,123 @@ public SearchResults search(CagraQuery query) throws Throwable {
     }
   }
 
+  /**
+   * Runs CAGRA search and writes results directly into caller-owned device buffers.
+   *
+   * <p>Unlike {@link #search}, this method does <em>not</em> copy results to host memory and does
+   * <em>not</em> synchronize the CUDA stream. The caller is responsible for both after all
+   * per-segment searches have been queued.
+   *
+   * <p>Results for this segment are written at byte offset {@code segmentIdx * topK} elements
+   * from the start of each buffer:
+   * <ul>
+   *   <li>{@code globalNeighborsDP}: uint32 ordinals, {@code topK} entries per segment</li>
+   *   <li>{@code globalDistancesDP}: float32 distances, {@code topK} entries per segment</li>
+   * </ul>
+   *
+   * @param query            query parameters including vectors, topK, search params, and optional
+   *                         prefilter; must have exactly one query vector (numQueries == 1)
+   * @param globalNeighborsDP device pointer to the start of the shared uint32 neighbors buffer
+   * @param globalDistancesDP device pointer to the start of the shared float32 distances buffer
+   * @param segmentIdx        zero-based index of this segment; determines the write offset
+   * @param segmentCuvsRes    {@code cuvsResources_t} handle whose CUDA stream receives the kernel
+   */
+  @Override
+  public MemorySegment getIndexHandle() {
+    return cagraIndexReference.getMemorySegment();
+  }
+
+  @Override
+  public void searchIntoBuffer(
+      CagraQuery query,
+      MemorySegment globalNeighborsDP,
+      MemorySegment globalDistancesDP,
+      int segmentIdx,
+      long segmentCuvsRes,
+      MemorySegment segmentStream,
+      MemorySegment searchParams,
+      Arena arena)
+      throws Throwable {
+    checkNotDestroyed();
+    int topK = query.getTopK();
+    var queryVectors = (CuVSMatrixInternal) query.getQueryVectors();
+    long numQueries = queryVectors.size();
+
+    final boolean hasPreFilter = query.getPrefilter() != null;
+    final BitSet[] prefilters =
+        hasPreFilter ? new BitSet[] {query.getPrefilter()} : EMPTY_PREFILTER_BITSET;
+    final long prefilterDataLength = hasPreFilter ? query.getNumDocs() * prefilters.length : 0;
+    final long prefilterLen = hasPreFilter ? (prefilterDataLength + 31) / 32 : 0;
+    final long prefilterBytes = C_INT_BYTE_SIZE * prefilterLen;
+
+    // Pointers into the global buffer at this segment's slice.
+    long neighborByteOffset = (long) segmentIdx * topK * C_INT_BYTE_SIZE;
+    long distanceByteOffset = (long) segmentIdx * topK * Float.BYTES;
+    MemorySegment neighborSlice =
+        MemorySegment.ofAddress(globalNeighborsDP.address() + neighborByteOffset);
+    MemorySegment distanceSlice =
+        MemorySegment.ofAddress(globalDistancesDP.address() + distanceByteOffset);
+
+    if (!(queryVectors instanceof CuVSDeviceMatrix)) {
+      throw new IllegalArgumentException(
+          "searchIntoBuffer requires query vectors already on device");
+    }
+    try (var prefilterDP =
+        hasPreFilter
+            ? allocateRMMSegment(segmentCuvsRes, prefilterBytes)
+            : CloseableRMMAllocation.EMPTY) {
+      var deviceQueryVectors = (CuVSMatrixInternal) queryVectors;
+
+      var queryTensor = deviceQueryVectors.toTensor(arena);
+      long[] neighborsShape = {numQueries, topK};
+      MemorySegment neighborsTensor =
+          prepareTensor(arena, neighborSlice, neighborsShape, kDLUInt(), 32, kDLCUDA());
+      long[] distancesShape = {numQueries, topK};
+      MemorySegment distancesTensor =
+          prepareTensor(
+              arena,
+              distanceSlice,
+              distancesShape,
+              deviceQueryVectors.code(),
+              deviceQueryVectors.bits(),
+              kDLCUDA());
+
+      MemorySegment prefilter = cuvsFilter.allocate(arena);
+      if (!hasPreFilter) {
+        cuvsFilter.type(prefilter, 0); // NO_FILTER
+        cuvsFilter.addr(prefilter, 0);
+      } else {
+        BitSet concatenatedFilters = concatenate(prefilters, query.getNumDocs());
+        long[] filters = concatenatedFilters.toLongArray();
+        var prefilterDataMemorySegment = buildMemorySegment(arena, filters);
+        long[] prefilterShape = {prefilterLen};
+        Util.cudaMemcpyAsync(
+            prefilterDP.handle(),
+            prefilterDataMemorySegment,
+            prefilterBytes,
+            HOST_TO_DEVICE,
+            segmentStream);
+        MemorySegment prefilterTensor =
+            prepareTensor(arena, prefilterDP.handle(), prefilterShape, kDLUInt(), 32, kDLCUDA());
+        cuvsFilter.type(prefilter, 1);
+        cuvsFilter.addr(prefilter, prefilterTensor.address());
+      }
+
+      var returnValue =
+          cuvsCagraSearch(
+              segmentCuvsRes,
+              searchParams,
+              cagraIndexReference.getMemorySegment(),
+              queryTensor,
+              neighborsTensor,
+              distancesTensor,
+              prefilter);
+      checkCuVSError(returnValue, "cuvsCagraSearch (searchIntoBuffer)");
+      // Intentionally no cudaMemcpyAsync and no stream sync here.
+      // The caller syncs the stream after queuing all segment searches.
+    }
+  }
+
   @Override
   public void serialize(OutputStream outputStream) throws Throwable {
     Path path =
@@ -632,8 +749,10 @@ private static void populateNativeIndexParams(
       cuvsAceParams.npartitions(cuvsAceParamsMemorySegment, cuVSAceParams.getNpartitions());
       cuvsAceParams.ef_construction(cuvsAceParamsMemorySegment, cuVSAceParams.getEfConstruction());
       cuvsAceParams.use_disk(cuvsAceParamsMemorySegment, cuVSAceParams.isUseDisk());
-      cuvsAceParams.max_host_memory_gb(cuvsAceParamsMemorySegment, cuVSAceParams.getMaxHostMemoryGb());
-      cuvsAceParams.max_gpu_memory_gb(cuvsAceParamsMemorySegment, cuVSAceParams.getMaxGpuMemoryGb());
+      cuvsAceParams.max_host_memory_gb(
+          cuvsAceParamsMemorySegment, cuVSAceParams.getMaxHostMemoryGb());
+      cuvsAceParams.max_gpu_memory_gb(
+          cuvsAceParamsMemorySegment, cuVSAceParams.getMaxGpuMemoryGb());
 
       String buildDir = cuVSAceParams.getBuildDir();
       if (buildDir != null && !buildDir.isEmpty()) {
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixBaseImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixBaseImpl.java
index 98f4095ffc..03e08ac4c4 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixBaseImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixBaseImpl.java
@@ -7,6 +7,7 @@
 import static com.nvidia.cuvs.internal.common.LinkerHelper.C_CHAR;
 import static com.nvidia.cuvs.internal.common.LinkerHelper.C_FLOAT;
 import static com.nvidia.cuvs.internal.common.LinkerHelper.C_INT;
+import static com.nvidia.cuvs.internal.common.LinkerHelper.C_SHORT;
 import static com.nvidia.cuvs.internal.common.Util.checkCuVSError;
 import static com.nvidia.cuvs.internal.panama.headers_h.*;
 
@@ -95,6 +96,7 @@ public ValueLayout valueLayout() {
   protected static ValueLayout valueLayoutFromType(DataType dataType) {
     return switch (dataType) {
       case FLOAT -> C_FLOAT;
+      case HALF -> C_SHORT;
       case INT, UINT -> C_INT;
       case BYTE -> C_CHAR;
     };
@@ -177,6 +179,8 @@ private static DataType dataTypeFromTensor(byte code, byte bits) {
       dataType = DataType.INT;
     } else if (code == kDLFloat() && bits == 32) {
       dataType = DataType.FLOAT;
+    } else if (code == kDLFloat() && bits == 16) {
+      dataType = DataType.HALF;
     } else if ((code == kDLInt() || code == kDLUInt()) && bits == 8) {
       dataType = DataType.BYTE;
     } else {
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixInternal.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixInternal.java
index 35715b8336..6b4b617493 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixInternal.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixInternal.java
@@ -38,7 +38,7 @@ default int code() {
 
   static int code(DataType dataType) {
     return switch (dataType) {
-      case FLOAT -> kDLFloat();
+      case FLOAT, HALF -> kDLFloat();
       case INT -> kDLInt();
       case UINT, BYTE -> kDLUInt();
     };
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
index 950504bc5a..a4c914dbf8 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs.internal;
@@ -7,7 +7,9 @@
 import static com.nvidia.cuvs.internal.common.Util.checkCuVSError;
 import static com.nvidia.cuvs.internal.panama.headers_h.*;
 
+import com.nvidia.cuvs.CagraSearchParams;
 import com.nvidia.cuvs.internal.common.CloseableHandle;
+import com.nvidia.cuvs.internal.panama.*;
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 
@@ -25,6 +27,30 @@ public final class CuVSParamsHelper {
 
   private CuVSParamsHelper() {}
 
+  /**
+   * Allocates and populates a {@code cuvsCagraSearchParams} struct into {@code arena}.
+   * The returned segment is valid for the lifetime of {@code arena}.
+   */
+  public static MemorySegment buildCagraSearchParams(Arena arena, CagraSearchParams params) {
+    MemorySegment seg = cuvsCagraSearchParams.allocate(arena);
+    cuvsCagraSearchParams.max_queries(seg, params.getMaxQueries());
+    cuvsCagraSearchParams.itopk_size(seg, params.getITopKSize());
+    cuvsCagraSearchParams.max_iterations(seg, params.getMaxIterations());
+    cuvsCagraSearchParams.algo(seg, params.getCagraSearchAlgo().value);
+    cuvsCagraSearchParams.team_size(seg, params.getTeamSize());
+    cuvsCagraSearchParams.search_width(seg, params.getSearchWidth());
+    cuvsCagraSearchParams.min_iterations(seg, params.getMinIterations());
+    cuvsCagraSearchParams.thread_block_size(seg, params.getThreadBlockSize());
+    cuvsCagraSearchParams.hashmap_mode(seg, params.getHashMapMode().value);
+    cuvsCagraSearchParams.hashmap_max_fill_rate(seg, params.getHashMapMaxFillRate());
+    cuvsCagraSearchParams.num_random_samplings(seg, params.getNumRandomSamplings());
+    cuvsCagraSearchParams.rand_xor_mask(seg, params.getRandXORMask());
+    cuvsCagraSearchParams.persistent(seg, params.isPersistent());
+    cuvsCagraSearchParams.persistent_lifetime(seg, params.getPersistentLifetime());
+    cuvsCagraSearchParams.persistent_device_usage(seg, params.getPersistentDeviceUsage());
+    return seg;
+  }
+
   public static CloseableHandle createCagraIndexParams() {
     try (var localArena = Arena.ofConfined()) {
       var paramsPtrPtr = localArena.allocate(cuvsCagraIndexParams_t);
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
index efdf7283ac..e421cad660 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
@@ -65,6 +65,12 @@ public void close() {
     }
   }
 
+  @Override
+  public void setWorkspacePool(long sizeBytes) {
+    checkCuVSError(
+        cuvsResourcesSetWorkspacePool(resourceHandle, sizeBytes), "cuvsResourcesSetWorkspacePool");
+  }
+
   @Override
   public Path tempDirectory() {
     return tempDirectory;
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java
index ca528ac010..66a53fa6d3 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs.internal;
@@ -251,8 +251,8 @@ public static HnswIndex.Builder newBuilder(CuVSResources cuvsResources) {
    * @return A new HNSW index ready for search
    * @throws Throwable if an error occurs during building
    */
-  public static HnswIndex build(CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset)
-      throws Throwable {
+  public static HnswIndex build(
+      CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset) throws Throwable {
     Objects.requireNonNull(resources);
     Objects.requireNonNull(hnswParams);
     Objects.requireNonNull(dataset);
@@ -288,7 +288,8 @@ public static HnswIndex build(CuVSResources resources, HnswIndexParams hnswParam
     return new HnswIndexImpl(new IndexReference(hnswIndex), resources, hnswParams);
   }
 
-  private static CloseableHandle createHnswIndexParamsForBuild(Arena arena, HnswIndexParams params) {
+  private static CloseableHandle createHnswIndexParamsForBuild(
+      Arena arena, HnswIndexParams params) {
     var hnswParams = createHnswIndexParams();
     MemorySegment seg = hnswParams.handle();
 
@@ -324,7 +325,7 @@ private static MemorySegment prepareTensorFromMatrix(Arena arena, CuVSMatrix dat
       return prepareTensor(
           arena,
           matrixInternal.memorySegment(),
-          new long[]{dataset.size(), dataset.columns()},
+          new long[] {dataset.size(), dataset.columns()},
           matrixInternal.code(),
           matrixInternal.bits(),
           kDLCPU());
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/SelectKHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/SelectKHelper.java
new file mode 100644
index 0000000000..f859ea26c7
--- /dev/null
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/SelectKHelper.java
@@ -0,0 +1,94 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package com.nvidia.cuvs.internal;
+
+import static com.nvidia.cuvs.internal.common.LinkerHelper.C_INT;
+import static com.nvidia.cuvs.internal.common.LinkerHelper.C_LONG;
+import static com.nvidia.cuvs.internal.common.LinkerHelper.C_POINTER;
+import static com.nvidia.cuvs.internal.common.Util.checkCuVSError;
+import static com.nvidia.cuvs.internal.common.Util.prepareTensor;
+import static com.nvidia.cuvs.internal.panama.headers_h.kDLCUDA;
+import static com.nvidia.cuvs.internal.panama.headers_h.kDLFloat;
+import static com.nvidia.cuvs.internal.panama.headers_h.kDLInt;
+
+import java.lang.foreign.Arena;
+import java.lang.foreign.FunctionDescriptor;
+import java.lang.foreign.Linker;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.SymbolLookup;
+import java.lang.invoke.MethodHandle;
+
+/**
+ * Panama FFI binding for {@code cuvsSelectK}.
+ *
+ * <p>Selects the k smallest float values from a flat device array of n candidates, writing output
+ * distances and their flat-array positions (int64) into caller-supplied device buffers.
+ */
+public class SelectKHelper {
+
+  private static final MethodHandle cuvsSelectK$mh;
+
+  static {
+    var linker = Linker.nativeLinker();
+    SymbolLookup lookup =
+        SymbolLookup.libraryLookup(System.mapLibraryName("cuvs_c"), Arena.ofAuto())
+            .or(SymbolLookup.loaderLookup())
+            .or(linker.defaultLookup());
+
+    cuvsSelectK$mh =
+        linker.downcallHandle(
+            lookup
+                .find("cuvsSelectK")
+                .orElseThrow(() -> new UnsatisfiedLinkError("cuvsSelectK not found in libcuvs_c")),
+            FunctionDescriptor.of(
+                C_INT, // return: cuvsError_t
+                C_LONG, // cuvsResources_t res
+                C_POINTER, // DLManagedTensor* in_val
+                C_POINTER, // DLManagedTensor* out_val
+                C_POINTER // DLManagedTensor* out_idx
+                ));
+  }
+
+  private SelectKHelper() {}
+
+  /**
+   * Selects the {@code k} smallest distances from a flat device array of {@code n} candidates.
+   *
+   * <p>Output positions in {@code outIdxDP} are int64 column indices into [0, n). The caller
+   * recovers per-segment identity as {@code segment = position / segmentK}.
+   *
+   * @param cuvsRes    cuvsResources_t handle (raw long)
+   * @param inValDP    device pointer to float[n] input distances
+   * @param n          number of input candidates
+   * @param outValDP   device pointer to float[k] output distances
+   * @param outIdxDP   device pointer to int64[k] output positions
+   * @param k          number of results to select
+   */
+  public static void selectK(
+      long cuvsRes,
+      MemorySegment inValDP,
+      long n,
+      MemorySegment outValDP,
+      MemorySegment outIdxDP,
+      long k) {
+    try (var arena = Arena.ofConfined()) {
+      long[] inShape = {1, n};
+      long[] outShape = {1, k};
+
+      MemorySegment inValTensor = prepareTensor(arena, inValDP, inShape, kDLFloat(), 32, kDLCUDA());
+      MemorySegment outValTensor =
+          prepareTensor(arena, outValDP, outShape, kDLFloat(), 32, kDLCUDA());
+      MemorySegment outIdxTensor =
+          prepareTensor(arena, outIdxDP, outShape, kDLInt(), 64, kDLCUDA());
+
+      int rc = (int) cuvsSelectK$mh.invokeExact(cuvsRes, inValTensor, outValTensor, outIdxTensor);
+      checkCuVSError(rc, "cuvsSelectK");
+    } catch (RuntimeException | Error e) {
+      throw e;
+    } catch (Throwable t) {
+      throw new RuntimeException("cuvsSelectK failed", t);
+    }
+  }
+}
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/LinkerHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/LinkerHelper.java
index 6de70ce920..79b440b4a8 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/LinkerHelper.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/LinkerHelper.java
@@ -25,6 +25,9 @@ public class LinkerHelper {
   public static final ValueLayout.OfLong C_LONG =
       (ValueLayout.OfLong) LINKER.canonicalLayouts().get("long");
 
+  public static final ValueLayout.OfShort C_SHORT =
+      (ValueLayout.OfShort) LINKER.canonicalLayouts().get("short");
+
   public static final ValueLayout.OfFloat C_FLOAT =
       (ValueLayout.OfFloat) LINKER.canonicalLayouts().get("float");
 
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
index 1d3199f26f..72f0b4e232 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs.spi;
@@ -138,6 +138,9 @@ public String toString() {
   private final cuvsRMMMemoryResourceReset cuvsRMMMemoryResourceResetInvoker =
       cuvsRMMMemoryResourceReset.makeInvoker();
 
+  private final cuvsRMMAsyncMemoryResourceEnable cuvsRMMAsyncMemoryResourceEnableInvoker =
+      cuvsRMMAsyncMemoryResourceEnable.makeInvoker();
+
   private final cuvsGetLogLevel GET_LOG_LEVEL_INVOKER = cuvsGetLogLevel.makeInvoker();
 
   private JDKProvider() {}
@@ -255,8 +258,8 @@ public HnswIndex hnswIndexFromCagra(HnswIndexParams hnswParams, CagraIndex cagra
   }
 
   @Override
-  public HnswIndex hnswIndexBuild(CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset)
-      throws Throwable {
+  public HnswIndex hnswIndexBuild(
+      CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset) throws Throwable {
     return HnswIndexImpl.build(resources, hnswParams, dataset);
   }
 
@@ -436,6 +439,12 @@ public Level getLogLevel() {
     throw new IllegalArgumentException("Unexpected log level [" + logLevel + "]");
   }
 
+  @Override
+  public void enableRMMAsyncMemory() {
+    checkCuVSError(
+        cuvsRMMAsyncMemoryResourceEnableInvoker.apply(), "cuvsRMMAsyncMemoryResourceEnable");
+  }
+
   @Override
   public void enableRMMPooledMemory(int initialPoolSizePercent, int maxPoolSizePercent) {
     checkCuVSError(
@@ -603,6 +612,15 @@ public void addVector(int[] vector) {
       internalAddVector(MemorySegment.ofArray(vector));
     }
 
+    public void addVector(short[] vector) {
+      if (vector.length != columns) {
+        throw new IllegalArgumentException(
+            String.format(
+                Locale.ROOT, "Expected a vector of size [%d], got [%d]", columns, vector.length));
+      }
+      internalAddVector(MemorySegment.ofArray(vector));
+    }
+
     protected abstract void internalAddVector(MemorySegment vector);
   }
 
diff --git a/java/cuvs-java/src/test/java/com/nvidia/cuvs/CheckedCuVSResources.java b/java/cuvs-java/src/test/java/com/nvidia/cuvs/CheckedCuVSResources.java
index e880edc85d..c615e5ee30 100644
--- a/java/cuvs-java/src/test/java/com/nvidia/cuvs/CheckedCuVSResources.java
+++ b/java/cuvs-java/src/test/java/com/nvidia/cuvs/CheckedCuVSResources.java
@@ -56,6 +56,11 @@ public void close() {
     inner.close();
   }
 
+  @Override
+  public void setWorkspacePool(long sizeBytes) {
+    inner.setWorkspacePool(sizeBytes);
+  }
+
   @Override
   public Path tempDirectory() {
     return inner.tempDirectory();