From c6e880d254bd07fac18050e6290c8d77b3cdc338 Mon Sep 17 00:00:00 2001
From: James Xia <jamxia@nvidia.com>
Date: Tue, 31 Mar 2026 16:12:03 -0700
Subject: [PATCH 01/11] Add multi-segment GPU search: cuvsSelectK C API and
 Java bindings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  Introduces a globally-optimized path for searching across multiple CAGRA
  index segments without per-segment device-to-host copies.

  - c/include/cuvs/selection/select_k.h, c/src/selection/select_k.cpp:
    new C API function cuvsSelectK that wraps cuvs::selection::select_k
    for selecting the k smallest float values from a flat device array

  - BufferedCagraSearch (internal interface): searchIntoBuffer() writes
    per-segment CAGRA results into a slice of a caller-owned device buffer
    without syncing the stream or copying to host

  - CagraIndexImpl: implements BufferedCagraSearch; searchIntoBuffer()
    computes byte offsets into the global buffer using segmentIdx * topK

  - SelectKHelper: Panama FFI binding for cuvsSelectK

  - MultiSegmentCagraSearch: orchestrates the full pipeline — queue all
    per-segment searches, sync once, run cuvsSelectK on GPU, sync again,
    single D2H copy, decode results

  - MultiSegmentSearchResults: simple result carrier with count,
    segmentIndices, ordinals, and distances arrays
---
 c/CMakeLists.txt                              |   1 +
 c/include/cuvs/selection/select_k.h           |  37 ++++
 c/src/selection/select_k.cpp                  |  42 +++++
 .../main/java/com/nvidia/cuvs/CagraIndex.java |   2 +-
 .../java/com/nvidia/cuvs/CuVSAceParams.java   |  15 +-
 .../java/com/nvidia/cuvs/HnswAceParams.java   |  13 +-
 .../main/java/com/nvidia/cuvs/HnswIndex.java  |   2 +-
 .../java/com/nvidia/cuvs/HnswIndexParams.java |  10 +-
 .../cuvs/MultiSegmentSearchResults.java       |  55 ++++++
 .../com/nvidia/cuvs/spi/CuVSProvider.java     |   2 +-
 .../nvidia/cuvs/spi/UnsupportedProvider.java  |   6 +-
 .../nvidia/cuvs/MultiSegmentCagraSearch.java  | 164 ++++++++++++++++++
 .../cuvs/internal/BufferedCagraSearch.java    |  44 +++++
 .../nvidia/cuvs/internal/CagraIndexImpl.java  | 122 ++++++++++++-
 .../cuvs/internal/CuVSParamsHelper.java       |   2 +-
 .../nvidia/cuvs/internal/HnswIndexImpl.java   |  11 +-
 .../nvidia/cuvs/internal/SelectKHelper.java   |  94 ++++++++++
 .../com/nvidia/cuvs/spi/JDKProvider.java      |   6 +-
 18 files changed, 591 insertions(+), 37 deletions(-)
 create mode 100644 c/include/cuvs/selection/select_k.h
 create mode 100644 c/src/selection/select_k.cpp
 create mode 100644 java/cuvs-java/src/main/java/com/nvidia/cuvs/MultiSegmentSearchResults.java
 create mode 100644 java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
 create mode 100644 java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java
 create mode 100644 java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/SelectKHelper.java

diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
index 2439f9269f..df0c81923f 100644
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@@ -104,6 +104,7 @@ add_library(
   src/preprocessing/quantize/pq.cpp
   src/preprocessing/quantize/scalar.cpp
   src/distance/pairwise_distance.cpp
+  src/selection/select_k.cpp
 )
 add_library(cuvs::c_api ALIAS cuvs_c)
 set_target_properties(
diff --git a/c/include/cuvs/selection/select_k.h b/c/include/cuvs/selection/select_k.h
new file mode 100644
index 0000000000..ad79b9e3a7
--- /dev/null
+++ b/c/include/cuvs/selection/select_k.h
@@ -0,0 +1,37 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include <cuvs/core/c_api.h>
+#include <dlpack/dlpack.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Select the k smallest values from a flat device array of n candidates.
+ *
+ * Treats `in_val` as a matrix of shape [1, n] and selects the `k` smallest
+ * float values.  `out_idx` receives the int64 column positions of the selected
+ * values in [0, n), so the caller can recover per-segment identity as:
+ *
+ *   segment_index        = out_idx[j] / segment_k
+ *   position_in_segment  = out_idx[j] % segment_k
+ *
+ * @param[in]  res      cuvsResources_t handle
+ * @param[in]  in_val   DLManagedTensor* shape [1, n], float32, device memory
+ * @param[out] out_val  DLManagedTensor* shape [1, k], float32, device memory
+ * @param[out] out_idx  DLManagedTensor* shape [1, k], int64,   device memory
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsSelectK(cuvsResources_t res,
+                        DLManagedTensor* in_val,
+                        DLManagedTensor* out_val,
+                        DLManagedTensor* out_idx);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/c/src/selection/select_k.cpp b/c/src/selection/select_k.cpp
new file mode 100644
index 0000000000..f68416454a
--- /dev/null
+++ b/c/src/selection/select_k.cpp
@@ -0,0 +1,42 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/core/c_api.h>
+#include "../core/exceptions.hpp"
+#include <cuvs/selection/select_k.hpp>
+#include <dlpack/dlpack.h>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resources.hpp>
+
+extern "C" cuvsError_t cuvsSelectK(cuvsResources_t res,
+                                   DLManagedTensor* in_val,
+                                   DLManagedTensor* out_val,
+                                   DLManagedTensor* out_idx)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto* res_ptr = reinterpret_cast<raft::resources*>(res);
+
+    int64_t n = in_val->dl_tensor.shape[1];
+    int64_t k = out_val->dl_tensor.shape[1];
+
+    auto in_view = raft::make_device_matrix_view<const float, int64_t, raft::row_major>(
+      static_cast<const float*>(in_val->dl_tensor.data), 1, n);
+
+    auto out_val_view = raft::make_device_matrix_view<float, int64_t, raft::row_major>(
+      static_cast<float*>(out_val->dl_tensor.data), 1, k);
+
+    auto out_idx_view = raft::make_device_matrix_view<int64_t, int64_t, raft::row_major>(
+      static_cast<int64_t*>(out_idx->dl_tensor.data), 1, k);
+
+    cuvs::selection::select_k(
+      *res_ptr,
+      in_view,
+      std::nullopt,  // implicit positions [0, n) as in_idx
+      out_val_view,
+      out_idx_view,
+      true);  // select_min = true (smallest distance = nearest neighbor)
+  });
+}
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndex.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndex.java
index c87f024124..2e48928636 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndex.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndex.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs;
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java
index b70547b333..d9e6b2598d 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs;
@@ -75,8 +75,13 @@ public class CuVSAceParams {
    */
   private final double maxGpuMemoryGb;
 
-  private CuVSAceParams(long npartitions, long efConstruction, String buildDir, boolean useDisk,
-                        double maxHostMemoryGb, double maxGpuMemoryGb) {
+  private CuVSAceParams(
+      long npartitions,
+      long efConstruction,
+      String buildDir,
+      boolean useDisk,
+      double maxHostMemoryGb,
+      double maxGpuMemoryGb) {
     this.npartitions = npartitions;
     this.efConstruction = efConstruction;
     this.buildDir = buildDir;
@@ -259,8 +264,8 @@ public Builder withMaxGpuMemoryGb(double maxGpuMemoryGb) {
      * @return an instance of {@link CuVSAceParams}
      */
     public CuVSAceParams build() {
-      return new CuVSAceParams(npartitions, efConstruction, buildDir, useDisk,
-                               maxHostMemoryGb, maxGpuMemoryGb);
+      return new CuVSAceParams(
+          npartitions, efConstruction, buildDir, useDisk, maxHostMemoryGb, maxGpuMemoryGb);
     }
   }
 }
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswAceParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswAceParams.java
index 325f424fae..215c22838f 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswAceParams.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswAceParams.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs;
@@ -21,8 +21,12 @@ public class HnswAceParams {
   private double maxHostMemoryGb;
   private double maxGpuMemoryGb;
 
-  private HnswAceParams(long npartitions, String buildDir, boolean useDisk,
-                        double maxHostMemoryGb, double maxGpuMemoryGb) {
+  private HnswAceParams(
+      long npartitions,
+      String buildDir,
+      boolean useDisk,
+      double maxHostMemoryGb,
+      double maxGpuMemoryGb) {
     this.npartitions = npartitions;
     this.buildDir = buildDir;
     this.useDisk = useDisk;
@@ -188,8 +192,7 @@ public Builder withMaxGpuMemoryGb(double maxGpuMemoryGb) {
      * @return an instance of {@link HnswAceParams}
      */
     public HnswAceParams build() {
-      return new HnswAceParams(npartitions, buildDir, useDisk,
-                               maxHostMemoryGb, maxGpuMemoryGb);
+      return new HnswAceParams(npartitions, buildDir, useDisk, maxHostMemoryGb, maxGpuMemoryGb);
     }
   }
 }
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java
index 3eef491b62..84979cfe0c 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs;
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java
index 070cbedae1..d68e01b58b 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs;
@@ -283,13 +283,7 @@ public Builder withAceParams(HnswAceParams aceParams) {
      */
     public HnswIndexParams build() {
       return new HnswIndexParams(
-          hierarchy,
-          efConstruction,
-          numThreads,
-          vectorDimension,
-          m,
-          metric,
-          aceParams);
+          hierarchy, efConstruction, numThreads, vectorDimension, m, metric, aceParams);
     }
   }
 }
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/MultiSegmentSearchResults.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/MultiSegmentSearchResults.java
new file mode 100644
index 0000000000..fd43d18739
--- /dev/null
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/MultiSegmentSearchResults.java
@@ -0,0 +1,55 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package com.nvidia.cuvs;
+
+/**
+ * Holds the decoded results of a multi-segment GPU search.
+ *
+ * <p>Each entry {@code i} in [0, {@link #count}) identifies:
+ * <ul>
+ *   <li>which input segment the result came from ({@link #getSegmentIndex(int)})</li>
+ *   <li>the local vector ordinal within that segment ({@link #getOrdinal(int)})</li>
+ *   <li>the raw CAGRA distance ({@link #getDistance(int)})</li>
+ * </ul>
+ *
+ * <p>The caller is responsible for mapping ordinals to Lucene doc IDs using the
+ * segment-specific {@code ordToDoc} function and adding {@code docBase}.
+ *
+ * @since 25.10
+ */
+public class MultiSegmentSearchResults {
+
+  private final int count;
+  private final int[] segmentIndices;
+  private final int[] ordinals;
+  private final float[] distances;
+
+  MultiSegmentSearchResults(int count, int[] segmentIndices, int[] ordinals, float[] distances) {
+    this.count = count;
+    this.segmentIndices = segmentIndices;
+    this.ordinals = ordinals;
+    this.distances = distances;
+  }
+
+  /** Number of valid results (may be less than k if fewer candidates exist). */
+  public int count() {
+    return count;
+  }
+
+  /** Index into the original segment list for result {@code i}. */
+  public int getSegmentIndex(int i) {
+    return segmentIndices[i];
+  }
+
+  /** Local vector ordinal within the segment for result {@code i}. */
+  public int getOrdinal(int i) {
+    return ordinals[i];
+  }
+
+  /** Raw CAGRA distance for result {@code i} (before score normalization). */
+  public float getDistance(int i) {
+    return distances[i];
+  }
+}
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
index c39578755c..fad11dab21 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs.spi;
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java
index 7cbeee4e75..8d080a0929 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs.spi;
@@ -47,8 +47,8 @@ public HnswIndex hnswIndexFromCagra(HnswIndexParams hnswParams, CagraIndex cagra
   }
 
   @Override
-  public HnswIndex hnswIndexBuild(CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset)
-      throws Throwable {
+  public HnswIndex hnswIndexBuild(
+      CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset) throws Throwable {
     throw new UnsupportedOperationException(reasons);
   }
 
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
new file mode 100644
index 0000000000..f63142a1f6
--- /dev/null
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
@@ -0,0 +1,164 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package com.nvidia.cuvs;
+
+import static com.nvidia.cuvs.internal.common.CloseableRMMAllocation.allocateRMMSegment;
+import static com.nvidia.cuvs.internal.common.Util.CudaMemcpyKind.DEVICE_TO_HOST;
+import static com.nvidia.cuvs.internal.common.Util.checkCuVSError;
+import static com.nvidia.cuvs.internal.common.Util.cudaMemcpyAsync;
+import static com.nvidia.cuvs.internal.common.Util.getStream;
+import static com.nvidia.cuvs.internal.panama.headers_h.cuvsStreamSync;
+
+import com.nvidia.cuvs.internal.BufferedCagraSearch;
+import com.nvidia.cuvs.internal.SelectKHelper;
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.ValueLayout;
+import java.util.List;
+
+/**
+ * Performs a single-query approximate nearest neighbor search across multiple CAGRA index segments
+ * using a shared GPU buffer, eliminating per-segment device-to-host copies.
+ *
+ * <h3>Algorithm</h3>
+ * <ol>
+ *   <li>Allocate two global device buffers sized {@code numSegments × k}:
+ *       one for uint32 neighbor ordinals and one for float32 distances.</li>
+ *   <li>For each segment, call {@link BufferedCagraSearch#searchIntoBuffer} to queue the CAGRA
+ *       search kernel; results are written into the segment's slice with no stream sync between
+ *       segments.</li>
+ *   <li>Sync the stream once after all segment searches are queued.</li>
+ *   <li>Call {@code cuvsSelectK} to find the global top-k smallest distances across all
+ *       {@code numSegments × k} candidates entirely on GPU.</li>
+ *   <li>Sync the stream again.</li>
+ *   <li>Copy the three result arrays to host in a single pass:
+ *       k selected distances, k flat-array positions, and all {@code numSegments × k} ordinals.</li>
+ *   <li>Decode each result: {@code segment = position / k}, {@code ordinal = ordinals[position]}.</li>
+ * </ol>
+ *
+ * @since 25.10
+ */
+public class MultiSegmentCagraSearch {
+
+  private MultiSegmentCagraSearch() {}
+
+  /**
+   * Searches multiple CAGRA index segments for the global top-k nearest neighbors.
+   *
+   * @param resources  shared {@link CuVSResources} handle; all queries must use the same instance
+   * @param indices    one {@link CagraIndex} per segment, in segment order; each must implement
+   *                   {@link BufferedCagraSearch} (all built-in implementations do)
+   * @param queries    one {@link CagraQuery} per segment (same topK for all); each query encodes
+   *                   the target vector, search parameters, and optional prefilter for that segment
+   * @param k          number of global nearest neighbors to return
+   * @return decoded search results with per-result (segmentIndex, ordinal, distance)
+   * @throws IllegalArgumentException if {@code indices} and {@code queries} differ in size, or if
+   *                                  any index does not support buffered search
+   */
+  public static MultiSegmentSearchResults search(
+      CuVSResources resources, List<CagraIndex> indices, List<CagraQuery> queries, int k)
+      throws Throwable {
+    int numSegments = indices.size();
+    if (numSegments != queries.size()) {
+      throw new IllegalArgumentException(
+          "indices and queries must have the same size; got "
+              + numSegments
+              + " vs "
+              + queries.size());
+    }
+    if (numSegments == 0) {
+      return new MultiSegmentSearchResults(0, new int[0], new int[0], new float[0]);
+    }
+
+    // Validate that all indices support buffered search.
+    BufferedCagraSearch[] buffered = new BufferedCagraSearch[numSegments];
+    for (int i = 0; i < numSegments; i++) {
+      CagraIndex idx = indices.get(i);
+      if (!(idx instanceof BufferedCagraSearch)) {
+        throw new IllegalArgumentException(
+            "Index at position " + i + " does not support buffered search");
+      }
+      buffered[i] = (BufferedCagraSearch) idx;
+    }
+
+    long totalCandidates = (long) numSegments * k;
+    long neighborsBytes = totalCandidates * Integer.BYTES; // uint32 per ordinal
+    long distancesBytes = totalCandidates * Float.BYTES; // float32 per distance
+    long outIdxBytes = (long) k * Long.BYTES; // int64 positions from select_k
+    long outValBytes = (long) k * Float.BYTES;
+
+    try (var resourcesAccessor = resources.access()) {
+      long cuvsRes = resourcesAccessor.handle();
+      var cuvsStream = getStream(cuvsRes);
+
+      try (var globalNeighborsDP = allocateRMMSegment(cuvsRes, neighborsBytes);
+          var globalDistancesDP = allocateRMMSegment(cuvsRes, distancesBytes);
+          var outIdxDP = allocateRMMSegment(cuvsRes, outIdxBytes);
+          var outValDP = allocateRMMSegment(cuvsRes, outValBytes)) {
+
+        // --- Phase 1: queue all per-segment CAGRA searches ---
+        for (int i = 0; i < numSegments; i++) {
+          buffered[i].searchIntoBuffer(
+              queries.get(i), globalNeighborsDP.handle(), globalDistancesDP.handle(), i);
+        }
+
+        // --- Phase 2: sync once, then select global top-k on GPU ---
+        checkCuVSError(cuvsStreamSync(cuvsRes), "cuvsStreamSync before selectK");
+
+        SelectKHelper.selectK(
+            cuvsRes,
+            globalDistancesDP.handle(),
+            totalCandidates,
+            outValDP.handle(),
+            outIdxDP.handle(),
+            k);
+
+        checkCuVSError(cuvsStreamSync(cuvsRes), "cuvsStreamSync after selectK");
+
+        // --- Phase 3: single device-to-host copy for all three arrays ---
+        try (var arena = Arena.ofConfined()) {
+          MemorySegment hostOutIdx = arena.allocate(outIdxBytes);
+          MemorySegment hostOutVal = arena.allocate(outValBytes);
+          MemorySegment hostAllOrdinals = arena.allocate(neighborsBytes);
+
+          cudaMemcpyAsync(hostOutIdx, outIdxDP.handle(), outIdxBytes, DEVICE_TO_HOST, cuvsStream);
+          cudaMemcpyAsync(hostOutVal, outValDP.handle(), outValBytes, DEVICE_TO_HOST, cuvsStream);
+          cudaMemcpyAsync(
+              hostAllOrdinals,
+              globalNeighborsDP.handle(),
+              neighborsBytes,
+              DEVICE_TO_HOST,
+              cuvsStream);
+
+          checkCuVSError(cuvsStreamSync(cuvsRes), "cuvsStreamSync after D2H copy");
+
+          // --- Phase 4: decode results ---
+          int[] segmentIndices = new int[k];
+          int[] selectedOrdinals = new int[k];
+          float[] selectedDistances = new float[k];
+          int count = 0;
+
+          for (int j = 0; j < k; j++) {
+            long pos = hostOutIdx.getAtIndex(ValueLayout.JAVA_LONG, j);
+            float dist = hostOutVal.getAtIndex(ValueLayout.JAVA_FLOAT, j);
+            int ordinal = hostAllOrdinals.getAtIndex(ValueLayout.JAVA_INT, (int) pos);
+
+            if (ordinal < 0) {
+              // CAGRA uses negative sentinel values for unfilled slots
+              continue;
+            }
+            segmentIndices[count] = (int) (pos / k);
+            selectedOrdinals[count] = ordinal;
+            selectedDistances[count] = dist;
+            count++;
+          }
+
+          return new MultiSegmentSearchResults(
+              count, segmentIndices, selectedOrdinals, selectedDistances);
+        }
+      }
+    }
+  }
+}
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java
new file mode 100644
index 0000000000..6a9f5fe23e
--- /dev/null
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java
@@ -0,0 +1,44 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package com.nvidia.cuvs.internal;
+
+import com.nvidia.cuvs.CagraQuery;
+import java.lang.foreign.MemorySegment;
+
+/**
+ * Internal interface implemented by CAGRA index classes that support writing
+ * search results directly into a caller-owned device buffer without syncing
+ * the stream or copying results to host.
+ *
+ * <p>Used by {@link com.nvidia.cuvs.MultiSegmentCagraSearch} to queue all
+ * per-segment searches before running a single GPU-side top-k reduction.
+ */
+public interface BufferedCagraSearch {
+
+  /**
+   * Runs CAGRA search and writes results into a slice of caller-owned device
+   * buffers without copying results to host or syncing the stream.
+   *
+   * <p>Results are written at element offset {@code segmentIdx * query.getTopK()}
+   * in each buffer:
+   * <ul>
+   *   <li>{@code globalNeighborsDP}: uint32 ordinals</li>
+   *   <li>{@code globalDistancesDP}: float32 distances</li>
+   * </ul>
+   *
+   * <p>The caller must synchronize the stream after all segments have been searched.
+   *
+   * @param query             query with vectors, topK, search params, optional prefilter
+   * @param globalNeighborsDP device pointer to the shared uint32 neighbors buffer
+   * @param globalDistancesDP device pointer to the shared float32 distances buffer
+   * @param segmentIdx        zero-based segment index; determines the write offset
+   */
+  void searchIntoBuffer(
+      CagraQuery query,
+      MemorySegment globalNeighborsDP,
+      MemorySegment globalDistancesDP,
+      int segmentIdx)
+      throws Throwable;
+}
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
index abc53a5945..192e0c6780 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs.internal;
@@ -47,7 +47,7 @@
  *
  * @since 25.02
  */
-public class CagraIndexImpl implements CagraIndex {
+public class CagraIndexImpl implements CagraIndex, BufferedCagraSearch {
   private final CuVSResources resources;
   private final IndexReference cagraIndexReference;
   private boolean destroyed;
@@ -338,6 +338,118 @@ public SearchResults search(CagraQuery query) throws Throwable {
     }
   }
 
+  /**
+   * Runs CAGRA search and writes results directly into caller-owned device buffers.
+   *
+   * <p>Unlike {@link #search}, this method does <em>not</em> copy results to host memory and does
+   * <em>not</em> synchronize the CUDA stream. The caller is responsible for both after all
+   * per-segment searches have been queued.
+   *
+   * <p>Results for this segment are written at byte offset {@code segmentIdx * topK} elements
+   * from the start of each buffer:
+   * <ul>
+   *   <li>{@code globalNeighborsDP}: uint32 ordinals, {@code topK} entries per segment</li>
+   *   <li>{@code globalDistancesDP}: float32 distances, {@code topK} entries per segment</li>
+   * </ul>
+   *
+   * @param query            query parameters including vectors, topK, search params, and optional
+   *                         prefilter; must have exactly one query vector (numQueries == 1)
+   * @param globalNeighborsDP device pointer to the start of the shared uint32 neighbors buffer
+   * @param globalDistancesDP device pointer to the start of the shared float32 distances buffer
+   * @param segmentIdx        zero-based index of this segment; determines the write offset
+   */
+  @Override
+  public void searchIntoBuffer(
+      CagraQuery query,
+      MemorySegment globalNeighborsDP,
+      MemorySegment globalDistancesDP,
+      int segmentIdx)
+      throws Throwable {
+    try (var localArena = Arena.ofConfined()) {
+      checkNotDestroyed();
+      int topK = query.getTopK();
+      var queryVectors = (CuVSMatrixInternal) query.getQueryVectors();
+      long numQueries = queryVectors.size();
+
+      final boolean hasPreFilter = query.getPrefilter() != null;
+      final BitSet[] prefilters =
+          hasPreFilter ? new BitSet[] {query.getPrefilter()} : EMPTY_PREFILTER_BITSET;
+      final long prefilterDataLength = hasPreFilter ? query.getNumDocs() * prefilters.length : 0;
+      final long prefilterLen = hasPreFilter ? (prefilterDataLength + 31) / 32 : 0;
+      final long prefilterBytes = C_INT_BYTE_SIZE * prefilterLen;
+
+      try (var resourcesAccessor = query.getResources().access()) {
+        var cuvsRes = resourcesAccessor.handle();
+        var cuvsStream = Util.getStream(cuvsRes);
+
+        // Pointers into the global buffer at this segment's slice.
+        long neighborByteOffset = (long) segmentIdx * topK * C_INT_BYTE_SIZE;
+        long distanceByteOffset = (long) segmentIdx * topK * Float.BYTES;
+        MemorySegment neighborSlice =
+            MemorySegment.ofAddress(globalNeighborsDP.address() + neighborByteOffset);
+        MemorySegment distanceSlice =
+            MemorySegment.ofAddress(globalDistancesDP.address() + distanceByteOffset);
+
+        try (var deviceQueryVectors =
+                (CuVSMatrixInternal) queryVectors.toDevice(query.getResources());
+            var prefilterDP =
+                hasPreFilter
+                    ? allocateRMMSegment(cuvsRes, prefilterBytes)
+                    : CloseableRMMAllocation.EMPTY) {
+
+          var queryTensor = deviceQueryVectors.toTensor(localArena);
+          long[] neighborsShape = {numQueries, topK};
+          MemorySegment neighborsTensor =
+              prepareTensor(localArena, neighborSlice, neighborsShape, kDLUInt(), 32, kDLCUDA());
+          long[] distancesShape = {numQueries, topK};
+          MemorySegment distancesTensor =
+              prepareTensor(
+                  localArena,
+                  distanceSlice,
+                  distancesShape,
+                  deviceQueryVectors.code(),
+                  deviceQueryVectors.bits(),
+                  kDLCUDA());
+
+          MemorySegment prefilter = cuvsFilter.allocate(localArena);
+          if (!hasPreFilter) {
+            cuvsFilter.type(prefilter, 0); // NO_FILTER
+            cuvsFilter.addr(prefilter, 0);
+          } else {
+            BitSet concatenatedFilters = concatenate(prefilters, query.getNumDocs());
+            long[] filters = concatenatedFilters.toLongArray();
+            var prefilterDataMemorySegment = buildMemorySegment(localArena, filters);
+            long[] prefilterShape = {prefilterLen};
+            Util.cudaMemcpyAsync(
+                prefilterDP.handle(),
+                prefilterDataMemorySegment,
+                prefilterBytes,
+                HOST_TO_DEVICE,
+                cuvsStream);
+            MemorySegment prefilterTensor =
+                prepareTensor(
+                    localArena, prefilterDP.handle(), prefilterShape, kDLUInt(), 32, kDLCUDA());
+            cuvsFilter.type(prefilter, 1);
+            cuvsFilter.addr(prefilter, prefilterTensor.address());
+          }
+
+          var returnValue =
+              cuvsCagraSearch(
+                  cuvsRes,
+                  segmentFromSearchParams(localArena, query.getCagraSearchParameters()),
+                  cagraIndexReference.getMemorySegment(),
+                  queryTensor,
+                  neighborsTensor,
+                  distancesTensor,
+                  prefilter);
+          checkCuVSError(returnValue, "cuvsCagraSearch (searchIntoBuffer)");
+          // Intentionally no cudaMemcpyAsync and no stream sync here.
+          // The caller syncs the stream after queuing all segment searches.
+        }
+      }
+    }
+  }
+
   @Override
   public void serialize(OutputStream outputStream) throws Throwable {
     Path path =
@@ -632,8 +744,10 @@ private static void populateNativeIndexParams(
       cuvsAceParams.npartitions(cuvsAceParamsMemorySegment, cuVSAceParams.getNpartitions());
       cuvsAceParams.ef_construction(cuvsAceParamsMemorySegment, cuVSAceParams.getEfConstruction());
       cuvsAceParams.use_disk(cuvsAceParamsMemorySegment, cuVSAceParams.isUseDisk());
-      cuvsAceParams.max_host_memory_gb(cuvsAceParamsMemorySegment, cuVSAceParams.getMaxHostMemoryGb());
-      cuvsAceParams.max_gpu_memory_gb(cuvsAceParamsMemorySegment, cuVSAceParams.getMaxGpuMemoryGb());
+      cuvsAceParams.max_host_memory_gb(
+          cuvsAceParamsMemorySegment, cuVSAceParams.getMaxHostMemoryGb());
+      cuvsAceParams.max_gpu_memory_gb(
+          cuvsAceParamsMemorySegment, cuVSAceParams.getMaxGpuMemoryGb());
 
       String buildDir = cuVSAceParams.getBuildDir();
       if (buildDir != null && !buildDir.isEmpty()) {
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
index 950504bc5a..d2f828ebdb 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs.internal;
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java
index ca528ac010..66a53fa6d3 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs.internal;
@@ -251,8 +251,8 @@ public static HnswIndex.Builder newBuilder(CuVSResources cuvsResources) {
    * @return A new HNSW index ready for search
    * @throws Throwable if an error occurs during building
    */
-  public static HnswIndex build(CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset)
-      throws Throwable {
+  public static HnswIndex build(
+      CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset) throws Throwable {
     Objects.requireNonNull(resources);
     Objects.requireNonNull(hnswParams);
     Objects.requireNonNull(dataset);
@@ -288,7 +288,8 @@ public static HnswIndex build(CuVSResources resources, HnswIndexParams hnswParam
     return new HnswIndexImpl(new IndexReference(hnswIndex), resources, hnswParams);
   }
 
-  private static CloseableHandle createHnswIndexParamsForBuild(Arena arena, HnswIndexParams params) {
+  private static CloseableHandle createHnswIndexParamsForBuild(
+      Arena arena, HnswIndexParams params) {
     var hnswParams = createHnswIndexParams();
     MemorySegment seg = hnswParams.handle();
 
@@ -324,7 +325,7 @@ private static MemorySegment prepareTensorFromMatrix(Arena arena, CuVSMatrix dat
       return prepareTensor(
           arena,
           matrixInternal.memorySegment(),
-          new long[]{dataset.size(), dataset.columns()},
+          new long[] {dataset.size(), dataset.columns()},
           matrixInternal.code(),
           matrixInternal.bits(),
           kDLCPU());
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/SelectKHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/SelectKHelper.java
new file mode 100644
index 0000000000..f859ea26c7
--- /dev/null
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/SelectKHelper.java
@@ -0,0 +1,94 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package com.nvidia.cuvs.internal;
+
+import static com.nvidia.cuvs.internal.common.LinkerHelper.C_INT;
+import static com.nvidia.cuvs.internal.common.LinkerHelper.C_LONG;
+import static com.nvidia.cuvs.internal.common.LinkerHelper.C_POINTER;
+import static com.nvidia.cuvs.internal.common.Util.checkCuVSError;
+import static com.nvidia.cuvs.internal.common.Util.prepareTensor;
+import static com.nvidia.cuvs.internal.panama.headers_h.kDLCUDA;
+import static com.nvidia.cuvs.internal.panama.headers_h.kDLFloat;
+import static com.nvidia.cuvs.internal.panama.headers_h.kDLInt;
+
+import java.lang.foreign.Arena;
+import java.lang.foreign.FunctionDescriptor;
+import java.lang.foreign.Linker;
+import java.lang.foreign.MemorySegment;
+import java.lang.foreign.SymbolLookup;
+import java.lang.invoke.MethodHandle;
+
+/**
+ * Panama FFI binding for {@code cuvsSelectK}.
+ *
+ * <p>Selects the k smallest float values from a flat device array of n candidates, writing output
+ * distances and their flat-array positions (int64) into caller-supplied device buffers.
+ */
+public class SelectKHelper {
+
+  private static final MethodHandle cuvsSelectK$mh;
+
+  static {
+    var linker = Linker.nativeLinker();
+    SymbolLookup lookup =
+        SymbolLookup.libraryLookup(System.mapLibraryName("cuvs_c"), Arena.ofAuto())
+            .or(SymbolLookup.loaderLookup())
+            .or(linker.defaultLookup());
+
+    cuvsSelectK$mh =
+        linker.downcallHandle(
+            lookup
+                .find("cuvsSelectK")
+                .orElseThrow(() -> new UnsatisfiedLinkError("cuvsSelectK not found in libcuvs_c")),
+            FunctionDescriptor.of(
+                C_INT, // return: cuvsError_t
+                C_LONG, // cuvsResources_t res
+                C_POINTER, // DLManagedTensor* in_val
+                C_POINTER, // DLManagedTensor* out_val
+                C_POINTER // DLManagedTensor* out_idx
+                ));
+  }
+
+  private SelectKHelper() {}
+
+  /**
+   * Selects the {@code k} smallest distances from a flat device array of {@code n} candidates.
+   *
+   * <p>Output positions in {@code outIdxDP} are int64 column indices into [0, n). The caller
+   * recovers per-segment identity as {@code segment = position / segmentK}.
+   *
+   * @param cuvsRes    cuvsResources_t handle (raw long)
+   * @param inValDP    device pointer to float[n] input distances
+   * @param n          number of input candidates
+   * @param outValDP   device pointer to float[k] output distances
+   * @param outIdxDP   device pointer to int64[k] output positions
+   * @param k          number of results to select
+   */
+  public static void selectK(
+      long cuvsRes,
+      MemorySegment inValDP,
+      long n,
+      MemorySegment outValDP,
+      MemorySegment outIdxDP,
+      long k) {
+    try (var arena = Arena.ofConfined()) {
+      long[] inShape = {1, n};
+      long[] outShape = {1, k};
+
+      MemorySegment inValTensor = prepareTensor(arena, inValDP, inShape, kDLFloat(), 32, kDLCUDA());
+      MemorySegment outValTensor =
+          prepareTensor(arena, outValDP, outShape, kDLFloat(), 32, kDLCUDA());
+      MemorySegment outIdxTensor =
+          prepareTensor(arena, outIdxDP, outShape, kDLInt(), 64, kDLCUDA());
+
+      int rc = (int) cuvsSelectK$mh.invokeExact(cuvsRes, inValTensor, outValTensor, outIdxTensor);
+      checkCuVSError(rc, "cuvsSelectK");
+    } catch (RuntimeException | Error e) {
+      throw e;
+    } catch (Throwable t) {
+      throw new RuntimeException("cuvsSelectK failed", t);
+    }
+  }
+}
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
index 1d3199f26f..540d124213 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 package com.nvidia.cuvs.spi;
@@ -255,8 +255,8 @@ public HnswIndex hnswIndexFromCagra(HnswIndexParams hnswParams, CagraIndex cagra
   }
 
   @Override
-  public HnswIndex hnswIndexBuild(CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset)
-      throws Throwable {
+  public HnswIndex hnswIndexBuild(
+      CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset) throws Throwable {
     return HnswIndexImpl.build(resources, hnswParams, dataset);
   }
 

From 390adecb9ae1b768fa9c05760034dceb8e8df961 Mon Sep 17 00:00:00 2001
From: James Xia <jamxia@nvidia.com>
Date: Wed, 1 Apr 2026 17:00:35 -0700
Subject: [PATCH 02/11] Add parallel multi-segment CAGRA search with async
 memory resource

  Implements concurrent per-segment GPU search that eliminates per-segment
  device-to-host copies and CPU blocking on workspace deallocation.
  Key changes:
  - CudaStreamPool: fixed-size pool of non-blocking CUDA streams (one
    cuvsResources_t per slot). Segments are assigned to slots via
    round-robin so searches on different slots run concurrently. Pool size
    defaults to 8 and is overridden via the system property
    com.nvidia.cuvs.streamPoolSize.
  - MultiSegmentCagraSearch: single-query search across N index segments
    using the stream pool. All per-segment CAGRA kernels write into a
    shared device buffer (no per-segment D2H copy or stream sync). A
    single cuvsSelectK call finds the global top-k entirely on GPU, then
    one D2H copy transfers the results.
  - BufferedCagraSearch / CagraIndexImpl: new searchIntoBuffer() method
    that queues a CAGRA search kernel on a caller-supplied stream and
    writes results at a given row offset into a pre-allocated device
    buffer.
  - cuvsRMMAsyncMemoryResourceEnable() (C API + Java bindings): switches
    the current device memory resource to cuda_async_memory_resource so
    that workspace deallocations issued by CAGRA's search plan destructor
    are stream-ordered and non-blocking. Without this, cudaFree serializes
    kernel launches across streams regardless of stream assignment,
    nullifying the stream pool benefit.
---
 c/include/cuvs/core/c_api.h                   |  13 ++
 c/src/core/c_api.cpp                          |  16 +-
 .../com/nvidia/cuvs/spi/CuVSProvider.java     |  10 ++
 .../nvidia/cuvs/spi/UnsupportedProvider.java  |   5 +
 .../nvidia/cuvs/MultiSegmentCagraSearch.java  |  84 +++++++--
 .../cuvs/internal/BufferedCagraSearch.java    |  18 +-
 .../nvidia/cuvs/internal/CagraIndexImpl.java  | 164 +++++++++---------
 .../cuvs/internal/CuVSParamsHelper.java       |  27 +++
 .../cuvs/internal/CuVSResourcesImpl.java      |   1 +
 .../nvidia/cuvs/internal/CudaStreamPool.java  | 161 +++++++++++++++++
 .../com/nvidia/cuvs/spi/JDKProvider.java      |   9 +
 11 files changed, 404 insertions(+), 104 deletions(-)
 create mode 100644 java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CudaStreamPool.java

diff --git a/c/include/cuvs/core/c_api.h b/c/include/cuvs/core/c_api.h
index b9941d6ae4..b9678b3771 100644
--- a/c/include/cuvs/core/c_api.h
+++ b/c/include/cuvs/core/c_api.h
@@ -210,6 +210,19 @@ cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes);
 cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent,
                                             int max_pool_size_percent,
                                             bool managed);
+/**
+ * @brief Switches the working memory resource to use stream-ordered asynchronous allocation
+ * (cudaMallocAsync / cudaFreeAsync). Unlike the pool resource, this resource returns memory to
+ * the stream immediately without blocking the CPU, eliminating device-wide synchronization on
+ * deallocation. This is especially beneficial when multiple CAGRA searches run concurrently on
+ * separate CUDA streams, because the internal workspace allocations no longer serialize kernel
+ * launches. Be aware that this function will change the memory resource for the whole process
+ * and the new memory resource will be used until explicitly changed.
+ *
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsRMMAsyncMemoryResourceEnable();
+
 /**
  * @brief Resets the memory resource to use the default memory resource (cuda_memory_resource)
  * @return cuvsError_t
diff --git a/c/src/core/c_api.cpp b/c/src/core/c_api.cpp
index f4e3664482..b27b3bc6d5 100644
--- a/c/src/core/c_api.cpp
+++ b/c/src/core/c_api.cpp
@@ -14,6 +14,7 @@
 #include <raft/util/cudart_utils.hpp>
 #include <rapids_logger/logger.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/cuda_async_memory_resource.hpp>
 #include <rmm/mr/cuda_memory_resource.hpp>
 #include <rmm/mr/managed_memory_resource.hpp>
 #include <rmm/mr/per_device_resource.hpp>
@@ -146,6 +147,8 @@ extern "C" cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes)
   });
 }
 
+thread_local std::shared_ptr<rmm::mr::cuda_async_memory_resource> async_mr;
+
 extern "C" cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent,
                                                        int max_pool_size_percent,
                                                        bool managed)
@@ -164,9 +167,20 @@ extern "C" cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_per
   });
 }
 
+extern "C" cuvsError_t cuvsRMMAsyncMemoryResourceEnable()
+{
+  return cuvs::core::translate_exceptions([=] {
+    async_mr = std::make_shared<rmm::mr::cuda_async_memory_resource>();
+    rmm::mr::set_current_device_resource(async_mr.get());
+  });
+}
+
 extern "C" cuvsError_t cuvsRMMMemoryResourceReset()
 {
-  return cuvs::core::translate_exceptions([=] { rmm::mr::reset_current_device_resource(); });
+  return cuvs::core::translate_exceptions([=] {
+    rmm::mr::reset_current_device_resource();
+    async_mr.reset();
+  });
 }
 
 thread_local std::unique_ptr<rmm::mr::pinned_host_memory_resource> pinned_mr;
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
index fad11dab21..558d2e73f7 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java
@@ -189,6 +189,16 @@ default CagraIndex mergeCagraIndexes(CagraIndex[] indexes, CagraIndexParams merg
    */
   void enableRMMManagedPooledMemory(int initialPoolSizePercent, int maxPoolSizePercent);
 
+  /**
+   * Switch RMM allocations to use stream-ordered asynchronous allocation
+   * ({@code cudaMallocAsync} / {@code cudaFreeAsync}). Unlike the pool resource, this resource
+   * returns memory to the stream without blocking the CPU, eliminating device-wide synchronization
+   * on deallocation. This is especially beneficial when multiple CAGRA searches run concurrently
+   * on separate CUDA streams, because internal workspace allocations no longer serialize kernel
+   * launches. This operation has a global effect and will affect all resources on the current device.
+   */
+  void enableRMMAsyncMemory();
+
   /** Disables pooled memory on the current device, reverting back to the default setting.  */
   void resetRMMPooledMemory();
 
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java
index 8d080a0929..6701afa47b 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java
@@ -106,6 +106,11 @@ public Level getLogLevel() {
     throw new UnsupportedOperationException(reasons);
   }
 
+  @Override
+  public void enableRMMAsyncMemory() {
+    throw new UnsupportedOperationException(reasons);
+  }
+
   @Override
   public void enableRMMPooledMemory(int initialPoolSizePercent, int maxPoolSizePercent) {
     throw new UnsupportedOperationException(reasons);
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
index f63142a1f6..f39a0b2d0e 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
@@ -7,32 +7,39 @@
 import static com.nvidia.cuvs.internal.common.CloseableRMMAllocation.allocateRMMSegment;
 import static com.nvidia.cuvs.internal.common.Util.CudaMemcpyKind.DEVICE_TO_HOST;
 import static com.nvidia.cuvs.internal.common.Util.checkCuVSError;
+import static com.nvidia.cuvs.internal.common.Util.checkCudaError;
 import static com.nvidia.cuvs.internal.common.Util.cudaMemcpyAsync;
 import static com.nvidia.cuvs.internal.common.Util.getStream;
+import static com.nvidia.cuvs.internal.panama.headers_h.cudaEventRecord;
 import static com.nvidia.cuvs.internal.panama.headers_h.cuvsStreamSync;
+import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaStreamWaitEvent;
 
 import com.nvidia.cuvs.internal.BufferedCagraSearch;
+import com.nvidia.cuvs.internal.CuVSParamsHelper;
+import com.nvidia.cuvs.internal.CudaStreamPool;
 import com.nvidia.cuvs.internal.SelectKHelper;
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 import java.lang.foreign.ValueLayout;
+import java.util.Arrays;
 import java.util.List;
 
 /**
  * Performs a single-query approximate nearest neighbor search across multiple CAGRA index segments
- * using a shared GPU buffer, eliminating per-segment device-to-host copies.
+ * using a shared GPU buffer and a fixed-size CUDA stream pool, eliminating per-segment
+ * device-to-host copies.
  *
  * <h3>Algorithm</h3>
  * <ol>
  *   <li>Allocate two global device buffers sized {@code numSegments × k}:
  *       one for uint32 neighbor ordinals and one for float32 distances.</li>
+ *   <li>Assign each segment a slot from the {@link CudaStreamPool} via round-robin. Segments on
+ *       different slots run in parallel on separate CUDA streams.</li>
  *   <li>For each segment, call {@link BufferedCagraSearch#searchIntoBuffer} to queue the CAGRA
- *       search kernel; results are written into the segment's slice with no stream sync between
- *       segments.</li>
- *   <li>Sync the stream once after all segment searches are queued.</li>
- *   <li>Call {@code cuvsSelectK} to find the global top-k smallest distances across all
- *       {@code numSegments × k} candidates entirely on GPU.</li>
- *   <li>Sync the stream again.</li>
+ *       search kernel on the slot's stream; no per-segment sync or D2H copy.</li>
+ *   <li>Record a CUDA event on each slot's stream; make the main stream wait on all events.</li>
+ *   <li>Call {@code cuvsSelectK} on the main stream to find the global top-k entirely on GPU.</li>
+ *   <li>Sync the main stream.</li>
  *   <li>Copy the three result arrays to host in a single pass:
  *       k selected distances, k flat-array positions, and all {@code numSegments × k} ordinals.</li>
  *   <li>Decode each result: {@code segment = position / k}, {@code ordinal = ordinals[position]}.</li>
@@ -89,6 +96,14 @@ public static MultiSegmentSearchResults search(
     long outIdxBytes = (long) k * Long.BYTES; // int64 positions from select_k
     long outValBytes = (long) k * Float.BYTES;
 
+    // Assign a pool slot to each segment via round-robin.
+    CudaStreamPool pool = CudaStreamPool.getOrCreate();
+    int startSlot = CudaStreamPool.slotCounter.getAndAdd(numSegments);
+    int[] slots = new int[numSegments];
+    for (int i = 0; i < numSegments; i++) {
+      slots[i] = Math.floorMod(startSlot + i, pool.size());
+    }
+
     try (var resourcesAccessor = resources.access()) {
       long cuvsRes = resourcesAccessor.handle();
       var cuvsStream = getStream(cuvsRes);
@@ -98,15 +113,46 @@ public static MultiSegmentSearchResults search(
           var outIdxDP = allocateRMMSegment(cuvsRes, outIdxBytes);
           var outValDP = allocateRMMSegment(cuvsRes, outValBytes)) {
 
-        // --- Phase 1: queue all per-segment CAGRA searches ---
-        for (int i = 0; i < numSegments; i++) {
-          buffered[i].searchIntoBuffer(
-              queries.get(i), globalNeighborsDP.handle(), globalDistancesDP.handle(), i);
+        // --- Phase 1: queue all per-segment CAGRA searches, each on its own stream ---
+        // Use a single arena and pre-built search params shared across all segments to avoid
+        // repeated Arena.ofConfined() and segmentFromSearchParams calls.
+        // The arena is closed after Phase 1; all CPU-side structs are only needed until
+        // cuvsCagraSearch returns (the kernel launch is synchronous on the CPU side).
+        try (var segArena = Arena.ofConfined()) {
+          MemorySegment searchParams =
+              CuVSParamsHelper.buildCagraSearchParams(
+                  segArena, queries.get(0).getCagraSearchParameters());
+          for (int i = 0; i < numSegments; i++) {
+            buffered[i].searchIntoBuffer(
+                queries.get(i),
+                globalNeighborsDP.handle(),
+                globalDistancesDP.handle(),
+                i,
+                pool.resources(slots[i]),
+                pool.stream(slots[i]),
+                searchParams,
+                segArena);
+          }
         }
 
-        // --- Phase 2: sync once, then select global top-k on GPU ---
-        checkCuVSError(cuvsStreamSync(cuvsRes), "cuvsStreamSync before selectK");
+        // --- Phase 2: event-based sync — make main stream wait for all segment streams ---
+        // Record one event per distinct slot (on the last kernel submitted to that slot);
+        // this is O(pool.size()) API calls instead of O(numSegments).
+        // Pool events are pre-allocated and reused across calls to avoid create/destroy overhead.
+        int[] lastSegmentForSlot = new int[pool.size()];
+        Arrays.fill(lastSegmentForSlot, -1);
+        for (int i = 0; i < numSegments; i++) {
+          lastSegmentForSlot[slots[i]] = i;
+        }
+        for (int slot = 0; slot < pool.size(); slot++) {
+          if (lastSegmentForSlot[slot] >= 0) {
+            checkCudaError(cudaEventRecord(pool.event(slot), pool.stream(slot)), "cudaEventRecord");
+            checkCudaError(
+                cudaStreamWaitEvent(cuvsStream, pool.event(slot), 0), "cudaStreamWaitEvent");
+          }
+        }
 
+        // --- Phase 3: select global top-k on GPU (after all segment searches complete) ---
         SelectKHelper.selectK(
             cuvsRes,
             globalDistancesDP.handle(),
@@ -117,11 +163,11 @@ public static MultiSegmentSearchResults search(
 
         checkCuVSError(cuvsStreamSync(cuvsRes), "cuvsStreamSync after selectK");
 
-        // --- Phase 3: single device-to-host copy for all three arrays ---
-        try (var arena = Arena.ofConfined()) {
-          MemorySegment hostOutIdx = arena.allocate(outIdxBytes);
-          MemorySegment hostOutVal = arena.allocate(outValBytes);
-          MemorySegment hostAllOrdinals = arena.allocate(neighborsBytes);
+        // --- Phase 4: single device-to-host copy for all three arrays ---
+        try (var hostArena = Arena.ofConfined()) {
+          MemorySegment hostOutIdx = hostArena.allocate(outIdxBytes);
+          MemorySegment hostOutVal = hostArena.allocate(outValBytes);
+          MemorySegment hostAllOrdinals = hostArena.allocate(neighborsBytes);
 
           cudaMemcpyAsync(hostOutIdx, outIdxDP.handle(), outIdxBytes, DEVICE_TO_HOST, cuvsStream);
           cudaMemcpyAsync(hostOutVal, outValDP.handle(), outValBytes, DEVICE_TO_HOST, cuvsStream);
@@ -134,7 +180,7 @@ public static MultiSegmentSearchResults search(
 
           checkCuVSError(cuvsStreamSync(cuvsRes), "cuvsStreamSync after D2H copy");
 
-          // --- Phase 4: decode results ---
+          // --- Phase 5: decode results ---
           int[] segmentIndices = new int[k];
           int[] selectedOrdinals = new int[k];
           float[] selectedDistances = new float[k];
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java
index 6a9f5fe23e..20390ce4cc 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java
@@ -5,6 +5,7 @@
 package com.nvidia.cuvs.internal;
 
 import com.nvidia.cuvs.CagraQuery;
+import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 
 /**
@@ -28,17 +29,30 @@ public interface BufferedCagraSearch {
    *   <li>{@code globalDistancesDP}: float32 distances</li>
    * </ul>
    *
-   * <p>The caller must synchronize the stream after all segments have been searched.
+   * <p>The search is submitted to {@code segmentStream}. The caller is responsible for
+   * synchronizing that stream (e.g. via a CUDA event) before consuming the output buffers.
    *
    * @param query             query with vectors, topK, search params, optional prefilter
    * @param globalNeighborsDP device pointer to the shared uint32 neighbors buffer
    * @param globalDistancesDP device pointer to the shared float32 distances buffer
    * @param segmentIdx        zero-based segment index; determines the write offset
+   * @param segmentCuvsRes    {@code cuvsResources_t} handle whose CUDA stream receives the kernel
+   * @param segmentStream     CUDA stream corresponding to {@code segmentCuvsRes}; passed explicitly
+   *                          to avoid a redundant {@code cuvsStreamGet} call inside the method
+   * @param searchParams      pre-built {@code cuvsCagraSearchParams} struct; shared across all
+   *                          segments to avoid repeated allocation and population
+   * @param arena             shared scratch arena for per-call CPU-side allocations (tensor
+   *                          descriptors, filter struct); must remain open until after this call
+   *                          returns, and the GPU kernel has launched
    */
   void searchIntoBuffer(
       CagraQuery query,
       MemorySegment globalNeighborsDP,
       MemorySegment globalDistancesDP,
-      int segmentIdx)
+      int segmentIdx,
+      long segmentCuvsRes,
+      MemorySegment segmentStream,
+      MemorySegment searchParams,
+      Arena arena)
       throws Throwable;
 }
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
index 192e0c6780..fb102c5fca 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
@@ -357,96 +357,96 @@ public SearchResults search(CagraQuery query) throws Throwable {
    * @param globalNeighborsDP device pointer to the start of the shared uint32 neighbors buffer
    * @param globalDistancesDP device pointer to the start of the shared float32 distances buffer
    * @param segmentIdx        zero-based index of this segment; determines the write offset
+   * @param segmentCuvsRes    {@code cuvsResources_t} handle whose CUDA stream receives the kernel
    */
   @Override
   public void searchIntoBuffer(
       CagraQuery query,
       MemorySegment globalNeighborsDP,
       MemorySegment globalDistancesDP,
-      int segmentIdx)
+      int segmentIdx,
+      long segmentCuvsRes,
+      MemorySegment segmentStream,
+      MemorySegment searchParams,
+      Arena arena)
       throws Throwable {
-    try (var localArena = Arena.ofConfined()) {
-      checkNotDestroyed();
-      int topK = query.getTopK();
-      var queryVectors = (CuVSMatrixInternal) query.getQueryVectors();
-      long numQueries = queryVectors.size();
-
-      final boolean hasPreFilter = query.getPrefilter() != null;
-      final BitSet[] prefilters =
-          hasPreFilter ? new BitSet[] {query.getPrefilter()} : EMPTY_PREFILTER_BITSET;
-      final long prefilterDataLength = hasPreFilter ? query.getNumDocs() * prefilters.length : 0;
-      final long prefilterLen = hasPreFilter ? (prefilterDataLength + 31) / 32 : 0;
-      final long prefilterBytes = C_INT_BYTE_SIZE * prefilterLen;
-
-      try (var resourcesAccessor = query.getResources().access()) {
-        var cuvsRes = resourcesAccessor.handle();
-        var cuvsStream = Util.getStream(cuvsRes);
-
-        // Pointers into the global buffer at this segment's slice.
-        long neighborByteOffset = (long) segmentIdx * topK * C_INT_BYTE_SIZE;
-        long distanceByteOffset = (long) segmentIdx * topK * Float.BYTES;
-        MemorySegment neighborSlice =
-            MemorySegment.ofAddress(globalNeighborsDP.address() + neighborByteOffset);
-        MemorySegment distanceSlice =
-            MemorySegment.ofAddress(globalDistancesDP.address() + distanceByteOffset);
-
-        try (var deviceQueryVectors =
-                (CuVSMatrixInternal) queryVectors.toDevice(query.getResources());
-            var prefilterDP =
-                hasPreFilter
-                    ? allocateRMMSegment(cuvsRes, prefilterBytes)
-                    : CloseableRMMAllocation.EMPTY) {
-
-          var queryTensor = deviceQueryVectors.toTensor(localArena);
-          long[] neighborsShape = {numQueries, topK};
-          MemorySegment neighborsTensor =
-              prepareTensor(localArena, neighborSlice, neighborsShape, kDLUInt(), 32, kDLCUDA());
-          long[] distancesShape = {numQueries, topK};
-          MemorySegment distancesTensor =
-              prepareTensor(
-                  localArena,
-                  distanceSlice,
-                  distancesShape,
-                  deviceQueryVectors.code(),
-                  deviceQueryVectors.bits(),
-                  kDLCUDA());
-
-          MemorySegment prefilter = cuvsFilter.allocate(localArena);
-          if (!hasPreFilter) {
-            cuvsFilter.type(prefilter, 0); // NO_FILTER
-            cuvsFilter.addr(prefilter, 0);
-          } else {
-            BitSet concatenatedFilters = concatenate(prefilters, query.getNumDocs());
-            long[] filters = concatenatedFilters.toLongArray();
-            var prefilterDataMemorySegment = buildMemorySegment(localArena, filters);
-            long[] prefilterShape = {prefilterLen};
-            Util.cudaMemcpyAsync(
-                prefilterDP.handle(),
-                prefilterDataMemorySegment,
-                prefilterBytes,
-                HOST_TO_DEVICE,
-                cuvsStream);
-            MemorySegment prefilterTensor =
-                prepareTensor(
-                    localArena, prefilterDP.handle(), prefilterShape, kDLUInt(), 32, kDLCUDA());
-            cuvsFilter.type(prefilter, 1);
-            cuvsFilter.addr(prefilter, prefilterTensor.address());
-          }
-
-          var returnValue =
-              cuvsCagraSearch(
-                  cuvsRes,
-                  segmentFromSearchParams(localArena, query.getCagraSearchParameters()),
-                  cagraIndexReference.getMemorySegment(),
-                  queryTensor,
-                  neighborsTensor,
-                  distancesTensor,
-                  prefilter);
-          checkCuVSError(returnValue, "cuvsCagraSearch (searchIntoBuffer)");
-          // Intentionally no cudaMemcpyAsync and no stream sync here.
-          // The caller syncs the stream after queuing all segment searches.
-        }
+    checkNotDestroyed();
+    int topK = query.getTopK();
+    var queryVectors = (CuVSMatrixInternal) query.getQueryVectors();
+    long numQueries = queryVectors.size();
+
+    final boolean hasPreFilter = query.getPrefilter() != null;
+    final BitSet[] prefilters =
+        hasPreFilter ? new BitSet[] {query.getPrefilter()} : EMPTY_PREFILTER_BITSET;
+    final long prefilterDataLength = hasPreFilter ? query.getNumDocs() * prefilters.length : 0;
+    final long prefilterLen = hasPreFilter ? (prefilterDataLength + 31) / 32 : 0;
+    final long prefilterBytes = C_INT_BYTE_SIZE * prefilterLen;
+
+    // Pointers into the global buffer at this segment's slice.
+    long neighborByteOffset = (long) segmentIdx * topK * C_INT_BYTE_SIZE;
+    long distanceByteOffset = (long) segmentIdx * topK * Float.BYTES;
+    MemorySegment neighborSlice =
+        MemorySegment.ofAddress(globalNeighborsDP.address() + neighborByteOffset);
+    MemorySegment distanceSlice =
+        MemorySegment.ofAddress(globalDistancesDP.address() + distanceByteOffset);
+
+    if (!(queryVectors instanceof CuVSDeviceMatrix)) {
+      throw new IllegalArgumentException(
+          "searchIntoBuffer requires query vectors already on device");
+    }
+    try (var prefilterDP =
+        hasPreFilter
+            ? allocateRMMSegment(segmentCuvsRes, prefilterBytes)
+            : CloseableRMMAllocation.EMPTY) {
+      var deviceQueryVectors = (CuVSMatrixInternal) queryVectors;
+
+      var queryTensor = deviceQueryVectors.toTensor(arena);
+      long[] neighborsShape = {numQueries, topK};
+      MemorySegment neighborsTensor =
+          prepareTensor(arena, neighborSlice, neighborsShape, kDLUInt(), 32, kDLCUDA());
+      long[] distancesShape = {numQueries, topK};
+      MemorySegment distancesTensor =
+          prepareTensor(
+              arena,
+              distanceSlice,
+              distancesShape,
+              deviceQueryVectors.code(),
+              deviceQueryVectors.bits(),
+              kDLCUDA());
+
+      MemorySegment prefilter = cuvsFilter.allocate(arena);
+      if (!hasPreFilter) {
+        cuvsFilter.type(prefilter, 0); // NO_FILTER
+        cuvsFilter.addr(prefilter, 0);
+      } else {
+        BitSet concatenatedFilters = concatenate(prefilters, query.getNumDocs());
+        long[] filters = concatenatedFilters.toLongArray();
+        var prefilterDataMemorySegment = buildMemorySegment(arena, filters);
+        long[] prefilterShape = {prefilterLen};
+        Util.cudaMemcpyAsync(
+            prefilterDP.handle(),
+            prefilterDataMemorySegment,
+            prefilterBytes,
+            HOST_TO_DEVICE,
+            segmentStream);
+        MemorySegment prefilterTensor =
+            prepareTensor(arena, prefilterDP.handle(), prefilterShape, kDLUInt(), 32, kDLCUDA());
+        cuvsFilter.type(prefilter, 1);
+        cuvsFilter.addr(prefilter, prefilterTensor.address());
       }
+
+      var returnValue =
+          cuvsCagraSearch(
+              segmentCuvsRes,
+              searchParams,
+              cagraIndexReference.getMemorySegment(),
+              queryTensor,
+              neighborsTensor,
+              distancesTensor,
+              prefilter);
+      checkCuVSError(returnValue, "cuvsCagraSearch (searchIntoBuffer)");
+      // Intentionally no cudaMemcpyAsync and no stream sync here.
+      // The caller syncs the stream after queuing all segment searches.
     }
   }
 
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
index d2f828ebdb..ee12e7d88f 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
@@ -7,7 +7,9 @@
 import static com.nvidia.cuvs.internal.common.Util.checkCuVSError;
 import static com.nvidia.cuvs.internal.panama.headers_h.*;
 
+import com.nvidia.cuvs.CagraSearchParams;
 import com.nvidia.cuvs.internal.common.CloseableHandle;
+import com.nvidia.cuvs.internal.panama.*;
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 
@@ -25,6 +27,31 @@ public final class CuVSParamsHelper {
 
   private CuVSParamsHelper() {}
 
+  /**
+   * Allocates and populates a {@code cuvsCagraSearchParams} struct into {@code arena}.
+   * The returned segment is valid for the lifetime of {@code arena}.
+   */
+  public static MemorySegment buildCagraSearchParams(Arena arena, CagraSearchParams params) {
+    MemorySegment seg = cuvsCagraSearchParams.allocate(arena);
+    cuvsCagraSearchParams.max_queries(seg, params.getMaxQueries());
+    cuvsCagraSearchParams.itopk_size(seg, params.getITopKSize());
+    cuvsCagraSearchParams.max_iterations(seg, params.getMaxIterations());
+    if (params.getCagraSearchAlgo() != null) {
+      cuvsCagraSearchParams.algo(seg, params.getCagraSearchAlgo().value);
+    }
+    cuvsCagraSearchParams.team_size(seg, params.getTeamSize());
+    cuvsCagraSearchParams.search_width(seg, params.getSearchWidth());
+    cuvsCagraSearchParams.min_iterations(seg, params.getMinIterations());
+    cuvsCagraSearchParams.thread_block_size(seg, params.getThreadBlockSize());
+    if (params.getHashMapMode() != null) {
+      cuvsCagraSearchParams.hashmap_mode(seg, params.getHashMapMode().value);
+    }
+    cuvsCagraSearchParams.hashmap_max_fill_rate(seg, params.getHashMapMaxFillRate());
+    cuvsCagraSearchParams.num_random_samplings(seg, params.getNumRandomSamplings());
+    cuvsCagraSearchParams.rand_xor_mask(seg, params.getRandXORMask());
+    return seg;
+  }
+
   public static CloseableHandle createCagraIndexParams() {
     try (var localArena = Arena.ofConfined()) {
       var paramsPtrPtr = localArena.allocate(cuvsCagraIndexParams_t);
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
index efdf7283ac..630155f883 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
@@ -59,6 +59,7 @@ public int deviceId() {
   @Override
   public void close() {
     synchronized (this) {
+      CudaStreamPool.closeInstance();
       int returnValue = cuvsResourcesDestroy(resourceHandle);
       checkCuVSError(returnValue, "cuvsResourcesDestroy");
       hostBuffer.close();
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CudaStreamPool.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CudaStreamPool.java
new file mode 100644
index 0000000000..22d95af7c1
--- /dev/null
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CudaStreamPool.java
@@ -0,0 +1,161 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package com.nvidia.cuvs.internal;
+
+import static com.nvidia.cuvs.internal.common.Util.checkCuVSError;
+import static com.nvidia.cuvs.internal.common.Util.checkCudaError;
+import static com.nvidia.cuvs.internal.panama.headers_h.cudaEventCreateWithFlags;
+import static com.nvidia.cuvs.internal.panama.headers_h.cudaEventDestroy;
+import static com.nvidia.cuvs.internal.panama.headers_h.cuvsResourcesCreate;
+import static com.nvidia.cuvs.internal.panama.headers_h.cuvsResourcesDestroy;
+import static com.nvidia.cuvs.internal.panama.headers_h.cuvsResources_t;
+import static com.nvidia.cuvs.internal.panama.headers_h.cuvsStreamSet;
+import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaEventDisableTiming;
+import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaEvent_t;
+import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaStreamCreateWithFlags;
+import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaStreamDestroy;
+import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaStreamNonBlocking;
+import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaStream_t;
+
+import java.lang.foreign.Arena;
+import java.lang.foreign.MemorySegment;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * A fixed-size pool of CUDA streams used by {@link com.nvidia.cuvs.MultiSegmentCagraSearch}
+ * to run per-segment CAGRA searches in parallel.
+ *
+ * <p>Each pool slot owns one {@code cuvsResources_t} handle backed by a dedicated non-blocking
+ * CUDA stream. Callers assign segments to slots via round-robin using {@link #slotCounter}; the
+ * GPU executes searches on different slots concurrently, then synchronizes via CUDA events before
+ * the global {@code cuvsSelectK} call.
+ *
+ * <h3>Lifecycle</h3>
+ * <p>The pool is a lazily-initialized static singleton. Call {@link #getOrCreate()} to obtain it
+ * on first use and {@link #closeInstance()} from {@code CuVSResourcesImpl.close()} to release
+ * all CUDA resources at application shutdown.
+ *
+ * <h3>Configuration</h3>
+ * <p>Pool size defaults to {@value #DEFAULT_SIZE} and can be overridden via the system property
+ * {@value #SIZE_PROPERTY}.
+ */
+public final class CudaStreamPool implements AutoCloseable {
+
+  /** Default number of streams in the pool. */
+  public static final int DEFAULT_SIZE = 8;
+
+  /** System property name for overriding the pool size. */
+  public static final String SIZE_PROPERTY = "com.nvidia.cuvs.streamPoolSize";
+
+  /** Round-robin counter shared across all {@code search()} calls. */
+  public static final AtomicInteger slotCounter = new AtomicInteger();
+
+  private static volatile CudaStreamPool instance;
+
+  private final long[] resources; // cuvsResources_t handles
+  private final MemorySegment[] streams; // cudaStream_t handle values
+  private final MemorySegment[] events; // pre-allocated cudaEvent_t handles, one per slot
+  private final int size;
+
+  private CudaStreamPool(int size) {
+    this.size = size;
+    this.resources = new long[size];
+    this.streams = new MemorySegment[size];
+    this.events = new MemorySegment[size];
+    try (var arena = Arena.ofConfined()) {
+      for (int i = 0; i < size; i++) {
+        // Create a non-blocking CUDA stream (avoids implicit sync with the default stream).
+        var pStream = arena.allocate(cudaStream_t);
+        checkCudaError(
+            cudaStreamCreateWithFlags(pStream, cudaStreamNonBlocking()), "cudaStreamCreate");
+        streams[i] = pStream.get(cudaStream_t, 0);
+
+        // Create a cuvsResources_t and assign the new stream to it.
+        var pRes = arena.allocate(cuvsResources_t);
+        checkCuVSError(cuvsResourcesCreate(pRes), "cuvsResourcesCreate");
+        resources[i] = pRes.get(cuvsResources_t, 0);
+        checkCuVSError(cuvsStreamSet(resources[i], streams[i]), "cuvsStreamSet");
+
+        // Pre-allocate one reusable event per slot (disable timing to avoid overhead).
+        var pEvent = arena.allocate(cudaEvent_t);
+        checkCudaError(
+            cudaEventCreateWithFlags(pEvent, cudaEventDisableTiming()), "cudaEventCreate");
+        events[i] = pEvent.get(cudaEvent_t, 0);
+      }
+    }
+  }
+
+  // -------------------------------------------------------------------------
+  // Static singleton API
+  // -------------------------------------------------------------------------
+
+  /**
+   * Returns the singleton pool, creating it on first call.
+   */
+  public static CudaStreamPool getOrCreate() {
+    CudaStreamPool pool = instance;
+    if (pool == null) {
+      synchronized (CudaStreamPool.class) {
+        pool = instance;
+        if (pool == null) {
+          int size = Integer.getInteger(SIZE_PROPERTY, DEFAULT_SIZE);
+          instance = pool = new CudaStreamPool(size);
+        }
+      }
+    }
+    return pool;
+  }
+
+  /**
+   * Closes and nulls the singleton pool. Called from {@code CuVSResourcesImpl.close()}.
+   */
+  public static void closeInstance() {
+    CudaStreamPool pool;
+    synchronized (CudaStreamPool.class) {
+      pool = instance;
+      instance = null;
+    }
+    if (pool != null) {
+      pool.close();
+    }
+  }
+
+  // -------------------------------------------------------------------------
+  // Per-slot accessors
+  // -------------------------------------------------------------------------
+
+  /** Returns the {@code cuvsResources_t} handle for the given slot. */
+  public long resources(int slot) {
+    return resources[slot];
+  }
+
+  /** Returns the CUDA stream handle for the given slot. */
+  public MemorySegment stream(int slot) {
+    return streams[slot];
+  }
+
+  /** Returns the pre-allocated CUDA event handle for the given slot. */
+  public MemorySegment event(int slot) {
+    return events[slot];
+  }
+
+  /** Returns the number of slots in this pool. */
+  public int size() {
+    return size;
+  }
+
+  // -------------------------------------------------------------------------
+  // Lifecycle
+  // -------------------------------------------------------------------------
+
+  @Override
+  public void close() {
+    for (int i = 0; i < size; i++) {
+      checkCudaError(cudaEventDestroy(events[i]), "cudaEventDestroy");
+      cuvsResourcesDestroy(resources[i]);
+      checkCudaError(cudaStreamDestroy(streams[i]), "cudaStreamDestroy");
+    }
+  }
+}
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
index 540d124213..4594ad1e93 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
@@ -138,6 +138,9 @@ public String toString() {
   private final cuvsRMMMemoryResourceReset cuvsRMMMemoryResourceResetInvoker =
       cuvsRMMMemoryResourceReset.makeInvoker();
 
+  private final cuvsRMMAsyncMemoryResourceEnable cuvsRMMAsyncMemoryResourceEnableInvoker =
+      cuvsRMMAsyncMemoryResourceEnable.makeInvoker();
+
   private final cuvsGetLogLevel GET_LOG_LEVEL_INVOKER = cuvsGetLogLevel.makeInvoker();
 
   private JDKProvider() {}
@@ -436,6 +439,12 @@ public Level getLogLevel() {
     throw new IllegalArgumentException("Unexpected log level [" + logLevel + "]");
   }
 
+  @Override
+  public void enableRMMAsyncMemory() {
+    checkCuVSError(
+        cuvsRMMAsyncMemoryResourceEnableInvoker.apply(), "cuvsRMMAsyncMemoryResourceEnable");
+  }
+
   @Override
   public void enableRMMPooledMemory(int initialPoolSizePercent, int maxPoolSizePercent) {
     checkCuVSError(

From 92dbcf99de9076845041e86fe467d7bd868c2a78 Mon Sep 17 00:00:00 2001
From: James Xia <jamxia@nvidia.com>
Date: Thu, 2 Apr 2026 08:51:25 -0700
Subject: [PATCH 03/11] Fix CudaStreamPool races and reduce multi-segment
 search overhead
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  CudaStreamPool was a static singleton shared across all threads. With
  multiple concurrent query threads, calls to MultiSegmentCagraSearch.search()
  would alias onto the same pool slots after Math.floorMod, causing concurrent
  cudaEventRecord and cudaStreamWaitEvent calls on the same event handle
  (undefined behavior) and cross-thread stream interference. Additionally,
  CudaStreamPool.closeInstance() was called from CuVSResourcesImpl.close(),
  so whichever thread closed its resources first would destroy the shared pool
  while other threads were still using it.

  Fix: make CudaStreamPool a per-CuVSResources instance rather than a static
  singleton. One pool is created and owned by each CuVSResourcesImpl; it is
  closed when that instance is closed. Since CuVSResources is thread-local in
  the Lucene integration, each query thread gets its own independent set of
  streams and events with no sharing or locking required.

  - CudaStreamPool: remove static singleton (getOrCreate, closeInstance, static
    volatile instance, static AtomicInteger slotCounter); add package-private
    constructor; replace static slotCounter with an instance int and a new
    nextSlot(int count) method.

  - CuVSResourcesImpl: add final CudaStreamPool streamPool field (sized from
    com.nvidia.cuvs.streamPoolSize system property); close it directly in
    close(); add static getStreamPool(CuVSResources) helper for
    MultiSegmentCagraSearch to retrieve the per-resources pool.

  - MultiSegmentCagraSearch: get pool via CuVSResourcesImpl.getStreamPool
    and advance via pool.nextSlot. Remove redundant cuvsStreamSync after
    cuvsSelectK — the D2H copies are enqueued on the same stream so CUDA
    ordering already serializes them. Replace three separate hostArena.allocate
    calls with one contiguous allocation (Long.BYTES-aligned) sliced into three
    typed views, reducing OS-level allocation overhead per query.
---
 .../nvidia/cuvs/MultiSegmentCagraSearch.java  | 21 +++++--
 .../cuvs/internal/CuVSResourcesImpl.java      | 16 ++++-
 .../nvidia/cuvs/internal/CudaStreamPool.java  | 62 +++++--------------
 3 files changed, 47 insertions(+), 52 deletions(-)

diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
index f39a0b2d0e..2085f7357b 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
@@ -16,6 +16,7 @@
 
 import com.nvidia.cuvs.internal.BufferedCagraSearch;
 import com.nvidia.cuvs.internal.CuVSParamsHelper;
+import com.nvidia.cuvs.internal.CuVSResourcesImpl;
 import com.nvidia.cuvs.internal.CudaStreamPool;
 import com.nvidia.cuvs.internal.SelectKHelper;
 import java.lang.foreign.Arena;
@@ -97,8 +98,8 @@ public static MultiSegmentSearchResults search(
     long outValBytes = (long) k * Float.BYTES;
 
     // Assign a pool slot to each segment via round-robin.
-    CudaStreamPool pool = CudaStreamPool.getOrCreate();
-    int startSlot = CudaStreamPool.slotCounter.getAndAdd(numSegments);
+    CudaStreamPool pool = CuVSResourcesImpl.getStreamPool(resources);
+    int startSlot = pool.nextSlot(numSegments);
     int[] slots = new int[numSegments];
     for (int i = 0; i < numSegments; i++) {
       slots[i] = Math.floorMod(startSlot + i, pool.size());
@@ -161,13 +162,21 @@ public static MultiSegmentSearchResults search(
             outIdxDP.handle(),
             k);
 
-        checkCuVSError(cuvsStreamSync(cuvsRes), "cuvsStreamSync after selectK");
+        // No stream sync needed here: the D2H copies below are enqueued on the same cuvsStream,
+        // so CUDA stream ordering guarantees selectK completes before the copies begin.
 
         // --- Phase 4: single device-to-host copy for all three arrays ---
+        // Allocate one contiguous host buffer and slice into three typed views.
+        // Layout (in order of decreasing alignment): int64 outIdx | float32 outVal | uint32
+        // ordinals
+        // outIdxBytes is a multiple of Long.BYTES, so each slice is naturally aligned.
         try (var hostArena = Arena.ofConfined()) {
-          MemorySegment hostOutIdx = hostArena.allocate(outIdxBytes);
-          MemorySegment hostOutVal = hostArena.allocate(outValBytes);
-          MemorySegment hostAllOrdinals = hostArena.allocate(neighborsBytes);
+          MemorySegment hostBuf =
+              hostArena.allocate(outIdxBytes + outValBytes + neighborsBytes, Long.BYTES);
+          MemorySegment hostOutIdx = hostBuf.asSlice(0, outIdxBytes);
+          MemorySegment hostOutVal = hostBuf.asSlice(outIdxBytes, outValBytes);
+          MemorySegment hostAllOrdinals =
+              hostBuf.asSlice(outIdxBytes + outValBytes, neighborsBytes);
 
           cudaMemcpyAsync(hostOutIdx, outIdxDP.handle(), outIdxBytes, DEVICE_TO_HOST, cuvsStream);
           cudaMemcpyAsync(hostOutVal, outValDP.handle(), outValBytes, DEVICE_TO_HOST, cuvsStream);
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
index 630155f883..ca3bbea8af 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
@@ -28,6 +28,9 @@ public class CuVSResourcesImpl implements CuVSResources {
   private final int deviceId;
 
   private final PinnedMemoryBuffer hostBuffer = new PinnedMemoryBuffer();
+  private final CudaStreamPool streamPool =
+      new CudaStreamPool(
+          Integer.getInteger(CudaStreamPool.SIZE_PROPERTY, CudaStreamPool.DEFAULT_SIZE));
 
   /**
    * Constructor that allocates the resources needed for cuVS
@@ -59,13 +62,24 @@ public int deviceId() {
   @Override
   public void close() {
     synchronized (this) {
-      CudaStreamPool.closeInstance();
+      streamPool.close();
       int returnValue = cuvsResourcesDestroy(resourceHandle);
       checkCuVSError(returnValue, "cuvsResourcesDestroy");
       hostBuffer.close();
     }
   }
 
+  /**
+   * Returns the {@link CudaStreamPool} owned by the given {@link CuVSResources} instance.
+   */
+  public static CudaStreamPool getStreamPool(CuVSResources resources) {
+    if (resources instanceof CuVSResourcesImpl impl) {
+      return impl.streamPool;
+    }
+    throw new IllegalArgumentException(
+        "Unsupported resources type: " + resources.getClass().getName());
+  }
+
   @Override
   public Path tempDirectory() {
     return tempDirectory;
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CudaStreamPool.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CudaStreamPool.java
index 22d95af7c1..cddb3b5cb5 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CudaStreamPool.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CudaStreamPool.java
@@ -21,21 +21,20 @@
 
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
-import java.util.concurrent.atomic.AtomicInteger;
 
 /**
  * A fixed-size pool of CUDA streams used by {@link com.nvidia.cuvs.MultiSegmentCagraSearch}
  * to run per-segment CAGRA searches in parallel.
  *
  * <p>Each pool slot owns one {@code cuvsResources_t} handle backed by a dedicated non-blocking
- * CUDA stream. Callers assign segments to slots via round-robin using {@link #slotCounter}; the
+ * CUDA stream. Callers assign segments to slots via round-robin using {@link #nextSlot}; the
  * GPU executes searches on different slots concurrently, then synchronizes via CUDA events before
  * the global {@code cuvsSelectK} call.
  *
  * <h3>Lifecycle</h3>
- * <p>The pool is a lazily-initialized static singleton. Call {@link #getOrCreate()} to obtain it
- * on first use and {@link #closeInstance()} from {@code CuVSResourcesImpl.close()} to release
- * all CUDA resources at application shutdown.
+ * <p>One pool is owned by each {@code CuVSResourcesImpl} instance and closed when that instance
+ * is closed. This gives each thread its own independent set of streams and events, eliminating
+ * races when multiple threads perform concurrent multi-segment searches.
  *
  * <h3>Configuration</h3>
  * <p>Pool size defaults to {@value #DEFAULT_SIZE} and can be overridden via the system property
@@ -49,17 +48,15 @@ public final class CudaStreamPool implements AutoCloseable {
   /** System property name for overriding the pool size. */
   public static final String SIZE_PROPERTY = "com.nvidia.cuvs.streamPoolSize";
 
-  /** Round-robin counter shared across all {@code search()} calls. */
-  public static final AtomicInteger slotCounter = new AtomicInteger();
-
-  private static volatile CudaStreamPool instance;
+  /** Round-robin counter; advanced by {@link #nextSlot(int)} on each search call. */
+  private int slotCounter;
 
   private final long[] resources; // cuvsResources_t handles
   private final MemorySegment[] streams; // cudaStream_t handle values
   private final MemorySegment[] events; // pre-allocated cudaEvent_t handles, one per slot
   private final int size;
 
-  private CudaStreamPool(int size) {
+  CudaStreamPool(int size) {
     this.size = size;
     this.resources = new long[size];
     this.streams = new MemorySegment[size];
@@ -87,41 +84,6 @@ private CudaStreamPool(int size) {
     }
   }
 
-  // -------------------------------------------------------------------------
-  // Static singleton API
-  // -------------------------------------------------------------------------
-
-  /**
-   * Returns the singleton pool, creating it on first call.
-   */
-  public static CudaStreamPool getOrCreate() {
-    CudaStreamPool pool = instance;
-    if (pool == null) {
-      synchronized (CudaStreamPool.class) {
-        pool = instance;
-        if (pool == null) {
-          int size = Integer.getInteger(SIZE_PROPERTY, DEFAULT_SIZE);
-          instance = pool = new CudaStreamPool(size);
-        }
-      }
-    }
-    return pool;
-  }
-
-  /**
-   * Closes and nulls the singleton pool. Called from {@code CuVSResourcesImpl.close()}.
-   */
-  public static void closeInstance() {
-    CudaStreamPool pool;
-    synchronized (CudaStreamPool.class) {
-      pool = instance;
-      instance = null;
-    }
-    if (pool != null) {
-      pool.close();
-    }
-  }
-
   // -------------------------------------------------------------------------
   // Per-slot accessors
   // -------------------------------------------------------------------------
@@ -146,6 +108,16 @@ public int size() {
     return size;
   }
 
+  /**
+   * Advances the round-robin counter by {@code count} and returns the starting slot index for
+   * this call. Slot indices are wrapped modulo {@link #size()}.
+   */
+  public int nextSlot(int count) {
+    int start = slotCounter;
+    slotCounter += count;
+    return start;
+  }
+
   // -------------------------------------------------------------------------
   // Lifecycle
   // -------------------------------------------------------------------------

From 82dcf71547524670d549add4c029dbb67f4ed5f8 Mon Sep 17 00:00:00 2001
From: James Xia <jamxia@nvidia.com>
Date: Mon, 13 Apr 2026 07:14:11 -0700
Subject: [PATCH 04/11] Default initialize CAGRA search parameters

---
 .../src/main/java/com/nvidia/cuvs/CagraSearchParams.java  | 4 ++--
 .../java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java | 8 ++------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java
index 76e1f10bd9..c4a3847755 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java
@@ -301,8 +301,8 @@ public static class Builder {
     private int numRandomSamplings = 1;
     private float hashMapMaxFillRate = 0.5f;
     private long randXORMask = 0x128394;
-    private SearchAlgo searchAlgo;
-    private HashMapMode hashMapMode;
+    private SearchAlgo searchAlgo = SearchAlgo.AUTO;
+    private HashMapMode hashMapMode = HashMapMode.AUTO_HASH;
 
     /**
      * Default constructor.
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
index ee12e7d88f..a1340a7c76 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
@@ -36,16 +36,12 @@ public static MemorySegment buildCagraSearchParams(Arena arena, CagraSearchParam
     cuvsCagraSearchParams.max_queries(seg, params.getMaxQueries());
     cuvsCagraSearchParams.itopk_size(seg, params.getITopKSize());
     cuvsCagraSearchParams.max_iterations(seg, params.getMaxIterations());
-    if (params.getCagraSearchAlgo() != null) {
-      cuvsCagraSearchParams.algo(seg, params.getCagraSearchAlgo().value);
-    }
+    cuvsCagraSearchParams.algo(seg, params.getCagraSearchAlgo().value);
     cuvsCagraSearchParams.team_size(seg, params.getTeamSize());
     cuvsCagraSearchParams.search_width(seg, params.getSearchWidth());
     cuvsCagraSearchParams.min_iterations(seg, params.getMinIterations());
     cuvsCagraSearchParams.thread_block_size(seg, params.getThreadBlockSize());
-    if (params.getHashMapMode() != null) {
-      cuvsCagraSearchParams.hashmap_mode(seg, params.getHashMapMode().value);
-    }
+    cuvsCagraSearchParams.hashmap_mode(seg, params.getHashMapMode().value);
     cuvsCagraSearchParams.hashmap_max_fill_rate(seg, params.getHashMapMaxFillRate());
     cuvsCagraSearchParams.num_random_samplings(seg, params.getNumRandomSamplings());
     cuvsCagraSearchParams.rand_xor_mask(seg, params.getRandXORMask());

From fad76cb18bb18ff4826a4686b4aa97b841e1c1d8 Mon Sep 17 00:00:00 2001
From: James Xia <jamxia@nvidia.com>
Date: Mon, 13 Apr 2026 16:47:01 -0700
Subject: [PATCH 05/11] Enable persistent CAGRA kernel to be shared across
 index segments

The persistent kernel runner was previously keyed on (dataset_desc,
graph, fixed search params), which forced a destroy/recreate cycle for
every segment when searching a multi-segment Lucene index: each segment
has a different graph pointer and potentially a different auto-computed
max_iterations, producing a different hash on every call.

C++ changes (search_single_cta_kernel-inl.cuh):
- Move dataset_desc_ptr, graph_ptr, and graph_degree from fixed runner
  state into per-job fields in job_desc_t. The persistent kernel reads
  them from the job descriptor, so one runner instance can serve any
  number of segments without being rebuilt.
- Remove dataset_desc and graph arguments from persistent_runner_t
  constructor and calculate_parameter_hash; the runner is now keyed
  only on fixed kernel parameters (block_size, smem_size, itopk, etc.).
- Update select_and_run to initialize the device descriptor on the
  caller's stream and synchronize before submission, then pass
  dd_dev_ptr, graph.data_handle(), and graph_degree to runner::launch.
- Remove dd_host from persistent_runner_t; dataset upload is now the
  caller's responsibility on each launch.

Java changes:
- Add persistent, persistentLifetime, and persistentDeviceUsage fields,
  getters, and Builder methods to CagraSearchParams.
- Wire the three persistent params through CuVSParamsHelper into the
  Panama-generated cuvsCagraSearchParams struct.
---
 .../cagra/search_single_cta_kernel-inl.cuh    | 95 ++++++++++---------
 .../com/nvidia/cuvs/CagraSearchParams.java    | 83 +++++++++++++++-
 .../cuvs/internal/CuVSParamsHelper.java       |  3 +
 3 files changed, 136 insertions(+), 45 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 48553611bf..5122fad63f 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1187,6 +1187,10 @@ struct alignas(kCacheLineBytes) job_desc_t {
     const data_type* queries_ptr;         // [num_queries, dataset_dim]
     uint32_t top_k;
     uint32_t n_queries;
+    // Make index details job-specific so the persistent kernel singleton can serve multiple indexes
+    const DATASET_DESCRIPTOR_T* dataset_desc_ptr;
+    const index_type* graph_ptr;
+    uint32_t graph_degree;
   };
   using blob_elem_type = uint4;
   constexpr static inline size_t kBlobSize =
@@ -1230,12 +1234,9 @@ template <bool TOPK_BY_BITONIC_SORT,
           class SourceIndexT,
           class SAMPLE_FILTER_T>
 RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel_p(
-  const DATASET_DESCRIPTOR_T* dataset_desc,
   worker_handle_t* worker_handles,
   job_desc_t<DATASET_DESCRIPTOR_T>* job_descriptors,
   uint32_t* completion_counters,
-  const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,  // [dataset_size, graph_degree]
-  const std::uint32_t graph_degree,
   const SourceIndexT* source_indices_ptr,
   const unsigned num_distilation,
   const uint64_t rand_xor_mask,
@@ -1294,6 +1295,10 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel_p(
     auto top_k                 = job_descriptor.value.top_k;
     auto n_queries             = job_descriptor.value.n_queries;
     auto query_id              = worker_data.value.query_id;
+    // per-job index pointers
+    auto* dataset_desc = job_descriptor.value.dataset_desc_ptr;
+    auto* knn_graph    = job_descriptor.value.graph_ptr;
+    auto  graph_degree = job_descriptor.value.graph_degree;
 
     // work phase
     search_core<TOPK_BY_BITONIC_SORT,
@@ -1888,7 +1893,6 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
   using job_desc_type = job_desc_t<descriptor_base_type>;
   kernel_type kernel;
   uint32_t block_size;
-  dataset_descriptor_host<DataT, IndexT, DistanceT> dd_host;
   rmm::device_uvector<worker_handle_t> worker_handles;
   rmm::device_uvector<job_desc_type> job_descriptors;
   rmm::device_uvector<uint32_t> completion_counters;
@@ -1900,9 +1904,9 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
    * Calculate the hash of the parameters to detect if they've changed across the calls.
    * NB: this must have the same argument types as the constructor.
    */
+  // NB: dataset_desc and graph are intentionally excluded from the hash — they are now per-job
+  // fields stored in the job descriptor, so the runner is keyed only on fixed kernel parameters.
   static inline auto calculate_parameter_hash(
-    std::reference_wrapper<const dataset_descriptor_host<DataT, IndexT, DistanceT>> dataset_desc,
-    raft::device_matrix_view<const index_type, int64_t, raft::row_major> graph,
     const SourceIndexT* source_indices_ptr,
     uint32_t max_candidates,
     uint32_t num_itopk_candidates,
@@ -1923,16 +1927,13 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
     float persistent_lifetime,
     float persistent_device_usage) -> uint64_t
   {
-    return uint64_t(graph.data_handle()) ^ uint64_t(source_indices_ptr) ^
-           dataset_desc.get().team_size ^ num_itopk_candidates ^ block_size ^ smem_size ^
+    return uint64_t(source_indices_ptr) ^ num_itopk_candidates ^ block_size ^ smem_size ^
            hash_bitlen ^ small_hash_reset_interval ^ num_random_samplings ^ rand_xor_mask ^
            num_seeds ^ itopk_size ^ search_width ^ min_iterations ^ max_iterations ^
            uint64_t(persistent_lifetime * 1000) ^ uint64_t(persistent_device_usage * 1000);
   }
 
   persistent_runner_t(
-    std::reference_wrapper<const dataset_descriptor_host<DataT, IndexT, DistanceT>> dataset_desc,
-    raft::device_matrix_view<const index_type, int64_t, raft::row_major> graph,
     const SourceIndexT* source_indices_ptr,
     uint32_t max_candidates,
     uint32_t num_itopk_candidates,
@@ -1960,10 +1961,7 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
       job_descriptors(kMaxJobsNum, stream, job_descriptor_mr),
       completion_counters(kMaxJobsNum, stream, device_mr),
       hashmap(0, stream, device_mr),
-      dd_host{dataset_desc.get()},
-      param_hash(calculate_parameter_hash(dd_host,
-                                          graph,
-                                          source_indices_ptr,
+      param_hash(calculate_parameter_hash(source_indices_ptr,
                                           max_candidates,
                                           num_itopk_candidates,
                                           block_size,
@@ -1983,8 +1981,7 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
                                           persistent_lifetime,
                                           persistent_device_usage))
   {
-    // initialize the dataset/distance descriptor
-    auto* dd_dev_ptr = dd_host.dev_ptr(stream);
+    // dataset_desc and graph are now per-job fields; no fixed device descriptor at launch time.
 
     // set kernel attributes same as in normal kernel
     RAFT_CUDA_TRY(
@@ -2027,18 +2024,14 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
     }
 
     // launch the kernel
-    auto* graph_ptr                   = graph.data_handle();
-    uint32_t graph_degree             = graph.extent(1);
+    // dataset_desc and graph are per-job (in the job descriptor); not passed as kernel args.
     uint32_t* num_executed_iterations = nullptr;  // optional arg [num_queries]
     const index_type* dev_seed_ptr    = nullptr;  // optional arg [num_queries, num_seeds]
 
     void* args[] =  // NOLINT
-      {&dd_dev_ptr,
-       &worker_handles_ptr,
+      {&worker_handles_ptr,
        &job_descriptors_ptr,
        &completion_counters_ptr,
-       &graph_ptr,  // [dataset_size, graph_degree]
-       &graph_degree,
        &source_indices_ptr,
        &num_random_samplings,
        &rand_xor_mask,
@@ -2078,11 +2071,14 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
     RAFT_LOG_INFO("Destroyed the persistent runner.");
   }
 
-  void launch(uintptr_t result_indices_ptr,         // [num_queries, top_k]
-              distance_type* result_distances_ptr,  // [num_queries, top_k]
-              const data_type* queries_ptr,         // [num_queries, dataset_dim]
+  void launch(uintptr_t result_indices_ptr,              // [num_queries, top_k]
+              distance_type* result_distances_ptr,       // [num_queries, top_k]
+              const data_type* queries_ptr,              // [num_queries, dataset_dim]
               uint32_t num_queries,
-              uint32_t top_k)
+              uint32_t top_k,
+              const descriptor_base_type* dd_dev_ptr,   // device descriptor for this segment
+              const index_type* graph_ptr,              // graph for this segment
+              uint32_t graph_degree)                    // graph degree for this segment
   {
     // submit all queries
     launcher_t launcher{job_queue,
@@ -2095,14 +2091,20 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
                          result_distances_ptr,
                          queries_ptr,
                          top_k,
-                         num_queries](uint32_t job_ix) {
-                          auto& jd                = job_descriptors.data()[job_ix].input.value;
-                          auto* cflag             = &job_descriptors.data()[job_ix].completion_flag;
-                          jd.result_indices_ptr   = result_indices_ptr;
-                          jd.result_distances_ptr = result_distances_ptr;
-                          jd.queries_ptr          = queries_ptr;
-                          jd.top_k                = top_k;
-                          jd.n_queries            = num_queries;
+                         num_queries,
+                         dd_dev_ptr,
+                         graph_ptr,
+                         graph_degree](uint32_t job_ix) {
+                          auto& jd                  = job_descriptors.data()[job_ix].input.value;
+                          auto* cflag               = &job_descriptors.data()[job_ix].completion_flag;
+                          jd.result_indices_ptr     = result_indices_ptr;
+                          jd.result_distances_ptr   = result_distances_ptr;
+                          jd.queries_ptr            = queries_ptr;
+                          jd.top_k                  = top_k;
+                          jd.n_queries              = num_queries;
+                          jd.dataset_desc_ptr       = dd_dev_ptr;
+                          jd.graph_ptr              = graph_ptr;
+                          jd.graph_degree           = graph_degree;
                           cflag->store(false, cuda::memory_order_relaxed);
                           cuda::atomic_thread_fence(cuda::memory_order_release,
                                                     cuda::thread_scope_system);
@@ -2288,14 +2290,14 @@ void select_and_run(
   if (ps.persistent) {
     using runner_type = persistent_runner_t<DataT, IndexT, DistanceT, SourceIndexT, SampleFilterT>;
 
-    get_runner<runner_type>(/*
-Note, we're passing the descriptor by reference here, and this reference is going to be passed to a
-new spawned thread, which is dangerous. However, the descriptor is copied in that thread before the
-control is returned in this thread (in persistent_runner_t constructor), so we're safe.
-*/
-                            std::cref(dataset_desc),
-                            graph,
-                            source_indices_ptr,
+    // Initialize the device descriptor on the caller's stream (lazy, cached after first call).
+    // Synchronize to guarantee the upload completes before the GPU kernel reads the pointer from
+    // the job descriptor.  This sync is cheap on all subsequent calls (stream is empty).
+    auto* dd_dev_ptr = dataset_desc.dev_ptr(stream);
+    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
+
+    // Runner is keyed on fixed kernel parameters only; dataset/graph are now per-job.
+    get_runner<runner_type>(source_indices_ptr,
                             max_candidates,
                             num_itopk_candidates,
                             block_size,
@@ -2314,7 +2316,14 @@ control is returned in this thread (in persistent_runner_t constructor), so we'r
                             sample_filter,
                             ps.persistent_lifetime,
                             ps.persistent_device_usage)
-      ->launch(topk_indices_ptr, topk_distances_ptr, queries_ptr, num_queries, topk);
+      ->launch(topk_indices_ptr,
+               topk_distances_ptr,
+               queries_ptr,
+               num_queries,
+               topk,
+               dd_dev_ptr,
+               graph.data_handle(),
+               static_cast<uint32_t>(graph.extent(1)));
   } else {
     using descriptor_base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
     auto kernel = search_kernel_config<false, descriptor_base_type, SourceIndexT, SampleFilterT>::
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java
index c4a3847755..a9ea94a440 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java
@@ -25,6 +25,9 @@ public class CagraSearchParams {
   private long randXORMask;
   private SearchAlgo searchAlgo;
   private HashMapMode hashMapMode;
+  private boolean persistent;
+  private float persistentLifetime;
+  private float persistentDeviceUsage;
 
   /**
    * Enum to denote algorithm used to search CAGRA Index.
@@ -120,7 +123,10 @@ private CagraSearchParams(
       int hashmapMinBitlen,
       float hashmapMaxFillRate,
       int numRandomSamplings,
-      long randXORMask) {
+      long randXORMask,
+      boolean persistent,
+      float persistentLifetime,
+      float persistentDeviceUsage) {
     this.maxQueries = maxQueries;
     this.iTopKSize = iTopKSize;
     this.maxIterations = maxIterations;
@@ -134,6 +140,9 @@ private CagraSearchParams(
     this.hashMapMaxFillRate = hashmapMaxFillRate;
     this.numRandomSamplings = numRandomSamplings;
     this.randXORMask = randXORMask;
+    this.persistent = persistent;
+    this.persistentLifetime = persistentLifetime;
+    this.persistentDeviceUsage = persistentDeviceUsage;
   }
 
   /**
@@ -254,6 +263,33 @@ public HashMapMode getHashMapMode() {
     return hashMapMode;
   }
 
+  /**
+   * Gets whether the persistent kernel is enabled.
+   *
+   * @return true if the persistent kernel is enabled
+   */
+  public boolean isPersistent() {
+    return persistent;
+  }
+
+  /**
+   * Gets the persistent kernel lifetime in seconds.
+   *
+   * @return the lifetime in seconds
+   */
+  public float getPersistentLifetime() {
+    return persistentLifetime;
+  }
+
+  /**
+   * Gets the fraction of maximum grid size used by the persistent kernel.
+   *
+   * @return the device usage fraction (0.0, 1.0]
+   */
+  public float getPersistentDeviceUsage() {
+    return persistentDeviceUsage;
+  }
+
   @Override
   public String toString() {
     return "CagraSearchParams [maxQueries="
@@ -303,6 +339,9 @@ public static class Builder {
     private long randXORMask = 0x128394;
     private SearchAlgo searchAlgo = SearchAlgo.AUTO;
     private HashMapMode hashMapMode = HashMapMode.AUTO_HASH;
+    private boolean persistent = false;
+    private float persistentLifetime = 2.0f;
+    private float persistentDeviceUsage = 1.0f;
 
     /**
      * Default constructor.
@@ -460,6 +499,43 @@ public Builder withRandXorMask(long randXORMask) {
       return this;
     }
 
+    /**
+     * Enables or disables the persistent kernel.
+     *
+     * <p>When enabled, the CAGRA SINGLE_CTA kernel stays resident on the GPU and serves search
+     * jobs via system-scope atomics. The kernel is shared across all indexes.
+     *
+     * @param persistent true to enable the persistent kernel
+     * @return an instance of this Builder
+     */
+    public Builder withPersistent(boolean persistent) {
+      this.persistent = persistent;
+      return this;
+    }
+
+    /**
+     * Sets the time in seconds before an idle persistent kernel exits.
+     *
+     * @param persistentLifetime lifetime in seconds (default 2.0)
+     * @return an instance of this Builder
+     */
+    public Builder withPersistentLifetime(float persistentLifetime) {
+      this.persistentLifetime = persistentLifetime;
+      return this;
+    }
+
+    /**
+     * Sets the fraction of maximum grid size used by the persistent kernel.
+     * Must be greater than 0.0 and not greater than 1.0.
+     *
+     * @param persistentDeviceUsage device usage fraction (default 1.0)
+     * @return an instance of this Builder
+     */
+    public Builder withPersistentDeviceUsage(float persistentDeviceUsage) {
+      this.persistentDeviceUsage = persistentDeviceUsage;
+      return this;
+    }
+
     /**
      * Builds an instance of {@link CagraSearchParams} with passed search
      * parameters.
@@ -480,7 +556,10 @@ public CagraSearchParams build() {
           hashMapMinBitlen,
           hashMapMaxFillRate,
           numRandomSamplings,
-          randXORMask);
+          randXORMask,
+          persistent,
+          persistentLifetime,
+          persistentDeviceUsage);
     }
   }
 }
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
index a1340a7c76..a4c914dbf8 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java
@@ -45,6 +45,9 @@ public static MemorySegment buildCagraSearchParams(Arena arena, CagraSearchParam
     cuvsCagraSearchParams.hashmap_max_fill_rate(seg, params.getHashMapMaxFillRate());
     cuvsCagraSearchParams.num_random_samplings(seg, params.getNumRandomSamplings());
     cuvsCagraSearchParams.rand_xor_mask(seg, params.getRandXORMask());
+    cuvsCagraSearchParams.persistent(seg, params.isPersistent());
+    cuvsCagraSearchParams.persistent_lifetime(seg, params.getPersistentLifetime());
+    cuvsCagraSearchParams.persistent_device_usage(seg, params.getPersistentDeviceUsage());
     return seg;
   }
 

From 22a2c8d6e9603cddc2da906d702bffea5aef3cfa Mon Sep 17 00:00:00 2001
From: James Xia <jamxia@nvidia.com>
Date: Tue, 14 Apr 2026 13:54:33 -0700
Subject: [PATCH 06/11] java: parallelize persistent-mode segment searches in
 MultiSegmentCagraSearch

In persistent mode, `searchIntoBuffer` blocks on the CPU until the GPU
signals completion via a system-scope atomic. Previously, segments were
searched sequentially, so the GPU processed one segment at a time per
query, leaving its job queue mostly idle between segment dispatches.

Submit one async task per pool slot so all slots' segment searches are
in-flight simultaneously. The persistent runner's job queue can hold
all N segment jobs at once, allowing GPU workers to execute segments in
parallel (bounded by worker_queue_size).

Segments are grouped by pool slot rather than submitted one-per-segment
to prevent concurrent access to the same cuvsResources_t handle: the
descriptor_cache stored inside the RAFT resources object is not
thread-safe, and multiple threads calling cuvsCagraSearch with the same
handle causes a SIGSEGV. Grouping ensures each cuvsResources_t is
accessed by at most one thread at a time. Effective parallelism is
min(numSegments, pool.size()); increasing cuvsStreamPoolSize raises
the ceiling.

In non-persistent mode the existing sequential loop is unchanged:
kernel launches are asynchronous and return immediately, so Java-level
parallelism adds overhead without benefit.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../nvidia/cuvs/MultiSegmentCagraSearch.java  | 127 +++++++++++++++---
 1 file changed, 108 insertions(+), 19 deletions(-)

diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
index 2085f7357b..50cad34f39 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
@@ -22,8 +22,14 @@
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 import java.lang.foreign.ValueLayout;
+import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
 
 /**
  * Performs a single-query approximate nearest neighbor search across multiple CAGRA index segments
@@ -37,7 +43,10 @@
  *   <li>Assign each segment a slot from the {@link CudaStreamPool} via round-robin. Segments on
  *       different slots run in parallel on separate CUDA streams.</li>
  *   <li>For each segment, call {@link BufferedCagraSearch#searchIntoBuffer} to queue the CAGRA
- *       search kernel on the slot's stream; no per-segment sync or D2H copy.</li>
+ *       search kernel. In non-persistent mode this enqueues asynchronously on the slot's CUDA
+ *       stream. In persistent mode each call blocks on CPU until the GPU worker signals completion;
+ *       all segments are submitted concurrently via {@link #ASYNC_SEARCH_POOL} so the GPU can
+ *       execute multiple segment jobs in parallel (bounded by {@code worker_queue_size}).</li>
  *   <li>Record a CUDA event on each slot's stream; make the main stream wait on all events.</li>
  *   <li>Call {@code cuvsSelectK} on the main stream to find the global top-k entirely on GPU.</li>
  *   <li>Sync the main stream.</li>
@@ -52,6 +61,22 @@ public class MultiSegmentCagraSearch {
 
   private MultiSegmentCagraSearch() {}
 
+  /**
+   * Thread pool used to submit persistent-mode segment searches concurrently.
+   *
+   * <p>In persistent mode, {@link BufferedCagraSearch#searchIntoBuffer} blocks on the CPU until
+   * the GPU signals completion via a system-scope atomic. Running each segment in its own thread
+   * allows the persistent kernel's job queue to hold all N segment jobs simultaneously, so GPU
+   * workers can execute them in parallel (bounded by {@code worker_queue_size}).
+   */
+  private static final ExecutorService ASYNC_SEARCH_POOL =
+      Executors.newCachedThreadPool(
+          r -> {
+            Thread t = new Thread(r, "cuvs-segment-search");
+            t.setDaemon(true);
+            return t;
+          });
+
   /**
    * Searches multiple CAGRA index segments for the global top-k nearest neighbors.
    *
@@ -114,25 +139,89 @@ public static MultiSegmentSearchResults search(
           var outIdxDP = allocateRMMSegment(cuvsRes, outIdxBytes);
           var outValDP = allocateRMMSegment(cuvsRes, outValBytes)) {
 
-        // --- Phase 1: queue all per-segment CAGRA searches, each on its own stream ---
-        // Use a single arena and pre-built search params shared across all segments to avoid
-        // repeated Arena.ofConfined() and segmentFromSearchParams calls.
-        // The arena is closed after Phase 1; all CPU-side structs are only needed until
-        // cuvsCagraSearch returns (the kernel launch is synchronous on the CPU side).
-        try (var segArena = Arena.ofConfined()) {
-          MemorySegment searchParams =
-              CuVSParamsHelper.buildCagraSearchParams(
-                  segArena, queries.get(0).getCagraSearchParameters());
+        // --- Phase 1: queue all per-segment CAGRA searches ---
+        CagraSearchParams searchParameters = queries.get(0).getCagraSearchParameters();
+        if (searchParameters.isPersistent()) {
+          // Persistent mode: searchIntoBuffer blocks on CPU (via system-scope atomic spin) until
+          // the GPU signals completion. Submit one task per pool slot in parallel so the GPU can
+          // work on multiple segment jobs concurrently, bounded by worker_queue_size.
+          //
+          // Segments are grouped by slot: if numSegments > pool.size(), multiple segments share a
+          // slot and must be serialized within that slot's task — each cuvsResources_t handle is
+          // not thread-safe for concurrent access (the descriptor_cache inside is not guarded).
+          // Parallelism = min(numSegments, pool.size()).
+          int poolSize = pool.size();
+          // Collect segment indices per slot. Size: poolSize, each entry may have 0..n indices.
+          @SuppressWarnings("unchecked")
+          List<Integer>[] segsBySlot = new List[poolSize];
+          for (int slot = 0; slot < poolSize; slot++) {
+            segsBySlot[slot] = new ArrayList<>();
+          }
           for (int i = 0; i < numSegments; i++) {
-            buffered[i].searchIntoBuffer(
-                queries.get(i),
-                globalNeighborsDP.handle(),
-                globalDistancesDP.handle(),
-                i,
-                pool.resources(slots[i]),
-                pool.stream(slots[i]),
-                searchParams,
-                segArena);
+            segsBySlot[slots[i]].add(i);
+          }
+          // Submit one task per occupied slot.
+          List<Future<Void>> futures = new ArrayList<>(poolSize);
+          for (int slot = 0; slot < poolSize; slot++) {
+            if (segsBySlot[slot].isEmpty()) continue;
+            final int taskSlot = slot;
+            final List<Integer> taskSegs = segsBySlot[slot];
+            futures.add(
+                ASYNC_SEARCH_POOL.submit(
+                    (Callable<Void>)
+                        () -> {
+                          try (var threadArena = Arena.ofConfined()) {
+                            MemorySegment sp =
+                                CuVSParamsHelper.buildCagraSearchParams(
+                                    threadArena, searchParameters);
+                            for (int segIdx : taskSegs) {
+                              buffered[segIdx].searchIntoBuffer(
+                                  queries.get(segIdx),
+                                  globalNeighborsDP.handle(),
+                                  globalDistancesDP.handle(),
+                                  segIdx,
+                                  pool.resources(taskSlot),
+                                  pool.stream(taskSlot),
+                                  sp,
+                                  threadArena);
+                            }
+                          } catch (Exception e) {
+                            throw e;
+                          } catch (Throwable t) {
+                            throw new RuntimeException(t);
+                          }
+                          return null;
+                        }));
+          }
+          for (Future<Void> f : futures) {
+            try {
+              f.get();
+            } catch (ExecutionException e) {
+              throw e.getCause();
+            } catch (InterruptedException e) {
+              Thread.currentThread().interrupt();
+              throw e;
+            }
+          }
+        } else {
+          // Non-persistent: each cuvsCagraSearch enqueues a CUDA kernel asynchronously and
+          // returns immediately; segments execute in parallel on their respective CUDA streams.
+          // A shared arena covers all per-call CPU allocations; it is closed once all launches
+          // have been enqueued.
+          try (var segArena = Arena.ofConfined()) {
+            MemorySegment searchParams =
+                CuVSParamsHelper.buildCagraSearchParams(segArena, searchParameters);
+            for (int i = 0; i < numSegments; i++) {
+              buffered[i].searchIntoBuffer(
+                  queries.get(i),
+                  globalNeighborsDP.handle(),
+                  globalDistancesDP.handle(),
+                  i,
+                  pool.resources(slots[i]),
+                  pool.stream(slots[i]),
+                  searchParams,
+                  segArena);
+            }
           }
         }
 

From 69c4771f1fd70c4183a36a69fa2672cf89916ccf Mon Sep 17 00:00:00 2001
From: James Xia <jamxia@nvidia.com>
Date: Thu, 16 Apr 2026 16:26:26 -0700
Subject: [PATCH 07/11] Add native multi-segment CAGRA search and per-resources
 workspace pool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Multi-segment search (C++/C/Java):
- Add search_multi_segment() C++ function: builds a single search plan
  sized for the largest segment, packs per-segment descriptors, and
  launches search_kernel_ms — a new SINGLE_CTA kernel with grid
  dimensions (1, num_queries, num_segments) so each CTA independently
  searches one (query, segment) pair in a single kernel call.
- Add cuvsCagraSearchMultiSegment() C API wrapping the above.
- Simplify MultiSegmentCagraSearch.java to call
  cuvsCagraSearchMultiSegment() unconditionally, removing the previous
  per-segment stream pool, thread pool, CUDA event synchronization, and
  persistent/non-persistent branching. The search now completes in four
  phases: multi-segment kernel, GPU-side select-k, single D2H copy,
  result decoding.
- Add BufferedCagraSearch.getIndexHandle() to expose the raw
  cuvsCagraIndex_t handle needed by the multi-segment kernel dispatch.

Workspace pool (C/Java):
- Add cuvsResourcesSetWorkspacePool(): configures the per-resources
  temporary workspace as an uncapped RMM pool that grows without
  shrinking. After warmup, cuvsRMMAlloc/cuvsRMMFree hit the pool cache
  instead of calling cudaMallocAsync/cudaFreeAsync, eliminating CUDA
  context lock contention under concurrent query threads.
- Route cuvsRMMAlloc/cuvsRMMFree through the workspace resource so Java-
  side output buffer allocations also benefit from the pool.
- Expose setWorkspacePool() in the CuVSResources Java interface with
  implementations in CuVSResourcesImpl and SynchronizedCuVSResources.

Refactoring (search_single_cta_kernel-inl.cuh):
- Extract TopkVariant enum and select_topk_variant() helper, shared by
  search_kernel_config and search_kernel_config_ms, replacing duplicated
  if/else trees in both choose_itopk_and_mx_candidates() bodies.
- Extract kernel_dispatch_params::compute() to centralize max_candidates
  and max_itopk computation shared by select_and_run and
  select_and_run_multi_segment.
- Extract hashmap_element_count() static helper on the search struct,
  used by both set_params (single-segment) and run_multi_segment.
---
 c/include/cuvs/core/c_api.h                   |  16 +
 c/include/cuvs/neighbors/cagra.h              |  28 ++
 c/src/core/c_api.cpp                          |  22 +-
 c/src/neighbors/cagra.cpp                     |  48 +++
 cpp/include/cuvs/neighbors/cagra.hpp          |  82 +++++
 cpp/src/neighbors/cagra.cuh                   |  13 +
 cpp/src/neighbors/cagra_search_inst.cu.in     |  18 +
 .../neighbors/detail/cagra/cagra_search.cuh   | 166 +++++++++
 .../detail/cagra/search_single_cta.cuh        |  56 ++-
 .../detail/cagra/search_single_cta_inst.cuh   |  17 +
 .../cagra/search_single_cta_kernel-inl.cuh    | 329 +++++++++++++++---
 .../detail/cagra/search_single_cta_kernel.cuh |  38 ++
 .../java/com/nvidia/cuvs/CuVSResources.java   |  18 +
 .../cuvs/SynchronizedCuVSResources.java       |   5 +
 .../nvidia/cuvs/MultiSegmentCagraSearch.java  | 214 ++++--------
 .../cuvs/internal/BufferedCagraSearch.java    |   7 +
 .../nvidia/cuvs/internal/CagraIndexImpl.java  |   5 +
 .../cuvs/internal/CuVSResourcesImpl.java      |   6 +
 .../com/nvidia/cuvs/CheckedCuVSResources.java |   5 +
 19 files changed, 880 insertions(+), 213 deletions(-)

diff --git a/c/include/cuvs/core/c_api.h b/c/include/cuvs/core/c_api.h
index b9678b3771..622fe4cdcf 100644
--- a/c/include/cuvs/core/c_api.h
+++ b/c/include/cuvs/core/c_api.h
@@ -129,6 +129,22 @@ cuvsError_t cuvsStreamSync(cuvsResources_t res);
  */
 cuvsError_t cuvsDeviceIdGet(cuvsResources_t res, int* device_id);
 
+/**
+ * @brief Configure the temporary workspace on this resources object as an uncapped pool, backed
+ *        by the current device memory resource. After the initial reservation is allocated on
+ *        first use, subsequent calls to cuvsRMMAlloc / cuvsRMMFree on the same resources handle
+ *        hit the pool cache rather than calling cudaMallocAsync / cudaFreeAsync, reducing CUDA
+ *        context lock contention under concurrent query threads. The pool grows without shrinking:
+ *        freed allocations are returned to the pool rather than to the device, so the pool's
+ *        high-water mark only increases until the resources object is destroyed.
+ *
+ * @param[in] res                cuvsResources_t opaque C handle
+ * @param[in] initial_size_bytes initial pool reservation in bytes; size to cover the
+ *                               steady-state working set to avoid growth after warmup
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsResourcesSetWorkspacePool(cuvsResources_t res, size_t initial_size_bytes);
+
 /**
  * @brief Create an Initialized opaque C handle for C++ type `raft::device_resources_snmg`
  *        for multi-GPU operations
diff --git a/c/include/cuvs/neighbors/cagra.h b/c/include/cuvs/neighbors/cagra.h
index 3304e10f37..f6fc7da8e5 100644
--- a/c/include/cuvs/neighbors/cagra.h
+++ b/c/include/cuvs/neighbors/cagra.h
@@ -712,6 +712,34 @@ cuvsError_t cuvsCagraSearch(cuvsResources_t res,
                             DLManagedTensor* distances,
                             cuvsFilter filter);
 
+/**
+ * @brief Search multiple CAGRA index segments concurrently using a single GPU kernel launch.
+ *
+ * Launches a single kernel with grid (1, num_queries, num_segments) so each CTA handles one
+ * (query, segment) pair concurrently. All results land in the caller-supplied device buffers
+ * on the same CUDA stream, so downstream operations (e.g. selectK) see them via stream ordering
+ * with no explicit synchronization needed.
+ *
+ * Only float32 datasets are currently supported.  Distance values are comparable across segments
+ * (same scale) but are not postprocessed (no kScale correction) — they are suitable for
+ * relative comparison (selectK / recall).
+ *
+ * @param[in]  res          cuvsResources_t opaque C handle
+ * @param[in]  params       search parameters
+ * @param[in]  num_segments number of index segments
+ * @param[in]  indices      array of num_segments cuvsCagraIndex_t pointers
+ * @param[in]  queries      array of num_segments DLManagedTensor* (device, float32, [nq, dim])
+ * @param[out] neighbors    array of num_segments DLManagedTensor* (device, uint32, [nq, topk])
+ * @param[out] distances    array of num_segments DLManagedTensor* (device, float32, [nq, topk])
+ */
+cuvsError_t cuvsCagraSearchMultiSegment(cuvsResources_t res,
+                                        cuvsCagraSearchParams_t params,
+                                        uint32_t num_segments,
+                                        cuvsCagraIndex_t* indices,
+                                        DLManagedTensor** queries,
+                                        DLManagedTensor** neighbors,
+                                        DLManagedTensor** distances);
+
 /**
  * @}
  */
diff --git a/c/src/core/c_api.cpp b/c/src/core/c_api.cpp
index b27b3bc6d5..09f652c613 100644
--- a/c/src/core/c_api.cpp
+++ b/c/src/core/c_api.cpp
@@ -9,6 +9,7 @@
 #include <raft/core/device_resources_snmg.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_id.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
 #include <raft/core/resource/resource_types.hpp>
 #include <raft/core/resources.hpp>
 #include <raft/util/cudart_utils.hpp>
@@ -36,6 +37,19 @@ extern "C" cuvsError_t cuvsResourcesCreate(cuvsResources_t* res)
   });
 }
 
+extern "C" cuvsError_t cuvsResourcesSetWorkspacePool(cuvsResources_t res, size_t initial_size_bytes)
+{
+  return cuvs::core::translate_exceptions([=] {
+    auto res_ptr = reinterpret_cast<raft::resources*>(res);
+    // Create an uncapped pool: pre-warms with initial_size_bytes to avoid cudaMalloc on every
+    // query, but can grow beyond that if an allocation exceeds the initial reservation.
+    auto pool_mr =
+      std::make_shared<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>>(
+        rmm::mr::get_current_device_resource(), initial_size_bytes);
+    raft::resource::set_workspace_resource(*res_ptr, pool_mr);
+  });
+}
+
 extern "C" cuvsError_t cuvsResourcesDestroy(cuvsResources_t res)
 {
   return cuvs::core::translate_exceptions([=] {
@@ -133,8 +147,8 @@ extern "C" cuvsError_t cuvsRMMAlloc(cuvsResources_t res, void** ptr, size_t byte
 {
   return cuvs::core::translate_exceptions([=] {
     auto res_ptr = reinterpret_cast<raft::resources*>(res);
-    auto mr      = rmm::mr::get_current_device_resource_ref();
-    *ptr         = mr.allocate(raft::resource::get_cuda_stream(*res_ptr), bytes);
+    auto stream  = raft::resource::get_cuda_stream(*res_ptr);
+    *ptr         = raft::resource::get_workspace_resource(*res_ptr)->allocate(stream, bytes);
   });
 }
 
@@ -142,8 +156,8 @@ extern "C" cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes)
 {
   return cuvs::core::translate_exceptions([=] {
     auto res_ptr = reinterpret_cast<raft::resources*>(res);
-    auto mr      = rmm::mr::get_current_device_resource_ref();
-    mr.deallocate(raft::resource::get_cuda_stream(*res_ptr), ptr, bytes);
+    auto stream  = raft::resource::get_cuda_stream(*res_ptr);
+    raft::resource::get_workspace_resource(*res_ptr)->deallocate(stream, ptr, bytes);
   });
 }
 
diff --git a/c/src/neighbors/cagra.cpp b/c/src/neighbors/cagra.cpp
index 081179ca46..2a4edda2a8 100644
--- a/c/src/neighbors/cagra.cpp
+++ b/c/src/neighbors/cagra.cpp
@@ -689,6 +689,54 @@ extern "C" cuvsError_t cuvsCagraSearch(cuvsResources_t res,
   });
 }
 
+extern "C" cuvsError_t cuvsCagraSearchMultiSegment(cuvsResources_t res,
+                                                   cuvsCagraSearchParams_t params,
+                                                   uint32_t num_segments,
+                                                   cuvsCagraIndex_t* indices,
+                                                   DLManagedTensor** queries,
+                                                   DLManagedTensor** neighbors,
+                                                   DLManagedTensor** distances)
+{
+  return cuvs::core::translate_exceptions([=] {
+    RAFT_EXPECTS(num_segments > 0, "num_segments must be > 0");
+    RAFT_EXPECTS(indices != nullptr && queries != nullptr && neighbors != nullptr &&
+                   distances != nullptr,
+                 "All pointer arrays must be non-null");
+
+    auto res_ptr = reinterpret_cast<raft::resources*>(res);
+    auto search_params = cuvs::neighbors::cagra::search_params();
+    convert_c_search_params(*params, &search_params);
+
+    // Only float32 is supported for multi-segment search.
+    RAFT_EXPECTS(
+      indices[0]->dtype.code == kDLFloat && indices[0]->dtype.bits == 32,
+      "Multi-segment search only supports float32 indices");
+
+    using T        = float;
+    using IdxT     = uint32_t;
+    using OutIdxT  = uint32_t;
+    using DistanceT = float;
+    using IndexT    = cuvs::neighbors::cagra::index<T, IdxT>;
+
+    std::vector<const IndexT*> idx_vec(num_segments);
+    std::vector<raft::device_matrix_view<const T, int64_t, raft::row_major>> q_vec(num_segments);
+    std::vector<raft::device_matrix_view<OutIdxT, int64_t, raft::row_major>> n_vec(num_segments);
+    std::vector<raft::device_matrix_view<DistanceT, int64_t, raft::row_major>> d_vec(num_segments);
+
+    for (uint32_t i = 0; i < num_segments; i++) {
+      RAFT_EXPECTS(indices[i] != nullptr && indices[i]->addr != 0,
+                   "Index at position %u is null or not built", i);
+      idx_vec[i] = reinterpret_cast<const IndexT*>(indices[i]->addr);
+      q_vec[i]   = cuvs::core::from_dlpack<std::remove_reference_t<decltype(q_vec[i])>>(queries[i]);
+      n_vec[i]   = cuvs::core::from_dlpack<std::remove_reference_t<decltype(n_vec[i])>>(neighbors[i]);
+      d_vec[i]   = cuvs::core::from_dlpack<std::remove_reference_t<decltype(d_vec[i])>>(distances[i]);
+    }
+
+    cuvs::neighbors::cagra::search_multi_segment(
+      *res_ptr, search_params, idx_vec, q_vec, n_vec, d_vec);
+  });
+}
+
 extern "C" cuvsError_t cuvsCagraMerge(cuvsResources_t res,
                                       cuvsCagraIndexParams_t params,
                                       cuvsCagraIndex_t* indices,
diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp
index a7e1249677..fd9b551a36 100644
--- a/cpp/include/cuvs/neighbors/cagra.hpp
+++ b/cpp/include/cuvs/neighbors/cagra.hpp
@@ -29,6 +29,7 @@
 #include <optional>
 #include <string>
 #include <variant>
+#include <vector>
 
 namespace cuvs::neighbors::graph_build_params {
 using iterative_search_params = cuvs::neighbors::search_params;
@@ -1723,6 +1724,87 @@ void search(raft::resources const& res,
             const cuvs::neighbors::filtering::base_filter& sample_filter =
               cuvs::neighbors::filtering::none_sample_filter{});
 
+/**
+ * @brief Search multiple CAGRA index segments concurrently in a single kernel launch.
+ *
+ * Launches a single SINGLE_CTA kernel grid that covers all segments, with one CTA per segment.
+ * All per-segment results are written into the caller-supplied device buffers on the stream
+ * associated with @p res; the call returns when all segments have been submitted to the stream
+ * (not necessarily completed).  Use @c cuvsStreamSync to wait for completion.
+ *
+ * Distance values are comparable across segments but are not postprocessed (no kScale correction).
+ *
+ * @param[in]  res        raft resources
+ * @param[in]  params     search parameters
+ * @param[in]  indices    one index per segment
+ * @param[in]  queries    per-segment query matrix [n_queries, dim]
+ * @param[out] neighbors  per-segment result neighbors [n_queries, topk]
+ * @param[out] distances  per-segment result distances [n_queries, topk]
+ */
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<float, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const float, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<uint32_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<float, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const float, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<int64_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<half, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const half, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<uint32_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<half, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const half, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<int64_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<int8_t, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const int8_t, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<uint32_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<int8_t, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const int8_t, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<int64_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<uint8_t, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const uint8_t, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<uint32_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
+void search_multi_segment(
+  raft::resources const& res,
+  cuvs::neighbors::cagra::search_params const& params,
+  const std::vector<const cuvs::neighbors::cagra::index<uint8_t, uint32_t>*>& indices,
+  const std::vector<raft::device_matrix_view<const uint8_t, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<int64_t, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances);
+
 /**
  * @}
  */
diff --git a/cpp/src/neighbors/cagra.cuh b/cpp/src/neighbors/cagra.cuh
index 73c3794d39..62fa379fb6 100644
--- a/cpp/src/neighbors/cagra.cuh
+++ b/cpp/src/neighbors/cagra.cuh
@@ -406,6 +406,19 @@ index<T, IdxT> merge(raft::resources const& handle,
   return cagra::detail::merge<T, IdxT>(handle, params, indices, row_filter);
 }
 
+template <typename T, typename OutputIdxT = uint32_t, typename IdxT = uint32_t>
+void search_multi_segment(
+  raft::resources const& res,
+  search_params const& params,
+  const std::vector<const index<T, IdxT>*>& indices,
+  const std::vector<raft::device_matrix_view<const T, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<OutputIdxT, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances)
+{
+  cagra::detail::search_multi_segment<T, OutputIdxT, IdxT>(
+    res, params, indices, queries, neighbors, distances);
+}
+
 /** @} */  // end group cagra
 
 }  // namespace cuvs::neighbors::cagra
diff --git a/cpp/src/neighbors/cagra_search_inst.cu.in b/cpp/src/neighbors/cagra_search_inst.cu.in
index dfef630798..be5c4a906d 100644
--- a/cpp/src/neighbors/cagra_search_inst.cu.in
+++ b/cpp/src/neighbors/cagra_search_inst.cu.in
@@ -32,4 +32,22 @@ CUVS_INST_CAGRA_SEARCH(data_t, uint32_t, int64_t);
 
 #undef CUVS_INST_CAGRA_SEARCH
 
+#define CUVS_INST_CAGRA_SEARCH_MULTI_SEGMENT(T, IdxT, OutputIdxT)                                    \
+  void search_multi_segment(                                                                           \
+    raft::resources const& handle,                                                                     \
+    cuvs::neighbors::cagra::search_params const& params,                                              \
+    const std::vector<const cuvs::neighbors::cagra::index<T, IdxT>*>& indices,                       \
+    const std::vector<raft::device_matrix_view<const T, int64_t, raft::row_major>>& queries,          \
+    const std::vector<raft::device_matrix_view<OutputIdxT, int64_t, raft::row_major>>& neighbors,     \
+    const std::vector<raft::device_matrix_view<float, int64_t, raft::row_major>>& distances)          \
+  {                                                                                                    \
+    cuvs::neighbors::cagra::search_multi_segment<T, IdxT, OutputIdxT>(                               \
+      handle, params, indices, queries, neighbors, distances);                                         \
+  }
+
+CUVS_INST_CAGRA_SEARCH_MULTI_SEGMENT(data_t, uint32_t, uint32_t);
+CUVS_INST_CAGRA_SEARCH_MULTI_SEGMENT(data_t, uint32_t, int64_t);
+
+#undef CUVS_INST_CAGRA_SEARCH_MULTI_SEGMENT
+
 }  // namespace cuvs::neighbors::cagra
diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
index f1650980e0..4b3480ae50 100644
--- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh
@@ -9,6 +9,7 @@
 #include "factory.cuh"
 #include "sample_filter_utils.cuh"
 #include "search_plan.cuh"
+#include "search_single_cta.cuh"
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/host_mdspan.hpp>
@@ -251,4 +252,169 @@ void search_main(raft::resources const& res,
 }
 /** @} */  // end group cagra
 
+/**
+ * @brief Search all segments concurrently for a single query using one kernel launch.
+ *
+ * Each segment's CTA runs independently (blockIdx.z = segment_id, blockIdx.y = query_id).
+ * All segments must use float32 data with the same search parameters.
+ *
+ * @param indices    per-segment indices (strided datasets only for now)
+ * @param queries    per-segment query views — same vector repeated for each segment
+ * @param neighbors  per-segment output neighbor views — each [num_queries, topk]
+ * @param distances  per-segment output distance views — each [num_queries, topk]
+ */
+template <typename T, typename OutputIdxT = uint32_t, typename IdxT = uint32_t,
+          typename DistanceT = float>
+void search_multi_segment(
+  raft::resources const& res,
+  search_params params,
+  const std::vector<const index<T, IdxT>*>& indices,
+  const std::vector<raft::device_matrix_view<const T, int64_t, raft::row_major>>& queries,
+  const std::vector<raft::device_matrix_view<OutputIdxT, int64_t, raft::row_major>>& neighbors,
+  const std::vector<raft::device_matrix_view<DistanceT, int64_t, raft::row_major>>& distances)
+{
+  static_assert(std::is_same_v<IdxT, uint32_t>, "Only uint32_t graph index type is supported");
+  static_assert(std::is_same_v<DistanceT, float>, "Only float distances are supported");
+
+  const uint32_t num_segments = static_cast<uint32_t>(indices.size());
+  RAFT_EXPECTS(num_segments > 0, "At least one segment is required");
+  RAFT_EXPECTS(queries.size() == num_segments && neighbors.size() == num_segments &&
+                 distances.size() == num_segments,
+               "All input vectors must have the same size");
+
+  const int64_t dim       = queries[0].extent(1);
+  const uint32_t topk     = static_cast<uint32_t>(neighbors[0].extent(1));
+  const uint32_t n_queries = static_cast<uint32_t>(queries[0].extent(0));
+
+  // Find the max graph_degree across all segments (needed for the shared kernel plan).
+  int64_t max_graph_degree = 0;
+  int64_t max_dataset_size = 0;
+  for (uint32_t i = 0; i < num_segments; i++) {
+    RAFT_EXPECTS(!indices[i]->dataset_fd().has_value(),
+                 "Disk-based datasets are not supported for multi-segment search");
+    max_graph_degree = std::max(max_graph_degree, indices[i]->graph().extent(1));
+    max_dataset_size = std::max(max_dataset_size, indices[i]->data().n_rows());
+  }
+
+  if (params.max_queries == 0) {
+    cudaDeviceProp deviceProp = raft::resource::get_device_properties(res);
+    params.max_queries =
+      std::min<size_t>(static_cast<size_t>(n_queries), deviceProp.maxGridSize[1]);
+  }
+
+  // Multi-segment uses a regular (non-persistent) single-CTA kernel launch.
+  // MULTI_CTA requires a different plan type (multi_cta_search::search) and is not supported here.
+  // AUTO could resolve to MULTI_CTA for large itopk_size, so force SINGLE_CTA unconditionally.
+  params.persistent = false;
+  params.algo       = search_algo::SINGLE_CTA;
+
+  // Build a single search plan sized for the maximum graph_degree across all segments.
+  // For the first segment's descriptor type (strided float): use it to init the plan.
+  using graph_idx_type = uint32_t;
+  auto* strided_dset0  = dynamic_cast<const strided_dataset<T, int64_t>*>(&indices[0]->data());
+  RAFT_EXPECTS(strided_dset0 != nullptr,
+               "Multi-segment search only supports strided (non-compressed) datasets");
+
+  RAFT_EXPECTS(indices[0]->metric() != cuvs::distance::DistanceType::CosineExpanded ||
+                 indices[0]->dataset_norms().has_value(),
+               "Dataset norms must be provided for CosineExpanded metric");
+  const float* dataset_norms_ptr0 = nullptr;
+  if (indices[0]->metric() == cuvs::distance::DistanceType::CosineExpanded) {
+    dataset_norms_ptr0 = indices[0]->dataset_norms().value().data_handle();
+  }
+  // Use the first segment's descriptor to construct the plan (smem layout is type-dependent only).
+  auto plan_desc = dataset_descriptor_init_with_cache<T, graph_idx_type, DistanceT>(
+    res, params, *strided_dset0, indices[0]->metric(), dataset_norms_ptr0);
+
+  single_cta_search::search<T, graph_idx_type, DistanceT,
+                             cuvs::neighbors::filtering::none_sample_filter,
+                             graph_idx_type, OutputIdxT>
+    plan(res, params, plan_desc, dim, max_dataset_size, max_graph_degree, topk);
+
+  // Build per-segment descriptors and result pointers on the host.
+  // The device copy is allocated below.
+  using seg_desc_t = single_cta_search::multi_segment_desc_t<T, graph_idx_type, DistanceT>;
+  std::vector<seg_desc_t> host_seg_descs(num_segments);
+
+  cudaStream_t stream = raft::resource::get_cuda_stream(res);
+
+  // Collect per-segment dataset descriptors (may trigger lazy device init on `stream`).
+  std::vector<dataset_descriptor_host<T, graph_idx_type, DistanceT>> seg_dataset_descs;
+  seg_dataset_descs.reserve(num_segments);
+
+  for (uint32_t i = 0; i < num_segments; i++) {
+    auto* strided_dset = dynamic_cast<const strided_dataset<T, int64_t>*>(&indices[i]->data());
+    RAFT_EXPECTS(strided_dset != nullptr,
+                 "All segments must have strided (non-compressed) datasets");
+    const float* norms_ptr = nullptr;
+    if (indices[i]->metric() == cuvs::distance::DistanceType::CosineExpanded) {
+      RAFT_EXPECTS(indices[i]->dataset_norms().has_value(),
+                   "Dataset norms required for CosineExpanded metric (segment %u)", i);
+      norms_ptr = indices[i]->dataset_norms().value().data_handle();
+    }
+    seg_dataset_descs.push_back(dataset_descriptor_init_with_cache<T, graph_idx_type, DistanceT>(
+      res, params, *strided_dset, indices[i]->metric(), norms_ptr));
+
+    // Call dev_ptr to trigger lazy device-side descriptor upload (enqueued on stream).
+    host_seg_descs[i].dataset_desc = seg_dataset_descs.back().dev_ptr(stream);
+    host_seg_descs[i].graph        = indices[i]->graph().data_handle();
+    host_seg_descs[i].graph_degree = static_cast<uint32_t>(indices[i]->graph().extent(1));
+    host_seg_descs[i].queries_ptr  = queries[i].data_handle();
+
+    // Tag the result_indices_ptr with the OutputIdxT size (same convention as select_and_run).
+    constexpr uintptr_t kTag = raft::Pow2<sizeof(OutputIdxT)>::Log2;
+    host_seg_descs[i].result_indices_ptr =
+      reinterpret_cast<uintptr_t>(neighbors[i].data_handle()) | kTag;
+    host_seg_descs[i].result_distances_ptr = distances[i].data_handle();
+  }
+
+  // Upload segment descriptors via workspace pool (no cudaMallocAsync/cudaFreeAsync after warmup).
+  lightweight_uvector<seg_desc_t> dev_seg_descs_buf(res);
+  dev_seg_descs_buf.resize(num_segments, stream);
+  RAFT_CUDA_TRY(cudaMemcpyAsync(dev_seg_descs_buf.data(),
+                                host_seg_descs.data(),
+                                num_segments * sizeof(seg_desc_t),
+                                cudaMemcpyHostToDevice,
+                                stream));
+
+  // Launch all-segment kernel; stream ordering ensures descriptor upload and per-segment
+  // dataset_desc device-init complete before the search kernel executes.
+  plan.run_multi_segment(res, dev_seg_descs_buf.data(), num_segments, n_queries, topk);
+  // dev_seg_descs_buf destructor returns memory to workspace pool (stream-ordered).
+
+  // Post-process distances (scale + metric transform) for each segment.
+  constexpr float kScale = cuvs::spatial::knn::detail::utils::config<T>::kDivisor /
+                           cuvs::spatial::knn::detail::utils::config<DistanceT>::kDivisor;
+  for (uint32_t i = 0; i < num_segments; i++) {
+    float* dist_out          = distances[i].data_handle();
+    const DistanceT* dist_in = distances[i].data_handle();
+    if (indices[i]->metric() == cuvs::distance::DistanceType::CosineExpanded) {
+      auto query_norms = raft::make_device_vector<DistanceT, int64_t>(res, n_queries);
+      auto scaled_sq_op =
+        raft::compose_op(raft::sq_op{},
+                         raft::div_const_op<DistanceT>{DistanceT(kScale)},
+                         raft::cast_op<DistanceT>());
+      raft::linalg::reduce<raft::Apply::ALONG_ROWS>(
+        res,
+        raft::make_device_matrix_view<const T, int64_t, raft::row_major>(
+          queries[i].data_handle(), n_queries, dim),
+        query_norms.view(),
+        (DistanceT)0,
+        false,
+        scaled_sq_op,
+        raft::add_op(),
+        raft::sqrt_op{});
+      raft::linalg::matrix_vector_op<raft::Apply::ALONG_COLUMNS>(
+        res,
+        raft::make_const_mdspan(distances[i]),
+        raft::make_const_mdspan(query_norms.view()),
+        distances[i],
+        raft::compose_op(raft::add_const_op<DistanceT>{DistanceT(1)}, raft::div_checkzero_op{}));
+    } else {
+      cuvs::neighbors::ivf::detail::postprocess_distances(
+        res, dist_out, dist_in, indices[i]->metric(), n_queries, topk, kScale, true);
+    }
+  }
+}
+
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
index 02bf1ff697..0592dc4ed5 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
@@ -87,6 +87,12 @@ struct search
 
   uint32_t num_itopk_candidates;
 
+  /** Number of elements in a hashmap covering @p n_queries queries across @p n_segments segments. */
+  static size_t hashmap_element_count(size_t n_segments, size_t n_queries, size_t h_bitlen)
+  {
+    return n_segments * n_queries * hashmap::get_size(h_bitlen);
+  }
+
   search(raft::resources const& res,
          search_params params,
          const dataset_descriptor_host<DataT, IndexT, DistanceT>& dataset_desc,
@@ -197,12 +203,60 @@ struct search
     RAFT_LOG_DEBUG("# smem_size: %u", smem_size);
     hashmap_size = 0;
     if (small_hash_bitlen == 0 && !this->persistent) {
-      hashmap_size = max_queries * hashmap::get_size(hash_bitlen);
+      hashmap_size = hashmap_element_count(1, max_queries, hash_bitlen);
       hashmap.resize(hashmap_size, raft::resource::get_cuda_stream(res));
     }
     RAFT_LOG_DEBUG("# hashmap_size: %lu", hashmap_size);
   }
 
+  /**
+   * @brief Search all segments concurrently in a single kernel launch.
+   *
+   * @param res         RAFT resources (stream is extracted from here)
+   * @param segment_descs  device pointer to [num_segments] descriptors
+   * @param num_segments   number of segments (gridDim.z)
+   * @param num_queries    queries per segment (gridDim.y)
+   * @param topk           neighbors to return per (query, segment)
+   */
+  void run_multi_segment(
+    raft::resources const& res,
+    const multi_segment_desc_t<DATA_T, INDEX_T, DISTANCE_T>* segment_descs,
+    uint32_t num_segments,
+    uint32_t num_queries,
+    uint32_t topk)
+  {
+    cudaStream_t stream = raft::resource::get_cuda_stream(res);
+
+    // Allocate global hashmap when small-hash is disabled via the workspace pool
+    // (no cudaMallocAsync/cudaFreeAsync after pool warmup).
+    // Layout: [num_segments][num_queries][hash_size].
+    lightweight_uvector<INDEX_T> ms_hashmap_buf(res);
+    INDEX_T* ms_hashmap_ptr = nullptr;
+    if (small_hash_bitlen == 0) {
+      const size_t ms_hashmap_elems = hashmap_element_count(num_segments, num_queries, hash_bitlen);
+      ms_hashmap_buf.resize(ms_hashmap_elems, stream);
+      ms_hashmap_ptr = ms_hashmap_buf.data();
+    }
+
+    select_and_run_multi_segment<DATA_T, INDEX_T, DISTANCE_T, INDEX_T,
+                                 cuvs::neighbors::filtering::none_sample_filter>(
+      segment_descs,
+      num_segments,
+      num_queries,
+      *this,
+      topk,
+      num_itopk_candidates,
+      static_cast<uint32_t>(thread_block_size),
+      smem_size,
+      hash_bitlen,
+      ms_hashmap_ptr,
+      small_hash_bitlen,
+      small_hash_reset_interval,
+      cuvs::neighbors::filtering::none_sample_filter{},
+      stream);
+    // ms_hashmap_buf destructor returns memory to workspace pool (stream-ordered).
+  }
+
   void operator()(
     raft::resources const& res,
     raft::device_matrix_view<const INDEX_T, int64_t, raft::row_major> graph,
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
index 11b468cfca..18314b5c59 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh
@@ -34,4 +34,21 @@ namespace cuvs::neighbors::cagra::detail::single_cta_search {
     SampleFilterT sample_filter,                                                   \
     cudaStream_t stream);
 
+#define instantiate_kernel_selection_ms(DataT, IndexT, DistanceT, SampleFilterT)               \
+  template void select_and_run_multi_segment<DataT, IndexT, DistanceT, IndexT, SampleFilterT>( \
+    const multi_segment_desc_t<DataT, IndexT, DistanceT>* segment_descs,                       \
+    uint32_t num_segments,                                                                     \
+    uint32_t num_queries,                                                                      \
+    const search_params& ps,                                                                   \
+    uint32_t topk,                                                                             \
+    uint32_t num_itopk_candidates,                                                             \
+    uint32_t block_size,                                                                       \
+    uint32_t smem_size,                                                                        \
+    int64_t hash_bitlen,                                                                       \
+    IndexT* hashmap_ptr,                                                                       \
+    size_t small_hash_bitlen,                                                                  \
+    size_t small_hash_reset_interval,                                                          \
+    SampleFilterT sample_filter,                                                               \
+    cudaStream_t stream);
+
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index 5122fad63f..c0aaa1ec05 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1165,6 +1165,146 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel(
                                graph_size);
 }
 
+/**
+ * @brief Multi-segment CAGRA search kernel.
+ *
+ * Grid: (1, num_queries, num_segments).
+ * Each CTA handles one (query, segment) pair independently.
+ * The global hashmap (if used) must be laid out as
+ *   [num_segments][num_queries][hashmap::get_size(hash_bitlen)].
+ */
+template <bool TOPK_BY_BITONIC_SORT,
+          bool BITONIC_SORT_AND_MERGE_MULTI_WARPS,
+          class DATASET_DESCRIPTOR_T,
+          class SourceIndexT,
+          class SAMPLE_FILTER_T>
+RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel_ms(
+  const multi_segment_desc_t<typename DATASET_DESCRIPTOR_T::DATA_T,
+                              typename DATASET_DESCRIPTOR_T::INDEX_T,
+                              typename DATASET_DESCRIPTOR_T::DISTANCE_T>* segments,
+  const std::uint32_t top_k,
+  const SourceIndexT* source_indices_ptr,
+  const unsigned num_distilation,
+  const uint64_t rand_xor_mask,
+  const uint32_t num_seeds,
+  typename DATASET_DESCRIPTOR_T::INDEX_T* visited_hashmap_ptr,
+  const std::uint32_t max_candidates,
+  const std::uint32_t max_itopk,
+  const std::uint32_t internal_topk,
+  const std::uint32_t search_width,
+  const std::uint32_t min_iteration,
+  const std::uint32_t max_iteration,
+  std::uint32_t* const num_executed_iterations,
+  const std::uint32_t hash_bitlen,
+  const std::uint32_t small_hash_bitlen,
+  const std::uint32_t small_hash_reset_interval,
+  SAMPLE_FILTER_T sample_filter)
+{
+  using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T;
+
+  const uint32_t query_id = blockIdx.y;
+  const uint32_t seg_id   = blockIdx.z;
+  const auto& seg         = segments[seg_id];
+
+  // Offset the global hashmap to the base of this segment's block.
+  // search_core will then add blockIdx.y * hash_size for the per-query offset, giving the correct
+  // layout: visited_hashmap_ptr[(seg_id * gridDim.y + query_id) * hash_size].
+  INDEX_T* seg_hashmap_ptr =
+    (visited_hashmap_ptr != nullptr)
+      ? visited_hashmap_ptr +
+          seg_id * static_cast<size_t>(gridDim.y) * hashmap::get_size(hash_bitlen)
+      : nullptr;
+
+  search_core<TOPK_BY_BITONIC_SORT,
+              BITONIC_SORT_AND_MERGE_MULTI_WARPS,
+              DATASET_DESCRIPTOR_T,
+              SourceIndexT,
+              SAMPLE_FILTER_T>(seg.result_indices_ptr,
+                               seg.result_distances_ptr,
+                               top_k,
+                               seg.dataset_desc,
+                               seg.queries_ptr,
+                               seg.graph,
+                               seg.graph_degree,
+                               source_indices_ptr,
+                               num_distilation,
+                               rand_xor_mask,
+                               nullptr,  // seed_ptr: not used in multi-segment
+                               0,        // num_seeds
+                               seg_hashmap_ptr,
+                               max_candidates,
+                               max_itopk,
+                               internal_topk,
+                               search_width,
+                               min_iteration,
+                               max_iteration,
+                               num_executed_iterations,
+                               hash_bitlen,
+                               small_hash_bitlen,
+                               small_hash_reset_interval,
+                               query_id,
+                               sample_filter);
+}
+
+template <bool TOPK_BY_BITONIC_SORT,
+          bool BITONIC_SORT_AND_MERGE_MULTI_WARPS,
+          class DATASET_DESCRIPTOR_T,
+          class SourceIndexT,
+          class SAMPLE_FILTER_T>
+auto dispatch_kernel_ms = []() {
+  static_assert(TOPK_BY_BITONIC_SORT || !BITONIC_SORT_AND_MERGE_MULTI_WARPS);
+  return search_kernel_ms<TOPK_BY_BITONIC_SORT,
+                          BITONIC_SORT_AND_MERGE_MULTI_WARPS,
+                          DATASET_DESCRIPTOR_T,
+                          SourceIndexT,
+                          SAMPLE_FILTER_T>;
+}();
+
+/**
+ * @brief Encodes the (TOPK_BY_BITONIC_SORT, BITONIC_SORT_AND_MERGE_MULTI_WARPS) template
+ * booleans as a runtime value, selected from search parameters.
+ */
+enum class TopkVariant {
+  BITONIC,             ///< bitonic sort, no multi-warp merge  (num_itopk_candidates ≤ 256, itopk_size ≤ 256)
+  BITONIC_MERGE_MULTI, ///< bitonic sort, multi-warp merge     (num_itopk_candidates ≤ 256, itopk_size > 256)
+  RADIX,               ///< radix-based topk                   (num_itopk_candidates > 256)
+};
+
+/**
+ * @brief Selects the topk algorithm variant from runtime search parameters.
+ */
+inline TopkVariant select_topk_variant(unsigned itopk_size,
+                                        unsigned num_itopk_candidates,
+                                        unsigned block_size)
+{
+  assert(itopk_size <= 512);
+  if (num_itopk_candidates <= 256) {
+    if (itopk_size <= 256) { return TopkVariant::BITONIC; }
+    assert(block_size >= 64);
+    return TopkVariant::BITONIC_MERGE_MULTI;
+  }
+  return TopkVariant::RADIX;
+}
+
+template <typename DATASET_DESCRIPTOR_T, typename SourceIndexT, typename SAMPLE_FILTER_T>
+struct search_kernel_config_ms {
+  using kernel_t = decltype(dispatch_kernel_ms<false, false, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>);
+
+  static auto choose_itopk_and_mx_candidates(unsigned itopk_size,
+                                             unsigned num_itopk_candidates,
+                                             unsigned block_size) -> kernel_t
+  {
+    switch (select_topk_variant(itopk_size, num_itopk_candidates, block_size)) {
+      case TopkVariant::BITONIC:
+        return dispatch_kernel_ms<true, false, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>;
+      case TopkVariant::BITONIC_MERGE_MULTI:
+        return dispatch_kernel_ms<true, true, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>;
+      default:
+        return dispatch_kernel_ms<false, false, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>;
+    }
+  }
+};
+
 // To make sure we avoid false sharing on both CPU and GPU, we enforce cache line size to the
 // maximum of the two.
 // This makes sync atomic significantly faster.
@@ -1385,32 +1525,13 @@ struct search_kernel_config {
                                              unsigned num_itopk_candidates,
                                              unsigned block_size) -> kernel_t
   {
-    assert(itopk_size <= 512);
-    if (num_itopk_candidates <= 256) {
-      if (itopk_size <= 256) {
-        return dispatch_kernel<Persistent,
-                               true,
-                               false,
-                               DATASET_DESCRIPTOR_T,
-                               SourceIndexT,
-                               SAMPLE_FILTER_T>;
-      } else {
-        assert(block_size >= 64);
-        return dispatch_kernel<Persistent,
-                               true,
-                               true,
-                               DATASET_DESCRIPTOR_T,
-                               SourceIndexT,
-                               SAMPLE_FILTER_T>;
-      }
-    } else {
-      // Radix-based topk is used
-      return dispatch_kernel<Persistent,
-                             false,
-                             false,
-                             DATASET_DESCRIPTOR_T,
-                             SourceIndexT,
-                             SAMPLE_FILTER_T>;
+    switch (select_topk_variant(itopk_size, num_itopk_candidates, block_size)) {
+      case TopkVariant::BITONIC:
+        return dispatch_kernel<Persistent, true, false, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>;
+      case TopkVariant::BITONIC_MERGE_MULTI:
+        return dispatch_kernel<Persistent, true, true, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>;
+      default:
+        return dispatch_kernel<Persistent, false, false, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>;
     }
   }
 };
@@ -2224,6 +2345,48 @@ auto get_runner(Args... args) -> std::shared_ptr<RunnerT>
   return runner;
 }
 
+/**
+ * @brief Computes the max_candidates and max_itopk constants passed to the search kernel.
+ *
+ * Both values are rounded up to the next power-of-two bucket supported by the kernel template
+ * instantiations. They are the same for single-segment and multi-segment launches, so this helper
+ * is shared by select_and_run and select_and_run_multi_segment.
+ */
+struct kernel_dispatch_params {
+  uint32_t max_candidates;
+  uint32_t max_itopk;
+
+  static kernel_dispatch_params compute(const search_params& ps, uint32_t num_itopk_candidates)
+  {
+    kernel_dispatch_params p{};
+    if (num_itopk_candidates <= 64) {
+      p.max_candidates = 64;
+    } else if (num_itopk_candidates <= 128) {
+      p.max_candidates = 128;
+    } else if (num_itopk_candidates <= 256) {
+      p.max_candidates = 256;
+    } else {
+      p.max_candidates = 32;  // irrelevant, radix-based topk is used
+    }
+
+    assert(ps.itopk_size <= 512);
+    if (num_itopk_candidates <= 256) {  // bitonic sort
+      if (ps.itopk_size <= 64) {
+        p.max_itopk = 64;
+      } else if (ps.itopk_size <= 128) {
+        p.max_itopk = 128;
+      } else if (ps.itopk_size <= 256) {
+        p.max_itopk = 256;
+      } else {
+        p.max_itopk = 512;
+      }
+    } else {  // radix sort
+      p.max_itopk = (ps.itopk_size <= 256) ? 256 : 512;
+    }
+    return p;
+  }
+};
+
 template <typename DataT,
           typename IndexT,
           typename DistanceT,
@@ -2255,37 +2418,7 @@ void select_and_run(
   const SourceIndexT* source_indices_ptr =
     source_indices.has_value() ? source_indices->data_handle() : nullptr;
 
-  uint32_t max_candidates{};
-  if (num_itopk_candidates <= 64) {
-    max_candidates = 64;
-  } else if (num_itopk_candidates <= 128) {
-    max_candidates = 128;
-  } else if (num_itopk_candidates <= 256) {
-    max_candidates = 256;
-  } else {
-    max_candidates =
-      32;  // irrelevant, radix based topk is used (see choose_itopk_and_max_candidates)
-  }
-
-  uint32_t max_itopk{};
-  assert(ps.itopk_size <= 512);
-  if (num_itopk_candidates <= 256) {  // bitonic sort
-    if (ps.itopk_size <= 64) {
-      max_itopk = 64;
-    } else if (ps.itopk_size <= 128) {
-      max_itopk = 128;
-    } else if (ps.itopk_size <= 256) {
-      max_itopk = 256;
-    } else {
-      max_itopk = 512;
-    }
-  } else {  // radix sort
-    if (ps.itopk_size <= 256) {
-      max_itopk = 256;
-    } else {
-      max_itopk = 512;
-    }
-  }
+  auto [max_candidates, max_itopk] = kernel_dispatch_params::compute(ps, num_itopk_candidates);
 
   if (ps.persistent) {
     using runner_type = persistent_runner_t<DataT, IndexT, DistanceT, SourceIndexT, SampleFilterT>;
@@ -2360,5 +2493,87 @@ void select_and_run(
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }
+
+/**
+ * @brief Launch the multi-segment CAGRA search kernel.
+ *
+ * Searches all N segments concurrently in a single kernel launch.  Each CTA (indexed by
+ * blockIdx.y = query_id, blockIdx.z = seg_id) independently searches one segment for one query.
+ *
+ * @param segment_descs  device pointer to array of num_segments descriptors
+ * @param num_segments   number of segments (= gridDim.z)
+ * @param num_queries    number of queries (= gridDim.y)
+ * @param ps             search parameters (shared across all segments)
+ * @param topk           number of neighbors to return per segment
+ * @param num_itopk_candidates  search_width * max_graph_degree
+ * @param block_size     thread-block size
+ * @param smem_size      shared memory per CTA (computed for max graph_degree)
+ * @param hash_bitlen    global hashmap bit-length
+ * @param hashmap_ptr    device buffer sized [num_segments * num_queries * get_size(hash_bitlen)]
+ * @param small_hash_bitlen    small-hash bit-length (0 = disabled)
+ * @param small_hash_reset_interval  reset interval for small hash
+ * @param sample_filter  sample filter
+ * @param stream         CUDA stream
+ */
+template <typename DataT,
+          typename IndexT,
+          typename DistanceT,
+          typename SourceIndexT,
+          typename SampleFilterT>
+void select_and_run_multi_segment(
+  const multi_segment_desc_t<DataT, IndexT, DistanceT>* segment_descs,
+  uint32_t num_segments,
+  uint32_t num_queries,
+  const search_params& ps,
+  uint32_t topk,
+  uint32_t num_itopk_candidates,
+  uint32_t block_size,
+  uint32_t smem_size,
+  int64_t hash_bitlen,
+  IndexT* hashmap_ptr,
+  size_t small_hash_bitlen,
+  size_t small_hash_reset_interval,
+  SampleFilterT sample_filter,
+  cudaStream_t stream)
+{
+  using descriptor_base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
+
+  auto [max_candidates, max_itopk] = kernel_dispatch_params::compute(ps, num_itopk_candidates);
+
+  auto kernel =
+    search_kernel_config_ms<descriptor_base_type, SourceIndexT, SampleFilterT>::
+      choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size);
+
+  dim3 thread_dims(block_size, 1, 1);
+  dim3 block_dims(1, num_queries, num_segments);
+  RAFT_LOG_DEBUG("Launching ms kernel: %u threads, %u queries, %u segments, %u smem",
+                 block_size,
+                 num_queries,
+                 num_segments,
+                 smem_size);
+  auto const& kernel_launcher = [&](auto const& kernel) -> void {
+    kernel<<<block_dims, thread_dims, smem_size, stream>>>(segment_descs,
+                                                           topk,
+                                                           nullptr,  // source_indices_ptr
+                                                           ps.num_random_samplings,
+                                                           ps.rand_xor_mask,
+                                                           0,  // num_seeds
+                                                           hashmap_ptr,
+                                                           max_candidates,
+                                                           max_itopk,
+                                                           ps.itopk_size,
+                                                           ps.search_width,
+                                                           ps.min_iterations,
+                                                           ps.max_iterations,
+                                                           nullptr,  // num_executed_iterations
+                                                           hash_bitlen,
+                                                           small_hash_bitlen,
+                                                           small_hash_reset_interval,
+                                                           sample_filter);
+  };
+  cuvs::neighbors::detail::safely_launch_kernel_with_smem_size(kernel, smem_size, kernel_launcher);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 }  // namespace single_cta_search
 }  // namespace cuvs::neighbors::cagra::detail
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
index ba308db98b..2af57d421f 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh
@@ -10,6 +10,23 @@
 
 namespace cuvs::neighbors::cagra::detail::single_cta_search {
 
+/**
+ * @brief Per-segment descriptor for the multi-segment CAGRA search kernel.
+ *
+ * One instance per Lucene segment; the kernel reads this array from device memory using
+ * blockIdx.z as the segment index.
+ */
+template <typename DataT, typename IndexT, typename DistanceT>
+struct alignas(16) multi_segment_desc_t {
+  const dataset_descriptor_base_t<DataT, IndexT, DistanceT>* dataset_desc;
+  const DataT* queries_ptr;         // [num_queries, dim] for this segment
+  const IndexT* graph;              // [dataset_size, graph_degree]
+  uint32_t graph_degree;
+  uint32_t _pad;
+  uintptr_t result_indices_ptr;     // tagged pointer: [num_queries, top_k]
+  DistanceT* result_distances_ptr;  // [num_queries, top_k]
+};
+
 template <typename DataT,
           typename IndexT,
           typename DistanceT,
@@ -38,4 +55,25 @@ void select_and_run(
   SampleFilterT sample_filter,
   cudaStream_t stream);
 
+template <typename DataT,
+          typename IndexT,
+          typename DistanceT,
+          typename SourceIndexT,
+          typename SampleFilterT>
+void select_and_run_multi_segment(
+  const multi_segment_desc_t<DataT, IndexT, DistanceT>* segment_descs,
+  uint32_t num_segments,
+  uint32_t num_queries,
+  const search_params& ps,
+  uint32_t topk,
+  uint32_t num_itopk_candidates,
+  uint32_t block_size,
+  uint32_t smem_size,
+  int64_t hash_bitlen,
+  IndexT* hashmap_ptr,
+  size_t small_hash_bitlen,
+  size_t small_hash_reset_interval,
+  SampleFilterT sample_filter,
+  cudaStream_t stream);
+
 }
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSResources.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSResources.java
index b105580328..01d500b948 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSResources.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSResources.java
@@ -57,6 +57,24 @@ interface ScopedAccess extends AutoCloseable {
    */
   Path tempDirectory();
 
+  /**
+   * Configure the temporary workspace on this resources object as an uncapped pool backed by the
+   * current device memory resource. After the initial reservation is allocated on first use,
+   * subsequent calls to {@code cuvsRMMAlloc} / {@code cuvsRMMFree} on this handle hit the pool
+   * cache rather than calling {@code cudaMallocAsync} / {@code cudaFreeAsync}, reducing CUDA
+   * context lock contention under concurrent query threads. The pool grows without shrinking:
+   * freed allocations are returned to the pool rather than to the device, so the pool's
+   * high-water mark only increases until the resources object is closed.
+   *
+   * <p>The pool is per-resources-handle (i.e. per query thread when resources are thread-local),
+   * so there is no cross-thread pool mutex contention. Call this once after creating the resources
+   * object; calling it again replaces the pool.
+   *
+   * @param initialSizeBytes initial pool reservation in bytes; size {@code initialSizeBytes} to
+   *                         cover the steady-state working set to avoid growth after warmup
+   */
+  void setWorkspacePool(long initialSizeBytes);
+
   /**
    * Creates a new resources.
    * Equivalent to
diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/SynchronizedCuVSResources.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/SynchronizedCuVSResources.java
index 64a72ec32a..aa74893c6f 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/SynchronizedCuVSResources.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/SynchronizedCuVSResources.java
@@ -40,6 +40,11 @@ public void close() {
     inner.close();
   }
 
+  @Override
+  public void setWorkspacePool(long sizeBytes) {
+    inner.setWorkspacePool(sizeBytes);
+  }
+
   @Override
   public Path tempDirectory() {
     return inner.tempDirectory();
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
index 50cad34f39..ad2d5a449a 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java
@@ -7,52 +7,39 @@
 import static com.nvidia.cuvs.internal.common.CloseableRMMAllocation.allocateRMMSegment;
 import static com.nvidia.cuvs.internal.common.Util.CudaMemcpyKind.DEVICE_TO_HOST;
 import static com.nvidia.cuvs.internal.common.Util.checkCuVSError;
-import static com.nvidia.cuvs.internal.common.Util.checkCudaError;
 import static com.nvidia.cuvs.internal.common.Util.cudaMemcpyAsync;
 import static com.nvidia.cuvs.internal.common.Util.getStream;
-import static com.nvidia.cuvs.internal.panama.headers_h.cudaEventRecord;
+import static com.nvidia.cuvs.internal.common.Util.prepareTensor;
+import static com.nvidia.cuvs.internal.panama.headers_h.cuvsCagraSearchMultiSegment;
 import static com.nvidia.cuvs.internal.panama.headers_h.cuvsStreamSync;
-import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaStreamWaitEvent;
+import static com.nvidia.cuvs.internal.panama.headers_h.kDLCUDA;
+import static com.nvidia.cuvs.internal.panama.headers_h.kDLFloat;
+import static com.nvidia.cuvs.internal.panama.headers_h.kDLUInt;
 
 import com.nvidia.cuvs.internal.BufferedCagraSearch;
+import com.nvidia.cuvs.internal.CuVSMatrixInternal;
 import com.nvidia.cuvs.internal.CuVSParamsHelper;
-import com.nvidia.cuvs.internal.CuVSResourcesImpl;
-import com.nvidia.cuvs.internal.CudaStreamPool;
 import com.nvidia.cuvs.internal.SelectKHelper;
 import java.lang.foreign.Arena;
 import java.lang.foreign.MemorySegment;
 import java.lang.foreign.ValueLayout;
-import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.List;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
 
 /**
  * Performs a single-query approximate nearest neighbor search across multiple CAGRA index segments
- * using a shared GPU buffer and a fixed-size CUDA stream pool, eliminating per-segment
- * device-to-host copies.
+ * using a shared GPU buffer, eliminating per-segment device-to-host copies.
  *
  * <h3>Algorithm</h3>
  * <ol>
  *   <li>Allocate two global device buffers sized {@code numSegments × k}:
  *       one for uint32 neighbor ordinals and one for float32 distances.</li>
- *   <li>Assign each segment a slot from the {@link CudaStreamPool} via round-robin. Segments on
- *       different slots run in parallel on separate CUDA streams.</li>
- *   <li>For each segment, call {@link BufferedCagraSearch#searchIntoBuffer} to queue the CAGRA
- *       search kernel. In non-persistent mode this enqueues asynchronously on the slot's CUDA
- *       stream. In persistent mode each call blocks on CPU until the GPU worker signals completion;
- *       all segments are submitted concurrently via {@link #ASYNC_SEARCH_POOL} so the GPU can
- *       execute multiple segment jobs in parallel (bounded by {@code worker_queue_size}).</li>
- *   <li>Record a CUDA event on each slot's stream; make the main stream wait on all events.</li>
+ *   <li>Call {@code cuvsCagraSearchMultiSegment} which launches a single GPU kernel covering all
+ *       segments concurrently (one CTA per segment), writing results into the global buffers.</li>
  *   <li>Call {@code cuvsSelectK} on the main stream to find the global top-k entirely on GPU.</li>
  *   <li>Sync the main stream.</li>
- *   <li>Copy the three result arrays to host in a single pass:
- *       k selected distances, k flat-array positions, and all {@code numSegments × k} ordinals.</li>
- *   <li>Decode each result: {@code segment = position / k}, {@code ordinal = ordinals[position]}.</li>
+ *   <li>Copy the three result arrays to host in a single pass.</li>
+ *   <li>Decode each result: {@code segment = position / k},
+ *       {@code ordinal = ordinals[position]}.</li>
  * </ol>
  *
  * @since 25.10
@@ -61,22 +48,6 @@ public class MultiSegmentCagraSearch {
 
   private MultiSegmentCagraSearch() {}
 
-  /**
-   * Thread pool used to submit persistent-mode segment searches concurrently.
-   *
-   * <p>In persistent mode, {@link BufferedCagraSearch#searchIntoBuffer} blocks on the CPU until
-   * the GPU signals completion via a system-scope atomic. Running each segment in its own thread
-   * allows the persistent kernel's job queue to hold all N segment jobs simultaneously, so GPU
-   * workers can execute them in parallel (bounded by {@code worker_queue_size}).
-   */
-  private static final ExecutorService ASYNC_SEARCH_POOL =
-      Executors.newCachedThreadPool(
-          r -> {
-            Thread t = new Thread(r, "cuvs-segment-search");
-            t.setDaemon(true);
-            return t;
-          });
-
   /**
    * Searches multiple CAGRA index segments for the global top-k nearest neighbors.
    *
@@ -122,13 +93,7 @@ public static MultiSegmentSearchResults search(
     long outIdxBytes = (long) k * Long.BYTES; // int64 positions from select_k
     long outValBytes = (long) k * Float.BYTES;
 
-    // Assign a pool slot to each segment via round-robin.
-    CudaStreamPool pool = CuVSResourcesImpl.getStreamPool(resources);
-    int startSlot = pool.nextSlot(numSegments);
-    int[] slots = new int[numSegments];
-    for (int i = 0; i < numSegments; i++) {
-      slots[i] = Math.floorMod(startSlot + i, pool.size());
-    }
+    CagraSearchParams searchParameters = queries.get(0).getCagraSearchParameters();
 
     try (var resourcesAccessor = resources.access()) {
       long cuvsRes = resourcesAccessor.handle();
@@ -139,110 +104,59 @@ public static MultiSegmentSearchResults search(
           var outIdxDP = allocateRMMSegment(cuvsRes, outIdxBytes);
           var outValDP = allocateRMMSegment(cuvsRes, outValBytes)) {
 
-        // --- Phase 1: queue all per-segment CAGRA searches ---
-        CagraSearchParams searchParameters = queries.get(0).getCagraSearchParameters();
-        if (searchParameters.isPersistent()) {
-          // Persistent mode: searchIntoBuffer blocks on CPU (via system-scope atomic spin) until
-          // the GPU signals completion. Submit one task per pool slot in parallel so the GPU can
-          // work on multiple segment jobs concurrently, bounded by worker_queue_size.
-          //
-          // Segments are grouped by slot: if numSegments > pool.size(), multiple segments share a
-          // slot and must be serialized within that slot's task — each cuvsResources_t handle is
-          // not thread-safe for concurrent access (the descriptor_cache inside is not guarded).
-          // Parallelism = min(numSegments, pool.size()).
-          int poolSize = pool.size();
-          // Collect segment indices per slot. Size: poolSize, each entry may have 0..n indices.
-          @SuppressWarnings("unchecked")
-          List<Integer>[] segsBySlot = new List[poolSize];
-          for (int slot = 0; slot < poolSize; slot++) {
-            segsBySlot[slot] = new ArrayList<>();
-          }
+        // --- Phase 1: call cuvsCagraSearchMultiSegment ---
+        // Single kernel launch covers all segments; results land in globalNeighborsDP /
+        // globalDistancesDP on the same CUDA stream, so SelectK below sees them via ordering.
+        try (var arena = Arena.ofConfined()) {
+          MemorySegment sp = CuVSParamsHelper.buildCagraSearchParams(arena, searchParameters);
+
+          // Build C arrays: cuvsCagraIndex_t* indices, DLManagedTensor** q/n/d
+          MemorySegment indexArray = arena.allocate(ValueLayout.ADDRESS, numSegments);
+          MemorySegment queriesArray = arena.allocate(ValueLayout.ADDRESS, numSegments);
+          MemorySegment neighborsArray = arena.allocate(ValueLayout.ADDRESS, numSegments);
+          MemorySegment distancesArray = arena.allocate(ValueLayout.ADDRESS, numSegments);
+
+          long[] segShape = {1, k};
           for (int i = 0; i < numSegments; i++) {
-            segsBySlot[slots[i]].add(i);
+            // Index handle
+            indexArray.setAtIndex(ValueLayout.ADDRESS, i, buffered[i].getIndexHandle());
+
+            // Query DLTensor
+            var queryVectors = (CuVSMatrixInternal) queries.get(i).getQueryVectors();
+            queriesArray.setAtIndex(ValueLayout.ADDRESS, i, queryVectors.toTensor(arena));
+
+            // Neighbors DLTensor — slice of global buffer
+            long nByteOffset = (long) i * k * Integer.BYTES;
+            MemorySegment nSlice =
+                MemorySegment.ofAddress(globalNeighborsDP.handle().address() + nByteOffset);
+            neighborsArray.setAtIndex(
+                ValueLayout.ADDRESS,
+                i,
+                prepareTensor(arena, nSlice, segShape, kDLUInt(), 32, kDLCUDA()));
+
+            // Distances DLTensor — slice of global buffer
+            long dByteOffset = (long) i * k * Float.BYTES;
+            MemorySegment dSlice =
+                MemorySegment.ofAddress(globalDistancesDP.handle().address() + dByteOffset);
+            distancesArray.setAtIndex(
+                ValueLayout.ADDRESS,
+                i,
+                prepareTensor(arena, dSlice, segShape, kDLFloat(), 32, kDLCUDA()));
           }
-          // Submit one task per occupied slot.
-          List<Future<Void>> futures = new ArrayList<>(poolSize);
-          for (int slot = 0; slot < poolSize; slot++) {
-            if (segsBySlot[slot].isEmpty()) continue;
-            final int taskSlot = slot;
-            final List<Integer> taskSegs = segsBySlot[slot];
-            futures.add(
-                ASYNC_SEARCH_POOL.submit(
-                    (Callable<Void>)
-                        () -> {
-                          try (var threadArena = Arena.ofConfined()) {
-                            MemorySegment sp =
-                                CuVSParamsHelper.buildCagraSearchParams(
-                                    threadArena, searchParameters);
-                            for (int segIdx : taskSegs) {
-                              buffered[segIdx].searchIntoBuffer(
-                                  queries.get(segIdx),
-                                  globalNeighborsDP.handle(),
-                                  globalDistancesDP.handle(),
-                                  segIdx,
-                                  pool.resources(taskSlot),
-                                  pool.stream(taskSlot),
-                                  sp,
-                                  threadArena);
-                            }
-                          } catch (Exception e) {
-                            throw e;
-                          } catch (Throwable t) {
-                            throw new RuntimeException(t);
-                          }
-                          return null;
-                        }));
-          }
-          for (Future<Void> f : futures) {
-            try {
-              f.get();
-            } catch (ExecutionException e) {
-              throw e.getCause();
-            } catch (InterruptedException e) {
-              Thread.currentThread().interrupt();
-              throw e;
-            }
-          }
-        } else {
-          // Non-persistent: each cuvsCagraSearch enqueues a CUDA kernel asynchronously and
-          // returns immediately; segments execute in parallel on their respective CUDA streams.
-          // A shared arena covers all per-call CPU allocations; it is closed once all launches
-          // have been enqueued.
-          try (var segArena = Arena.ofConfined()) {
-            MemorySegment searchParams =
-                CuVSParamsHelper.buildCagraSearchParams(segArena, searchParameters);
-            for (int i = 0; i < numSegments; i++) {
-              buffered[i].searchIntoBuffer(
-                  queries.get(i),
-                  globalNeighborsDP.handle(),
-                  globalDistancesDP.handle(),
-                  i,
-                  pool.resources(slots[i]),
-                  pool.stream(slots[i]),
-                  searchParams,
-                  segArena);
-            }
-          }
-        }
 
-        // --- Phase 2: event-based sync — make main stream wait for all segment streams ---
-        // Record one event per distinct slot (on the last kernel submitted to that slot);
-        // this is O(pool.size()) API calls instead of O(numSegments).
-        // Pool events are pre-allocated and reused across calls to avoid create/destroy overhead.
-        int[] lastSegmentForSlot = new int[pool.size()];
-        Arrays.fill(lastSegmentForSlot, -1);
-        for (int i = 0; i < numSegments; i++) {
-          lastSegmentForSlot[slots[i]] = i;
-        }
-        for (int slot = 0; slot < pool.size(); slot++) {
-          if (lastSegmentForSlot[slot] >= 0) {
-            checkCudaError(cudaEventRecord(pool.event(slot), pool.stream(slot)), "cudaEventRecord");
-            checkCudaError(
-                cudaStreamWaitEvent(cuvsStream, pool.event(slot), 0), "cudaStreamWaitEvent");
-          }
+          checkCuVSError(
+              cuvsCagraSearchMultiSegment(
+                  cuvsRes,
+                  sp,
+                  numSegments,
+                  indexArray,
+                  queriesArray,
+                  neighborsArray,
+                  distancesArray),
+              "cuvsCagraSearchMultiSegment");
         }
 
-        // --- Phase 3: select global top-k on GPU (after all segment searches complete) ---
+        // --- Phase 2: select global top-k on GPU ---
         SelectKHelper.selectK(
             cuvsRes,
             globalDistancesDP.handle(),
@@ -254,11 +168,9 @@ public static MultiSegmentSearchResults search(
         // No stream sync needed here: the D2H copies below are enqueued on the same cuvsStream,
         // so CUDA stream ordering guarantees selectK completes before the copies begin.
 
-        // --- Phase 4: single device-to-host copy for all three arrays ---
-        // Allocate one contiguous host buffer and slice into three typed views.
+        // --- Phase 3: single device-to-host copy for all three arrays ---
         // Layout (in order of decreasing alignment): int64 outIdx | float32 outVal | uint32
         // ordinals
-        // outIdxBytes is a multiple of Long.BYTES, so each slice is naturally aligned.
         try (var hostArena = Arena.ofConfined()) {
           MemorySegment hostBuf =
               hostArena.allocate(outIdxBytes + outValBytes + neighborsBytes, Long.BYTES);
@@ -278,7 +190,7 @@ public static MultiSegmentSearchResults search(
 
           checkCuVSError(cuvsStreamSync(cuvsRes), "cuvsStreamSync after D2H copy");
 
-          // --- Phase 5: decode results ---
+          // --- Phase 4: decode results ---
           int[] segmentIndices = new int[k];
           int[] selectedOrdinals = new int[k];
           float[] selectedDistances = new float[k];
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java
index 20390ce4cc..7df129639c 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java
@@ -55,4 +55,11 @@ void searchIntoBuffer(
       MemorySegment searchParams,
       Arena arena)
       throws Throwable;
+
+  /**
+   * Returns the raw {@code cuvsCagraIndex_t} handle as a {@link MemorySegment}.
+   * Used by {@link com.nvidia.cuvs.MultiSegmentCagraSearch} to build the index pointer array
+   * for {@code cuvsCagraSearchMultiSegment}.
+   */
+  MemorySegment getIndexHandle();
 }
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
index fb102c5fca..7d0d0e2907 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java
@@ -359,6 +359,11 @@ public SearchResults search(CagraQuery query) throws Throwable {
    * @param segmentIdx        zero-based index of this segment; determines the write offset
    * @param segmentCuvsRes    {@code cuvsResources_t} handle whose CUDA stream receives the kernel
    */
+  @Override
+  public MemorySegment getIndexHandle() {
+    return cagraIndexReference.getMemorySegment();
+  }
+
   @Override
   public void searchIntoBuffer(
       CagraQuery query,
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
index ca3bbea8af..a0466a6d7d 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
@@ -80,6 +80,12 @@ public static CudaStreamPool getStreamPool(CuVSResources resources) {
         "Unsupported resources type: " + resources.getClass().getName());
   }
 
+  @Override
+  public void setWorkspacePool(long sizeBytes) {
+    checkCuVSError(
+        cuvsResourcesSetWorkspacePool(resourceHandle, sizeBytes), "cuvsResourcesSetWorkspacePool");
+  }
+
   @Override
   public Path tempDirectory() {
     return tempDirectory;
diff --git a/java/cuvs-java/src/test/java/com/nvidia/cuvs/CheckedCuVSResources.java b/java/cuvs-java/src/test/java/com/nvidia/cuvs/CheckedCuVSResources.java
index e880edc85d..c615e5ee30 100644
--- a/java/cuvs-java/src/test/java/com/nvidia/cuvs/CheckedCuVSResources.java
+++ b/java/cuvs-java/src/test/java/com/nvidia/cuvs/CheckedCuVSResources.java
@@ -56,6 +56,11 @@ public void close() {
     inner.close();
   }
 
+  @Override
+  public void setWorkspacePool(long sizeBytes) {
+    inner.setWorkspacePool(sizeBytes);
+  }
+
   @Override
   public Path tempDirectory() {
     return inner.tempDirectory();

From 49e5a146831831fe04a79909d01fad5bf1a76860 Mon Sep 17 00:00:00 2001
From: James Xia <jamxia@nvidia.com>
Date: Mon, 20 Apr 2026 10:02:48 -0700
Subject: [PATCH 08/11] java: add DataType.HALF (float16) support to CuVSMatrix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add HALF(2) to the DataType enum and wire it through the Java layer so
callers can build and exchange float16 matrices without any Java-side
type conversion:

- CuVSMatrix.Builder: new addVector(short[]) overload; each element is
  a raw IEEE 754 binary16 bit pattern held in a short
- LinkerHelper: add C_SHORT via canonicalLayouts("short")
- CuVSMatrixBaseImpl: map HALF → C_SHORT in valueLayoutFromType();
  decode DLPack (kDLFloat, bits=16) → DataType.HALF in
  dataTypeFromTensor()
- CuVSMatrixInternal: HALF maps to the same kDLFloat DLPack type code
  as FLOAT; the bits field (16 vs 32) distinguishes them on the C side
- JDKProvider.MatrixBuilder: implement addVector(short[]) following the
  same MemorySegment.ofArray pattern as the other overloads
---
 .../src/main/java/com/nvidia/cuvs/CuVSMatrix.java        | 8 ++++++++
 .../com/nvidia/cuvs/internal/CuVSMatrixBaseImpl.java     | 4 ++++
 .../com/nvidia/cuvs/internal/CuVSMatrixInternal.java     | 2 +-
 .../com/nvidia/cuvs/internal/common/LinkerHelper.java    | 3 +++
 .../src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java | 9 +++++++++
 5 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSMatrix.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSMatrix.java
index e0e39a4b4b..a19254ee3b 100644
--- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSMatrix.java
+++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSMatrix.java
@@ -18,6 +18,7 @@ public interface CuVSMatrix extends AutoCloseable {
 
   enum DataType {
     FLOAT(4),
+    HALF(2),
     INT(4),
     UINT(4),
     BYTE(1);
@@ -94,6 +95,13 @@ interface Builder<T extends CuVSMatrix> {
      */
     void addVector(int[] vector);
 
+    /**
+     * Adds a single vector to the matrix. Each element is a raw float16 bit pattern stored in a short.
+     *
+     * @param vector A short array of as many elements as the dimensions
+     */
+    void addVector(short[] vector);
+
     T build();
   }
 
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixBaseImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixBaseImpl.java
index 98f4095ffc..03e08ac4c4 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixBaseImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixBaseImpl.java
@@ -7,6 +7,7 @@
 import static com.nvidia.cuvs.internal.common.LinkerHelper.C_CHAR;
 import static com.nvidia.cuvs.internal.common.LinkerHelper.C_FLOAT;
 import static com.nvidia.cuvs.internal.common.LinkerHelper.C_INT;
+import static com.nvidia.cuvs.internal.common.LinkerHelper.C_SHORT;
 import static com.nvidia.cuvs.internal.common.Util.checkCuVSError;
 import static com.nvidia.cuvs.internal.panama.headers_h.*;
 
@@ -95,6 +96,7 @@ public ValueLayout valueLayout() {
   protected static ValueLayout valueLayoutFromType(DataType dataType) {
     return switch (dataType) {
       case FLOAT -> C_FLOAT;
+      case HALF -> C_SHORT;
       case INT, UINT -> C_INT;
       case BYTE -> C_CHAR;
     };
@@ -177,6 +179,8 @@ private static DataType dataTypeFromTensor(byte code, byte bits) {
       dataType = DataType.INT;
     } else if (code == kDLFloat() && bits == 32) {
       dataType = DataType.FLOAT;
+    } else if (code == kDLFloat() && bits == 16) {
+      dataType = DataType.HALF;
     } else if ((code == kDLInt() || code == kDLUInt()) && bits == 8) {
       dataType = DataType.BYTE;
     } else {
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixInternal.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixInternal.java
index 35715b8336..6b4b617493 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixInternal.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixInternal.java
@@ -38,7 +38,7 @@ default int code() {
 
   static int code(DataType dataType) {
     return switch (dataType) {
-      case FLOAT -> kDLFloat();
+      case FLOAT, HALF -> kDLFloat();
       case INT -> kDLInt();
       case UINT, BYTE -> kDLUInt();
     };
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/LinkerHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/LinkerHelper.java
index 6de70ce920..79b440b4a8 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/LinkerHelper.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/LinkerHelper.java
@@ -25,6 +25,9 @@ public class LinkerHelper {
   public static final ValueLayout.OfLong C_LONG =
       (ValueLayout.OfLong) LINKER.canonicalLayouts().get("long");
 
+  public static final ValueLayout.OfShort C_SHORT =
+      (ValueLayout.OfShort) LINKER.canonicalLayouts().get("short");
+
   public static final ValueLayout.OfFloat C_FLOAT =
       (ValueLayout.OfFloat) LINKER.canonicalLayouts().get("float");
 
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
index 4594ad1e93..72f0b4e232 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java
@@ -612,6 +612,15 @@ public void addVector(int[] vector) {
       internalAddVector(MemorySegment.ofArray(vector));
     }
 
+    public void addVector(short[] vector) {
+      if (vector.length != columns) {
+        throw new IllegalArgumentException(
+            String.format(
+                Locale.ROOT, "Expected a vector of size [%d], got [%d]", columns, vector.length));
+      }
+      internalAddVector(MemorySegment.ofArray(vector));
+    }
+
     protected abstract void internalAddVector(MemorySegment vector);
   }
 

From 29751c7becf58317c14246fbe3f1368a5ef4a562 Mon Sep 17 00:00:00 2001
From: James Xia <jamxia@nvidia.com>
Date: Wed, 22 Apr 2026 17:07:26 -0700
Subject: [PATCH 09/11] Fix errors after merge

---
 c/src/core/c_api.cpp                               | 14 +++++++-------
 cpp/src/neighbors/cagra.cuh                        |  2 +-
 .../detail/cagra/search_single_cta_inst.cu.in      |  4 ++++
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/c/src/core/c_api.cpp b/c/src/core/c_api.cpp
index 09f652c613..05e3856da1 100644
--- a/c/src/core/c_api.cpp
+++ b/c/src/core/c_api.cpp
@@ -43,10 +43,10 @@ extern "C" cuvsError_t cuvsResourcesSetWorkspacePool(cuvsResources_t res, size_t
     auto res_ptr = reinterpret_cast<raft::resources*>(res);
     // Create an uncapped pool: pre-warms with initial_size_bytes to avoid cudaMalloc on every
     // query, but can grow beyond that if an allocation exceeds the initial reservation.
-    auto pool_mr =
-      std::make_shared<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>>(
-        rmm::mr::get_current_device_resource(), initial_size_bytes);
-    raft::resource::set_workspace_resource(*res_ptr, pool_mr);
+    raft::resource::set_workspace_resource(
+      *res_ptr,
+      rmm::mr::pool_memory_resource{rmm::mr::get_current_device_resource_ref(),
+                                    initial_size_bytes});
   });
 }
 
@@ -148,7 +148,7 @@ extern "C" cuvsError_t cuvsRMMAlloc(cuvsResources_t res, void** ptr, size_t byte
   return cuvs::core::translate_exceptions([=] {
     auto res_ptr = reinterpret_cast<raft::resources*>(res);
     auto stream  = raft::resource::get_cuda_stream(*res_ptr);
-    *ptr         = raft::resource::get_workspace_resource(*res_ptr)->allocate(stream, bytes);
+    *ptr         = raft::resource::get_workspace_resource_ref(*res_ptr).allocate(stream, bytes);
   });
 }
 
@@ -157,7 +157,7 @@ extern "C" cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes)
   return cuvs::core::translate_exceptions([=] {
     auto res_ptr = reinterpret_cast<raft::resources*>(res);
     auto stream  = raft::resource::get_cuda_stream(*res_ptr);
-    raft::resource::get_workspace_resource(*res_ptr)->deallocate(stream, ptr, bytes);
+    raft::resource::get_workspace_resource_ref(*res_ptr).deallocate(stream, ptr, bytes);
   });
 }
 
@@ -185,7 +185,7 @@ extern "C" cuvsError_t cuvsRMMAsyncMemoryResourceEnable()
 {
   return cuvs::core::translate_exceptions([=] {
     async_mr = std::make_shared<rmm::mr::cuda_async_memory_resource>();
-    rmm::mr::set_current_device_resource(async_mr.get());
+    rmm::mr::set_current_device_resource(*async_mr);
   });
 }
 
diff --git a/cpp/src/neighbors/cagra.cuh b/cpp/src/neighbors/cagra.cuh
index 62fa379fb6..8df172cff0 100644
--- a/cpp/src/neighbors/cagra.cuh
+++ b/cpp/src/neighbors/cagra.cuh
@@ -406,7 +406,7 @@ index<T, IdxT> merge(raft::resources const& handle,
   return cagra::detail::merge<T, IdxT>(handle, params, indices, row_filter);
 }
 
-template <typename T, typename OutputIdxT = uint32_t, typename IdxT = uint32_t>
+template <typename T, typename IdxT = uint32_t, typename OutputIdxT = uint32_t>
 void search_multi_segment(
   raft::resources const& res,
   search_params const& params,
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cu.in b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cu.in
index 85342e7093..c2d30fa6f5 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cu.in
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cu.in
@@ -20,5 +20,9 @@ instantiate_kernel_selection(data_t,
                              float,
                              cuvs::neighbors::filtering::none_sample_filter);
 instantiate_kernel_selection(data_t, uint32_t, float, bitset_filter_t);
+instantiate_kernel_selection_ms(data_t,
+                                uint32_t,
+                                float,
+                                cuvs::neighbors::filtering::none_sample_filter);
 
 }  // namespace cuvs::neighbors::cagra::detail::single_cta_search

From 2fffcb8b59f21054a1cafb12309f3fd1a1ffcb23 Mon Sep 17 00:00:00 2001
From: James Xia <jamxia@nvidia.com>
Date: Wed, 22 Apr 2026 17:08:07 -0700
Subject: [PATCH 10/11] Remove dead code

---
 .../cuvs/internal/CuVSResourcesImpl.java      |  15 --
 .../nvidia/cuvs/internal/CudaStreamPool.java  | 133 ------------------
 2 files changed, 148 deletions(-)
 delete mode 100644 java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CudaStreamPool.java

diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
index a0466a6d7d..e421cad660 100644
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
+++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java
@@ -28,9 +28,6 @@ public class CuVSResourcesImpl implements CuVSResources {
   private final int deviceId;
 
   private final PinnedMemoryBuffer hostBuffer = new PinnedMemoryBuffer();
-  private final CudaStreamPool streamPool =
-      new CudaStreamPool(
-          Integer.getInteger(CudaStreamPool.SIZE_PROPERTY, CudaStreamPool.DEFAULT_SIZE));
 
   /**
    * Constructor that allocates the resources needed for cuVS
@@ -62,24 +59,12 @@ public int deviceId() {
   @Override
   public void close() {
     synchronized (this) {
-      streamPool.close();
       int returnValue = cuvsResourcesDestroy(resourceHandle);
       checkCuVSError(returnValue, "cuvsResourcesDestroy");
       hostBuffer.close();
     }
   }
 
-  /**
-   * Returns the {@link CudaStreamPool} owned by the given {@link CuVSResources} instance.
-   */
-  public static CudaStreamPool getStreamPool(CuVSResources resources) {
-    if (resources instanceof CuVSResourcesImpl impl) {
-      return impl.streamPool;
-    }
-    throw new IllegalArgumentException(
-        "Unsupported resources type: " + resources.getClass().getName());
-  }
-
   @Override
   public void setWorkspacePool(long sizeBytes) {
     checkCuVSError(
diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CudaStreamPool.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CudaStreamPool.java
deleted file mode 100644
index cddb3b5cb5..0000000000
--- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CudaStreamPool.java
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
- * SPDX-License-Identifier: Apache-2.0
- */
-package com.nvidia.cuvs.internal;
-
-import static com.nvidia.cuvs.internal.common.Util.checkCuVSError;
-import static com.nvidia.cuvs.internal.common.Util.checkCudaError;
-import static com.nvidia.cuvs.internal.panama.headers_h.cudaEventCreateWithFlags;
-import static com.nvidia.cuvs.internal.panama.headers_h.cudaEventDestroy;
-import static com.nvidia.cuvs.internal.panama.headers_h.cuvsResourcesCreate;
-import static com.nvidia.cuvs.internal.panama.headers_h.cuvsResourcesDestroy;
-import static com.nvidia.cuvs.internal.panama.headers_h.cuvsResources_t;
-import static com.nvidia.cuvs.internal.panama.headers_h.cuvsStreamSet;
-import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaEventDisableTiming;
-import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaEvent_t;
-import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaStreamCreateWithFlags;
-import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaStreamDestroy;
-import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaStreamNonBlocking;
-import static com.nvidia.cuvs.internal.panama.headers_h_1.cudaStream_t;
-
-import java.lang.foreign.Arena;
-import java.lang.foreign.MemorySegment;
-
-/**
- * A fixed-size pool of CUDA streams used by {@link com.nvidia.cuvs.MultiSegmentCagraSearch}
- * to run per-segment CAGRA searches in parallel.
- *
- * <p>Each pool slot owns one {@code cuvsResources_t} handle backed by a dedicated non-blocking
- * CUDA stream. Callers assign segments to slots via round-robin using {@link #nextSlot}; the
- * GPU executes searches on different slots concurrently, then synchronizes via CUDA events before
- * the global {@code cuvsSelectK} call.
- *
- * <h3>Lifecycle</h3>
- * <p>One pool is owned by each {@code CuVSResourcesImpl} instance and closed when that instance
- * is closed. This gives each thread its own independent set of streams and events, eliminating
- * races when multiple threads perform concurrent multi-segment searches.
- *
- * <h3>Configuration</h3>
- * <p>Pool size defaults to {@value #DEFAULT_SIZE} and can be overridden via the system property
- * {@value #SIZE_PROPERTY}.
- */
-public final class CudaStreamPool implements AutoCloseable {
-
-  /** Default number of streams in the pool. */
-  public static final int DEFAULT_SIZE = 8;
-
-  /** System property name for overriding the pool size. */
-  public static final String SIZE_PROPERTY = "com.nvidia.cuvs.streamPoolSize";
-
-  /** Round-robin counter; advanced by {@link #nextSlot(int)} on each search call. */
-  private int slotCounter;
-
-  private final long[] resources; // cuvsResources_t handles
-  private final MemorySegment[] streams; // cudaStream_t handle values
-  private final MemorySegment[] events; // pre-allocated cudaEvent_t handles, one per slot
-  private final int size;
-
-  CudaStreamPool(int size) {
-    this.size = size;
-    this.resources = new long[size];
-    this.streams = new MemorySegment[size];
-    this.events = new MemorySegment[size];
-    try (var arena = Arena.ofConfined()) {
-      for (int i = 0; i < size; i++) {
-        // Create a non-blocking CUDA stream (avoids implicit sync with the default stream).
-        var pStream = arena.allocate(cudaStream_t);
-        checkCudaError(
-            cudaStreamCreateWithFlags(pStream, cudaStreamNonBlocking()), "cudaStreamCreate");
-        streams[i] = pStream.get(cudaStream_t, 0);
-
-        // Create a cuvsResources_t and assign the new stream to it.
-        var pRes = arena.allocate(cuvsResources_t);
-        checkCuVSError(cuvsResourcesCreate(pRes), "cuvsResourcesCreate");
-        resources[i] = pRes.get(cuvsResources_t, 0);
-        checkCuVSError(cuvsStreamSet(resources[i], streams[i]), "cuvsStreamSet");
-
-        // Pre-allocate one reusable event per slot (disable timing to avoid overhead).
-        var pEvent = arena.allocate(cudaEvent_t);
-        checkCudaError(
-            cudaEventCreateWithFlags(pEvent, cudaEventDisableTiming()), "cudaEventCreate");
-        events[i] = pEvent.get(cudaEvent_t, 0);
-      }
-    }
-  }
-
-  // -------------------------------------------------------------------------
-  // Per-slot accessors
-  // -------------------------------------------------------------------------
-
-  /** Returns the {@code cuvsResources_t} handle for the given slot. */
-  public long resources(int slot) {
-    return resources[slot];
-  }
-
-  /** Returns the CUDA stream handle for the given slot. */
-  public MemorySegment stream(int slot) {
-    return streams[slot];
-  }
-
-  /** Returns the pre-allocated CUDA event handle for the given slot. */
-  public MemorySegment event(int slot) {
-    return events[slot];
-  }
-
-  /** Returns the number of slots in this pool. */
-  public int size() {
-    return size;
-  }
-
-  /**
-   * Advances the round-robin counter by {@code count} and returns the starting slot index for
-   * this call. Slot indices are wrapped modulo {@link #size()}.
-   */
-  public int nextSlot(int count) {
-    int start = slotCounter;
-    slotCounter += count;
-    return start;
-  }
-
-  // -------------------------------------------------------------------------
-  // Lifecycle
-  // -------------------------------------------------------------------------
-
-  @Override
-  public void close() {
-    for (int i = 0; i < size; i++) {
-      checkCudaError(cudaEventDestroy(events[i]), "cudaEventDestroy");
-      cuvsResourcesDestroy(resources[i]);
-      checkCudaError(cudaStreamDestroy(streams[i]), "cudaStreamDestroy");
-    }
-  }
-}

From d284bd0f02799a05736c548aae27fbbb930a89aa Mon Sep 17 00:00:00 2001
From: James Xia <jamxia@nvidia.com>
Date: Fri, 1 May 2026 06:45:55 -0700
Subject: [PATCH 11/11] Remove remnants of experimental code

---
 .../cagra/search_single_cta_kernel-inl.cuh    | 128 ++++++++++--------
 1 file changed, 69 insertions(+), 59 deletions(-)

diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index c0aaa1ec05..a0b66acc4b 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -1327,10 +1327,6 @@ struct alignas(kCacheLineBytes) job_desc_t {
     const data_type* queries_ptr;         // [num_queries, dataset_dim]
     uint32_t top_k;
     uint32_t n_queries;
-    // Make index details job-specific so the persistent kernel singleton can serve multiple indexes
-    const DATASET_DESCRIPTOR_T* dataset_desc_ptr;
-    const index_type* graph_ptr;
-    uint32_t graph_degree;
   };
   using blob_elem_type = uint4;
   constexpr static inline size_t kBlobSize =
@@ -1374,9 +1370,12 @@ template <bool TOPK_BY_BITONIC_SORT,
           class SourceIndexT,
           class SAMPLE_FILTER_T>
 RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel_p(
+  const DATASET_DESCRIPTOR_T* dataset_desc,
   worker_handle_t* worker_handles,
   job_desc_t<DATASET_DESCRIPTOR_T>* job_descriptors,
   uint32_t* completion_counters,
+  const typename DATASET_DESCRIPTOR_T::INDEX_T* const knn_graph,  // [dataset_size, graph_degree]
+  const std::uint32_t graph_degree,
   const SourceIndexT* source_indices_ptr,
   const unsigned num_distilation,
   const uint64_t rand_xor_mask,
@@ -1435,10 +1434,6 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel_p(
     auto top_k                 = job_descriptor.value.top_k;
     auto n_queries             = job_descriptor.value.n_queries;
     auto query_id              = worker_data.value.query_id;
-    // per-job index pointers
-    auto* dataset_desc = job_descriptor.value.dataset_desc_ptr;
-    auto* knn_graph    = job_descriptor.value.graph_ptr;
-    auto  graph_degree = job_descriptor.value.graph_degree;
 
     // work phase
     search_core<TOPK_BY_BITONIC_SORT,
@@ -1525,13 +1520,32 @@ struct search_kernel_config {
                                              unsigned num_itopk_candidates,
                                              unsigned block_size) -> kernel_t
   {
-    switch (select_topk_variant(itopk_size, num_itopk_candidates, block_size)) {
-      case TopkVariant::BITONIC:
-        return dispatch_kernel<Persistent, true, false, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>;
-      case TopkVariant::BITONIC_MERGE_MULTI:
-        return dispatch_kernel<Persistent, true, true, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>;
-      default:
-        return dispatch_kernel<Persistent, false, false, DATASET_DESCRIPTOR_T, SourceIndexT, SAMPLE_FILTER_T>;
+    assert(itopk_size <= 512);
+    if (num_itopk_candidates <= 256) {
+      if (itopk_size <= 256) {
+        return dispatch_kernel<Persistent,
+                               true,
+                               false,
+                               DATASET_DESCRIPTOR_T,
+                               SourceIndexT,
+                               SAMPLE_FILTER_T>;
+      } else {
+        assert(block_size >= 64);
+        return dispatch_kernel<Persistent,
+                               true,
+                               true,
+                               DATASET_DESCRIPTOR_T,
+                               SourceIndexT,
+                               SAMPLE_FILTER_T>;
+      }
+    } else {
+      // Radix-based topk is used
+      return dispatch_kernel<Persistent,
+                             false,
+                             false,
+                             DATASET_DESCRIPTOR_T,
+                             SourceIndexT,
+                             SAMPLE_FILTER_T>;
     }
   }
 };
@@ -2014,6 +2028,7 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
   using job_desc_type = job_desc_t<descriptor_base_type>;
   kernel_type kernel;
   uint32_t block_size;
+  dataset_descriptor_host<DataT, IndexT, DistanceT> dd_host;
   rmm::device_uvector<worker_handle_t> worker_handles;
   rmm::device_uvector<job_desc_type> job_descriptors;
   rmm::device_uvector<uint32_t> completion_counters;
@@ -2025,9 +2040,9 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
    * Calculate the hash of the parameters to detect if they've changed across the calls.
    * NB: this must have the same argument types as the constructor.
    */
-  // NB: dataset_desc and graph are intentionally excluded from the hash — they are now per-job
-  // fields stored in the job descriptor, so the runner is keyed only on fixed kernel parameters.
   static inline auto calculate_parameter_hash(
+    std::reference_wrapper<const dataset_descriptor_host<DataT, IndexT, DistanceT>> dataset_desc,
+    raft::device_matrix_view<const index_type, int64_t, raft::row_major> graph,
     const SourceIndexT* source_indices_ptr,
     uint32_t max_candidates,
     uint32_t num_itopk_candidates,
@@ -2048,13 +2063,16 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
     float persistent_lifetime,
     float persistent_device_usage) -> uint64_t
   {
-    return uint64_t(source_indices_ptr) ^ num_itopk_candidates ^ block_size ^ smem_size ^
+    return uint64_t(graph.data_handle()) ^ uint64_t(source_indices_ptr) ^
+           dataset_desc.get().team_size ^ num_itopk_candidates ^ block_size ^ smem_size ^
            hash_bitlen ^ small_hash_reset_interval ^ num_random_samplings ^ rand_xor_mask ^
            num_seeds ^ itopk_size ^ search_width ^ min_iterations ^ max_iterations ^
            uint64_t(persistent_lifetime * 1000) ^ uint64_t(persistent_device_usage * 1000);
   }
 
   persistent_runner_t(
+    std::reference_wrapper<const dataset_descriptor_host<DataT, IndexT, DistanceT>> dataset_desc,
+    raft::device_matrix_view<const index_type, int64_t, raft::row_major> graph,
     const SourceIndexT* source_indices_ptr,
     uint32_t max_candidates,
     uint32_t num_itopk_candidates,
@@ -2082,7 +2100,10 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
       job_descriptors(kMaxJobsNum, stream, job_descriptor_mr),
       completion_counters(kMaxJobsNum, stream, device_mr),
       hashmap(0, stream, device_mr),
-      param_hash(calculate_parameter_hash(source_indices_ptr,
+      dd_host{dataset_desc.get()},
+      param_hash(calculate_parameter_hash(dd_host,
+                                          graph,
+                                          source_indices_ptr,
                                           max_candidates,
                                           num_itopk_candidates,
                                           block_size,
@@ -2102,7 +2123,8 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
                                           persistent_lifetime,
                                           persistent_device_usage))
   {
-    // dataset_desc and graph are now per-job fields; no fixed device descriptor at launch time.
+    // initialize the dataset/distance descriptor
+    auto* dd_dev_ptr = dd_host.dev_ptr(stream);
 
     // set kernel attributes same as in normal kernel
     RAFT_CUDA_TRY(
@@ -2145,14 +2167,18 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
     }
 
     // launch the kernel
-    // dataset_desc and graph are per-job (in the job descriptor); not passed as kernel args.
+    auto* graph_ptr                   = graph.data_handle();
+    uint32_t graph_degree             = graph.extent(1);
     uint32_t* num_executed_iterations = nullptr;  // optional arg [num_queries]
     const index_type* dev_seed_ptr    = nullptr;  // optional arg [num_queries, num_seeds]
 
     void* args[] =  // NOLINT
-      {&worker_handles_ptr,
+      {&dd_dev_ptr,
+       &worker_handles_ptr,
        &job_descriptors_ptr,
        &completion_counters_ptr,
+       &graph_ptr,  // [dataset_size, graph_degree]
+       &graph_degree,
        &source_indices_ptr,
        &num_random_samplings,
        &rand_xor_mask,
@@ -2192,14 +2218,11 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
     RAFT_LOG_INFO("Destroyed the persistent runner.");
   }
 
-  void launch(uintptr_t result_indices_ptr,              // [num_queries, top_k]
-              distance_type* result_distances_ptr,       // [num_queries, top_k]
-              const data_type* queries_ptr,              // [num_queries, dataset_dim]
+  void launch(uintptr_t result_indices_ptr,         // [num_queries, top_k]
+              distance_type* result_distances_ptr,  // [num_queries, top_k]
+              const data_type* queries_ptr,         // [num_queries, dataset_dim]
               uint32_t num_queries,
-              uint32_t top_k,
-              const descriptor_base_type* dd_dev_ptr,   // device descriptor for this segment
-              const index_type* graph_ptr,              // graph for this segment
-              uint32_t graph_degree)                    // graph degree for this segment
+              uint32_t top_k)
   {
     // submit all queries
     launcher_t launcher{job_queue,
@@ -2212,20 +2235,14 @@ struct alignas(kCacheLineBytes) persistent_runner_t : public persistent_runner_b
                          result_distances_ptr,
                          queries_ptr,
                          top_k,
-                         num_queries,
-                         dd_dev_ptr,
-                         graph_ptr,
-                         graph_degree](uint32_t job_ix) {
-                          auto& jd                  = job_descriptors.data()[job_ix].input.value;
-                          auto* cflag               = &job_descriptors.data()[job_ix].completion_flag;
-                          jd.result_indices_ptr     = result_indices_ptr;
-                          jd.result_distances_ptr   = result_distances_ptr;
-                          jd.queries_ptr            = queries_ptr;
-                          jd.top_k                  = top_k;
-                          jd.n_queries              = num_queries;
-                          jd.dataset_desc_ptr       = dd_dev_ptr;
-                          jd.graph_ptr              = graph_ptr;
-                          jd.graph_degree           = graph_degree;
+                         num_queries](uint32_t job_ix) {
+                          auto& jd                = job_descriptors.data()[job_ix].input.value;
+                          auto* cflag             = &job_descriptors.data()[job_ix].completion_flag;
+                          jd.result_indices_ptr   = result_indices_ptr;
+                          jd.result_distances_ptr = result_distances_ptr;
+                          jd.queries_ptr          = queries_ptr;
+                          jd.top_k                = top_k;
+                          jd.n_queries            = num_queries;
                           cflag->store(false, cuda::memory_order_relaxed);
                           cuda::atomic_thread_fence(cuda::memory_order_release,
                                                     cuda::thread_scope_system);
@@ -2423,14 +2440,14 @@ void select_and_run(
   if (ps.persistent) {
     using runner_type = persistent_runner_t<DataT, IndexT, DistanceT, SourceIndexT, SampleFilterT>;
 
-    // Initialize the device descriptor on the caller's stream (lazy, cached after first call).
-    // Synchronize to guarantee the upload completes before the GPU kernel reads the pointer from
-    // the job descriptor.  This sync is cheap on all subsequent calls (stream is empty).
-    auto* dd_dev_ptr = dataset_desc.dev_ptr(stream);
-    RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
-
-    // Runner is keyed on fixed kernel parameters only; dataset/graph are now per-job.
-    get_runner<runner_type>(source_indices_ptr,
+    get_runner<runner_type>(/*
+Note, we're passing the descriptor by reference here, and this reference is going to be passed to a
+new spawned thread, which is dangerous. However, the descriptor is copied in that thread before the
+control is returned in this thread (in persistent_runner_t constructor), so we're safe.
+*/
+                            std::cref(dataset_desc),
+                            graph,
+                            source_indices_ptr,
                             max_candidates,
                             num_itopk_candidates,
                             block_size,
@@ -2449,14 +2466,7 @@ void select_and_run(
                             sample_filter,
                             ps.persistent_lifetime,
                             ps.persistent_device_usage)
-      ->launch(topk_indices_ptr,
-               topk_distances_ptr,
-               queries_ptr,
-               num_queries,
-               topk,
-               dd_dev_ptr,
-               graph.data_handle(),
-               static_cast<uint32_t>(graph.extent(1)));
+      ->launch(topk_indices_ptr, topk_distances_ptr, queries_ptr, num_queries, topk);
   } else {
     using descriptor_base_type = dataset_descriptor_base_t<DataT, IndexT, DistanceT>;
     auto kernel = search_kernel_config<false, descriptor_base_type, SourceIndexT, SampleFilterT>::