diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt index 2439f9269f..df0c81923f 100644 --- a/c/CMakeLists.txt +++ b/c/CMakeLists.txt @@ -104,6 +104,7 @@ add_library( src/preprocessing/quantize/pq.cpp src/preprocessing/quantize/scalar.cpp src/distance/pairwise_distance.cpp + src/selection/select_k.cpp ) add_library(cuvs::c_api ALIAS cuvs_c) set_target_properties( diff --git a/c/include/cuvs/core/c_api.h b/c/include/cuvs/core/c_api.h index b9941d6ae4..622fe4cdcf 100644 --- a/c/include/cuvs/core/c_api.h +++ b/c/include/cuvs/core/c_api.h @@ -129,6 +129,22 @@ cuvsError_t cuvsStreamSync(cuvsResources_t res); */ cuvsError_t cuvsDeviceIdGet(cuvsResources_t res, int* device_id); +/** + * @brief Configure the temporary workspace on this resources object as an uncapped pool, backed + * by the current device memory resource. After the initial reservation is allocated on + * first use, subsequent calls to cuvsRMMAlloc / cuvsRMMFree on the same resources handle + * hit the pool cache rather than calling cudaMallocAsync / cudaFreeAsync, reducing CUDA + * context lock contention under concurrent query threads. The pool grows without shrinking: + * freed allocations are returned to the pool rather than to the device, so the pool's + * high-water mark only increases until the resources object is destroyed. + * + * @param[in] res cuvsResources_t opaque C handle + * @param[in] initial_size_bytes initial pool reservation in bytes; size to cover the + * steady-state working set to avoid growth after warmup + * @return cuvsError_t + */ +cuvsError_t cuvsResourcesSetWorkspacePool(cuvsResources_t res, size_t initial_size_bytes); + /** * @brief Create an Initialized opaque C handle for C++ type `raft::device_resources_snmg` * for multi-GPU operations @@ -210,6 +226,19 @@ cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes); cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent, int max_pool_size_percent, bool managed); +/** + * @brief Switches the working memory resource to use stream-ordered asynchronous allocation + * (cudaMallocAsync / cudaFreeAsync). Unlike the pool resource, this resource returns memory to + * the stream immediately without blocking the CPU, eliminating device-wide synchronization on + * deallocation. This is especially beneficial when multiple CAGRA searches run concurrently on + * separate CUDA streams, because the internal workspace allocations no longer serialize kernel + * launches. Be aware that this function will change the memory resource for the whole process + * and the new memory resource will be used until explicitly changed. + * + * @return cuvsError_t + */ +cuvsError_t cuvsRMMAsyncMemoryResourceEnable(); + /** * @brief Resets the memory resource to use the default memory resource (cuda_memory_resource) * @return cuvsError_t diff --git a/c/include/cuvs/neighbors/cagra.h b/c/include/cuvs/neighbors/cagra.h index 3304e10f37..f6fc7da8e5 100644 --- a/c/include/cuvs/neighbors/cagra.h +++ b/c/include/cuvs/neighbors/cagra.h @@ -712,6 +712,34 @@ cuvsError_t cuvsCagraSearch(cuvsResources_t res, DLManagedTensor* distances, cuvsFilter filter); +/** + * @brief Search multiple CAGRA index segments concurrently using a single GPU kernel launch. + * + * Launches a single kernel with grid (1, num_queries, num_segments) so each CTA handles one + * (query, segment) pair concurrently. All results land in the caller-supplied device buffers + * on the same CUDA stream, so downstream operations (e.g. selectK) see them via stream ordering + * with no explicit synchronization needed. + * + * Only float32 datasets are currently supported. Distance values are comparable across segments + * (same scale) but are not postprocessed (no kScale correction) — they are suitable for + * relative comparison (selectK / recall). + * + * @param[in] res cuvsResources_t opaque C handle + * @param[in] params search parameters + * @param[in] num_segments number of index segments + * @param[in] indices array of num_segments cuvsCagraIndex_t pointers + * @param[in] queries array of num_segments DLManagedTensor* (device, float32, [nq, dim]) + * @param[out] neighbors array of num_segments DLManagedTensor* (device, uint32, [nq, topk]) + * @param[out] distances array of num_segments DLManagedTensor* (device, float32, [nq, topk]) + */ +cuvsError_t cuvsCagraSearchMultiSegment(cuvsResources_t res, + cuvsCagraSearchParams_t params, + uint32_t num_segments, + cuvsCagraIndex_t* indices, + DLManagedTensor** queries, + DLManagedTensor** neighbors, + DLManagedTensor** distances); + /** * @} */ diff --git a/c/include/cuvs/selection/select_k.h b/c/include/cuvs/selection/select_k.h new file mode 100644 index 0000000000..ad79b9e3a7 --- /dev/null +++ b/c/include/cuvs/selection/select_k.h @@ -0,0 +1,37 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Select the k smallest values from a flat device array of n candidates. + * + * Treats `in_val` as a matrix of shape [1, n] and selects the `k` smallest + * float values. `out_idx` receives the int64 column positions of the selected + * values in [0, n), so the caller can recover per-segment identity as: + * + * segment_index = out_idx[j] / segment_k + * position_in_segment = out_idx[j] % segment_k + * + * @param[in] res cuvsResources_t handle + * @param[in] in_val DLManagedTensor* shape [1, n], float32, device memory + * @param[out] out_val DLManagedTensor* shape [1, k], float32, device memory + * @param[out] out_idx DLManagedTensor* shape [1, k], int64, device memory + * @return cuvsError_t + */ +cuvsError_t cuvsSelectK(cuvsResources_t res, + DLManagedTensor* in_val, + DLManagedTensor* out_val, + DLManagedTensor* out_idx); + +#ifdef __cplusplus +} +#endif diff --git a/c/src/core/c_api.cpp b/c/src/core/c_api.cpp index f4e3664482..05e3856da1 100644 --- a/c/src/core/c_api.cpp +++ b/c/src/core/c_api.cpp @@ -9,11 +9,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -35,6 +37,19 @@ extern "C" cuvsError_t cuvsResourcesCreate(cuvsResources_t* res) }); } +extern "C" cuvsError_t cuvsResourcesSetWorkspacePool(cuvsResources_t res, size_t initial_size_bytes) +{ + return cuvs::core::translate_exceptions([=] { + auto res_ptr = reinterpret_cast(res); + // Create an uncapped pool: pre-warms with initial_size_bytes to avoid cudaMalloc on every + // query, but can grow beyond that if an allocation exceeds the initial reservation. + raft::resource::set_workspace_resource( + *res_ptr, + rmm::mr::pool_memory_resource{rmm::mr::get_current_device_resource_ref(), + initial_size_bytes}); + }); +} + extern "C" cuvsError_t cuvsResourcesDestroy(cuvsResources_t res) { return cuvs::core::translate_exceptions([=] { @@ -132,8 +147,8 @@ extern "C" cuvsError_t cuvsRMMAlloc(cuvsResources_t res, void** ptr, size_t byte { return cuvs::core::translate_exceptions([=] { auto res_ptr = reinterpret_cast(res); - auto mr = rmm::mr::get_current_device_resource_ref(); - *ptr = mr.allocate(raft::resource::get_cuda_stream(*res_ptr), bytes); + auto stream = raft::resource::get_cuda_stream(*res_ptr); + *ptr = raft::resource::get_workspace_resource_ref(*res_ptr).allocate(stream, bytes); }); } @@ -141,11 +156,13 @@ extern "C" cuvsError_t cuvsRMMFree(cuvsResources_t res, void* ptr, size_t bytes) { return cuvs::core::translate_exceptions([=] { auto res_ptr = reinterpret_cast(res); - auto mr = rmm::mr::get_current_device_resource_ref(); - mr.deallocate(raft::resource::get_cuda_stream(*res_ptr), ptr, bytes); + auto stream = raft::resource::get_cuda_stream(*res_ptr); + raft::resource::get_workspace_resource_ref(*res_ptr).deallocate(stream, ptr, bytes); }); } +thread_local std::shared_ptr async_mr; + extern "C" cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_percent, int max_pool_size_percent, bool managed) @@ -164,9 +181,20 @@ extern "C" cuvsError_t cuvsRMMPoolMemoryResourceEnable(int initial_pool_size_per }); } +extern "C" cuvsError_t cuvsRMMAsyncMemoryResourceEnable() +{ + return cuvs::core::translate_exceptions([=] { + async_mr = std::make_shared(); + rmm::mr::set_current_device_resource(*async_mr); + }); +} + extern "C" cuvsError_t cuvsRMMMemoryResourceReset() { - return cuvs::core::translate_exceptions([=] { rmm::mr::reset_current_device_resource(); }); + return cuvs::core::translate_exceptions([=] { + rmm::mr::reset_current_device_resource(); + async_mr.reset(); + }); } thread_local std::unique_ptr pinned_mr; diff --git a/c/src/neighbors/cagra.cpp b/c/src/neighbors/cagra.cpp index 081179ca46..2a4edda2a8 100644 --- a/c/src/neighbors/cagra.cpp +++ b/c/src/neighbors/cagra.cpp @@ -689,6 +689,54 @@ extern "C" cuvsError_t cuvsCagraSearch(cuvsResources_t res, }); } +extern "C" cuvsError_t cuvsCagraSearchMultiSegment(cuvsResources_t res, + cuvsCagraSearchParams_t params, + uint32_t num_segments, + cuvsCagraIndex_t* indices, + DLManagedTensor** queries, + DLManagedTensor** neighbors, + DLManagedTensor** distances) +{ + return cuvs::core::translate_exceptions([=] { + RAFT_EXPECTS(num_segments > 0, "num_segments must be > 0"); + RAFT_EXPECTS(indices != nullptr && queries != nullptr && neighbors != nullptr && + distances != nullptr, + "All pointer arrays must be non-null"); + + auto res_ptr = reinterpret_cast(res); + auto search_params = cuvs::neighbors::cagra::search_params(); + convert_c_search_params(*params, &search_params); + + // Only float32 is supported for multi-segment search. + RAFT_EXPECTS( + indices[0]->dtype.code == kDLFloat && indices[0]->dtype.bits == 32, + "Multi-segment search only supports float32 indices"); + + using T = float; + using IdxT = uint32_t; + using OutIdxT = uint32_t; + using DistanceT = float; + using IndexT = cuvs::neighbors::cagra::index; + + std::vector idx_vec(num_segments); + std::vector> q_vec(num_segments); + std::vector> n_vec(num_segments); + std::vector> d_vec(num_segments); + + for (uint32_t i = 0; i < num_segments; i++) { + RAFT_EXPECTS(indices[i] != nullptr && indices[i]->addr != 0, + "Index at position %u is null or not built", i); + idx_vec[i] = reinterpret_cast(indices[i]->addr); + q_vec[i] = cuvs::core::from_dlpack>(queries[i]); + n_vec[i] = cuvs::core::from_dlpack>(neighbors[i]); + d_vec[i] = cuvs::core::from_dlpack>(distances[i]); + } + + cuvs::neighbors::cagra::search_multi_segment( + *res_ptr, search_params, idx_vec, q_vec, n_vec, d_vec); + }); +} + extern "C" cuvsError_t cuvsCagraMerge(cuvsResources_t res, cuvsCagraIndexParams_t params, cuvsCagraIndex_t* indices, diff --git a/c/src/selection/select_k.cpp b/c/src/selection/select_k.cpp new file mode 100644 index 0000000000..f68416454a --- /dev/null +++ b/c/src/selection/select_k.cpp @@ -0,0 +1,42 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include "../core/exceptions.hpp" +#include +#include + +#include +#include + +extern "C" cuvsError_t cuvsSelectK(cuvsResources_t res, + DLManagedTensor* in_val, + DLManagedTensor* out_val, + DLManagedTensor* out_idx) +{ + return cuvs::core::translate_exceptions([=] { + auto* res_ptr = reinterpret_cast(res); + + int64_t n = in_val->dl_tensor.shape[1]; + int64_t k = out_val->dl_tensor.shape[1]; + + auto in_view = raft::make_device_matrix_view( + static_cast(in_val->dl_tensor.data), 1, n); + + auto out_val_view = raft::make_device_matrix_view( + static_cast(out_val->dl_tensor.data), 1, k); + + auto out_idx_view = raft::make_device_matrix_view( + static_cast(out_idx->dl_tensor.data), 1, k); + + cuvs::selection::select_k( + *res_ptr, + in_view, + std::nullopt, // implicit positions [0, n) as in_idx + out_val_view, + out_idx_view, + true); // select_min = true (smallest distance = nearest neighbor) + }); +} diff --git a/cpp/include/cuvs/neighbors/cagra.hpp b/cpp/include/cuvs/neighbors/cagra.hpp index a7e1249677..fd9b551a36 100644 --- a/cpp/include/cuvs/neighbors/cagra.hpp +++ b/cpp/include/cuvs/neighbors/cagra.hpp @@ -29,6 +29,7 @@ #include #include #include +#include namespace cuvs::neighbors::graph_build_params { using iterative_search_params = cuvs::neighbors::search_params; @@ -1723,6 +1724,87 @@ void search(raft::resources const& res, const cuvs::neighbors::filtering::base_filter& sample_filter = cuvs::neighbors::filtering::none_sample_filter{}); +/** + * @brief Search multiple CAGRA index segments concurrently in a single kernel launch. + * + * Launches a single SINGLE_CTA kernel grid that covers all segments, with one CTA per segment. + * All per-segment results are written into the caller-supplied device buffers on the stream + * associated with @p res; the call returns when all segments have been submitted to the stream + * (not necessarily completed). Use @c cuvsStreamSync to wait for completion. + * + * Distance values are comparable across segments but are not postprocessed (no kScale correction). + * + * @param[in] res raft resources + * @param[in] params search parameters + * @param[in] indices one index per segment + * @param[in] queries per-segment query matrix [n_queries, dim] + * @param[out] neighbors per-segment result neighbors [n_queries, topk] + * @param[out] distances per-segment result distances [n_queries, topk] + */ +void search_multi_segment( + raft::resources const& res, + cuvs::neighbors::cagra::search_params const& params, + const std::vector*>& indices, + const std::vector>& queries, + const std::vector>& neighbors, + const std::vector>& distances); + +void search_multi_segment( + raft::resources const& res, + cuvs::neighbors::cagra::search_params const& params, + const std::vector*>& indices, + const std::vector>& queries, + const std::vector>& neighbors, + const std::vector>& distances); + +void search_multi_segment( + raft::resources const& res, + cuvs::neighbors::cagra::search_params const& params, + const std::vector*>& indices, + const std::vector>& queries, + const std::vector>& neighbors, + const std::vector>& distances); + +void search_multi_segment( + raft::resources const& res, + cuvs::neighbors::cagra::search_params const& params, + const std::vector*>& indices, + const std::vector>& queries, + const std::vector>& neighbors, + const std::vector>& distances); + +void search_multi_segment( + raft::resources const& res, + cuvs::neighbors::cagra::search_params const& params, + const std::vector*>& indices, + const std::vector>& queries, + const std::vector>& neighbors, + const std::vector>& distances); + +void search_multi_segment( + raft::resources const& res, + cuvs::neighbors::cagra::search_params const& params, + const std::vector*>& indices, + const std::vector>& queries, + const std::vector>& neighbors, + const std::vector>& distances); + +void search_multi_segment( + raft::resources const& res, + cuvs::neighbors::cagra::search_params const& params, + const std::vector*>& indices, + const std::vector>& queries, + const std::vector>& neighbors, + const std::vector>& distances); + +void search_multi_segment( + raft::resources const& res, + cuvs::neighbors::cagra::search_params const& params, + const std::vector*>& indices, + const std::vector>& queries, + const std::vector>& neighbors, + const std::vector>& distances); + /** * @} */ diff --git a/cpp/src/neighbors/cagra.cuh b/cpp/src/neighbors/cagra.cuh index 73c3794d39..8df172cff0 100644 --- a/cpp/src/neighbors/cagra.cuh +++ b/cpp/src/neighbors/cagra.cuh @@ -406,6 +406,19 @@ index merge(raft::resources const& handle, return cagra::detail::merge(handle, params, indices, row_filter); } +template +void search_multi_segment( + raft::resources const& res, + search_params const& params, + const std::vector*>& indices, + const std::vector>& queries, + const std::vector>& neighbors, + const std::vector>& distances) +{ + cagra::detail::search_multi_segment( + res, params, indices, queries, neighbors, distances); +} + /** @} */ // end group cagra } // namespace cuvs::neighbors::cagra diff --git a/cpp/src/neighbors/cagra_search_inst.cu.in b/cpp/src/neighbors/cagra_search_inst.cu.in index dfef630798..be5c4a906d 100644 --- a/cpp/src/neighbors/cagra_search_inst.cu.in +++ b/cpp/src/neighbors/cagra_search_inst.cu.in @@ -32,4 +32,22 @@ CUVS_INST_CAGRA_SEARCH(data_t, uint32_t, int64_t); #undef CUVS_INST_CAGRA_SEARCH +#define CUVS_INST_CAGRA_SEARCH_MULTI_SEGMENT(T, IdxT, OutputIdxT) \ + void search_multi_segment( \ + raft::resources const& handle, \ + cuvs::neighbors::cagra::search_params const& params, \ + const std::vector*>& indices, \ + const std::vector>& queries, \ + const std::vector>& neighbors, \ + const std::vector>& distances) \ + { \ + cuvs::neighbors::cagra::search_multi_segment( \ + handle, params, indices, queries, neighbors, distances); \ + } + +CUVS_INST_CAGRA_SEARCH_MULTI_SEGMENT(data_t, uint32_t, uint32_t); +CUVS_INST_CAGRA_SEARCH_MULTI_SEGMENT(data_t, uint32_t, int64_t); + +#undef CUVS_INST_CAGRA_SEARCH_MULTI_SEGMENT + } // namespace cuvs::neighbors::cagra diff --git a/cpp/src/neighbors/detail/cagra/cagra_search.cuh b/cpp/src/neighbors/detail/cagra/cagra_search.cuh index f1650980e0..4b3480ae50 100644 --- a/cpp/src/neighbors/detail/cagra/cagra_search.cuh +++ b/cpp/src/neighbors/detail/cagra/cagra_search.cuh @@ -9,6 +9,7 @@ #include "factory.cuh" #include "sample_filter_utils.cuh" #include "search_plan.cuh" +#include "search_single_cta.cuh" #include #include @@ -251,4 +252,169 @@ void search_main(raft::resources const& res, } /** @} */ // end group cagra +/** + * @brief Search all segments concurrently for a single query using one kernel launch. + * + * Each segment's CTA runs independently (blockIdx.z = segment_id, blockIdx.y = query_id). + * All segments must use float32 data with the same search parameters. + * + * @param indices per-segment indices (strided datasets only for now) + * @param queries per-segment query views — same vector repeated for each segment + * @param neighbors per-segment output neighbor views — each [num_queries, topk] + * @param distances per-segment output distance views — each [num_queries, topk] + */ +template +void search_multi_segment( + raft::resources const& res, + search_params params, + const std::vector*>& indices, + const std::vector>& queries, + const std::vector>& neighbors, + const std::vector>& distances) +{ + static_assert(std::is_same_v, "Only uint32_t graph index type is supported"); + static_assert(std::is_same_v, "Only float distances are supported"); + + const uint32_t num_segments = static_cast(indices.size()); + RAFT_EXPECTS(num_segments > 0, "At least one segment is required"); + RAFT_EXPECTS(queries.size() == num_segments && neighbors.size() == num_segments && + distances.size() == num_segments, + "All input vectors must have the same size"); + + const int64_t dim = queries[0].extent(1); + const uint32_t topk = static_cast(neighbors[0].extent(1)); + const uint32_t n_queries = static_cast(queries[0].extent(0)); + + // Find the max graph_degree across all segments (needed for the shared kernel plan). + int64_t max_graph_degree = 0; + int64_t max_dataset_size = 0; + for (uint32_t i = 0; i < num_segments; i++) { + RAFT_EXPECTS(!indices[i]->dataset_fd().has_value(), + "Disk-based datasets are not supported for multi-segment search"); + max_graph_degree = std::max(max_graph_degree, indices[i]->graph().extent(1)); + max_dataset_size = std::max(max_dataset_size, indices[i]->data().n_rows()); + } + + if (params.max_queries == 0) { + cudaDeviceProp deviceProp = raft::resource::get_device_properties(res); + params.max_queries = + std::min(static_cast(n_queries), deviceProp.maxGridSize[1]); + } + + // Multi-segment uses a regular (non-persistent) single-CTA kernel launch. + // MULTI_CTA requires a different plan type (multi_cta_search::search) and is not supported here. + // AUTO could resolve to MULTI_CTA for large itopk_size, so force SINGLE_CTA unconditionally. + params.persistent = false; + params.algo = search_algo::SINGLE_CTA; + + // Build a single search plan sized for the maximum graph_degree across all segments. + // For the first segment's descriptor type (strided float): use it to init the plan. + using graph_idx_type = uint32_t; + auto* strided_dset0 = dynamic_cast*>(&indices[0]->data()); + RAFT_EXPECTS(strided_dset0 != nullptr, + "Multi-segment search only supports strided (non-compressed) datasets"); + + RAFT_EXPECTS(indices[0]->metric() != cuvs::distance::DistanceType::CosineExpanded || + indices[0]->dataset_norms().has_value(), + "Dataset norms must be provided for CosineExpanded metric"); + const float* dataset_norms_ptr0 = nullptr; + if (indices[0]->metric() == cuvs::distance::DistanceType::CosineExpanded) { + dataset_norms_ptr0 = indices[0]->dataset_norms().value().data_handle(); + } + // Use the first segment's descriptor to construct the plan (smem layout is type-dependent only). + auto plan_desc = dataset_descriptor_init_with_cache( + res, params, *strided_dset0, indices[0]->metric(), dataset_norms_ptr0); + + single_cta_search::search + plan(res, params, plan_desc, dim, max_dataset_size, max_graph_degree, topk); + + // Build per-segment descriptors and result pointers on the host. + // The device copy is allocated below. + using seg_desc_t = single_cta_search::multi_segment_desc_t; + std::vector host_seg_descs(num_segments); + + cudaStream_t stream = raft::resource::get_cuda_stream(res); + + // Collect per-segment dataset descriptors (may trigger lazy device init on `stream`). + std::vector> seg_dataset_descs; + seg_dataset_descs.reserve(num_segments); + + for (uint32_t i = 0; i < num_segments; i++) { + auto* strided_dset = dynamic_cast*>(&indices[i]->data()); + RAFT_EXPECTS(strided_dset != nullptr, + "All segments must have strided (non-compressed) datasets"); + const float* norms_ptr = nullptr; + if (indices[i]->metric() == cuvs::distance::DistanceType::CosineExpanded) { + RAFT_EXPECTS(indices[i]->dataset_norms().has_value(), + "Dataset norms required for CosineExpanded metric (segment %u)", i); + norms_ptr = indices[i]->dataset_norms().value().data_handle(); + } + seg_dataset_descs.push_back(dataset_descriptor_init_with_cache( + res, params, *strided_dset, indices[i]->metric(), norms_ptr)); + + // Call dev_ptr to trigger lazy device-side descriptor upload (enqueued on stream). + host_seg_descs[i].dataset_desc = seg_dataset_descs.back().dev_ptr(stream); + host_seg_descs[i].graph = indices[i]->graph().data_handle(); + host_seg_descs[i].graph_degree = static_cast(indices[i]->graph().extent(1)); + host_seg_descs[i].queries_ptr = queries[i].data_handle(); + + // Tag the result_indices_ptr with the OutputIdxT size (same convention as select_and_run). + constexpr uintptr_t kTag = raft::Pow2::Log2; + host_seg_descs[i].result_indices_ptr = + reinterpret_cast(neighbors[i].data_handle()) | kTag; + host_seg_descs[i].result_distances_ptr = distances[i].data_handle(); + } + + // Upload segment descriptors via workspace pool (no cudaMallocAsync/cudaFreeAsync after warmup). + lightweight_uvector dev_seg_descs_buf(res); + dev_seg_descs_buf.resize(num_segments, stream); + RAFT_CUDA_TRY(cudaMemcpyAsync(dev_seg_descs_buf.data(), + host_seg_descs.data(), + num_segments * sizeof(seg_desc_t), + cudaMemcpyHostToDevice, + stream)); + + // Launch all-segment kernel; stream ordering ensures descriptor upload and per-segment + // dataset_desc device-init complete before the search kernel executes. + plan.run_multi_segment(res, dev_seg_descs_buf.data(), num_segments, n_queries, topk); + // dev_seg_descs_buf destructor returns memory to workspace pool (stream-ordered). + + // Post-process distances (scale + metric transform) for each segment. + constexpr float kScale = cuvs::spatial::knn::detail::utils::config::kDivisor / + cuvs::spatial::knn::detail::utils::config::kDivisor; + for (uint32_t i = 0; i < num_segments; i++) { + float* dist_out = distances[i].data_handle(); + const DistanceT* dist_in = distances[i].data_handle(); + if (indices[i]->metric() == cuvs::distance::DistanceType::CosineExpanded) { + auto query_norms = raft::make_device_vector(res, n_queries); + auto scaled_sq_op = + raft::compose_op(raft::sq_op{}, + raft::div_const_op{DistanceT(kScale)}, + raft::cast_op()); + raft::linalg::reduce( + res, + raft::make_device_matrix_view( + queries[i].data_handle(), n_queries, dim), + query_norms.view(), + (DistanceT)0, + false, + scaled_sq_op, + raft::add_op(), + raft::sqrt_op{}); + raft::linalg::matrix_vector_op( + res, + raft::make_const_mdspan(distances[i]), + raft::make_const_mdspan(query_norms.view()), + distances[i], + raft::compose_op(raft::add_const_op{DistanceT(1)}, raft::div_checkzero_op{})); + } else { + cuvs::neighbors::ivf::detail::postprocess_distances( + res, dist_out, dist_in, indices[i]->metric(), n_queries, topk, kScale, true); + } + } +} + } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh index 02bf1ff697..0592dc4ed5 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh @@ -87,6 +87,12 @@ struct search uint32_t num_itopk_candidates; + /** Number of elements in a hashmap covering @p n_queries queries across @p n_segments segments. */ + static size_t hashmap_element_count(size_t n_segments, size_t n_queries, size_t h_bitlen) + { + return n_segments * n_queries * hashmap::get_size(h_bitlen); + } + search(raft::resources const& res, search_params params, const dataset_descriptor_host& dataset_desc, @@ -197,12 +203,60 @@ struct search RAFT_LOG_DEBUG("# smem_size: %u", smem_size); hashmap_size = 0; if (small_hash_bitlen == 0 && !this->persistent) { - hashmap_size = max_queries * hashmap::get_size(hash_bitlen); + hashmap_size = hashmap_element_count(1, max_queries, hash_bitlen); hashmap.resize(hashmap_size, raft::resource::get_cuda_stream(res)); } RAFT_LOG_DEBUG("# hashmap_size: %lu", hashmap_size); } + /** + * @brief Search all segments concurrently in a single kernel launch. + * + * @param res RAFT resources (stream is extracted from here) + * @param segment_descs device pointer to [num_segments] descriptors + * @param num_segments number of segments (gridDim.z) + * @param num_queries queries per segment (gridDim.y) + * @param topk neighbors to return per (query, segment) + */ + void run_multi_segment( + raft::resources const& res, + const multi_segment_desc_t* segment_descs, + uint32_t num_segments, + uint32_t num_queries, + uint32_t topk) + { + cudaStream_t stream = raft::resource::get_cuda_stream(res); + + // Allocate global hashmap when small-hash is disabled via the workspace pool + // (no cudaMallocAsync/cudaFreeAsync after pool warmup). + // Layout: [num_segments][num_queries][hash_size]. + lightweight_uvector ms_hashmap_buf(res); + INDEX_T* ms_hashmap_ptr = nullptr; + if (small_hash_bitlen == 0) { + const size_t ms_hashmap_elems = hashmap_element_count(num_segments, num_queries, hash_bitlen); + ms_hashmap_buf.resize(ms_hashmap_elems, stream); + ms_hashmap_ptr = ms_hashmap_buf.data(); + } + + select_and_run_multi_segment( + segment_descs, + num_segments, + num_queries, + *this, + topk, + num_itopk_candidates, + static_cast(thread_block_size), + smem_size, + hash_bitlen, + ms_hashmap_ptr, + small_hash_bitlen, + small_hash_reset_interval, + cuvs::neighbors::filtering::none_sample_filter{}, + stream); + // ms_hashmap_buf destructor returns memory to workspace pool (stream-ordered). + } + void operator()( raft::resources const& res, raft::device_matrix_view graph, diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cu.in b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cu.in index 85342e7093..c2d30fa6f5 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cu.in +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cu.in @@ -20,5 +20,9 @@ instantiate_kernel_selection(data_t, float, cuvs::neighbors::filtering::none_sample_filter); instantiate_kernel_selection(data_t, uint32_t, float, bitset_filter_t); +instantiate_kernel_selection_ms(data_t, + uint32_t, + float, + cuvs::neighbors::filtering::none_sample_filter); } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh index 11b468cfca..18314b5c59 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_inst.cuh @@ -34,4 +34,21 @@ namespace cuvs::neighbors::cagra::detail::single_cta_search { SampleFilterT sample_filter, \ cudaStream_t stream); +#define instantiate_kernel_selection_ms(DataT, IndexT, DistanceT, SampleFilterT) \ + template void select_and_run_multi_segment( \ + const multi_segment_desc_t* segment_descs, \ + uint32_t num_segments, \ + uint32_t num_queries, \ + const search_params& ps, \ + uint32_t topk, \ + uint32_t num_itopk_candidates, \ + uint32_t block_size, \ + uint32_t smem_size, \ + int64_t hash_bitlen, \ + IndexT* hashmap_ptr, \ + size_t small_hash_bitlen, \ + size_t small_hash_reset_interval, \ + SampleFilterT sample_filter, \ + cudaStream_t stream); + } // namespace cuvs::neighbors::cagra::detail::single_cta_search diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh index 48553611bf..a0b66acc4b 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh @@ -1165,6 +1165,146 @@ RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel( graph_size); } +/** + * @brief Multi-segment CAGRA search kernel. + * + * Grid: (1, num_queries, num_segments). + * Each CTA handles one (query, segment) pair independently. + * The global hashmap (if used) must be laid out as + * [num_segments][num_queries][hashmap::get_size(hash_bitlen)]. + */ +template +RAFT_KERNEL __launch_bounds__(1024, 1) search_kernel_ms( + const multi_segment_desc_t* segments, + const std::uint32_t top_k, + const SourceIndexT* source_indices_ptr, + const unsigned num_distilation, + const uint64_t rand_xor_mask, + const uint32_t num_seeds, + typename DATASET_DESCRIPTOR_T::INDEX_T* visited_hashmap_ptr, + const std::uint32_t max_candidates, + const std::uint32_t max_itopk, + const std::uint32_t internal_topk, + const std::uint32_t search_width, + const std::uint32_t min_iteration, + const std::uint32_t max_iteration, + std::uint32_t* const num_executed_iterations, + const std::uint32_t hash_bitlen, + const std::uint32_t small_hash_bitlen, + const std::uint32_t small_hash_reset_interval, + SAMPLE_FILTER_T sample_filter) +{ + using INDEX_T = typename DATASET_DESCRIPTOR_T::INDEX_T; + + const uint32_t query_id = blockIdx.y; + const uint32_t seg_id = blockIdx.z; + const auto& seg = segments[seg_id]; + + // Offset the global hashmap to the base of this segment's block. + // search_core will then add blockIdx.y * hash_size for the per-query offset, giving the correct + // layout: visited_hashmap_ptr[(seg_id * gridDim.y + query_id) * hash_size]. + INDEX_T* seg_hashmap_ptr = + (visited_hashmap_ptr != nullptr) + ? visited_hashmap_ptr + + seg_id * static_cast(gridDim.y) * hashmap::get_size(hash_bitlen) + : nullptr; + + search_core(seg.result_indices_ptr, + seg.result_distances_ptr, + top_k, + seg.dataset_desc, + seg.queries_ptr, + seg.graph, + seg.graph_degree, + source_indices_ptr, + num_distilation, + rand_xor_mask, + nullptr, // seed_ptr: not used in multi-segment + 0, // num_seeds + seg_hashmap_ptr, + max_candidates, + max_itopk, + internal_topk, + search_width, + min_iteration, + max_iteration, + num_executed_iterations, + hash_bitlen, + small_hash_bitlen, + small_hash_reset_interval, + query_id, + sample_filter); +} + +template +auto dispatch_kernel_ms = []() { + static_assert(TOPK_BY_BITONIC_SORT || !BITONIC_SORT_AND_MERGE_MULTI_WARPS); + return search_kernel_ms; +}(); + +/** + * @brief Encodes the (TOPK_BY_BITONIC_SORT, BITONIC_SORT_AND_MERGE_MULTI_WARPS) template + * booleans as a runtime value, selected from search parameters. + */ +enum class TopkVariant { + BITONIC, ///< bitonic sort, no multi-warp merge (num_itopk_candidates ≤ 256, itopk_size ≤ 256) + BITONIC_MERGE_MULTI, ///< bitonic sort, multi-warp merge (num_itopk_candidates ≤ 256, itopk_size > 256) + RADIX, ///< radix-based topk (num_itopk_candidates > 256) +}; + +/** + * @brief Selects the topk algorithm variant from runtime search parameters. + */ +inline TopkVariant select_topk_variant(unsigned itopk_size, + unsigned num_itopk_candidates, + unsigned block_size) +{ + assert(itopk_size <= 512); + if (num_itopk_candidates <= 256) { + if (itopk_size <= 256) { return TopkVariant::BITONIC; } + assert(block_size >= 64); + return TopkVariant::BITONIC_MERGE_MULTI; + } + return TopkVariant::RADIX; +} + +template +struct search_kernel_config_ms { + using kernel_t = decltype(dispatch_kernel_ms); + + static auto choose_itopk_and_mx_candidates(unsigned itopk_size, + unsigned num_itopk_candidates, + unsigned block_size) -> kernel_t + { + switch (select_topk_variant(itopk_size, num_itopk_candidates, block_size)) { + case TopkVariant::BITONIC: + return dispatch_kernel_ms; + case TopkVariant::BITONIC_MERGE_MULTI: + return dispatch_kernel_ms; + default: + return dispatch_kernel_ms; + } + } +}; + // To make sure we avoid false sharing on both CPU and GPU, we enforce cache line size to the // maximum of the two. // This makes sync atomic significantly faster. @@ -2222,6 +2362,48 @@ auto get_runner(Args... args) -> std::shared_ptr return runner; } +/** + * @brief Computes the max_candidates and max_itopk constants passed to the search kernel. + * + * Both values are rounded up to the next power-of-two bucket supported by the kernel template + * instantiations. They are the same for single-segment and multi-segment launches, so this helper + * is shared by select_and_run and select_and_run_multi_segment. + */ +struct kernel_dispatch_params { + uint32_t max_candidates; + uint32_t max_itopk; + + static kernel_dispatch_params compute(const search_params& ps, uint32_t num_itopk_candidates) + { + kernel_dispatch_params p{}; + if (num_itopk_candidates <= 64) { + p.max_candidates = 64; + } else if (num_itopk_candidates <= 128) { + p.max_candidates = 128; + } else if (num_itopk_candidates <= 256) { + p.max_candidates = 256; + } else { + p.max_candidates = 32; // irrelevant, radix-based topk is used + } + + assert(ps.itopk_size <= 512); + if (num_itopk_candidates <= 256) { // bitonic sort + if (ps.itopk_size <= 64) { + p.max_itopk = 64; + } else if (ps.itopk_size <= 128) { + p.max_itopk = 128; + } else if (ps.itopk_size <= 256) { + p.max_itopk = 256; + } else { + p.max_itopk = 512; + } + } else { // radix sort + p.max_itopk = (ps.itopk_size <= 256) ? 256 : 512; + } + return p; + } +}; + template data_handle() : nullptr; - uint32_t max_candidates{}; - if (num_itopk_candidates <= 64) { - max_candidates = 64; - } else if (num_itopk_candidates <= 128) { - max_candidates = 128; - } else if (num_itopk_candidates <= 256) { - max_candidates = 256; - } else { - max_candidates = - 32; // irrelevant, radix based topk is used (see choose_itopk_and_max_candidates) - } - - uint32_t max_itopk{}; - assert(ps.itopk_size <= 512); - if (num_itopk_candidates <= 256) { // bitonic sort - if (ps.itopk_size <= 64) { - max_itopk = 64; - } else if (ps.itopk_size <= 128) { - max_itopk = 128; - } else if (ps.itopk_size <= 256) { - max_itopk = 256; - } else { - max_itopk = 512; - } - } else { // radix sort - if (ps.itopk_size <= 256) { - max_itopk = 256; - } else { - max_itopk = 512; - } - } + auto [max_candidates, max_itopk] = kernel_dispatch_params::compute(ps, num_itopk_candidates); if (ps.persistent) { using runner_type = persistent_runner_t; @@ -2351,5 +2503,87 @@ control is returned in this thread (in persistent_runner_t constructor), so we'r RAFT_CUDA_TRY(cudaPeekAtLastError()); } } + +/** + * @brief Launch the multi-segment CAGRA search kernel. + * + * Searches all N segments concurrently in a single kernel launch. Each CTA (indexed by + * blockIdx.y = query_id, blockIdx.z = seg_id) independently searches one segment for one query. + * + * @param segment_descs device pointer to array of num_segments descriptors + * @param num_segments number of segments (= gridDim.z) + * @param num_queries number of queries (= gridDim.y) + * @param ps search parameters (shared across all segments) + * @param topk number of neighbors to return per segment + * @param num_itopk_candidates search_width * max_graph_degree + * @param block_size thread-block size + * @param smem_size shared memory per CTA (computed for max graph_degree) + * @param hash_bitlen global hashmap bit-length + * @param hashmap_ptr device buffer sized [num_segments * num_queries * get_size(hash_bitlen)] + * @param small_hash_bitlen small-hash bit-length (0 = disabled) + * @param small_hash_reset_interval reset interval for small hash + * @param sample_filter sample filter + * @param stream CUDA stream + */ +template +void select_and_run_multi_segment( + const multi_segment_desc_t* segment_descs, + uint32_t num_segments, + uint32_t num_queries, + const search_params& ps, + uint32_t topk, + uint32_t num_itopk_candidates, + uint32_t block_size, + uint32_t smem_size, + int64_t hash_bitlen, + IndexT* hashmap_ptr, + size_t small_hash_bitlen, + size_t small_hash_reset_interval, + SampleFilterT sample_filter, + cudaStream_t stream) +{ + using descriptor_base_type = dataset_descriptor_base_t; + + auto [max_candidates, max_itopk] = kernel_dispatch_params::compute(ps, num_itopk_candidates); + + auto kernel = + search_kernel_config_ms:: + choose_itopk_and_mx_candidates(ps.itopk_size, num_itopk_candidates, block_size); + + dim3 thread_dims(block_size, 1, 1); + dim3 block_dims(1, num_queries, num_segments); + RAFT_LOG_DEBUG("Launching ms kernel: %u threads, %u queries, %u segments, %u smem", + block_size, + num_queries, + num_segments, + smem_size); + auto const& kernel_launcher = [&](auto const& kernel) -> void { + kernel<<>>(segment_descs, + topk, + nullptr, // source_indices_ptr + ps.num_random_samplings, + ps.rand_xor_mask, + 0, // num_seeds + hashmap_ptr, + max_candidates, + max_itopk, + ps.itopk_size, + ps.search_width, + ps.min_iterations, + ps.max_iterations, + nullptr, // num_executed_iterations + hash_bitlen, + small_hash_bitlen, + small_hash_reset_interval, + sample_filter); + }; + cuvs::neighbors::detail::safely_launch_kernel_with_smem_size(kernel, smem_size, kernel_launcher); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + } // namespace single_cta_search } // namespace cuvs::neighbors::cagra::detail diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh index ba308db98b..2af57d421f 100644 --- a/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh +++ b/cpp/src/neighbors/detail/cagra/search_single_cta_kernel.cuh @@ -10,6 +10,23 @@ namespace cuvs::neighbors::cagra::detail::single_cta_search { +/** + * @brief Per-segment descriptor for the multi-segment CAGRA search kernel. + * + * One instance per Lucene segment; the kernel reads this array from device memory using + * blockIdx.z as the segment index. + */ +template +struct alignas(16) multi_segment_desc_t { + const dataset_descriptor_base_t* dataset_desc; + const DataT* queries_ptr; // [num_queries, dim] for this segment + const IndexT* graph; // [dataset_size, graph_degree] + uint32_t graph_degree; + uint32_t _pad; + uintptr_t result_indices_ptr; // tagged pointer: [num_queries, top_k] + DistanceT* result_distances_ptr; // [num_queries, top_k] +}; + template +void select_and_run_multi_segment( + const multi_segment_desc_t* segment_descs, + uint32_t num_segments, + uint32_t num_queries, + const search_params& ps, + uint32_t topk, + uint32_t num_itopk_candidates, + uint32_t block_size, + uint32_t smem_size, + int64_t hash_bitlen, + IndexT* hashmap_ptr, + size_t small_hash_bitlen, + size_t small_hash_reset_interval, + SampleFilterT sample_filter, + cudaStream_t stream); + } diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndex.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndex.java index c87f024124..2e48928636 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndex.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraIndex.java @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ package com.nvidia.cuvs; diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java index 76e1f10bd9..a9ea94a440 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CagraSearchParams.java @@ -25,6 +25,9 @@ public class CagraSearchParams { private long randXORMask; private SearchAlgo searchAlgo; private HashMapMode hashMapMode; + private boolean persistent; + private float persistentLifetime; + private float persistentDeviceUsage; /** * Enum to denote algorithm used to search CAGRA Index. @@ -120,7 +123,10 @@ private CagraSearchParams( int hashmapMinBitlen, float hashmapMaxFillRate, int numRandomSamplings, - long randXORMask) { + long randXORMask, + boolean persistent, + float persistentLifetime, + float persistentDeviceUsage) { this.maxQueries = maxQueries; this.iTopKSize = iTopKSize; this.maxIterations = maxIterations; @@ -134,6 +140,9 @@ private CagraSearchParams( this.hashMapMaxFillRate = hashmapMaxFillRate; this.numRandomSamplings = numRandomSamplings; this.randXORMask = randXORMask; + this.persistent = persistent; + this.persistentLifetime = persistentLifetime; + this.persistentDeviceUsage = persistentDeviceUsage; } /** @@ -254,6 +263,33 @@ public HashMapMode getHashMapMode() { return hashMapMode; } + /** + * Gets whether the persistent kernel is enabled. + * + * @return true if the persistent kernel is enabled + */ + public boolean isPersistent() { + return persistent; + } + + /** + * Gets the persistent kernel lifetime in seconds. + * + * @return the lifetime in seconds + */ + public float getPersistentLifetime() { + return persistentLifetime; + } + + /** + * Gets the fraction of maximum grid size used by the persistent kernel. + * + * @return the device usage fraction (0.0, 1.0] + */ + public float getPersistentDeviceUsage() { + return persistentDeviceUsage; + } + @Override public String toString() { return "CagraSearchParams [maxQueries=" @@ -301,8 +337,11 @@ public static class Builder { private int numRandomSamplings = 1; private float hashMapMaxFillRate = 0.5f; private long randXORMask = 0x128394; - private SearchAlgo searchAlgo; - private HashMapMode hashMapMode; + private SearchAlgo searchAlgo = SearchAlgo.AUTO; + private HashMapMode hashMapMode = HashMapMode.AUTO_HASH; + private boolean persistent = false; + private float persistentLifetime = 2.0f; + private float persistentDeviceUsage = 1.0f; /** * Default constructor. @@ -460,6 +499,43 @@ public Builder withRandXorMask(long randXORMask) { return this; } + /** + * Enables or disables the persistent kernel. + * + *

When enabled, the CAGRA SINGLE_CTA kernel stays resident on the GPU and serves search + * jobs via system-scope atomics. The kernel is shared across all indexes. + * + * @param persistent true to enable the persistent kernel + * @return an instance of this Builder + */ + public Builder withPersistent(boolean persistent) { + this.persistent = persistent; + return this; + } + + /** + * Sets the time in seconds before an idle persistent kernel exits. + * + * @param persistentLifetime lifetime in seconds (default 2.0) + * @return an instance of this Builder + */ + public Builder withPersistentLifetime(float persistentLifetime) { + this.persistentLifetime = persistentLifetime; + return this; + } + + /** + * Sets the fraction of maximum grid size used by the persistent kernel. + * Must be greater than 0.0 and not greater than 1.0. + * + * @param persistentDeviceUsage device usage fraction (default 1.0) + * @return an instance of this Builder + */ + public Builder withPersistentDeviceUsage(float persistentDeviceUsage) { + this.persistentDeviceUsage = persistentDeviceUsage; + return this; + } + /** * Builds an instance of {@link CagraSearchParams} with passed search * parameters. @@ -480,7 +556,10 @@ public CagraSearchParams build() { hashMapMinBitlen, hashMapMaxFillRate, numRandomSamplings, - randXORMask); + randXORMask, + persistent, + persistentLifetime, + persistentDeviceUsage); } } } diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java index b70547b333..d9e6b2598d 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSAceParams.java @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ package com.nvidia.cuvs; @@ -75,8 +75,13 @@ public class CuVSAceParams { */ private final double maxGpuMemoryGb; - private CuVSAceParams(long npartitions, long efConstruction, String buildDir, boolean useDisk, - double maxHostMemoryGb, double maxGpuMemoryGb) { + private CuVSAceParams( + long npartitions, + long efConstruction, + String buildDir, + boolean useDisk, + double maxHostMemoryGb, + double maxGpuMemoryGb) { this.npartitions = npartitions; this.efConstruction = efConstruction; this.buildDir = buildDir; @@ -259,8 +264,8 @@ public Builder withMaxGpuMemoryGb(double maxGpuMemoryGb) { * @return an instance of {@link CuVSAceParams} */ public CuVSAceParams build() { - return new CuVSAceParams(npartitions, efConstruction, buildDir, useDisk, - maxHostMemoryGb, maxGpuMemoryGb); + return new CuVSAceParams( + npartitions, efConstruction, buildDir, useDisk, maxHostMemoryGb, maxGpuMemoryGb); } } } diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSMatrix.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSMatrix.java index e0e39a4b4b..a19254ee3b 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSMatrix.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSMatrix.java @@ -18,6 +18,7 @@ public interface CuVSMatrix extends AutoCloseable { enum DataType { FLOAT(4), + HALF(2), INT(4), UINT(4), BYTE(1); @@ -94,6 +95,13 @@ interface Builder { */ void addVector(int[] vector); + /** + * Adds a single vector to the matrix. Each element is a raw float16 bit pattern stored in a short. + * + * @param vector A short array of as many elements as the dimensions + */ + void addVector(short[] vector); + T build(); } diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSResources.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSResources.java index b105580328..01d500b948 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSResources.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/CuVSResources.java @@ -57,6 +57,24 @@ interface ScopedAccess extends AutoCloseable { */ Path tempDirectory(); + /** + * Configure the temporary workspace on this resources object as an uncapped pool backed by the + * current device memory resource. After the initial reservation is allocated on first use, + * subsequent calls to {@code cuvsRMMAlloc} / {@code cuvsRMMFree} on this handle hit the pool + * cache rather than calling {@code cudaMallocAsync} / {@code cudaFreeAsync}, reducing CUDA + * context lock contention under concurrent query threads. The pool grows without shrinking: + * freed allocations are returned to the pool rather than to the device, so the pool's + * high-water mark only increases until the resources object is closed. + * + *

The pool is per-resources-handle (i.e. per query thread when resources are thread-local), + * so there is no cross-thread pool mutex contention. Call this once after creating the resources + * object; calling it again replaces the pool. + * + * @param initialSizeBytes initial pool reservation in bytes; size {@code initialSizeBytes} to + * cover the steady-state working set to avoid growth after warmup + */ + void setWorkspacePool(long initialSizeBytes); + /** * Creates a new resources. * Equivalent to diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswAceParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswAceParams.java index 325f424fae..215c22838f 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswAceParams.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswAceParams.java @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ package com.nvidia.cuvs; @@ -21,8 +21,12 @@ public class HnswAceParams { private double maxHostMemoryGb; private double maxGpuMemoryGb; - private HnswAceParams(long npartitions, String buildDir, boolean useDisk, - double maxHostMemoryGb, double maxGpuMemoryGb) { + private HnswAceParams( + long npartitions, + String buildDir, + boolean useDisk, + double maxHostMemoryGb, + double maxGpuMemoryGb) { this.npartitions = npartitions; this.buildDir = buildDir; this.useDisk = useDisk; @@ -188,8 +192,7 @@ public Builder withMaxGpuMemoryGb(double maxGpuMemoryGb) { * @return an instance of {@link HnswAceParams} */ public HnswAceParams build() { - return new HnswAceParams(npartitions, buildDir, useDisk, - maxHostMemoryGb, maxGpuMemoryGb); + return new HnswAceParams(npartitions, buildDir, useDisk, maxHostMemoryGb, maxGpuMemoryGb); } } } diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java index 3eef491b62..84979cfe0c 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndex.java @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ package com.nvidia.cuvs; diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java index 070cbedae1..d68e01b58b 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/HnswIndexParams.java @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ package com.nvidia.cuvs; @@ -283,13 +283,7 @@ public Builder withAceParams(HnswAceParams aceParams) { */ public HnswIndexParams build() { return new HnswIndexParams( - hierarchy, - efConstruction, - numThreads, - vectorDimension, - m, - metric, - aceParams); + hierarchy, efConstruction, numThreads, vectorDimension, m, metric, aceParams); } } } diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/MultiSegmentSearchResults.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/MultiSegmentSearchResults.java new file mode 100644 index 0000000000..fd43d18739 --- /dev/null +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/MultiSegmentSearchResults.java @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +package com.nvidia.cuvs; + +/** + * Holds the decoded results of a multi-segment GPU search. + * + *

Each entry {@code i} in [0, {@link #count}) identifies: + *

    + *
  • which input segment the result came from ({@link #getSegmentIndex(int)})
  • + *
  • the local vector ordinal within that segment ({@link #getOrdinal(int)})
  • + *
  • the raw CAGRA distance ({@link #getDistance(int)})
  • + *
+ * + *

The caller is responsible for mapping ordinals to Lucene doc IDs using the + * segment-specific {@code ordToDoc} function and adding {@code docBase}. + * + * @since 25.10 + */ +public class MultiSegmentSearchResults { + + private final int count; + private final int[] segmentIndices; + private final int[] ordinals; + private final float[] distances; + + MultiSegmentSearchResults(int count, int[] segmentIndices, int[] ordinals, float[] distances) { + this.count = count; + this.segmentIndices = segmentIndices; + this.ordinals = ordinals; + this.distances = distances; + } + + /** Number of valid results (may be less than k if fewer candidates exist). */ + public int count() { + return count; + } + + /** Index into the original segment list for result {@code i}. */ + public int getSegmentIndex(int i) { + return segmentIndices[i]; + } + + /** Local vector ordinal within the segment for result {@code i}. */ + public int getOrdinal(int i) { + return ordinals[i]; + } + + /** Raw CAGRA distance for result {@code i} (before score normalization). */ + public float getDistance(int i) { + return distances[i]; + } +} diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/SynchronizedCuVSResources.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/SynchronizedCuVSResources.java index 64a72ec32a..aa74893c6f 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/SynchronizedCuVSResources.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/SynchronizedCuVSResources.java @@ -40,6 +40,11 @@ public void close() { inner.close(); } + @Override + public void setWorkspacePool(long sizeBytes) { + inner.setWorkspacePool(sizeBytes); + } + @Override public Path tempDirectory() { return inner.tempDirectory(); diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java index c39578755c..558d2e73f7 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/CuVSProvider.java @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ package com.nvidia.cuvs.spi; @@ -189,6 +189,16 @@ default CagraIndex mergeCagraIndexes(CagraIndex[] indexes, CagraIndexParams merg */ void enableRMMManagedPooledMemory(int initialPoolSizePercent, int maxPoolSizePercent); + /** + * Switch RMM allocations to use stream-ordered asynchronous allocation + * ({@code cudaMallocAsync} / {@code cudaFreeAsync}). Unlike the pool resource, this resource + * returns memory to the stream without blocking the CPU, eliminating device-wide synchronization + * on deallocation. This is especially beneficial when multiple CAGRA searches run concurrently + * on separate CUDA streams, because internal workspace allocations no longer serialize kernel + * launches. This operation has a global effect and will affect all resources on the current device. + */ + void enableRMMAsyncMemory(); + /** Disables pooled memory on the current device, reverting back to the default setting. */ void resetRMMPooledMemory(); diff --git a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java index 7cbeee4e75..6701afa47b 100644 --- a/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java +++ b/java/cuvs-java/src/main/java/com/nvidia/cuvs/spi/UnsupportedProvider.java @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ package com.nvidia.cuvs.spi; @@ -47,8 +47,8 @@ public HnswIndex hnswIndexFromCagra(HnswIndexParams hnswParams, CagraIndex cagra } @Override - public HnswIndex hnswIndexBuild(CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset) - throws Throwable { + public HnswIndex hnswIndexBuild( + CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset) throws Throwable { throw new UnsupportedOperationException(reasons); } @@ -106,6 +106,11 @@ public Level getLogLevel() { throw new UnsupportedOperationException(reasons); } + @Override + public void enableRMMAsyncMemory() { + throw new UnsupportedOperationException(reasons); + } + @Override public void enableRMMPooledMemory(int initialPoolSizePercent, int maxPoolSizePercent) { throw new UnsupportedOperationException(reasons); diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java new file mode 100644 index 0000000000..ad2d5a449a --- /dev/null +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/MultiSegmentCagraSearch.java @@ -0,0 +1,220 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +package com.nvidia.cuvs; + +import static com.nvidia.cuvs.internal.common.CloseableRMMAllocation.allocateRMMSegment; +import static com.nvidia.cuvs.internal.common.Util.CudaMemcpyKind.DEVICE_TO_HOST; +import static com.nvidia.cuvs.internal.common.Util.checkCuVSError; +import static com.nvidia.cuvs.internal.common.Util.cudaMemcpyAsync; +import static com.nvidia.cuvs.internal.common.Util.getStream; +import static com.nvidia.cuvs.internal.common.Util.prepareTensor; +import static com.nvidia.cuvs.internal.panama.headers_h.cuvsCagraSearchMultiSegment; +import static com.nvidia.cuvs.internal.panama.headers_h.cuvsStreamSync; +import static com.nvidia.cuvs.internal.panama.headers_h.kDLCUDA; +import static com.nvidia.cuvs.internal.panama.headers_h.kDLFloat; +import static com.nvidia.cuvs.internal.panama.headers_h.kDLUInt; + +import com.nvidia.cuvs.internal.BufferedCagraSearch; +import com.nvidia.cuvs.internal.CuVSMatrixInternal; +import com.nvidia.cuvs.internal.CuVSParamsHelper; +import com.nvidia.cuvs.internal.SelectKHelper; +import java.lang.foreign.Arena; +import java.lang.foreign.MemorySegment; +import java.lang.foreign.ValueLayout; +import java.util.List; + +/** + * Performs a single-query approximate nearest neighbor search across multiple CAGRA index segments + * using a shared GPU buffer, eliminating per-segment device-to-host copies. + * + *

Algorithm

+ *
    + *
  1. Allocate two global device buffers sized {@code numSegments × k}: + * one for uint32 neighbor ordinals and one for float32 distances.
  2. + *
  3. Call {@code cuvsCagraSearchMultiSegment} which launches a single GPU kernel covering all + * segments concurrently (one CTA per segment), writing results into the global buffers.
  4. + *
  5. Call {@code cuvsSelectK} on the main stream to find the global top-k entirely on GPU.
  6. + *
  7. Sync the main stream.
  8. + *
  9. Copy the three result arrays to host in a single pass.
  10. + *
  11. Decode each result: {@code segment = position / k}, + * {@code ordinal = ordinals[position]}.
  12. + *
+ * + * @since 25.10 + */ +public class MultiSegmentCagraSearch { + + private MultiSegmentCagraSearch() {} + + /** + * Searches multiple CAGRA index segments for the global top-k nearest neighbors. + * + * @param resources shared {@link CuVSResources} handle; all queries must use the same instance + * @param indices one {@link CagraIndex} per segment, in segment order; each must implement + * {@link BufferedCagraSearch} (all built-in implementations do) + * @param queries one {@link CagraQuery} per segment (same topK for all); each query encodes + * the target vector, search parameters, and optional prefilter for that segment + * @param k number of global nearest neighbors to return + * @return decoded search results with per-result (segmentIndex, ordinal, distance) + * @throws IllegalArgumentException if {@code indices} and {@code queries} differ in size, or if + * any index does not support buffered search + */ + public static MultiSegmentSearchResults search( + CuVSResources resources, List indices, List queries, int k) + throws Throwable { + int numSegments = indices.size(); + if (numSegments != queries.size()) { + throw new IllegalArgumentException( + "indices and queries must have the same size; got " + + numSegments + + " vs " + + queries.size()); + } + if (numSegments == 0) { + return new MultiSegmentSearchResults(0, new int[0], new int[0], new float[0]); + } + + // Validate that all indices support buffered search. + BufferedCagraSearch[] buffered = new BufferedCagraSearch[numSegments]; + for (int i = 0; i < numSegments; i++) { + CagraIndex idx = indices.get(i); + if (!(idx instanceof BufferedCagraSearch)) { + throw new IllegalArgumentException( + "Index at position " + i + " does not support buffered search"); + } + buffered[i] = (BufferedCagraSearch) idx; + } + + long totalCandidates = (long) numSegments * k; + long neighborsBytes = totalCandidates * Integer.BYTES; // uint32 per ordinal + long distancesBytes = totalCandidates * Float.BYTES; // float32 per distance + long outIdxBytes = (long) k * Long.BYTES; // int64 positions from select_k + long outValBytes = (long) k * Float.BYTES; + + CagraSearchParams searchParameters = queries.get(0).getCagraSearchParameters(); + + try (var resourcesAccessor = resources.access()) { + long cuvsRes = resourcesAccessor.handle(); + var cuvsStream = getStream(cuvsRes); + + try (var globalNeighborsDP = allocateRMMSegment(cuvsRes, neighborsBytes); + var globalDistancesDP = allocateRMMSegment(cuvsRes, distancesBytes); + var outIdxDP = allocateRMMSegment(cuvsRes, outIdxBytes); + var outValDP = allocateRMMSegment(cuvsRes, outValBytes)) { + + // --- Phase 1: call cuvsCagraSearchMultiSegment --- + // Single kernel launch covers all segments; results land in globalNeighborsDP / + // globalDistancesDP on the same CUDA stream, so SelectK below sees them via ordering. + try (var arena = Arena.ofConfined()) { + MemorySegment sp = CuVSParamsHelper.buildCagraSearchParams(arena, searchParameters); + + // Build C arrays: cuvsCagraIndex_t* indices, DLManagedTensor** q/n/d + MemorySegment indexArray = arena.allocate(ValueLayout.ADDRESS, numSegments); + MemorySegment queriesArray = arena.allocate(ValueLayout.ADDRESS, numSegments); + MemorySegment neighborsArray = arena.allocate(ValueLayout.ADDRESS, numSegments); + MemorySegment distancesArray = arena.allocate(ValueLayout.ADDRESS, numSegments); + + long[] segShape = {1, k}; + for (int i = 0; i < numSegments; i++) { + // Index handle + indexArray.setAtIndex(ValueLayout.ADDRESS, i, buffered[i].getIndexHandle()); + + // Query DLTensor + var queryVectors = (CuVSMatrixInternal) queries.get(i).getQueryVectors(); + queriesArray.setAtIndex(ValueLayout.ADDRESS, i, queryVectors.toTensor(arena)); + + // Neighbors DLTensor — slice of global buffer + long nByteOffset = (long) i * k * Integer.BYTES; + MemorySegment nSlice = + MemorySegment.ofAddress(globalNeighborsDP.handle().address() + nByteOffset); + neighborsArray.setAtIndex( + ValueLayout.ADDRESS, + i, + prepareTensor(arena, nSlice, segShape, kDLUInt(), 32, kDLCUDA())); + + // Distances DLTensor — slice of global buffer + long dByteOffset = (long) i * k * Float.BYTES; + MemorySegment dSlice = + MemorySegment.ofAddress(globalDistancesDP.handle().address() + dByteOffset); + distancesArray.setAtIndex( + ValueLayout.ADDRESS, + i, + prepareTensor(arena, dSlice, segShape, kDLFloat(), 32, kDLCUDA())); + } + + checkCuVSError( + cuvsCagraSearchMultiSegment( + cuvsRes, + sp, + numSegments, + indexArray, + queriesArray, + neighborsArray, + distancesArray), + "cuvsCagraSearchMultiSegment"); + } + + // --- Phase 2: select global top-k on GPU --- + SelectKHelper.selectK( + cuvsRes, + globalDistancesDP.handle(), + totalCandidates, + outValDP.handle(), + outIdxDP.handle(), + k); + + // No stream sync needed here: the D2H copies below are enqueued on the same cuvsStream, + // so CUDA stream ordering guarantees selectK completes before the copies begin. + + // --- Phase 3: single device-to-host copy for all three arrays --- + // Layout (in order of decreasing alignment): int64 outIdx | float32 outVal | uint32 + // ordinals + try (var hostArena = Arena.ofConfined()) { + MemorySegment hostBuf = + hostArena.allocate(outIdxBytes + outValBytes + neighborsBytes, Long.BYTES); + MemorySegment hostOutIdx = hostBuf.asSlice(0, outIdxBytes); + MemorySegment hostOutVal = hostBuf.asSlice(outIdxBytes, outValBytes); + MemorySegment hostAllOrdinals = + hostBuf.asSlice(outIdxBytes + outValBytes, neighborsBytes); + + cudaMemcpyAsync(hostOutIdx, outIdxDP.handle(), outIdxBytes, DEVICE_TO_HOST, cuvsStream); + cudaMemcpyAsync(hostOutVal, outValDP.handle(), outValBytes, DEVICE_TO_HOST, cuvsStream); + cudaMemcpyAsync( + hostAllOrdinals, + globalNeighborsDP.handle(), + neighborsBytes, + DEVICE_TO_HOST, + cuvsStream); + + checkCuVSError(cuvsStreamSync(cuvsRes), "cuvsStreamSync after D2H copy"); + + // --- Phase 4: decode results --- + int[] segmentIndices = new int[k]; + int[] selectedOrdinals = new int[k]; + float[] selectedDistances = new float[k]; + int count = 0; + + for (int j = 0; j < k; j++) { + long pos = hostOutIdx.getAtIndex(ValueLayout.JAVA_LONG, j); + float dist = hostOutVal.getAtIndex(ValueLayout.JAVA_FLOAT, j); + int ordinal = hostAllOrdinals.getAtIndex(ValueLayout.JAVA_INT, (int) pos); + + if (ordinal < 0) { + // CAGRA uses negative sentinel values for unfilled slots + continue; + } + segmentIndices[count] = (int) (pos / k); + selectedOrdinals[count] = ordinal; + selectedDistances[count] = dist; + count++; + } + + return new MultiSegmentSearchResults( + count, segmentIndices, selectedOrdinals, selectedDistances); + } + } + } + } +} diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java new file mode 100644 index 0000000000..7df129639c --- /dev/null +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/BufferedCagraSearch.java @@ -0,0 +1,65 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +package com.nvidia.cuvs.internal; + +import com.nvidia.cuvs.CagraQuery; +import java.lang.foreign.Arena; +import java.lang.foreign.MemorySegment; + +/** + * Internal interface implemented by CAGRA index classes that support writing + * search results directly into a caller-owned device buffer without syncing + * the stream or copying results to host. + * + *

Used by {@link com.nvidia.cuvs.MultiSegmentCagraSearch} to queue all + * per-segment searches before running a single GPU-side top-k reduction. + */ +public interface BufferedCagraSearch { + + /** + * Runs CAGRA search and writes results into a slice of caller-owned device + * buffers without copying results to host or syncing the stream. + * + *

Results are written at element offset {@code segmentIdx * query.getTopK()} + * in each buffer: + *

    + *
  • {@code globalNeighborsDP}: uint32 ordinals
  • + *
  • {@code globalDistancesDP}: float32 distances
  • + *
+ * + *

The search is submitted to {@code segmentStream}. The caller is responsible for + * synchronizing that stream (e.g. via a CUDA event) before consuming the output buffers. + * + * @param query query with vectors, topK, search params, optional prefilter + * @param globalNeighborsDP device pointer to the shared uint32 neighbors buffer + * @param globalDistancesDP device pointer to the shared float32 distances buffer + * @param segmentIdx zero-based segment index; determines the write offset + * @param segmentCuvsRes {@code cuvsResources_t} handle whose CUDA stream receives the kernel + * @param segmentStream CUDA stream corresponding to {@code segmentCuvsRes}; passed explicitly + * to avoid a redundant {@code cuvsStreamGet} call inside the method + * @param searchParams pre-built {@code cuvsCagraSearchParams} struct; shared across all + * segments to avoid repeated allocation and population + * @param arena shared scratch arena for per-call CPU-side allocations (tensor + * descriptors, filter struct); must remain open until after this call + * returns, and the GPU kernel has launched + */ + void searchIntoBuffer( + CagraQuery query, + MemorySegment globalNeighborsDP, + MemorySegment globalDistancesDP, + int segmentIdx, + long segmentCuvsRes, + MemorySegment segmentStream, + MemorySegment searchParams, + Arena arena) + throws Throwable; + + /** + * Returns the raw {@code cuvsCagraIndex_t} handle as a {@link MemorySegment}. + * Used by {@link com.nvidia.cuvs.MultiSegmentCagraSearch} to build the index pointer array + * for {@code cuvsCagraSearchMultiSegment}. + */ + MemorySegment getIndexHandle(); +} diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java index abc53a5945..7d0d0e2907 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CagraIndexImpl.java @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ package com.nvidia.cuvs.internal; @@ -47,7 +47,7 @@ * * @since 25.02 */ -public class CagraIndexImpl implements CagraIndex { +public class CagraIndexImpl implements CagraIndex, BufferedCagraSearch { private final CuVSResources resources; private final IndexReference cagraIndexReference; private boolean destroyed; @@ -338,6 +338,123 @@ public SearchResults search(CagraQuery query) throws Throwable { } } + /** + * Runs CAGRA search and writes results directly into caller-owned device buffers. + * + *

Unlike {@link #search}, this method does not copy results to host memory and does + * not synchronize the CUDA stream. The caller is responsible for both after all + * per-segment searches have been queued. + * + *

Results for this segment are written at byte offset {@code segmentIdx * topK} elements + * from the start of each buffer: + *

    + *
  • {@code globalNeighborsDP}: uint32 ordinals, {@code topK} entries per segment
  • + *
  • {@code globalDistancesDP}: float32 distances, {@code topK} entries per segment
  • + *
+ * + * @param query query parameters including vectors, topK, search params, and optional + * prefilter; must have exactly one query vector (numQueries == 1) + * @param globalNeighborsDP device pointer to the start of the shared uint32 neighbors buffer + * @param globalDistancesDP device pointer to the start of the shared float32 distances buffer + * @param segmentIdx zero-based index of this segment; determines the write offset + * @param segmentCuvsRes {@code cuvsResources_t} handle whose CUDA stream receives the kernel + */ + @Override + public MemorySegment getIndexHandle() { + return cagraIndexReference.getMemorySegment(); + } + + @Override + public void searchIntoBuffer( + CagraQuery query, + MemorySegment globalNeighborsDP, + MemorySegment globalDistancesDP, + int segmentIdx, + long segmentCuvsRes, + MemorySegment segmentStream, + MemorySegment searchParams, + Arena arena) + throws Throwable { + checkNotDestroyed(); + int topK = query.getTopK(); + var queryVectors = (CuVSMatrixInternal) query.getQueryVectors(); + long numQueries = queryVectors.size(); + + final boolean hasPreFilter = query.getPrefilter() != null; + final BitSet[] prefilters = + hasPreFilter ? new BitSet[] {query.getPrefilter()} : EMPTY_PREFILTER_BITSET; + final long prefilterDataLength = hasPreFilter ? query.getNumDocs() * prefilters.length : 0; + final long prefilterLen = hasPreFilter ? (prefilterDataLength + 31) / 32 : 0; + final long prefilterBytes = C_INT_BYTE_SIZE * prefilterLen; + + // Pointers into the global buffer at this segment's slice. + long neighborByteOffset = (long) segmentIdx * topK * C_INT_BYTE_SIZE; + long distanceByteOffset = (long) segmentIdx * topK * Float.BYTES; + MemorySegment neighborSlice = + MemorySegment.ofAddress(globalNeighborsDP.address() + neighborByteOffset); + MemorySegment distanceSlice = + MemorySegment.ofAddress(globalDistancesDP.address() + distanceByteOffset); + + if (!(queryVectors instanceof CuVSDeviceMatrix)) { + throw new IllegalArgumentException( + "searchIntoBuffer requires query vectors already on device"); + } + try (var prefilterDP = + hasPreFilter + ? allocateRMMSegment(segmentCuvsRes, prefilterBytes) + : CloseableRMMAllocation.EMPTY) { + var deviceQueryVectors = (CuVSMatrixInternal) queryVectors; + + var queryTensor = deviceQueryVectors.toTensor(arena); + long[] neighborsShape = {numQueries, topK}; + MemorySegment neighborsTensor = + prepareTensor(arena, neighborSlice, neighborsShape, kDLUInt(), 32, kDLCUDA()); + long[] distancesShape = {numQueries, topK}; + MemorySegment distancesTensor = + prepareTensor( + arena, + distanceSlice, + distancesShape, + deviceQueryVectors.code(), + deviceQueryVectors.bits(), + kDLCUDA()); + + MemorySegment prefilter = cuvsFilter.allocate(arena); + if (!hasPreFilter) { + cuvsFilter.type(prefilter, 0); // NO_FILTER + cuvsFilter.addr(prefilter, 0); + } else { + BitSet concatenatedFilters = concatenate(prefilters, query.getNumDocs()); + long[] filters = concatenatedFilters.toLongArray(); + var prefilterDataMemorySegment = buildMemorySegment(arena, filters); + long[] prefilterShape = {prefilterLen}; + Util.cudaMemcpyAsync( + prefilterDP.handle(), + prefilterDataMemorySegment, + prefilterBytes, + HOST_TO_DEVICE, + segmentStream); + MemorySegment prefilterTensor = + prepareTensor(arena, prefilterDP.handle(), prefilterShape, kDLUInt(), 32, kDLCUDA()); + cuvsFilter.type(prefilter, 1); + cuvsFilter.addr(prefilter, prefilterTensor.address()); + } + + var returnValue = + cuvsCagraSearch( + segmentCuvsRes, + searchParams, + cagraIndexReference.getMemorySegment(), + queryTensor, + neighborsTensor, + distancesTensor, + prefilter); + checkCuVSError(returnValue, "cuvsCagraSearch (searchIntoBuffer)"); + // Intentionally no cudaMemcpyAsync and no stream sync here. + // The caller syncs the stream after queuing all segment searches. + } + } + @Override public void serialize(OutputStream outputStream) throws Throwable { Path path = @@ -632,8 +749,10 @@ private static void populateNativeIndexParams( cuvsAceParams.npartitions(cuvsAceParamsMemorySegment, cuVSAceParams.getNpartitions()); cuvsAceParams.ef_construction(cuvsAceParamsMemorySegment, cuVSAceParams.getEfConstruction()); cuvsAceParams.use_disk(cuvsAceParamsMemorySegment, cuVSAceParams.isUseDisk()); - cuvsAceParams.max_host_memory_gb(cuvsAceParamsMemorySegment, cuVSAceParams.getMaxHostMemoryGb()); - cuvsAceParams.max_gpu_memory_gb(cuvsAceParamsMemorySegment, cuVSAceParams.getMaxGpuMemoryGb()); + cuvsAceParams.max_host_memory_gb( + cuvsAceParamsMemorySegment, cuVSAceParams.getMaxHostMemoryGb()); + cuvsAceParams.max_gpu_memory_gb( + cuvsAceParamsMemorySegment, cuVSAceParams.getMaxGpuMemoryGb()); String buildDir = cuVSAceParams.getBuildDir(); if (buildDir != null && !buildDir.isEmpty()) { diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixBaseImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixBaseImpl.java index 98f4095ffc..03e08ac4c4 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixBaseImpl.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixBaseImpl.java @@ -7,6 +7,7 @@ import static com.nvidia.cuvs.internal.common.LinkerHelper.C_CHAR; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_FLOAT; import static com.nvidia.cuvs.internal.common.LinkerHelper.C_INT; +import static com.nvidia.cuvs.internal.common.LinkerHelper.C_SHORT; import static com.nvidia.cuvs.internal.common.Util.checkCuVSError; import static com.nvidia.cuvs.internal.panama.headers_h.*; @@ -95,6 +96,7 @@ public ValueLayout valueLayout() { protected static ValueLayout valueLayoutFromType(DataType dataType) { return switch (dataType) { case FLOAT -> C_FLOAT; + case HALF -> C_SHORT; case INT, UINT -> C_INT; case BYTE -> C_CHAR; }; @@ -177,6 +179,8 @@ private static DataType dataTypeFromTensor(byte code, byte bits) { dataType = DataType.INT; } else if (code == kDLFloat() && bits == 32) { dataType = DataType.FLOAT; + } else if (code == kDLFloat() && bits == 16) { + dataType = DataType.HALF; } else if ((code == kDLInt() || code == kDLUInt()) && bits == 8) { dataType = DataType.BYTE; } else { diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixInternal.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixInternal.java index 35715b8336..6b4b617493 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixInternal.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSMatrixInternal.java @@ -38,7 +38,7 @@ default int code() { static int code(DataType dataType) { return switch (dataType) { - case FLOAT -> kDLFloat(); + case FLOAT, HALF -> kDLFloat(); case INT -> kDLInt(); case UINT, BYTE -> kDLUInt(); }; diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java index 950504bc5a..a4c914dbf8 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSParamsHelper.java @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ package com.nvidia.cuvs.internal; @@ -7,7 +7,9 @@ import static com.nvidia.cuvs.internal.common.Util.checkCuVSError; import static com.nvidia.cuvs.internal.panama.headers_h.*; +import com.nvidia.cuvs.CagraSearchParams; import com.nvidia.cuvs.internal.common.CloseableHandle; +import com.nvidia.cuvs.internal.panama.*; import java.lang.foreign.Arena; import java.lang.foreign.MemorySegment; @@ -25,6 +27,30 @@ public final class CuVSParamsHelper { private CuVSParamsHelper() {} + /** + * Allocates and populates a {@code cuvsCagraSearchParams} struct into {@code arena}. + * The returned segment is valid for the lifetime of {@code arena}. + */ + public static MemorySegment buildCagraSearchParams(Arena arena, CagraSearchParams params) { + MemorySegment seg = cuvsCagraSearchParams.allocate(arena); + cuvsCagraSearchParams.max_queries(seg, params.getMaxQueries()); + cuvsCagraSearchParams.itopk_size(seg, params.getITopKSize()); + cuvsCagraSearchParams.max_iterations(seg, params.getMaxIterations()); + cuvsCagraSearchParams.algo(seg, params.getCagraSearchAlgo().value); + cuvsCagraSearchParams.team_size(seg, params.getTeamSize()); + cuvsCagraSearchParams.search_width(seg, params.getSearchWidth()); + cuvsCagraSearchParams.min_iterations(seg, params.getMinIterations()); + cuvsCagraSearchParams.thread_block_size(seg, params.getThreadBlockSize()); + cuvsCagraSearchParams.hashmap_mode(seg, params.getHashMapMode().value); + cuvsCagraSearchParams.hashmap_max_fill_rate(seg, params.getHashMapMaxFillRate()); + cuvsCagraSearchParams.num_random_samplings(seg, params.getNumRandomSamplings()); + cuvsCagraSearchParams.rand_xor_mask(seg, params.getRandXORMask()); + cuvsCagraSearchParams.persistent(seg, params.isPersistent()); + cuvsCagraSearchParams.persistent_lifetime(seg, params.getPersistentLifetime()); + cuvsCagraSearchParams.persistent_device_usage(seg, params.getPersistentDeviceUsage()); + return seg; + } + public static CloseableHandle createCagraIndexParams() { try (var localArena = Arena.ofConfined()) { var paramsPtrPtr = localArena.allocate(cuvsCagraIndexParams_t); diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java index efdf7283ac..e421cad660 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/CuVSResourcesImpl.java @@ -65,6 +65,12 @@ public void close() { } } + @Override + public void setWorkspacePool(long sizeBytes) { + checkCuVSError( + cuvsResourcesSetWorkspacePool(resourceHandle, sizeBytes), "cuvsResourcesSetWorkspacePool"); + } + @Override public Path tempDirectory() { return tempDirectory; diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java index ca528ac010..66a53fa6d3 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/HnswIndexImpl.java @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ package com.nvidia.cuvs.internal; @@ -251,8 +251,8 @@ public static HnswIndex.Builder newBuilder(CuVSResources cuvsResources) { * @return A new HNSW index ready for search * @throws Throwable if an error occurs during building */ - public static HnswIndex build(CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset) - throws Throwable { + public static HnswIndex build( + CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset) throws Throwable { Objects.requireNonNull(resources); Objects.requireNonNull(hnswParams); Objects.requireNonNull(dataset); @@ -288,7 +288,8 @@ public static HnswIndex build(CuVSResources resources, HnswIndexParams hnswParam return new HnswIndexImpl(new IndexReference(hnswIndex), resources, hnswParams); } - private static CloseableHandle createHnswIndexParamsForBuild(Arena arena, HnswIndexParams params) { + private static CloseableHandle createHnswIndexParamsForBuild( + Arena arena, HnswIndexParams params) { var hnswParams = createHnswIndexParams(); MemorySegment seg = hnswParams.handle(); @@ -324,7 +325,7 @@ private static MemorySegment prepareTensorFromMatrix(Arena arena, CuVSMatrix dat return prepareTensor( arena, matrixInternal.memorySegment(), - new long[]{dataset.size(), dataset.columns()}, + new long[] {dataset.size(), dataset.columns()}, matrixInternal.code(), matrixInternal.bits(), kDLCPU()); diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/SelectKHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/SelectKHelper.java new file mode 100644 index 0000000000..f859ea26c7 --- /dev/null +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/SelectKHelper.java @@ -0,0 +1,94 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +package com.nvidia.cuvs.internal; + +import static com.nvidia.cuvs.internal.common.LinkerHelper.C_INT; +import static com.nvidia.cuvs.internal.common.LinkerHelper.C_LONG; +import static com.nvidia.cuvs.internal.common.LinkerHelper.C_POINTER; +import static com.nvidia.cuvs.internal.common.Util.checkCuVSError; +import static com.nvidia.cuvs.internal.common.Util.prepareTensor; +import static com.nvidia.cuvs.internal.panama.headers_h.kDLCUDA; +import static com.nvidia.cuvs.internal.panama.headers_h.kDLFloat; +import static com.nvidia.cuvs.internal.panama.headers_h.kDLInt; + +import java.lang.foreign.Arena; +import java.lang.foreign.FunctionDescriptor; +import java.lang.foreign.Linker; +import java.lang.foreign.MemorySegment; +import java.lang.foreign.SymbolLookup; +import java.lang.invoke.MethodHandle; + +/** + * Panama FFI binding for {@code cuvsSelectK}. + * + *

Selects the k smallest float values from a flat device array of n candidates, writing output + * distances and their flat-array positions (int64) into caller-supplied device buffers. + */ +public class SelectKHelper { + + private static final MethodHandle cuvsSelectK$mh; + + static { + var linker = Linker.nativeLinker(); + SymbolLookup lookup = + SymbolLookup.libraryLookup(System.mapLibraryName("cuvs_c"), Arena.ofAuto()) + .or(SymbolLookup.loaderLookup()) + .or(linker.defaultLookup()); + + cuvsSelectK$mh = + linker.downcallHandle( + lookup + .find("cuvsSelectK") + .orElseThrow(() -> new UnsatisfiedLinkError("cuvsSelectK not found in libcuvs_c")), + FunctionDescriptor.of( + C_INT, // return: cuvsError_t + C_LONG, // cuvsResources_t res + C_POINTER, // DLManagedTensor* in_val + C_POINTER, // DLManagedTensor* out_val + C_POINTER // DLManagedTensor* out_idx + )); + } + + private SelectKHelper() {} + + /** + * Selects the {@code k} smallest distances from a flat device array of {@code n} candidates. + * + *

Output positions in {@code outIdxDP} are int64 column indices into [0, n). The caller + * recovers per-segment identity as {@code segment = position / segmentK}. + * + * @param cuvsRes cuvsResources_t handle (raw long) + * @param inValDP device pointer to float[n] input distances + * @param n number of input candidates + * @param outValDP device pointer to float[k] output distances + * @param outIdxDP device pointer to int64[k] output positions + * @param k number of results to select + */ + public static void selectK( + long cuvsRes, + MemorySegment inValDP, + long n, + MemorySegment outValDP, + MemorySegment outIdxDP, + long k) { + try (var arena = Arena.ofConfined()) { + long[] inShape = {1, n}; + long[] outShape = {1, k}; + + MemorySegment inValTensor = prepareTensor(arena, inValDP, inShape, kDLFloat(), 32, kDLCUDA()); + MemorySegment outValTensor = + prepareTensor(arena, outValDP, outShape, kDLFloat(), 32, kDLCUDA()); + MemorySegment outIdxTensor = + prepareTensor(arena, outIdxDP, outShape, kDLInt(), 64, kDLCUDA()); + + int rc = (int) cuvsSelectK$mh.invokeExact(cuvsRes, inValTensor, outValTensor, outIdxTensor); + checkCuVSError(rc, "cuvsSelectK"); + } catch (RuntimeException | Error e) { + throw e; + } catch (Throwable t) { + throw new RuntimeException("cuvsSelectK failed", t); + } + } +} diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/LinkerHelper.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/LinkerHelper.java index 6de70ce920..79b440b4a8 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/LinkerHelper.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/internal/common/LinkerHelper.java @@ -25,6 +25,9 @@ public class LinkerHelper { public static final ValueLayout.OfLong C_LONG = (ValueLayout.OfLong) LINKER.canonicalLayouts().get("long"); + public static final ValueLayout.OfShort C_SHORT = + (ValueLayout.OfShort) LINKER.canonicalLayouts().get("short"); + public static final ValueLayout.OfFloat C_FLOAT = (ValueLayout.OfFloat) LINKER.canonicalLayouts().get("float"); diff --git a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java index 1d3199f26f..72f0b4e232 100644 --- a/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java +++ b/java/cuvs-java/src/main/java22/com/nvidia/cuvs/spi/JDKProvider.java @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ package com.nvidia.cuvs.spi; @@ -138,6 +138,9 @@ public String toString() { private final cuvsRMMMemoryResourceReset cuvsRMMMemoryResourceResetInvoker = cuvsRMMMemoryResourceReset.makeInvoker(); + private final cuvsRMMAsyncMemoryResourceEnable cuvsRMMAsyncMemoryResourceEnableInvoker = + cuvsRMMAsyncMemoryResourceEnable.makeInvoker(); + private final cuvsGetLogLevel GET_LOG_LEVEL_INVOKER = cuvsGetLogLevel.makeInvoker(); private JDKProvider() {} @@ -255,8 +258,8 @@ public HnswIndex hnswIndexFromCagra(HnswIndexParams hnswParams, CagraIndex cagra } @Override - public HnswIndex hnswIndexBuild(CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset) - throws Throwable { + public HnswIndex hnswIndexBuild( + CuVSResources resources, HnswIndexParams hnswParams, CuVSMatrix dataset) throws Throwable { return HnswIndexImpl.build(resources, hnswParams, dataset); } @@ -436,6 +439,12 @@ public Level getLogLevel() { throw new IllegalArgumentException("Unexpected log level [" + logLevel + "]"); } + @Override + public void enableRMMAsyncMemory() { + checkCuVSError( + cuvsRMMAsyncMemoryResourceEnableInvoker.apply(), "cuvsRMMAsyncMemoryResourceEnable"); + } + @Override public void enableRMMPooledMemory(int initialPoolSizePercent, int maxPoolSizePercent) { checkCuVSError( @@ -603,6 +612,15 @@ public void addVector(int[] vector) { internalAddVector(MemorySegment.ofArray(vector)); } + public void addVector(short[] vector) { + if (vector.length != columns) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, "Expected a vector of size [%d], got [%d]", columns, vector.length)); + } + internalAddVector(MemorySegment.ofArray(vector)); + } + protected abstract void internalAddVector(MemorySegment vector); } diff --git a/java/cuvs-java/src/test/java/com/nvidia/cuvs/CheckedCuVSResources.java b/java/cuvs-java/src/test/java/com/nvidia/cuvs/CheckedCuVSResources.java index e880edc85d..c615e5ee30 100644 --- a/java/cuvs-java/src/test/java/com/nvidia/cuvs/CheckedCuVSResources.java +++ b/java/cuvs-java/src/test/java/com/nvidia/cuvs/CheckedCuVSResources.java @@ -56,6 +56,11 @@ public void close() { inner.close(); } + @Override + public void setWorkspacePool(long sizeBytes) { + inner.setWorkspacePool(sizeBytes); + } + @Override public Path tempDirectory() { return inner.tempDirectory();