diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index b33daa635a..f501f5a39d 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -487,6 +487,86 @@ if(NOT BUILD_CPU_ONLY)
     OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_flat/post_process"
     KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
   )
+  set(ivf_sq_ns "cuvs::neighbors::ivf_sq::detail")
+  generate_jit_lto_kernels(
+    jit_lto_files
+    NAME_FORMAT "ivf_sq_scan_capacity_@capacity@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in"
+    FRAGMENT_TAG_FORMAT "${ivf_sq_ns}::fragment_tag_ivf_sq_scan<@capacity@>"
+    FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
+                              "<cuvs/detail/jit_lto/common_fragments.hpp>"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/scan"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
+  generate_jit_lto_kernels(
+    jit_lto_files
+    NAME_FORMAT "ivf_sq_setup_invariant_smem_metric_@metric_name@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_kernel.cu.in"
+    FRAGMENT_TAG_FORMAT
+      "${ivf_sq_ns}::fragment_tag_setup_invariant_smem<${ivf_sq_ns}::tag_metric_@metric_name@>"
+    FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/setup_invariant_smem"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
+  generate_jit_lto_kernels(
+    jit_lto_files
+    NAME_FORMAT "ivf_sq_setup_per_probe_smem_metric_@metric_name@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_kernel.cu.in"
+    FRAGMENT_TAG_FORMAT
+      "${ivf_sq_ns}::fragment_tag_setup_per_probe_smem<${ivf_sq_ns}::tag_metric_@metric_name@>"
+    FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/setup_per_probe_smem"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
+  generate_jit_lto_kernels(
+    jit_lto_files
+    NAME_FORMAT "ivf_sq_accumulate_distance_metric_@metric_name@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_kernel.cu.in"
+    FRAGMENT_TAG_FORMAT
+      "${ivf_sq_ns}::fragment_tag_accumulate_distance<${ivf_sq_ns}::tag_metric_@metric_name@>"
+    FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/accumulate_distance"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
+  generate_jit_lto_kernels(
+    jit_lto_files
+    NAME_FORMAT "ivf_sq_finalize_distance_metric_@metric_name@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_kernel.cu.in"
+    FRAGMENT_TAG_FORMAT
+      "${ivf_sq_ns}::fragment_tag_finalize_distance<${ivf_sq_ns}::tag_metric_@metric_name@>"
+    FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/finalize_distance"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
+  generate_jit_lto_kernels(
+    jit_lto_files
+    NAME_FORMAT "ivf_sq_filter_@filter_name@"
+    MATRIX_JSON_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_matrix.json"
+    KERNEL_INPUT_FILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_kernel.cu.in"
+    FRAGMENT_TAG_FORMAT
+      "${ivf_sq_ns}::fragment_tag_ivf_sq_filter<${neighbors_ns}::tag_filter_@filter_name@>"
+    FRAGMENT_TAG_HEADER_FILES "<cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>"
+                              "<cuvs/detail/jit_lto/common_fragments.hpp>"
+    OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/generated_kernels/ivf_sq/filter"
+    KERNEL_LINK_LIBRARIES jit_lto_kernel_usage_requirements
+  )
   set(ivf_pq_ns "cuvs::neighbors::ivf_pq::detail")
   generate_jit_lto_kernels(
     jit_lto_files
@@ -897,6 +977,11 @@ if(NOT BUILD_CPU_ONLY)
     src/neighbors/ivf_pq/detail/ivf_pq_process_and_fill_codes.cu
     ${ivf_pq_search_inst_files}
     ${ivf_pq_transform_inst_files}
+    src/neighbors/ivf_sq_index.cpp
+    src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
+    src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
+    src/neighbors/ivf_sq/ivf_sq_search_uint8_t_int64_t.cu
+    src/neighbors/ivf_sq/ivf_sq_serialize_uint8_t.cu
     src/neighbors/knn_merge_parts.cu
     src/neighbors/nn_descent.cu
     ${nn_descent_inst_files}
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index 4e4527267c..0755a15a2f 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -13,6 +13,7 @@ list(APPEND CMAKE_MODULE_PATH "${CUVS_SOURCE_DIR}")
 option(CUVS_ANN_BENCH_USE_FAISS_GPU_FLAT "Include faiss' brute-force knn algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT "Include faiss' ivf flat algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ "Include faiss' ivf pq algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_SQ "Include faiss' ivf sq algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_FAISS_GPU_CAGRA "Include faiss' cagra algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_FAISS_GPU_CAGRA_HNSW
        "Include faiss' cagra algorithm for build and hnsw for search in benchmark" ON
@@ -22,8 +23,10 @@ option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT "Include faiss' cpu ivf flat algori
        ON
 )
 option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ "Include faiss' cpu ivf pq algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_SQ "Include faiss' cpu ivf sq algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_FAISS_CPU_HNSW_FLAT "Include faiss' hnsw algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT "Include cuVS ivf flat algorithm in benchmark" ON)
+option(CUVS_ANN_BENCH_USE_CUVS_IVF_SQ "Include cuVS ivf sq algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ "Include cuVS ivf pq algorithm in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_CUVS_CAGRA "Include cuVS CAGRA in benchmark" ON)
 option(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE "Include cuVS brute force knn in benchmark" ON)
@@ -80,6 +83,7 @@ set(CUVS_USE_FAISS_STATIC ON)
 if(BUILD_CPU_ONLY)
   set(CUVS_FAISS_ENABLE_GPU OFF)
   set(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT OFF)
+  set(CUVS_ANN_BENCH_USE_CUVS_IVF_SQ OFF)
   set(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ OFF)
   set(CUVS_ANN_BENCH_USE_CUVS_CAGRA OFF)
   set(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE OFF)
@@ -97,6 +101,7 @@ set(CUVS_ANN_BENCH_USE_CUVS OFF)
 if(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
    OR CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE
    OR CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT
+   OR CUVS_ANN_BENCH_USE_CUVS_IVF_SQ
    OR CUVS_ANN_BENCH_USE_CUVS_CAGRA
    OR CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB
    OR CUVS_KNN_BENCH_USE_CUVS_BRUTE_FORCE
@@ -244,6 +249,12 @@ if(CUVS_ANN_BENCH_USE_CUVS_IVF_FLAT)
   )
 endif()
 
+if(CUVS_ANN_BENCH_USE_CUVS_IVF_SQ)
+  ConfigureAnnBench(
+    NAME CUVS_IVF_SQ PATH src/cuvs/cuvs_benchmark.cu src/cuvs/cuvs_ivf_sq.cu LINKS cuvs
+  )
+endif()
+
 if(CUVS_ANN_BENCH_USE_CUVS_BRUTE_FORCE)
   ConfigureAnnBench(NAME CUVS_BRUTE_FORCE PATH src/cuvs/cuvs_benchmark.cu LINKS cuvs)
 endif()
@@ -309,6 +320,12 @@ if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_PQ)
   )
 endif()
 
+if(CUVS_ANN_BENCH_USE_FAISS_CPU_IVF_SQ)
+  ConfigureAnnBench(
+    NAME FAISS_CPU_IVF_SQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS}
+  )
+endif()
+
 if(CUVS_ANN_BENCH_USE_FAISS_CPU_HNSW_FLAT)
   ConfigureAnnBench(
     NAME FAISS_CPU_HNSW_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS ${CUVS_FAISS_TARGETS}
@@ -329,6 +346,13 @@ if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_PQ AND CUVS_FAISS_ENABLE_GPU)
   )
 endif()
 
+if(CUVS_ANN_BENCH_USE_FAISS_GPU_IVF_SQ AND CUVS_FAISS_ENABLE_GPU)
+  ConfigureAnnBench(
+    NAME FAISS_GPU_IVF_SQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${CUVS_FAISS_TARGETS}
+    raft::raft
+  )
+endif()
+
 if(CUVS_ANN_BENCH_USE_FAISS_GPU_FLAT AND CUVS_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
     NAME FAISS_GPU_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${CUVS_FAISS_TARGETS}
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
index 2eaf3123a0..6e7a6e2179 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
+++ b/cpp/bench/ann/src/cuvs/cuvs_ann_bench_param_parser.h
@@ -35,6 +35,11 @@ extern template class cuvs::bench::cuvs_cagra<uint8_t, uint32_t>;
 extern template class cuvs::bench::cuvs_cagra<int8_t, uint32_t>;
 #endif
 
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_SQ
+#include "cuvs_ivf_sq_wrapper.h"
+extern template class cuvs::bench::cuvs_ivf_sq<float>;
+extern template class cuvs::bench::cuvs_ivf_sq<half>;
+#endif
 #ifdef CUVS_ANN_BENCH_USE_CUVS_MG
 #include "cuvs_ivf_flat_wrapper.h"
 #include "cuvs_mg_ivf_flat_wrapper.h"
@@ -86,6 +91,26 @@ void parse_search_param(const nlohmann::json& conf,
 }
 #endif
 
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_SQ
+template <typename T>
+void parse_build_param(const nlohmann::json& conf,
+                       typename cuvs::bench::cuvs_ivf_sq<T>::build_param& param)
+{
+  param.n_lists = conf.at("nlist");
+  if (conf.contains("niter")) { param.kmeans_n_iters = conf.at("niter"); }
+  if (conf.contains("ratio")) {
+    param.kmeans_trainset_fraction = 1.0 / static_cast<double>(conf.at("ratio"));
+  }
+}
+
+template <typename T>
+void parse_search_param(const nlohmann::json& conf,
+                        typename cuvs::bench::cuvs_ivf_sq<T>::search_param& param)
+{
+  param.ivf_sq_params.n_probes = conf.at("nprobe");
+}
+#endif
+
 #if defined(CUVS_ANN_BENCH_USE_CUVS_IVF_PQ) || defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA) ||   \
   defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_HNSWLIB) || defined(CUVS_ANN_BENCH_USE_CUVS_MG) || \
   defined(CUVS_ANN_BENCH_USE_CUVS_CAGRA_DISKANN)
diff --git a/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu b/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu
index aebac654c2..22aeb31c38 100644
--- a/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu
+++ b/cpp/bench/ann/src/cuvs/cuvs_benchmark.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -84,6 +84,15 @@ auto create_algo(const std::string& algo_name,
     }
   }
 #endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_SQ
+  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, half>) {
+    if (algo_name == "cuvs_ivf_sq") {
+      typename cuvs::bench::cuvs_ivf_sq<T>::build_param param;
+      parse_build_param<T>(conf, param);
+      a = std::make_unique<cuvs::bench::cuvs_ivf_sq<T>>(metric, dim, param);
+    }
+  }
+#endif
 #ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
   if (algo_name == "raft_ivf_pq" || algo_name == "cuvs_ivf_pq") {
     typename cuvs::bench::cuvs_ivf_pq<T, int64_t>::build_param param;
@@ -151,6 +160,15 @@ auto create_search_param(const std::string& algo_name, const nlohmann::json& con
     }
   }
 #endif
+#ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_SQ
+  if constexpr (std::is_same_v<T, float> || std::is_same_v<T, half>) {
+    if (algo_name == "cuvs_ivf_sq") {
+      auto param = std::make_unique<typename cuvs::bench::cuvs_ivf_sq<T>::search_param>();
+      parse_search_param<T>(conf, *param);
+      return param;
+    }
+  }
+#endif
 #ifdef CUVS_ANN_BENCH_USE_CUVS_IVF_PQ
   if (algo_name == "raft_ivf_pq" || algo_name == "cuvs_ivf_pq") {
     auto param = std::make_unique<typename cuvs::bench::cuvs_ivf_pq<T, int64_t>::search_param>();
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_sq.cu b/cpp/bench/ann/src/cuvs/cuvs_ivf_sq.cu
new file mode 100644
index 0000000000..ec41324c8d
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_sq.cu
@@ -0,0 +1,10 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include "cuvs_ivf_sq_wrapper.h"
+
+namespace cuvs::bench {
+template class cuvs_ivf_sq<float>;
+template class cuvs_ivf_sq<half>;
+}  // namespace cuvs::bench
diff --git a/cpp/bench/ann/src/cuvs/cuvs_ivf_sq_wrapper.h b/cpp/bench/ann/src/cuvs/cuvs_ivf_sq_wrapper.h
new file mode 100644
index 0000000000..1503e6bb84
--- /dev/null
+++ b/cpp/bench/ann/src/cuvs/cuvs_ivf_sq_wrapper.h
@@ -0,0 +1,141 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include "../common/ann_types.hpp"
+#include "cuvs_ann_bench_utils.h"
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/ivf_sq.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/cudart_utils.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <type_traits>
+
+namespace cuvs::bench {
+
+template <typename T>
+class cuvs_ivf_sq : public algo<T>, public algo_gpu {
+ public:
+  using search_param_base = typename algo<T>::search_param;
+
+  struct search_param : public search_param_base {
+    cuvs::neighbors::ivf_sq::search_params ivf_sq_params;
+  };
+
+  using build_param = cuvs::neighbors::ivf_sq::index_params;
+
+  cuvs_ivf_sq(Metric metric, int dim, const build_param& param)
+    : algo<T>(metric, dim), index_params_(param), dimension_(dim)
+  {
+    index_params_.metric                         = parse_metric_type(metric);
+    index_params_.conservative_memory_allocation = true;
+    RAFT_CUDA_TRY(cudaGetDevice(&device_));
+  }
+
+  void build(const T* dataset, size_t nrow) final;
+
+  void set_search_param(const search_param_base& param, const void* filter_bitset) override;
+
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              algo_base::index_type* neighbors,
+              float* distances) const override;
+
+  [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
+  {
+    return handle_.get_sync_stream();
+  }
+
+  [[nodiscard]] auto get_preference() const -> algo_property override
+  {
+    algo_property property;
+    property.dataset_memory_type = MemoryType::kHostMmap;
+    property.query_memory_type   = MemoryType::kDevice;
+    return property;
+  }
+
+  void save(const std::string& file) const override;
+  void load(const std::string&) override;
+  std::unique_ptr<algo<T>> copy() override;
+
+ private:
+  configured_raft_resources handle_{};
+  build_param index_params_;
+  cuvs::neighbors::ivf_sq::search_params search_params_;
+  std::shared_ptr<cuvs::neighbors::ivf_sq::index<uint8_t>> index_;
+  int device_;
+  int dimension_;
+
+  std::shared_ptr<cuvs::neighbors::filtering::base_filter> filter_;
+};
+
+template <typename T>
+void cuvs_ivf_sq<T>::build(const T* dataset, size_t nrow)
+{
+  size_t n_streams = 1;
+  raft::resource::set_cuda_stream_pool(handle_, std::make_shared<rmm::cuda_stream_pool>(n_streams));
+  index_ = std::make_shared<cuvs::neighbors::ivf_sq::index<uint8_t>>(
+    std::move(cuvs::neighbors::ivf_sq::build(
+      handle_,
+      index_params_,
+      raft::make_host_matrix_view<const T, int64_t>(dataset, nrow, dimension_))));
+}
+
+template <typename T>
+void cuvs_ivf_sq<T>::set_search_param(const search_param_base& param, const void* filter_bitset)
+{
+  filter_        = make_cuvs_filter(filter_bitset, index_->size());
+  auto sp        = dynamic_cast<const search_param&>(param);
+  search_params_ = sp.ivf_sq_params;
+  assert(search_params_.n_probes <= index_params_.n_lists);
+}
+
+template <typename T>
+void cuvs_ivf_sq<T>::save(const std::string& file) const
+{
+  cuvs::neighbors::ivf_sq::serialize(handle_, file, *index_);
+}
+
+template <typename T>
+void cuvs_ivf_sq<T>::load(const std::string& file)
+{
+  index_ =
+    std::make_shared<cuvs::neighbors::ivf_sq::index<uint8_t>>(handle_, index_params_, this->dim_);
+  cuvs::neighbors::ivf_sq::deserialize(handle_, file, index_.get());
+}
+
+template <typename T>
+std::unique_ptr<algo<T>> cuvs_ivf_sq<T>::copy()
+{
+  return std::make_unique<cuvs_ivf_sq<T>>(*this);
+}
+
+template <typename T>
+void cuvs_ivf_sq<T>::search(
+  const T* queries, int batch_size, int k, algo_base::index_type* neighbors, float* distances) const
+{
+  static_assert(sizeof(algo_base::index_type) == sizeof(int64_t));
+
+  cuvs::neighbors::ivf_sq::search(
+    handle_,
+    search_params_,
+    *index_,
+    raft::make_device_matrix_view<const T, int64_t>(queries, batch_size, index_->dim()),
+    raft::make_device_matrix_view<int64_t, int64_t>(
+      reinterpret_cast<int64_t*>(neighbors), batch_size, k),
+    raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k),
+    *filter_);
+}
+
+}  // namespace cuvs::bench
diff --git a/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp b/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp
new file mode 100644
index 0000000000..d16d47d653
--- /dev/null
+++ b/cpp/include/cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp
@@ -0,0 +1,37 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+struct tag_metric_l2 {};
+struct tag_metric_ip {};
+struct tag_metric_cosine {};
+
+// Scan entrypoint fragment. Templated only on Capacity. The warpsort runs
+// ascending for all metrics; metric specialization lives in the four
+// device-function fragments below (finalize_distance is responsible for
+// returning a min-close value).
+template <int Capacity>
+struct fragment_tag_ivf_sq_scan {};
+
+template <typename FilterTag>
+struct fragment_tag_ivf_sq_filter {};
+
+// Metric-specific device-function fragments composed in at JIT-link time.
+template <typename MetricTag>
+struct fragment_tag_setup_invariant_smem {};
+
+template <typename MetricTag>
+struct fragment_tag_setup_per_probe_smem {};
+
+template <typename MetricTag>
+struct fragment_tag_accumulate_distance {};
+
+template <typename MetricTag>
+struct fragment_tag_finalize_distance {};
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/include/cuvs/neighbors/ivf_sq.hpp b/cpp/include/cuvs/neighbors/ivf_sq.hpp
new file mode 100644
index 0000000000..b342d0ce50
--- /dev/null
+++ b/cpp/include/cuvs/neighbors/ivf_sq.hpp
@@ -0,0 +1,860 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "common.hpp"
+#include <cstdint>
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/neighbors/common.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
+#include <raft/core/mdspan.hpp>
+
+namespace cuvs::neighbors::ivf_sq {
+
+/**
+ * @defgroup ivf_sq_cpp_index_params IVF-SQ index build parameters
+ * @{
+ */
+
+constexpr static uint32_t kIndexGroupSize = 32;
+
+struct index_params : cuvs::neighbors::index_params {
+  /** The number of inverted lists (clusters) */
+  uint32_t n_lists = 1024;
+  /** The number of iterations searching for kmeans centers (index building). */
+  uint32_t kmeans_n_iters = 20;
+  /** The fraction of data to use during iterative kmeans building. */
+  double kmeans_trainset_fraction = 0.5;
+  /**
+   * By default, the algorithm allocates more space than necessary for individual clusters
+   * (`list_data`). This allows to amortize the cost of memory allocation and reduce the number of
+   * data copies during repeated calls to `extend` (extending the database).
+   *
+   * The alternative is the conservative allocation behavior; when enabled, the algorithm always
+   * allocates the minimum amount of memory required to store the given number of records. Set this
+   * flag to `true` if you prefer to use as little GPU memory for the database as possible.
+   */
+  bool conservative_memory_allocation = false;
+  /**
+   * Whether to add the dataset content to the index, i.e.:
+   *
+   *  - `true` means the index is filled with the dataset vectors and ready to search after calling
+   * `build`.
+   *  - `false` means `build` only trains the underlying model (e.g. quantizer or clustering), but
+   * the index is left empty; you'd need to call `extend` on the index afterwards to populate it.
+   */
+  bool add_data_on_build = true;
+};
+
+static_assert(std::is_aggregate_v<index_params>);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_search_params IVF-SQ index search parameters
+ * @{
+ */
+
+struct search_params : cuvs::neighbors::search_params {
+  /** The number of clusters to search. */
+  uint32_t n_probes = 20;
+};
+
+static_assert(std::is_aggregate_v<search_params>);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_list_spec IVF-SQ list storage spec
+ * @{
+ */
+
+template <typename SizeT, typename CodeT, typename IdxT>
+struct list_spec {
+  static_assert(std::is_same_v<CodeT, uint8_t>, "IVF-SQ code type CodeT must be uint8_t");
+
+  using value_type   = CodeT;
+  using list_extents = raft::matrix_extent<SizeT>;
+  using index_type   = IdxT;
+
+  SizeT align_max;
+  SizeT align_min;
+  uint32_t dim;
+
+  constexpr list_spec(uint32_t dim, bool conservative_memory_allocation)
+    : dim(dim),
+      align_min(kIndexGroupSize),
+      align_max(conservative_memory_allocation ? kIndexGroupSize : 1024)
+  {
+  }
+
+  template <typename OtherSizeT>
+  constexpr explicit list_spec(const list_spec<OtherSizeT, CodeT, IdxT>& other_spec)
+    : dim{other_spec.dim}, align_min{other_spec.align_min}, align_max{other_spec.align_max}
+  {
+  }
+
+  static constexpr uint32_t kVecLen = 16;
+
+  constexpr auto make_list_extents(SizeT n_rows) const -> list_extents
+  {
+    uint32_t padded = ((dim + kVecLen - 1) / kVecLen) * kVecLen;
+    return raft::make_extents<SizeT>(n_rows, padded);
+  }
+};
+
+template <typename CodeT, typename IdxT, typename SizeT = uint32_t>
+using list_data = ivf::list<list_spec, SizeT, CodeT, IdxT>;
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_index IVF-SQ index
+ * @{
+ */
+
+/**
+ * @brief IVF-SQ index.
+ *
+ * In the IVF-SQ index, a database vector is first assigned to the nearest cluster center
+ * using an inverted file (IVF) structure, and then compressed using scalar quantization (SQ).
+ *
+ * Scalar quantization independently maps each dimension of the per-cluster residual (the
+ * input vector minus its assigned centroid) to a fixed-width integer code. For 8-bit
+ * quantization (uint8_t), each residual component r_i = x_i - centroid_i is linearly
+ * mapped to an integer in [0, 255] using learned per-dimension minimum (`sq_vmin`) and
+ * step-size (`sq_delta`) values:
+ *
+ *   code_i = clamp(round((r_i - vmin_i) / delta_i), 0, 255)
+ *
+ * where delta_i is the per-level step size (range divided by 255), so the corresponding
+ * reconstruction is x_i ≈ centroid_i + vmin_i + code_i * delta_i.
+ *
+ * This provides a compact representation (1 byte per dimension) while preserving the relative
+ * distances between vectors with high fidelity, offering a good trade-off between index size,
+ * search speed, and recall compared to flat (uncompressed) and product-quantized (PQ)
+ * representations.
+ *
+ * @tparam CodeT  SQ code type. Only uint8_t (8-bit, codes in [0,255]) for now.
+ *
+ */
+template <typename CodeT>
+struct index : cuvs::neighbors::index {
+  static_assert(std::is_same_v<CodeT, uint8_t>, "IVF-SQ code type CodeT must be uint8_t for now.");
+
+  using index_params_type  = ivf_sq::index_params;
+  using search_params_type = ivf_sq::search_params;
+  using code_type          = CodeT;
+
+  static constexpr uint32_t sq_bits = sizeof(CodeT) * 8;
+
+ public:
+  index(const index&)            = delete;
+  index(index&&)                 = default;
+  index& operator=(const index&) = delete;
+  index& operator=(index&&)      = default;
+  ~index()                       = default;
+
+  index(raft::resources const& res);
+  index(raft::resources const& res, const index_params& params, uint32_t dim);
+  index(raft::resources const& res,
+        cuvs::distance::DistanceType metric,
+        uint32_t n_lists,
+        uint32_t dim,
+        bool conservative_memory_allocation);
+
+  cuvs::distance::DistanceType metric() const noexcept;
+  int64_t size() const noexcept;
+  uint32_t dim() const noexcept;
+  uint32_t n_lists() const noexcept;
+  bool conservative_memory_allocation() const noexcept;
+
+  raft::device_vector_view<uint32_t, uint32_t> list_sizes() noexcept;
+  raft::device_vector_view<const uint32_t, uint32_t> list_sizes() const noexcept;
+
+  raft::device_matrix_view<float, uint32_t, raft::row_major> centers() noexcept;
+  raft::device_matrix_view<const float, uint32_t, raft::row_major> centers() const noexcept;
+
+  std::optional<raft::device_vector_view<float, uint32_t>> center_norms() noexcept;
+  std::optional<raft::device_vector_view<const float, uint32_t>> center_norms() const noexcept;
+  void allocate_center_norms(raft::resources const& res);
+
+  raft::device_vector_view<float, uint32_t> sq_vmin() noexcept;
+  raft::device_vector_view<const float, uint32_t> sq_vmin() const noexcept;
+
+  raft::device_vector_view<float, uint32_t> sq_delta() noexcept;
+  raft::device_vector_view<const float, uint32_t> sq_delta() const noexcept;
+
+  raft::host_vector_view<int64_t, uint32_t> accum_sorted_sizes() noexcept;
+  [[nodiscard]] raft::host_vector_view<const int64_t, uint32_t> accum_sorted_sizes() const noexcept;
+
+  raft::device_vector_view<CodeT*, uint32_t> data_ptrs() noexcept;
+  raft::device_vector_view<CodeT* const, uint32_t> data_ptrs() const noexcept;
+
+  raft::device_vector_view<int64_t*, uint32_t> inds_ptrs() noexcept;
+  raft::device_vector_view<int64_t* const, uint32_t> inds_ptrs() const noexcept;
+
+  std::vector<std::shared_ptr<list_data<CodeT, int64_t>>>& lists() noexcept;
+  const std::vector<std::shared_ptr<list_data<CodeT, int64_t>>>& lists() const noexcept;
+
+  void check_consistency();
+
+ private:
+  cuvs::distance::DistanceType metric_;
+  bool conservative_memory_allocation_;
+
+  std::vector<std::shared_ptr<list_data<CodeT, int64_t>>> lists_;
+  raft::device_vector<uint32_t, uint32_t> list_sizes_;
+  raft::device_matrix<float, uint32_t, raft::row_major> centers_;
+  std::optional<raft::device_vector<float, uint32_t>> center_norms_;
+  raft::device_vector<float, uint32_t> sq_vmin_;
+  raft::device_vector<float, uint32_t> sq_delta_;
+
+  raft::device_vector<CodeT*, uint32_t> data_ptrs_;
+  raft::device_vector<int64_t*, uint32_t> inds_ptrs_;
+  raft::host_vector<int64_t, uint32_t> accum_sorted_sizes_;
+};
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_index_build IVF-SQ index build
+ * @{
+ */
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2SqrtExpanded
+ * - InnerProduct
+ * - CosineExpanded
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_sq::build(handle, index_params, dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-sq index
+ */
+auto build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::device_matrix_view<const float, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2SqrtExpanded
+ * - InnerProduct
+ * - CosineExpanded
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   ivf_sq::index<uint8_t> index;
+ *   ivf_sq::build(handle, index_params, dataset, index);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset raft::device_matrix_view to a row-major matrix [n_rows, dim]
+ * @param[out] idx reference to ivf_sq::index
+ *
+ */
+void build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::device_matrix_view<const float, int64_t, raft::row_major> dataset,
+           cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_sq::build(handle, index_params, dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset a device pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-sq index
+ */
+auto build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::device_matrix_view<const half, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2SqrtExpanded
+ * - InnerProduct
+ * - CosineExpanded
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   ivf_sq::index<uint8_t> index;
+ *   ivf_sq::build(handle, index_params, dataset, index);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset raft::device_matrix_view to a row-major matrix [n_rows, dim]
+ * @param[out] idx reference to ivf_sq::index
+ *
+ */
+void build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::device_matrix_view<const half, int64_t, raft::row_major> dataset,
+           cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_sq::build(handle, index_params, dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset a host pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-sq index
+ */
+auto build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2SqrtExpanded
+ * - InnerProduct
+ * - CosineExpanded
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   ivf_sq::index<uint8_t> index;
+ *   ivf_sq::build(handle, index_params, dataset, index);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset raft::host_matrix_view to a row-major matrix [n_rows, dim]
+ * @param[out] idx reference to ivf_sq::index
+ *
+ */
+void build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::host_matrix_view<const float, int64_t, raft::row_major> dataset,
+           cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   auto index = ivf_sq::build(handle, index_params, dataset);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset a host pointer to a row-major matrix [n_rows, dim]
+ *
+ * @return the constructed ivf-sq index
+ */
+auto build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::host_matrix_view<const half, int64_t, raft::row_major> dataset)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+/**
+ * @brief Build the index from the dataset for efficient search.
+ *
+ * NB: Currently, the following distance metrics are supported:
+ * - L2Expanded
+ * - L2SqrtExpanded
+ * - InnerProduct
+ * - CosineExpanded
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default index parameters
+ *   ivf_sq::index_params index_params;
+ *   // create and fill the index from a [N, D] dataset
+ *   ivf_sq::index<uint8_t> index;
+ *   ivf_sq::build(handle, index_params, dataset, index);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] index_params configure the index building
+ * @param[in] dataset raft::host_matrix_view to a row-major matrix [n_rows, dim]
+ * @param[out] idx reference to ivf_sq::index
+ *
+ */
+void build(raft::resources const& handle,
+           const cuvs::neighbors::ivf_sq::index_params& index_params,
+           raft::host_matrix_view<const half, int64_t, raft::row_major> dataset,
+           cuvs::neighbors::ivf_sq::index<uint8_t>& idx);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_index_extend IVF-SQ index extend
+ * @{
+ */
+
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::device_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   auto index = ivf_sq::extend(handle, new_vectors, no_op, index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] orig_index the original index
+ *
+ * @return the constructed extended ivf-sq index
+ */
+auto extend(raft::resources const& handle,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+            const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::device_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   ivf_sq::extend(handle, new_vectors, no_op, &index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx pointer to ivf_sq::index
+ */
+void extend(raft::resources const& handle,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+            cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
+
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::device_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   auto index = ivf_sq::extend(handle, new_vectors, no_op, index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] orig_index the original index
+ *
+ * @return the constructed extended ivf-sq index
+ */
+auto extend(raft::resources const& handle,
+            raft::device_matrix_view<const half, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+            const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::device_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   ivf_sq::extend(handle, new_vectors, no_op, &index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a device matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a device vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx pointer to ivf_sq::index
+ */
+void extend(raft::resources const& handle,
+            raft::device_matrix_view<const half, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+            cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
+
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   auto index = ivf_sq::extend(handle, new_vectors, no_op, index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a host matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a host vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] orig_index the original index
+ *
+ * @return the constructed extended ivf-sq index
+ */
+auto extend(raft::resources const& handle,
+            raft::host_matrix_view<const float, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   ivf_sq::extend(handle, new_vectors, no_op, &index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a host matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a host vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx pointer to ivf_sq::index
+ */
+void extend(raft::resources const& handle,
+            raft::host_matrix_view<const float, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
+
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   auto index = ivf_sq::extend(handle, new_vectors, no_op, index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a host matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a host vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[in] orig_index the original index
+ *
+ * @return the constructed extended ivf-sq index
+ */
+auto extend(raft::resources const& handle,
+            raft::host_matrix_view<const half, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            const cuvs::neighbors::ivf_sq::index<uint8_t>& orig_index)
+  -> cuvs::neighbors::ivf_sq::index<uint8_t>;
+
+/**
+ * @brief Extend the index with the new data.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   ivf_sq::index_params index_params;
+ *   index_params.add_data_on_build = false;      // don't populate index on build
+ *   index_params.kmeans_trainset_fraction = 1.0;  // use whole dataset for kmeans training
+ *   // train the index from a [N, D] dataset
+ *   auto index_empty = ivf_sq::build(handle, index_params, dataset);
+ *   // fill the index with the data
+ *   std::optional<raft::host_vector_view<const int64_t, int64_t>> no_op = std::nullopt;
+ *   ivf_sq::extend(handle, new_vectors, no_op, &index_empty);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] new_vectors a host matrix view to a row-major matrix [n_rows, idx.dim()]
+ * @param[in] new_indices a host vector view to a vector of indices [n_rows].
+ *    If the original index is empty (`idx.size() == 0`), you can pass `std::nullopt`
+ *    here to imply a continuous range `[0...n_rows)`.
+ * @param[inout] idx pointer to ivf_sq::index
+ */
+void extend(raft::resources const& handle,
+            raft::host_matrix_view<const half, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            cuvs::neighbors::ivf_sq::index<uint8_t>* idx);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_index_search IVF-SQ index search
+ * @{
+ */
+
+/**
+ * @brief Search ANN using the constructed index.
+ *
+ * See the [ivf_sq::build](#ivf_sq::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`.
+ * The exact size of the temporary buffer depends on multiple factors and is an implementation
+ * detail. However, you can safely specify a small initial size for the memory pool, so that only a
+ * few allocations happen to grow it during the first invocations of the `search`.
+ *
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default search parameters
+ *   ivf_sq::search_params search_params;
+ *   // Use the same allocator across multiple searches to reduce the number of
+ *   // cuda memory allocations
+ *   ivf_sq::search(handle, search_params, index, queries1, out_inds1, out_dists1);
+ *   ivf_sq::search(handle, search_params, index, queries2, out_inds2, out_dists2);
+ *   ivf_sq::search(handle, search_params, index, queries3, out_inds3, out_dists3);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] index ivf-sq constructed index
+ * @param[in] queries raft::device_matrix_view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors raft::device_matrix_view to the indices of the neighbors in the source
+ * dataset [n_queries, k]
+ * @param[out] distances raft::device_matrix_view to the distances to the selected neighbors
+ * [n_queries, k]
+ * @param[in] sample_filter an optional device filter function object that greenlights samples
+ * for a given query. (none_sample_filter for no filtering)
+ */
+void search(raft::resources const& handle,
+            const cuvs::neighbors::ivf_sq::search_params& params,
+            const cuvs::neighbors::ivf_sq::index<uint8_t>& index,
+            raft::device_matrix_view<const float, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+            const cuvs::neighbors::filtering::base_filter& sample_filter =
+              cuvs::neighbors::filtering::none_sample_filter{});
+
+/**
+ * @brief Search ANN using the constructed index with half-precision queries.
+ *
+ * See the [ivf_sq::build](#ivf_sq::build) documentation for a usage example.
+ *
+ * Note, this function requires a temporary buffer to store intermediate results between cuda kernel
+ * calls, which may lead to undesirable allocations and slowdown. To alleviate the problem, you can
+ * pass a pool memory resource or a large enough pre-allocated memory resource to reduce or
+ * eliminate entirely allocations happening within `search`.
+ *
+ * @code{.cpp}
+ *   using namespace cuvs::neighbors;
+ *   // use default search parameters
+ *   ivf_sq::search_params search_params;
+ *   // Use the same allocator across multiple searches to reduce the number of
+ *   // cuda memory allocations
+ *   ivf_sq::search(handle, search_params, index, queries1, out_inds1, out_dists1);
+ *   ivf_sq::search(handle, search_params, index, queries2, out_inds2, out_dists2);
+ *   ivf_sq::search(handle, search_params, index, queries3, out_inds3, out_dists3);
+ * @endcode
+ *
+ * @param[in] handle
+ * @param[in] params configure the search
+ * @param[in] index ivf-sq constructed index
+ * @param[in] queries raft::device_matrix_view to a row-major matrix [n_queries, index->dim()]
+ * @param[out] neighbors raft::device_matrix_view to the indices of the neighbors in the source
+ * dataset [n_queries, k]
+ * @param[out] distances raft::device_matrix_view to the distances to the selected neighbors
+ * [n_queries, k]
+ * @param[in] sample_filter an optional device filter function object that greenlights samples
+ * for a given query. (none_sample_filter for no filtering)
+ */
+void search(raft::resources const& handle,
+            const cuvs::neighbors::ivf_sq::search_params& params,
+            const cuvs::neighbors::ivf_sq::index<uint8_t>& index,
+            raft::device_matrix_view<const half, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+            const cuvs::neighbors::filtering::base_filter& sample_filter =
+              cuvs::neighbors::filtering::none_sample_filter{});
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup ivf_sq_cpp_index_serialize IVF-SQ index serialize
+ * @{
+ */
+
+/**
+ * Save the index to file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/ivf_sq.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an index with `auto index = ivf_sq::build(...);`
+ * cuvs::neighbors::ivf_sq::serialize(handle, filename, index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the file name for saving the index
+ * @param[in] index IVF-SQ index
+ *
+ */
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const cuvs::neighbors::ivf_sq::index<uint8_t>& index);
+
+/**
+ * Load index from file.
+ *
+ * Experimental, both the API and the serialization format are subject to change.
+ *
+ * @code{.cpp}
+ * #include <raft/core/resources.hpp>
+ * #include <cuvs/neighbors/ivf_sq.hpp>
+ *
+ * raft::resources handle;
+ *
+ * // create a string with a filepath
+ * std::string filename("/path/to/index");
+ * // create an empty index with `ivf_sq::index<uint8_t> index(handle);`
+ * cuvs::neighbors::ivf_sq::deserialize(handle, filename, &index);
+ * @endcode
+ *
+ * @param[in] handle the raft handle
+ * @param[in] filename the name of the file that stores the index
+ * @param[out] index IVF-SQ index
+ *
+ */
+void deserialize(raft::resources const& handle,
+                 const std::string& filename,
+                 cuvs::neighbors::ivf_sq::index<uint8_t>* index);
+
+/**
+ * @}
+ */
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_impl.cuh
new file mode 100644
index 0000000000..5368e101cd
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_impl.cuh
@@ -0,0 +1,46 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+// Per-element distance accumulator. Called inside the unrolled inner loop of
+// ivf_sq_scan_impl. After JIT-LTO inlining, these bodies fold directly into
+// the unrolled loop and the v_norm_sq plumbing is dead-code-eliminated for
+// non-cosine metrics.
+//
+// L2:        diff = qt - code*scale; dist += diff*diff
+// IP:        v    = aux + code*scale; dist += qt*v
+// Cosine:    as IP, plus v_norm_sq += v*v
+
+__device__ void accumulate_distance_l2_impl(
+  float qt, float /* aux */, float scale, uint8_t code, float& dist, float& /* v_norm_sq */)
+{
+  float recon = float(code) * scale;
+  float diff  = qt - recon;
+  dist += diff * diff;
+}
+
+__device__ void accumulate_distance_ip_impl(
+  float qt, float aux, float scale, uint8_t code, float& dist, float& /* v_norm_sq */)
+{
+  float recon = float(code) * scale;
+  float v_d   = aux + recon;
+  dist += qt * v_d;
+}
+
+__device__ void accumulate_distance_cosine_impl(
+  float qt, float aux, float scale, uint8_t code, float& dist, float& v_norm_sq)
+{
+  float recon = float(code) * scale;
+  float v_d   = aux + recon;
+  dist += qt * v_d;
+  v_norm_sq += v_d * v_d;
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_kernel.cu.in
new file mode 100644
index 0000000000..7d7e77c85a
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_kernel.cu.in
@@ -0,0 +1,24 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_impl.cuh>
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh>
+
+namespace {
+
+constexpr auto accumulate_distance_impl =
+  cuvs::neighbors::ivf_sq::detail::accumulate_distance_@metric_name@_impl;
+
+}  // namespace
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+__device__ void accumulate_distance(
+  float qt, float aux, float scale, uint8_t code, float& dist, float& v_norm_sq)
+{
+  accumulate_distance_impl(qt, aux, scale, code, dist, v_norm_sq);
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_matrix.json
new file mode 100644
index 0000000000..915ea1284a
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/accumulate_distance_matrix.json
@@ -0,0 +1,3 @@
+{
+  "metric_name": ["l2", "ip", "cosine"]
+}
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh
new file mode 100644
index 0000000000..ca22c21168
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh
@@ -0,0 +1,53 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+// Forward declaration of the sample filter device function.
+// The concrete implementation is provided by a JIT-LTO filter-adapter fragment
+// (see filter_kernel.cu.in) that delegates to the shared
+// cuvs::neighbors::detail::sample_filter_<name><uint32_t, int64_t> fragment.
+template <typename IndexT>
+__device__ bool sample_filter(const IndexT* const* const inds_ptrs,
+                              const uint32_t query_ix,
+                              const uint32_t cluster_ix,
+                              const uint32_t sample_ix,
+                              uint32_t* bitset_ptr,
+                              IndexT bitset_len,
+                              IndexT original_nbits);
+
+// Forward declarations of the metric-specific scan device functions. The
+// concrete implementations are provided by JIT-LTO fragments generated from
+// setup_invariant_smem_kernel.cu.in, setup_per_probe_smem_kernel.cu.in,
+// accumulate_distance_kernel.cu.in and finalize_distance_kernel.cu.in. After
+// nvJitLink LTO inlines these into ivf_sq_scan_impl, the codegen matches the
+// pre-refactor `if constexpr (kIsL2 / kIsCosine)` form.
+
+// Phase 1: load the metric-invariant smem array. Called once per query.
+__device__ void setup_invariant_smem(uint32_t dim,
+                                     const float* __restrict__ query,
+                                     const float* __restrict__ sq_vmin,
+                                     float* __restrict__ s_aux,
+                                     float* __restrict__ s_query_term);
+
+// Phase 2: load the per-probe smem array. Called once per (query, probe).
+__device__ void setup_per_probe_smem(uint32_t dim,
+                                     const float* __restrict__ centroid,
+                                     const float* __restrict__ sq_vmin,
+                                     float* __restrict__ s_aux,
+                                     float* __restrict__ s_query_term);
+
+// Per-element distance accumulator. Called inside the unrolled inner loop.
+__device__ void accumulate_distance(
+  float qt, float aux, float scale, uint8_t code, float& dist, float& v_norm_sq);
+
+// Per-row distance finalize. Called once per scanned row.
+__device__ float finalize_distance(float dist, float v_norm_sq, float query_norm);
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_kernel.cu.in
new file mode 100644
index 0000000000..c93038de84
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_kernel.cu.in
@@ -0,0 +1,31 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <neighbors/detail/jit_lto_kernels/filter.cuh>
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh>
+
+namespace {
+
+constexpr auto sample_filter_impl =
+  cuvs::neighbors::detail::sample_filter_@filter_name@<uint32_t, int64_t>;
+
+}  // namespace
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+template <>
+__device__ bool sample_filter<int64_t>(const int64_t* const* const inds_ptrs,
+                                       const uint32_t query_ix,
+                                       const uint32_t cluster_ix,
+                                       const uint32_t sample_ix,
+                                       uint32_t* bitset_ptr,
+                                       int64_t bitset_len,
+                                       int64_t original_nbits)
+{
+  return sample_filter_impl(
+    inds_ptrs, query_ix, cluster_ix, sample_ix, bitset_ptr, bitset_len, original_nbits);
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_matrix.json
new file mode 100644
index 0000000000..2a01bc4583
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/filter_matrix.json
@@ -0,0 +1,3 @@
+{
+  "filter_name": ["none", "bitset"]
+}
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_impl.cuh
new file mode 100644
index 0000000000..4276f80dd7
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_impl.cuh
@@ -0,0 +1,45 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+// Per-row distance finalize. Called once per scanned row.
+//
+// All metrics return a min-close value so the warpsort epilogue and the
+// host-side select_k can be hardcoded to ascending / select-min:
+//   L2:     return dist                              (squared L2 is min-close)
+//   IP:     return -dist                             (negate so min-close;
+//                                                     postprocess_distances
+//                                                     undoes the sign)
+//   Cosine: denom = query_norm * sqrtf(v_norm_sq);
+//           return denom > 0 ? 1 - dist/denom : 0    (cosine distance is min-close)
+
+__device__ float finalize_distance_l2_impl(float dist,
+                                           float /* v_norm_sq */,
+                                           float /* query_norm */)
+{
+  return dist;
+}
+
+__device__ float finalize_distance_ip_impl(float dist,
+                                           float /* v_norm_sq */,
+                                           float /* query_norm */)
+{
+  return -dist;
+}
+
+__device__ float finalize_distance_cosine_impl(float dist, float v_norm_sq, float query_norm)
+{
+  float denom = query_norm * sqrtf(v_norm_sq);
+  return (denom > 0.0f) ? 1.0f - dist / denom : 0.0f;
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_kernel.cu.in
new file mode 100644
index 0000000000..642e4f1d5d
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_kernel.cu.in
@@ -0,0 +1,23 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh>
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_impl.cuh>
+
+namespace {
+
+constexpr auto finalize_distance_impl =
+  cuvs::neighbors::ivf_sq::detail::finalize_distance_@metric_name@_impl;
+
+}  // namespace
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+__device__ float finalize_distance(float dist, float v_norm_sq, float query_norm)
+{
+  return finalize_distance_impl(dist, v_norm_sq, query_norm);
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_matrix.json
new file mode 100644
index 0000000000..915ea1284a
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/finalize_distance_matrix.json
@@ -0,0 +1,3 @@
+{
+  "metric_name": ["l2", "ip", "cosine"]
+}
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/kernel_def.hpp b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/kernel_def.hpp
new file mode 100644
index 0000000000..5df97deabc
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/kernel_def.hpp
@@ -0,0 +1,38 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+static constexpr int kSqScanThreads = 128;
+
+// Function-pointer signature for the JIT-LTO scan entrypoint.
+// Must exactly match the extern "C" __global__ ivf_sq_scan(...) signature
+// produced by scan_kernel.cu.in.
+template <typename IdxT>
+using ivf_sq_scan_func_t = void(const uint8_t* const* data_ptrs,
+                                const uint32_t* list_sizes,
+                                const uint32_t* coarse_indices,
+                                const float* queries_float,
+                                const float* centers,
+                                const float* sq_vmin,
+                                const float* sq_delta,
+                                const float* query_norms,
+                                uint32_t n_probes,
+                                uint32_t dim,
+                                uint32_t k,
+                                uint32_t max_samples,
+                                const uint32_t* chunk_indices,
+                                float* out_distances,
+                                uint32_t* out_indices,
+                                IdxT* const* inds_ptrs,
+                                uint32_t* bitset_ptr,
+                                IdxT bitset_len,
+                                IdxT original_nbits);
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
new file mode 100644
index 0000000000..85cb9605be
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh
@@ -0,0 +1,260 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "device_functions.cuh"
+#include "kernel_def.hpp"
+#include <cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>
+#include <cuvs/neighbors/ivf_sq.hpp>
+#include <neighbors/ivf_common.cuh>
+
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/util/cuda_dev_essentials.cuh>
+#include <raft/util/integer_utils.hpp>
+
+#include <cstdint>
+#include <type_traits>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+// block_sort type selection: dispatch the dummy block sort when Capacity == 0
+// so the same impl body works for both the fused top-k path (Capacity > 0) and
+// the materialize-all path (Capacity == 0).
+//
+// All metrics are min-close after finalize_distance (IP is negated; cosine
+// returns 1 - cos_sim; L2 is squared L2), so the warpsort is hardcoded to
+// ascending.
+template <int Capacity>
+struct sq_block_sort {
+  using type = raft::matrix::detail::select::warpsort::block_sort<
+    raft::matrix::detail::select::warpsort::warp_sort_filtered,
+    Capacity,
+    /*Ascending=*/true,
+    float,
+    uint32_t>;
+};
+
+template <>
+struct sq_block_sort<0> {
+  using type = ivf::detail::dummy_block_sort_t<float, uint32_t, /*Ascending=*/true>;
+};
+
+template <int Capacity>
+using sq_block_sort_t = typename sq_block_sort<Capacity>::type;
+
+// IVF-SQ scan kernel body with fused in-kernel top-k.
+//
+// The kernel is metric-agnostic: the four metric-specific pieces are
+// linked in at runtime via JIT-LTO as fragments declared in
+// device_functions.cuh:
+//   - setup_invariant_smem  (Phase 1, once per query)
+//   - setup_per_probe_smem  (Phase 2, once per probe)
+//   - accumulate_distance   (per-element, inner unrolled loop)
+//   - finalize_distance     (per-row, after the dim accumulation; ensures
+//                            all metrics are min-close so the warpsort
+//                            epilogue and the host select_k can be
+//                            hardcoded to ascending / select-min)
+//
+// The host launcher (ivf_sq_search.cuh) registers the matching metric
+// variant of each fragment with the planner. IP scores are negated by
+// finalize_distance and undone by postprocess_distances.
+//
+// Grid layout:
+//   kManageLocalTopK (Capacity > 0):
+//     grid (grid_dim_x, n_queries) - each block loops over probes
+//   otherwise (Capacity == 0):
+//     grid (n_probes, n_queries) - one block per (query, probe)
+//
+// Shared-memory layout (always 3 x dim floats):
+//   [s_sq_scale(dim) | s_query_term(dim) | s_aux(dim)]
+//
+//   s_sq_scale = delta[d]  - SQ dequantization scale, invariant (Phase 1).
+//
+//   L2 path:
+//     Phase 1: s_aux[d] = query[d] - vmin[d]              (invariant)
+//     Phase 2: s_query_term[d] = s_aux[d] - centroid[d]    (per-probe)
+//     The full SQ reconstruction is centroid + vmin + code*delta, so
+//     query - reconstructed = (query - vmin - centroid) - code*delta
+//                           = s_query_term - code*s_sq_scale.
+//
+//   IP/Cosine path:
+//     Phase 1: s_query_term[d] = query[d]                  (invariant)
+//     Phase 2: s_aux[d] = centroid[d] + vmin[d]            (per-probe)
+//     Reconstructed vector component: s_aux[d] + code*s_sq_scale[d].
+//
+//   After all probes are scanned, the smem is reused for block_sort merge.
+template <int Capacity>
+__device__ __forceinline__ void ivf_sq_scan_impl(const uint8_t* const* data_ptrs,
+                                                 const uint32_t* list_sizes,
+                                                 const uint32_t* coarse_indices,
+                                                 const float* queries_float,
+                                                 const float* centers,
+                                                 const float* sq_vmin,
+                                                 const float* sq_delta,
+                                                 const float* query_norms,
+                                                 uint32_t n_probes,
+                                                 uint32_t dim,
+                                                 uint32_t k,
+                                                 uint32_t max_samples,
+                                                 const uint32_t* chunk_indices,
+                                                 float* out_distances,
+                                                 uint32_t* out_indices,
+                                                 const int64_t* const* inds_ptrs,
+                                                 uint32_t* bitset_ptr,
+                                                 int64_t bitset_len,
+                                                 int64_t original_nbits)
+{
+  static_assert(kIndexGroupSize == raft::WarpSize,
+                "Warp-coalesced scan requires kIndexGroupSize == WarpSize");
+
+  constexpr int BlockDim          = kSqScanThreads;
+  constexpr bool kManageLocalTopK = (Capacity > 0);
+
+  extern __shared__ __align__(256) uint8_t smem_buf[];
+  float* smem = reinterpret_cast<float*>(smem_buf);
+
+  float* s_sq_scale   = smem;
+  float* s_query_term = smem + dim;
+  float* s_aux        = smem + 2 * dim;
+
+  const uint32_t query_ix = blockIdx.y;
+  const float* query      = queries_float + query_ix * dim;
+
+  // Hoist the per-query scalar load to a uniform value. The cosine fragment of
+  // finalize_distance is the only consumer; for L2/IP, the unused argument is
+  // dead-code-eliminated after JIT-LTO inlining and this load disappears.
+  const float q_norm = (query_norms != nullptr) ? query_norms[query_ix] : 0.0f;
+
+  if constexpr (kManageLocalTopK) {
+    out_distances += uint64_t(query_ix) * k * gridDim.x + blockIdx.x * k;
+    out_indices += uint64_t(query_ix) * k * gridDim.x + blockIdx.x * k;
+  }
+
+  // Phase 1: load shared memory that is invariant across probes.
+  for (uint32_t d = threadIdx.x; d < dim; d += BlockDim) {
+    s_sq_scale[d] = sq_delta[d];
+  }
+  setup_invariant_smem(dim, query, sq_vmin, s_aux, s_query_term);
+  __syncthreads();
+
+  using local_topk_t = sq_block_sort_t<Capacity>;
+  local_topk_t queue(k);
+
+  const uint32_t* my_coarse = coarse_indices + query_ix * n_probes;
+  const uint32_t* my_chunk  = chunk_indices + query_ix * n_probes;
+
+  constexpr uint32_t veclen         = 16;
+  constexpr uint32_t kWarpsPerBlock = BlockDim / raft::WarpSize;
+  const uint32_t warp_id            = threadIdx.x / raft::WarpSize;
+  const uint32_t lane_id            = threadIdx.x % raft::WarpSize;
+
+  // Phase 2: loop over probes.
+  // Synchronization protocol:
+  //  (a) __syncthreads after Phase 1 (above) ensures invariant smem arrays
+  //      (s_sq_scale, and L2: s_aux / IP-Cosine: s_query_term) are visible
+  //      before Phase 2 overwrites the per-probe array.
+  //  (b) __syncthreads after per-probe smem writes (L2: s_query_term /
+  //      IP-Cosine: s_aux) ensures probe-specific values are visible before
+  //      the distance computation.
+  //  (c) __syncthreads at the end of each iteration ensures all distance
+  //      computation reads are complete before the next iteration overwrites
+  //      the per-probe smem region.
+  //  When cluster_sz == 0, barrier (c) is skipped because no distance reads
+  //  occurred; all threads converge on the same branch uniformly, and the
+  //  next iteration's barrier (b) provides the needed ordering.
+  for (uint32_t probe_ix = blockIdx.x; probe_ix < n_probes;
+       probe_ix += (kManageLocalTopK ? gridDim.x : uint32_t{1})) {
+    const uint32_t cluster_id = my_coarse[probe_ix];
+    const uint32_t cluster_sz = list_sizes[cluster_id];
+
+    setup_per_probe_smem(dim, centers + cluster_id * dim, sq_vmin, s_aux, s_query_term);
+    __syncthreads();  // (b)
+
+    if (cluster_sz == 0) {
+      if constexpr (!kManageLocalTopK) break;
+      continue;
+    }
+
+    const uint8_t* codes   = data_ptrs[cluster_id];
+    uint32_t sample_offset = (probe_ix > 0) ? my_chunk[probe_ix - 1] : 0;
+    uint32_t padded_dim    = ((dim + veclen - 1) / veclen) * veclen;
+    uint32_t n_dim_blocks  = padded_dim / veclen;
+
+    for (uint32_t group = warp_id * kIndexGroupSize; group < cluster_sz;
+         group += kWarpsPerBlock * kIndexGroupSize) {
+      const uint32_t row = group + lane_id;
+      const bool valid =
+        (row < cluster_sz) &&
+        sample_filter<int64_t>(
+          inds_ptrs, query_ix, cluster_id, row, bitset_ptr, bitset_len, original_nbits);
+
+      float dist      = 0.0f;
+      float v_norm_sq = 0.0f;
+
+      const uint8_t* group_data = codes + size_t(group) * padded_dim;
+
+      for (uint32_t bl = 0; bl < n_dim_blocks; bl++) {
+        uint8_t codes_local[veclen];
+        *reinterpret_cast<uint4*>(codes_local) = *reinterpret_cast<const uint4*>(
+          group_data + bl * (veclen * kIndexGroupSize) + lane_id * veclen);
+
+        const uint32_t l = bl * veclen;
+#pragma unroll
+        for (uint32_t j = 0; j < veclen; j++) {
+          if (l + j < dim) {
+            accumulate_distance(s_query_term[l + j],
+                                s_aux[l + j],
+                                s_sq_scale[l + j],
+                                codes_local[j],
+                                dist,
+                                v_norm_sq);
+          }
+        }
+      }
+
+      dist = finalize_distance(dist, v_norm_sq, q_norm);
+
+      if constexpr (kManageLocalTopK) {
+        float val = valid ? dist : local_topk_t::queue_t::kDummy;
+        queue.add(val, sample_offset + row);
+      } else {
+        if (valid) {
+          uint32_t out_idx       = query_ix * max_samples + sample_offset + row;
+          out_distances[out_idx] = dist;
+          out_indices[out_idx]   = sample_offset + row;
+        }
+      }
+    }
+
+    __syncthreads();  // (c)
+    if constexpr (!kManageLocalTopK) break;
+  }
+
+  if constexpr (kManageLocalTopK) {
+    // All probe iterations are done; smem_buf is reused for block_sort merge.
+    // The loop's last (b) or (c) barrier ensures all prior smem accesses have
+    // completed, so this additional barrier is only needed to synchronize any
+    // register-level state across warps before the merge.
+    __syncthreads();
+    queue.done(smem_buf);
+    queue.store(out_distances, out_indices);
+
+    // block_sort initializes unused slots with (kDummy, idx=0). When the
+    // probed clusters have fewer than k total valid vectors, those slots
+    // survive into the output and share idx=0 with the real first vector,
+    // causing duplicates. Mark them with an invalid index so
+    // postprocess_neighbors treats them as out-of-bounds.
+    // store() is a warp-0-only operation, restrict the fixup to the same warp.
+    if (threadIdx.x < raft::WarpSize) {
+      constexpr auto kDummyVal = local_topk_t::queue_t::kDummy;
+      for (uint32_t i = threadIdx.x; i < k; i += raft::WarpSize) {
+        if (out_distances[i] == kDummyVal) { out_indices[i] = uint32_t(0xFFFFFFFF); }
+      }
+    }
+  }
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in
new file mode 100644
index 0000000000..d48c4d259f
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_kernel.cu.in
@@ -0,0 +1,61 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/kernel_def.hpp>
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/scan_impl.cuh>
+
+namespace {
+
+constexpr int capacity = @capacity@;
+
+}  // namespace
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+extern "C" __global__ __launch_bounds__(kSqScanThreads) void ivf_sq_scan(
+  const uint8_t* const* data_ptrs,
+  const uint32_t* list_sizes,
+  const uint32_t* coarse_indices,
+  const float* queries_float,
+  const float* centers,
+  const float* sq_vmin,
+  const float* sq_delta,
+  const float* query_norms,
+  uint32_t n_probes,
+  uint32_t dim,
+  uint32_t k,
+  uint32_t max_samples,
+  const uint32_t* chunk_indices,
+  float* out_distances,
+  uint32_t* out_indices,
+  int64_t* const* inds_ptrs,
+  uint32_t* bitset_ptr,
+  int64_t bitset_len,
+  int64_t original_nbits)
+{
+  ivf_sq_scan_impl<capacity>(data_ptrs,
+                             list_sizes,
+                             coarse_indices,
+                             queries_float,
+                             centers,
+                             sq_vmin,
+                             sq_delta,
+                             query_norms,
+                             n_probes,
+                             dim,
+                             k,
+                             max_samples,
+                             chunk_indices,
+                             out_distances,
+                             out_indices,
+                             inds_ptrs,
+                             bitset_ptr,
+                             bitset_len,
+                             original_nbits);
+}
+
+static_assert(std::is_same_v<decltype(ivf_sq_scan), ivf_sq_scan_func_t<int64_t>>);
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json
new file mode 100644
index 0000000000..4af912b122
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_matrix.json
@@ -0,0 +1,5 @@
+{
+  "capacity": [
+    "0", "32", "64", "128", "256"
+  ]
+}
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp
new file mode 100644
index 0000000000..05ea34532e
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/scan_planner.hpp
@@ -0,0 +1,62 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cuvs/detail/jit_lto/AlgorithmPlanner.hpp>
+#include <cuvs/detail/jit_lto/common_fragments.hpp>
+#include <cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>
+
+#include <string>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+struct IvfSqScanPlanner : AlgorithmPlanner {
+  inline static LauncherJitCache launcher_jit_cache{};
+
+  IvfSqScanPlanner() : AlgorithmPlanner("ivf_sq_scan", launcher_jit_cache) {}
+
+  template <int Capacity>
+  void add_entrypoint()
+  {
+    this->add_static_fragment<fragment_tag_ivf_sq_scan<Capacity>>();
+  }
+
+  template <typename FilterTag>
+  void add_filter_device_function()
+  {
+    this->add_static_fragment<fragment_tag_ivf_sq_filter<FilterTag>>();
+    this->add_static_fragment<
+      cuvs::neighbors::detail::fragment_tag_sample_filter<cuvs::neighbors::detail::tag_bitset_u32,
+                                                          cuvs::neighbors::detail::tag_index_i64,
+                                                          FilterTag>>();
+  }
+
+  template <typename MetricTag>
+  void add_setup_invariant_smem_function()
+  {
+    this->add_static_fragment<fragment_tag_setup_invariant_smem<MetricTag>>();
+  }
+
+  template <typename MetricTag>
+  void add_setup_per_probe_smem_function()
+  {
+    this->add_static_fragment<fragment_tag_setup_per_probe_smem<MetricTag>>();
+  }
+
+  template <typename MetricTag>
+  void add_accumulate_distance_function()
+  {
+    this->add_static_fragment<fragment_tag_accumulate_distance<MetricTag>>();
+  }
+
+  template <typename MetricTag>
+  void add_finalize_distance_function()
+  {
+    this->add_static_fragment<fragment_tag_finalize_distance<MetricTag>>();
+  }
+};
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_impl.cuh
new file mode 100644
index 0000000000..b0e2d5ea9e
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_impl.cuh
@@ -0,0 +1,50 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+// Phase 1 invariant smem setup. Called once per query.
+//
+// Loads the metric-invariant smem array. The metric-variant array is left
+// untouched and will be filled per-probe by setup_per_probe_smem.
+
+__device__ void setup_invariant_smem_l2_impl(uint32_t dim,
+                                             const float* __restrict__ query,
+                                             const float* __restrict__ sq_vmin,
+                                             float* __restrict__ s_aux,
+                                             float* __restrict__ /* s_query_term */)
+{
+  for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) {
+    s_aux[d] = query[d] - sq_vmin[d];
+  }
+}
+
+__device__ void setup_invariant_smem_ip_impl(uint32_t dim,
+                                             const float* __restrict__ query,
+                                             const float* __restrict__ /* sq_vmin */,
+                                             float* __restrict__ /* s_aux */,
+                                             float* __restrict__ s_query_term)
+{
+  for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) {
+    s_query_term[d] = query[d];
+  }
+}
+
+__device__ void setup_invariant_smem_cosine_impl(uint32_t dim,
+                                                 const float* __restrict__ query,
+                                                 const float* __restrict__ /* sq_vmin */,
+                                                 float* __restrict__ /* s_aux */,
+                                                 float* __restrict__ s_query_term)
+{
+  for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) {
+    s_query_term[d] = query[d];
+  }
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_kernel.cu.in
new file mode 100644
index 0000000000..43a31f7fc4
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_kernel.cu.in
@@ -0,0 +1,27 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh>
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_impl.cuh>
+
+namespace {
+
+constexpr auto setup_invariant_smem_impl =
+  cuvs::neighbors::ivf_sq::detail::setup_invariant_smem_@metric_name@_impl;
+
+}  // namespace
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+__device__ void setup_invariant_smem(uint32_t dim,
+                                     const float* __restrict__ query,
+                                     const float* __restrict__ sq_vmin,
+                                     float* __restrict__ s_aux,
+                                     float* __restrict__ s_query_term)
+{
+  setup_invariant_smem_impl(dim, query, sq_vmin, s_aux, s_query_term);
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_matrix.json
new file mode 100644
index 0000000000..915ea1284a
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_invariant_smem_matrix.json
@@ -0,0 +1,3 @@
+{
+  "metric_name": ["l2", "ip", "cosine"]
+}
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_impl.cuh b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_impl.cuh
new file mode 100644
index 0000000000..6e0b19a1f9
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_impl.cuh
@@ -0,0 +1,55 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+// Phase 2 per-probe smem setup. Called once per (query, probe).
+//
+// Loads the per-probe smem array using the invariant array filled in Phase 1.
+//
+// L2:        s_query_term[d] = s_aux[d]   - centroid[d]   (where s_aux holds query - sq_vmin)
+// IP/Cosine: s_aux[d]        = centroid[d] + sq_vmin[d]
+//
+// The "in" and "out" smem arrays do not alias — they are distinct regions of
+// the kernel's smem layout — so __restrict__ is safe.
+
+__device__ void setup_per_probe_smem_l2_impl(uint32_t dim,
+                                             const float* __restrict__ centroid,
+                                             const float* __restrict__ /* sq_vmin */,
+                                             float* __restrict__ s_aux,
+                                             float* __restrict__ s_query_term)
+{
+  for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) {
+    s_query_term[d] = s_aux[d] - centroid[d];
+  }
+}
+
+__device__ void setup_per_probe_smem_ip_impl(uint32_t dim,
+                                             const float* __restrict__ centroid,
+                                             const float* __restrict__ sq_vmin,
+                                             float* __restrict__ s_aux,
+                                             float* __restrict__ /* s_query_term */)
+{
+  for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) {
+    s_aux[d] = centroid[d] + sq_vmin[d];
+  }
+}
+
+__device__ void setup_per_probe_smem_cosine_impl(uint32_t dim,
+                                                 const float* __restrict__ centroid,
+                                                 const float* __restrict__ sq_vmin,
+                                                 float* __restrict__ s_aux,
+                                                 float* __restrict__ /* s_query_term */)
+{
+  for (uint32_t d = threadIdx.x; d < dim; d += blockDim.x) {
+    s_aux[d] = centroid[d] + sq_vmin[d];
+  }
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_kernel.cu.in b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_kernel.cu.in
new file mode 100644
index 0000000000..81d206f78d
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_kernel.cu.in
@@ -0,0 +1,27 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/device_functions.cuh>
+#include <neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_impl.cuh>
+
+namespace {
+
+constexpr auto setup_per_probe_smem_impl =
+  cuvs::neighbors::ivf_sq::detail::setup_per_probe_smem_@metric_name@_impl;
+
+}  // namespace
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+__device__ void setup_per_probe_smem(uint32_t dim,
+                                     const float* __restrict__ centroid,
+                                     const float* __restrict__ sq_vmin,
+                                     float* __restrict__ s_aux,
+                                     float* __restrict__ s_query_term)
+{
+  setup_per_probe_smem_impl(dim, centroid, sq_vmin, s_aux, s_query_term);
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_matrix.json b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_matrix.json
new file mode 100644
index 0000000000..915ea1284a
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/detail/jit_lto_kernels/setup_per_probe_smem_matrix.json
@@ -0,0 +1,3 @@
+{
+  "metric_name": ["l2", "ip", "cosine"]
+}
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
new file mode 100644
index 0000000000..52be653985
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build.cuh
@@ -0,0 +1,675 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "../../core/nvtx.hpp"
+#include "../ivf_common.cuh"
+#include "../ivf_list.cuh"
+
+#include <cuvs/cluster/kmeans.hpp>
+#include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include "../detail/ann_utils.cuh"
+#include <cuvs/distance/distance.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/mdarray.hpp>
+#include <raft/core/mdspan.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/cuda_stream_pool.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/add.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/matrix/sample_rows.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/stats/histogram.cuh>
+#include <raft/util/pow2_utils.cuh>
+#include <raft/util/vectorized.cuh>
+
+#include <cub/block/block_reduce.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cstdint>
+#include <limits>
+
+namespace cuvs::neighbors::ivf_sq {
+using namespace cuvs::spatial::knn::detail;  // NOLINT
+
+namespace detail {
+
+struct ColMinMaxPair {
+  float min_val;
+  float max_val;
+};
+
+struct ColMinMaxOp {
+  __device__ __forceinline__ ColMinMaxPair operator()(const ColMinMaxPair& a,
+                                                      const ColMinMaxPair& b) const
+  {
+    return {fminf(a.min_val, b.min_val), fmaxf(a.max_val, b.max_val)};
+  }
+};
+
+/**
+ * Vectorized load helper: reads VecCols contiguous elements of type T as
+ * a single aligned wide load (via raft::TxN_t -> __ldg on the promoted
+ * io_t) and unpacks them into floats.
+ *
+ * The primary benefit over scalar loads is halving (VecCols=2) or
+ * quartering (VecCols=4) the number of LDG instructions issued per warp,
+ * which is the dominant cost in the column-strided access pattern of
+ * fused_column_minmax_kernel. VecCols=1 is the degenerate scalar
+ * fallback for odd `dim`.
+ *
+ * Requires `p` to be aligned to sizeof(raft::IOType<T, VecCols>::Type),
+ * i.e. sizeof(T) * VecCols.
+ */
+template <typename T, int VecCols>
+__device__ __forceinline__ void load_cols_as_float(const T* p, float (&out)[VecCols])
+{
+  raft::TxN_t<T, VecCols> v;
+  v.load(p, 0);
+#pragma unroll
+  for (int k = 0; k < VecCols; ++k) {
+    out[k] = static_cast<float>(v.val.data[k]);
+  }
+}
+
+/**
+ * Fused per-column min+max in a single pass (2x less DRAM traffic than two
+ * separate reductions). Each block owns VecCols contiguous columns and
+ * threads read them with a single aligned wide load (float4/float2 for
+ * float, uint2/uint32 for half) per row instead of VecCols scalar loads.
+ * Requires dim to be a multiple of VecCols so that
+ * `data + row*dim + col_base` is `sizeof(T)*VecCols` aligned for every
+ * row; the host-side dispatcher picks the widest VecCols that satisfies
+ * this, down to VecCols=1 (pure scalar fallback) for arbitrary dim.
+ *
+ * Row-loop is manually 4x-unrolled so the compiler can overlap four
+ * independent read-only loads in the memory pipeline.
+ */
+template <int BlockSize, int VecCols, typename T>
+__launch_bounds__(BlockSize) RAFT_KERNEL fused_column_minmax_kernel(const T* __restrict__ data,
+                                                                    float* __restrict__ col_min,
+                                                                    float* __restrict__ col_max,
+                                                                    int64_t n_rows,
+                                                                    uint32_t dim)
+{
+  using BlockReduce = cub::BlockReduce<ColMinMaxPair, BlockSize>;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+
+  const uint32_t col_base = blockIdx.x * VecCols;
+  // When launched with gridDim.x = dim / VecCols and dim % VecCols == 0
+  // (enforced by the host-side dispatch), col_base + VecCols <= dim always.
+
+  ColMinMaxPair agg[VecCols];
+#pragma unroll
+  for (int k = 0; k < VecCols; ++k) {
+    agg[k] = {std::numeric_limits<float>::max(), std::numeric_limits<float>::lowest()};
+  }
+
+  const int64_t stride = static_cast<int64_t>(BlockSize);
+  int64_t row          = static_cast<int64_t>(threadIdx.x);
+
+  // 4x row-unrolled loop with vectorized loads: 4 * VecCols values per iter
+  // are pulled into registers via 4 wide LDGs, exposing ILP across both
+  // the column and row axes.
+  for (; row + 3 * stride < n_rows; row += 4 * stride) {
+    float r0[VecCols], r1[VecCols], r2[VecCols], r3[VecCols];
+    load_cols_as_float<T, VecCols>(data + row * dim + col_base, r0);
+    load_cols_as_float<T, VecCols>(data + (row + stride) * dim + col_base, r1);
+    load_cols_as_float<T, VecCols>(data + (row + 2 * stride) * dim + col_base, r2);
+    load_cols_as_float<T, VecCols>(data + (row + 3 * stride) * dim + col_base, r3);
+#pragma unroll
+    for (int k = 0; k < VecCols; ++k) {
+      float mn       = fminf(fminf(r0[k], r1[k]), fminf(r2[k], r3[k]));
+      float mx       = fmaxf(fmaxf(r0[k], r1[k]), fmaxf(r2[k], r3[k]));
+      agg[k].min_val = fminf(agg[k].min_val, mn);
+      agg[k].max_val = fmaxf(agg[k].max_val, mx);
+    }
+  }
+  for (; row < n_rows; row += stride) {
+    float r[VecCols];
+    load_cols_as_float<T, VecCols>(data + row * dim + col_base, r);
+#pragma unroll
+    for (int k = 0; k < VecCols; ++k) {
+      agg[k].min_val = fminf(agg[k].min_val, r[k]);
+      agg[k].max_val = fmaxf(agg[k].max_val, r[k]);
+    }
+  }
+
+  // One block-reduce per owned column. CUB requires a __syncthreads()
+  // between reuses of the shared temp_storage.
+  for (int k = 0; k < VecCols; ++k) {
+    if (k > 0) __syncthreads();
+    auto r = BlockReduce(temp_storage).Reduce(agg[k], ColMinMaxOp());
+    if (threadIdx.x == 0) {
+      col_min[col_base + k] = r.min_val;
+      col_max[col_base + k] = r.max_val;
+    }
+  }
+}
+
+/**
+ * Host-side dispatch that selects the widest VecCols compatible with the
+ * given dim alignment (VecCols in {4, 2, 1}), and launches
+ * fused_column_minmax_kernel with the corresponding grid shape.
+ */
+template <int BlockSize, typename T>
+inline void launch_fused_column_minmax(
+  const T* data, float* col_min, float* col_max, int64_t n_rows, uint32_t dim, cudaStream_t stream)
+{
+  if (dim % 4 == 0) {
+    fused_column_minmax_kernel<BlockSize, 4, T>
+      <<<dim / 4, BlockSize, 0, stream>>>(data, col_min, col_max, n_rows, dim);
+  } else if (dim % 2 == 0) {
+    fused_column_minmax_kernel<BlockSize, 2, T>
+      <<<dim / 2, BlockSize, 0, stream>>>(data, col_min, col_max, n_rows, dim);
+  } else {
+    fused_column_minmax_kernel<BlockSize, 1, T>
+      <<<dim, BlockSize, 0, stream>>>(data, col_min, col_max, n_rows, dim);
+  }
+}
+
+template <typename CodeT>
+auto clone(const raft::resources& res, const index<CodeT>& source) -> index<CodeT>
+{
+  auto stream = raft::resource::get_cuda_stream(res);
+
+  index<CodeT> target(
+    res, source.metric(), source.n_lists(), source.dim(), source.conservative_memory_allocation());
+
+  raft::copy(target.list_sizes().data_handle(),
+             source.list_sizes().data_handle(),
+             source.list_sizes().size(),
+             stream);
+  raft::copy(target.centers().data_handle(),
+             source.centers().data_handle(),
+             source.centers().size(),
+             stream);
+  if (source.center_norms().has_value()) {
+    target.allocate_center_norms(res);
+    raft::copy(target.center_norms()->data_handle(),
+               source.center_norms()->data_handle(),
+               source.center_norms()->size(),
+               stream);
+  }
+  raft::copy(target.sq_vmin().data_handle(),
+             source.sq_vmin().data_handle(),
+             source.sq_vmin().size(),
+             stream);
+  raft::copy(target.sq_delta().data_handle(),
+             source.sq_delta().data_handle(),
+             source.sq_delta().size(),
+             stream);
+  target.lists() = source.lists();
+  ivf::detail::recompute_internal_state(res, target);
+  return target;
+}
+
+/**
+ * Kernel to compute residuals on the fly and encode them to uint8_t SQ codes
+ * interleaved into the target list.
+ *
+ * Uses warp-per-vector parallelism: each warp cooperatively encodes one vector
+ * so that reads from new_vectors/centers/vmin/delta are coalesced across the
+ * 32 lanes. Lane 0 handles the atomic position assignment and the index write.
+ *
+ * The residual (cast<float>(new_vectors[i,d]) - centers[list_id,d]) is computed
+ * in registers and consumed immediately, avoiding a full HBM round trip through
+ * an intermediate residuals buffer.
+ */
+template <int BlockSize, typename T>
+__launch_bounds__(BlockSize) RAFT_KERNEL encode_and_fill_kernel(const uint32_t* labels,
+                                                                const T* new_vectors,
+                                                                const float* centers,
+                                                                const int64_t* source_ixs,
+                                                                uint8_t** list_data_ptrs,
+                                                                int64_t** list_index_ptrs,
+                                                                uint32_t* list_sizes_ptr,
+                                                                const float* vmin,
+                                                                const float* delta,
+                                                                int64_t n_rows,
+                                                                uint32_t dim,
+                                                                int64_t batch_offset)
+{
+  constexpr uint32_t kWarpSize      = kIndexGroupSize;
+  constexpr uint32_t kWarpsPerBlock = BlockSize / kWarpSize;
+
+  const uint32_t lane_id = threadIdx.x % kWarpSize;
+  const int64_t row_id =
+    int64_t(threadIdx.x / kWarpSize) + int64_t(blockIdx.x) * int64_t(kWarpsPerBlock);
+  if (row_id >= n_rows) return;
+
+  uint32_t list_id   = 0;
+  uint32_t inlist_id = 0;
+  if (lane_id == 0) {
+    auto source_ix = source_ixs == nullptr ? row_id + batch_offset : source_ixs[row_id];
+    list_id        = labels[row_id];
+    inlist_id      = atomicAdd(list_sizes_ptr + list_id, 1);
+    list_index_ptrs[list_id][inlist_id] = source_ix;
+  }
+  list_id   = __shfl_sync(0xFFFFFFFF, list_id, 0);
+  inlist_id = __shfl_sync(0xFFFFFFFF, inlist_id, 0);
+
+  using interleaved_group = raft::Pow2<kIndexGroupSize>;
+  auto group_offset       = interleaved_group::roundDown(inlist_id);
+  auto ingroup_id         = interleaved_group::mod(inlist_id);
+
+  constexpr uint32_t veclen = list_spec<uint32_t, uint8_t, int64_t>::kVecLen;
+  uint32_t padded_dim       = ((dim + veclen - 1) / veclen) * veclen;
+  auto* list_dat   = list_data_ptrs[list_id] + static_cast<size_t>(group_offset) * padded_dim;
+  const T* src     = new_vectors + row_id * dim;
+  const float* ctr = centers + static_cast<size_t>(list_id) * dim;
+
+  for (uint32_t d = lane_id; d < padded_dim; d += kWarpSize) {
+    uint8_t out;
+    if (d < dim) {
+      float val  = utils::mapping<float>{}(src[d]) - ctr[d];
+      float dv   = delta[d];
+      float v    = vmin[d];
+      float code = (dv > 0.0f) ? roundf((val - v) / dv) : 0.0f;
+      out        = static_cast<uint8_t>(fminf(fmaxf(code, 0.0f), 255.0f));
+    } else {
+      out = 0;
+    }
+    uint32_t l                                              = (d / veclen) * veclen;
+    uint32_t j                                              = d % veclen;
+    list_dat[l * kIndexGroupSize + ingroup_id * veclen + j] = out;
+  }
+}
+
+/** In-place variant: dataset[i] = cast<T>(cast<float>(dataset[i]) - centers[labels[i]]) */
+template <typename T>
+RAFT_KERNEL compute_residuals_inplace_kernel(
+  T* dataset, const float* centers, const uint32_t* labels, int64_t n_rows, uint32_t dim)
+{
+  int64_t i = blockIdx.x;
+  if (i >= n_rows) return;
+  uint32_t c = labels[i];
+  for (uint32_t j = threadIdx.x; j < dim; j += blockDim.x) {
+    float val            = utils::mapping<float>{}(dataset[i * dim + j]);
+    dataset[i * dim + j] = utils::mapping<T>{}(val - centers[c * dim + j]);
+  }
+}
+
+template <typename T, typename CodeT>
+void extend(raft::resources const& handle,
+            index<CodeT>* index,
+            const T* new_vectors,
+            const int64_t* new_indices,
+            int64_t n_rows)
+{
+  using LabelT = uint32_t;
+  RAFT_EXPECTS(index != nullptr, "index cannot be empty.");
+  if (n_rows == 0) return;
+
+  auto stream  = raft::resource::get_cuda_stream(handle);
+  auto n_lists = index->n_lists();
+  auto dim     = index->dim();
+  list_spec<uint32_t, CodeT, int64_t> list_device_spec{index->dim(),
+                                                       index->conservative_memory_allocation()};
+  cuvs::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
+    "ivf_sq::extend(%zu, %u)", size_t(n_rows), dim);
+
+  RAFT_EXPECTS(new_indices != nullptr || index->size() == 0,
+               "You must pass data indices when the index is non-empty.");
+
+  auto new_labels =
+    raft::make_device_mdarray<LabelT>(handle,
+                                      raft::resource::get_large_workspace_resource_ref(handle),
+                                      raft::make_extents<int64_t>(n_rows));
+  cuvs::cluster::kmeans::balanced_params kmeans_params;
+  kmeans_params.metric     = index->metric();
+  auto orig_centroids_view = raft::make_device_matrix_view<const float, int64_t>(
+    index->centers().data_handle(), n_lists, dim);
+
+  constexpr size_t kReasonableMaxBatchSize = 65536;
+  size_t max_batch_size                    = std::min<size_t>(n_rows, kReasonableMaxBatchSize);
+
+  auto copy_stream     = raft::resource::get_cuda_stream(handle);
+  bool enable_prefetch = false;
+  if (handle.has_resource_factory(raft::resource::resource_type::CUDA_STREAM_POOL)) {
+    if (raft::resource::get_stream_pool_size(handle) >= 1) {
+      enable_prefetch = true;
+      copy_stream     = raft::resource::get_stream_from_stream_pool(handle);
+    }
+  }
+
+  utils::batch_load_iterator<T> vec_batches(new_vectors,
+                                            n_rows,
+                                            index->dim(),
+                                            max_batch_size,
+                                            copy_stream,
+                                            raft::resource::get_workspace_resource_ref(handle),
+                                            enable_prefetch);
+  vec_batches.prefetch_next_batch();
+
+  const bool needs_prefetch_sync = enable_prefetch && vec_batches.does_copy();
+
+  for (const auto& batch : vec_batches) {
+    auto batch_data_view =
+      raft::make_device_matrix_view<const T, int64_t>(batch.data(), batch.size(), index->dim());
+    auto batch_labels_view = raft::make_device_vector_view<LabelT, int64_t>(
+      new_labels.data_handle() + batch.offset(), batch.size());
+    cuvs::cluster::kmeans::predict(
+      handle, kmeans_params, batch_data_view, orig_centroids_view, batch_labels_view);
+    vec_batches.prefetch_next_batch();
+    if (needs_prefetch_sync) { raft::resource::sync_stream(handle); }
+  }
+
+  auto* list_sizes_ptr    = index->list_sizes().data_handle();
+  auto old_list_sizes_dev = raft::make_device_vector<uint32_t, int64_t>(handle, n_lists);
+  raft::copy(old_list_sizes_dev.data_handle(), list_sizes_ptr, n_lists, stream);
+
+  raft::stats::histogram<uint32_t, int64_t>(raft::stats::HistTypeAuto,
+                                            reinterpret_cast<int32_t*>(list_sizes_ptr),
+                                            int64_t(n_lists),
+                                            new_labels.data_handle(),
+                                            n_rows,
+                                            1,
+                                            stream);
+  raft::linalg::add(
+    list_sizes_ptr, list_sizes_ptr, old_list_sizes_dev.data_handle(), n_lists, stream);
+
+  std::vector<uint32_t> new_list_sizes(n_lists);
+  std::vector<uint32_t> old_list_sizes(n_lists);
+  {
+    raft::copy(old_list_sizes.data(), old_list_sizes_dev.data_handle(), n_lists, stream);
+    raft::copy(new_list_sizes.data(), list_sizes_ptr, n_lists, stream);
+    raft::resource::sync_stream(handle);
+    auto& lists = index->lists();
+    for (uint32_t label = 0; label < n_lists; label++) {
+      ivf::resize_list(handle,
+                       lists[label],
+                       list_device_spec,
+                       new_list_sizes[label],
+                       raft::Pow2<kIndexGroupSize>::roundUp(old_list_sizes[label]));
+    }
+  }
+  ivf::detail::recompute_internal_state(handle, *index);
+  raft::copy(list_sizes_ptr, old_list_sizes_dev.data_handle(), n_lists, stream);
+
+  utils::batch_load_iterator<int64_t> vec_indices(
+    new_indices,
+    n_rows,
+    1,
+    max_batch_size,
+    stream,
+    raft::resource::get_workspace_resource_ref(handle));
+  vec_batches.reset();
+  vec_batches.prefetch_next_batch();
+  utils::batch_load_iterator<int64_t> idx_batch = vec_indices.begin();
+
+  size_t next_report_offset = 0;
+  size_t d_report_offset    = n_rows * 5 / 100;
+
+  for (const auto& batch : vec_batches) {
+    int64_t bs = batch.size();
+
+    {
+      constexpr int kEncodeBlockSize   = 256;
+      constexpr int kEncodeWarpsPerBlk = kEncodeBlockSize / kIndexGroupSize;
+      const dim3 block_dim(kEncodeBlockSize);
+      const dim3 grid_dim(raft::ceildiv<int64_t>(bs, int64_t(kEncodeWarpsPerBlk)));
+      encode_and_fill_kernel<kEncodeBlockSize, T>
+        <<<grid_dim, block_dim, 0, stream>>>(new_labels.data_handle() + batch.offset(),
+                                             batch.data(),
+                                             index->centers().data_handle(),
+                                             idx_batch->data(),
+                                             index->data_ptrs().data_handle(),
+                                             index->inds_ptrs().data_handle(),
+                                             list_sizes_ptr,
+                                             index->sq_vmin().data_handle(),
+                                             index->sq_delta().data_handle(),
+                                             bs,
+                                             dim,
+                                             batch.offset());
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+
+    vec_batches.prefetch_next_batch();
+    if (needs_prefetch_sync) { raft::resource::sync_stream(handle); }
+
+    if (batch.offset() > next_report_offset) {
+      float progress = batch.offset() * 100.0f / n_rows;
+      RAFT_LOG_DEBUG("ivf_sq::extend added vectors %zu, %6.1f%% complete",
+                     static_cast<size_t>(batch.offset()),
+                     progress);
+      next_report_offset += d_report_offset;
+    }
+    ++idx_batch;
+  }
+
+  auto compute_center_norms = [&]() {
+    if (index->metric() == cuvs::distance::DistanceType::CosineExpanded) {
+      raft::linalg::rowNorm<raft::linalg::L2Norm, true>(index->center_norms()->data_handle(),
+                                                        index->centers().data_handle(),
+                                                        dim,
+                                                        n_lists,
+                                                        stream,
+                                                        raft::sqrt_op{});
+    } else {
+      raft::linalg::rowNorm<raft::linalg::L2Norm, true>(
+        index->center_norms()->data_handle(), index->centers().data_handle(), dim, n_lists, stream);
+    }
+  };
+
+  if (!index->center_norms().has_value()) {
+    index->allocate_center_norms(handle);
+    if (index->center_norms().has_value()) { compute_center_norms(); }
+  }
+}
+
+template <typename T, typename CodeT>
+auto extend(raft::resources const& handle,
+            const index<CodeT>& orig_index,
+            const T* new_vectors,
+            const int64_t* new_indices,
+            int64_t n_rows) -> index<CodeT>
+{
+  auto ext_index = clone(handle, orig_index);
+  detail::extend(handle, &ext_index, new_vectors, new_indices, n_rows);
+  return ext_index;
+}
+
+template <typename T, typename CodeT, typename accessor>
+inline auto build(
+  raft::resources const& handle,
+  const index_params& params,
+  raft::mdspan<const T, raft::matrix_extent<int64_t>, raft::row_major, accessor> dataset)
+  -> index<CodeT>
+{
+  int64_t n_rows = dataset.extent(0);
+  uint32_t dim   = dataset.extent(1);
+  auto stream    = raft::resource::get_cuda_stream(handle);
+  cuvs::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
+    "ivf_sq::build(%zu, %u)", size_t(n_rows), dim);
+  static_assert(std::is_same_v<T, float> || std::is_same_v<T, half>, "unsupported data type");
+  RAFT_EXPECTS(n_rows > 0 && dim > 0, "empty dataset");
+  RAFT_EXPECTS(n_rows >= params.n_lists, "number of rows can't be less than n_lists");
+  RAFT_EXPECTS(params.kmeans_trainset_fraction > 0.0 && params.kmeans_trainset_fraction <= 1.0,
+               "kmeans_trainset_fraction must be in (0, 1]");
+  RAFT_EXPECTS(params.metric != cuvs::distance::DistanceType::CosineExpanded || dim > 1,
+               "Cosine metric requires more than one dim");
+
+  index<CodeT> idx(handle, params, dim);
+
+  // Train k-means centroids and SQ parameters on the same training subset.
+  // This mirrors IVF-PQ, which also trains its codebook on a subset of the data.
+  {
+    raft::random::RngState random_state{137};
+    auto trainset_ratio = std::max<size_t>(
+      1, n_rows / std::max<size_t>(params.kmeans_trainset_fraction * n_rows, idx.n_lists()));
+    auto n_rows_train = n_rows / trainset_ratio;
+    auto trainset =
+      raft::make_device_mdarray<T>(handle,
+                                   raft::resource::get_large_workspace_resource_ref(handle),
+                                   raft::make_extents<int64_t>(n_rows_train, idx.dim()));
+    raft::matrix::sample_rows<T, int64_t>(handle, random_state, dataset, trainset.view());
+    auto trainset_const_view = raft::make_const_mdspan(trainset.view());
+    auto centers_view        = raft::make_device_matrix_view<float, int64_t>(
+      idx.centers().data_handle(), idx.n_lists(), idx.dim());
+    cuvs::cluster::kmeans::balanced_params kmeans_params;
+    kmeans_params.n_iters = params.kmeans_n_iters;
+    kmeans_params.metric  = idx.metric();
+    cuvs::cluster::kmeans::fit(handle, kmeans_params, trainset_const_view, centers_view);
+
+    // Train SQ: predict labels for the training subset, compute residuals in-place,
+    // and derive per-dimension vmin/delta from them.
+    {
+      auto train_labels       = raft::make_device_vector<uint32_t, int64_t>(handle, n_rows_train);
+      auto centers_const_view = raft::make_device_matrix_view<const float, int64_t>(
+        idx.centers().data_handle(), idx.n_lists(), dim);
+      cuvs::cluster::kmeans::predict(
+        handle, kmeans_params, trainset_const_view, centers_const_view, train_labels.view());
+
+      constexpr int kResidualBlockSize = 256;
+      compute_residuals_inplace_kernel<T>
+        <<<n_rows_train, kResidualBlockSize, 0, stream>>>(trainset.data_handle(),
+                                                          idx.centers().data_handle(),
+                                                          train_labels.data_handle(),
+                                                          n_rows_train,
+                                                          dim);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+    }
+
+    // After the in-place kernel, trainset now contains residuals.
+    auto& residuals = trainset;
+
+    {
+      auto vmax_buf  = raft::make_device_vector<float, uint32_t>(handle, dim);
+      auto* vmin_ptr = idx.sq_vmin().data_handle();
+      auto* vmax_ptr = vmax_buf.data_handle();
+
+      constexpr int kMinMaxBlockSize = 256;
+      launch_fused_column_minmax<kMinMaxBlockSize, T>(
+        residuals.data_handle(), vmin_ptr, vmax_ptr, n_rows_train, dim, stream);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+      // Expand the observed range by a small margin to reduce clipping on unseen data,
+      // since the SQ parameters are trained on a subset rather than the full dataset.
+      constexpr float kMargin = 0.05f;
+      auto* delta_ptr         = idx.sq_delta().data_handle();
+      raft::linalg::map_offset(
+        handle, idx.sq_vmin(), [vmin_ptr, vmax_ptr, delta_ptr, kMargin] __device__(uint32_t j) {
+          float range  = vmax_ptr[j] - vmin_ptr[j];
+          float margin = range * kMargin;
+          delta_ptr[j] = (range > 0.0f) ? (range + 2.0f * margin) / 255.0f : 1.0f;
+          return vmin_ptr[j] - margin;
+        });
+    }
+  }
+
+  if (params.add_data_on_build) {
+    detail::extend<T, CodeT>(handle, &idx, dataset.data_handle(), nullptr, n_rows);
+  }
+
+  return idx;
+}
+
+template <typename T, typename CodeT>
+void build(raft::resources const& handle,
+           const index_params& params,
+           raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,
+           index<CodeT>& idx)
+{
+  idx = build<T, CodeT>(handle, params, dataset);
+}
+
+template <typename T, typename CodeT>
+void build(raft::resources const& handle,
+           const index_params& params,
+           raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,
+           index<CodeT>& idx)
+{
+  idx = build<T, CodeT>(handle, params, dataset);
+}
+
+template <typename T, typename CodeT>
+auto extend(raft::resources const& handle,
+            raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+            const index<CodeT>& orig_index) -> index<CodeT>
+{
+  RAFT_EXPECTS(new_vectors.extent(1) == orig_index.dim(),
+               "new_vectors should have the same dimension as the index");
+  if (new_indices.has_value()) {
+    RAFT_EXPECTS(new_indices.value().extent(0) == new_vectors.extent(0),
+                 "new_vectors and new_indices have different number of rows");
+  }
+  int64_t n_rows = new_vectors.extent(0);
+  return extend<T, CodeT>(handle,
+                          orig_index,
+                          new_vectors.data_handle(),
+                          new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                          n_rows);
+}
+
+template <typename T, typename CodeT>
+auto extend(raft::resources const& handle,
+            raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            const index<CodeT>& orig_index) -> index<CodeT>
+{
+  RAFT_EXPECTS(new_vectors.extent(1) == orig_index.dim(),
+               "new_vectors should have the same dimension as the index");
+  if (new_indices.has_value()) {
+    RAFT_EXPECTS(new_indices.value().extent(0) == new_vectors.extent(0),
+                 "new_vectors and new_indices have different number of rows");
+  }
+  int64_t n_rows = new_vectors.extent(0);
+  return extend<T, CodeT>(handle,
+                          orig_index,
+                          new_vectors.data_handle(),
+                          new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                          n_rows);
+}
+
+template <typename T, typename CodeT>
+void extend(raft::resources const& handle,
+            raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,
+            index<CodeT>* idx)
+{
+  RAFT_EXPECTS(new_vectors.extent(1) == idx->dim(),
+               "new_vectors should have the same dimension as the index");
+  if (new_indices.has_value()) {
+    RAFT_EXPECTS(new_indices.value().extent(0) == new_vectors.extent(0),
+                 "new_vectors and new_indices have different number of rows");
+  }
+  detail::extend(handle,
+                 idx,
+                 new_vectors.data_handle(),
+                 new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                 new_vectors.extent(0));
+}
+
+template <typename T, typename CodeT>
+void extend(raft::resources const& handle,
+            raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,
+            std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,
+            index<CodeT>* idx)
+{
+  RAFT_EXPECTS(new_vectors.extent(1) == idx->dim(),
+               "new_vectors should have the same dimension as the index");
+  if (new_indices.has_value()) {
+    RAFT_EXPECTS(new_indices.value().extent(0) == new_vectors.extent(0),
+                 "new_vectors and new_indices have different number of rows");
+  }
+  detail::extend(handle,
+                 idx,
+                 new_vectors.data_handle(),
+                 new_indices.has_value() ? new_indices.value().data_handle() : nullptr,
+                 new_vectors.extent(0));
+}
+
+}  // namespace detail
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..f22154e50f
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_float_uint8_t_int64_t.cu
@@ -0,0 +1,89 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include "ivf_sq_build.cuh"
+
+namespace cuvs::neighbors::ivf_sq {
+
+#define CUVS_INST_IVF_SQ_BUILD_EXTEND(T, CodeT)                                               \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)             \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
+  }                                                                                           \
+                                                                                              \
+  void build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             cuvs::neighbors::ivf_sq::index<CodeT>& idx)                                      \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset, idx);           \
+  }                                                                                           \
+                                                                                              \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)               \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
+  }                                                                                           \
+                                                                                              \
+  void build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             cuvs::neighbors::ivf_sq::index<CodeT>& idx)                                      \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset, idx);           \
+  }                                                                                           \
+                                                                                              \
+  auto extend(raft::resources const& handle,                                                  \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
+              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \
+              const cuvs::neighbors::ivf_sq::index<CodeT>& orig_index)                        \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(                            \
+        handle, new_vectors, new_indices, orig_index)));                                      \
+  }                                                                                           \
+                                                                                              \
+  void extend(raft::resources const& handle,                                                  \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
+              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \
+              cuvs::neighbors::ivf_sq::index<CodeT>* idx)                                     \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(handle, new_vectors, new_indices, idx); \
+  }                                                                                           \
+                                                                                              \
+  auto extend(raft::resources const& handle,                                                  \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,          \
+              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,      \
+              const cuvs::neighbors::ivf_sq::index<CodeT>& orig_index)                        \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(                            \
+        handle, new_vectors, new_indices, orig_index)));                                      \
+  }                                                                                           \
+                                                                                              \
+  void extend(raft::resources const& handle,                                                  \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,          \
+              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,      \
+              cuvs::neighbors::ivf_sq::index<CodeT>* idx)                                     \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(handle, new_vectors, new_indices, idx); \
+  }
+
+CUVS_INST_IVF_SQ_BUILD_EXTEND(float, uint8_t);
+
+#undef CUVS_INST_IVF_SQ_BUILD_EXTEND
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..e6900af4a0
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_build_extend_half_uint8_t_int64_t.cu
@@ -0,0 +1,89 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include "ivf_sq_build.cuh"
+
+namespace cuvs::neighbors::ivf_sq {
+
+#define CUVS_INST_IVF_SQ_BUILD_EXTEND(T, CodeT)                                               \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset)             \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
+  }                                                                                           \
+                                                                                              \
+  void build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::device_matrix_view<const T, int64_t, raft::row_major> dataset,             \
+             cuvs::neighbors::ivf_sq::index<CodeT>& idx)                                      \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset, idx);           \
+  }                                                                                           \
+                                                                                              \
+  auto build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset)               \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset)));  \
+  }                                                                                           \
+                                                                                              \
+  void build(raft::resources const& handle,                                                   \
+             const cuvs::neighbors::ivf_sq::index_params& params,                             \
+             raft::host_matrix_view<const T, int64_t, raft::row_major> dataset,               \
+             cuvs::neighbors::ivf_sq::index<CodeT>& idx)                                      \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::build<T, CodeT>(handle, params, dataset, idx);           \
+  }                                                                                           \
+                                                                                              \
+  auto extend(raft::resources const& handle,                                                  \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
+              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \
+              const cuvs::neighbors::ivf_sq::index<CodeT>& orig_index)                        \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(                            \
+        handle, new_vectors, new_indices, orig_index)));                                      \
+  }                                                                                           \
+                                                                                              \
+  void extend(raft::resources const& handle,                                                  \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> new_vectors,        \
+              std::optional<raft::device_vector_view<const int64_t, int64_t>> new_indices,    \
+              cuvs::neighbors::ivf_sq::index<CodeT>* idx)                                     \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(handle, new_vectors, new_indices, idx); \
+  }                                                                                           \
+                                                                                              \
+  auto extend(raft::resources const& handle,                                                  \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,          \
+              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,      \
+              const cuvs::neighbors::ivf_sq::index<CodeT>& orig_index)                        \
+    -> cuvs::neighbors::ivf_sq::index<CodeT>                                                  \
+  {                                                                                           \
+    return cuvs::neighbors::ivf_sq::index<CodeT>(                                             \
+      std::move(cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(                            \
+        handle, new_vectors, new_indices, orig_index)));                                      \
+  }                                                                                           \
+                                                                                              \
+  void extend(raft::resources const& handle,                                                  \
+              raft::host_matrix_view<const T, int64_t, raft::row_major> new_vectors,          \
+              std::optional<raft::host_vector_view<const int64_t, int64_t>> new_indices,      \
+              cuvs::neighbors::ivf_sq::index<CodeT>* idx)                                     \
+  {                                                                                           \
+    cuvs::neighbors::ivf_sq::detail::extend<T, CodeT>(handle, new_vectors, new_indices, idx); \
+  }
+
+CUVS_INST_IVF_SQ_BUILD_EXTEND(half, uint8_t);
+
+#undef CUVS_INST_IVF_SQ_BUILD_EXTEND
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
new file mode 100644
index 0000000000..6bc64c1569
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search.cuh
@@ -0,0 +1,786 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "../../core/nvtx.hpp"
+#include "../detail/ann_utils.cuh"
+#include "../ivf_common.cuh"
+#include "../sample_filter.cuh"
+#include "detail/jit_lto_kernels/kernel_def.hpp"
+#include "detail/jit_lto_kernels/scan_planner.hpp"
+#include <cuvs/detail/jit_lto/common_fragments.hpp>
+#include <cuvs/detail/jit_lto/ivf_sq/scan_fragments.hpp>
+#include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include <cuvs/distance/distance.hpp>
+#include <cuvs/selection/select_k.hpp>
+#include <raft/core/error.hpp>
+#include <raft/core/logger.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/gemm.cuh>
+#include <raft/linalg/norm.cuh>
+#include <raft/linalg/unary_op.cuh>
+#include <raft/matrix/detail/select_warpsort.cuh>
+#include <raft/util/cuda_rt_essentials.hpp>
+#include <raft/util/integer_utils.hpp>
+
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/fill.h>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+using namespace cuvs::spatial::knn::detail;  // NOLINT
+
+// Maximum fused top-k capacity we instantiate for the scan kernel.
+// Must match the highest Capacity case in ivf_sq_scan's switch.
+static constexpr int kMaxSqScanCapacity = 256;
+static_assert(kMaxSqScanCapacity <= raft::matrix::detail::select::warpsort::kMaxCapacity,
+              "kMaxSqScanCapacity must not exceed the warpsort library's maximum supported "
+              "capacity; reduce kMaxSqScanCapacity or update the warpsort dependency.");
+
+auto RAFT_WEAK_FUNCTION is_local_topk_feasible(uint32_t k) -> bool
+{
+  return k <= kMaxSqScanCapacity;
+}
+
+// Compute shared-memory size for the scan kernel (3 x dim floats), plus the
+// block_sort merge buffer when fused top-k is enabled. Must match the smem
+// layout used by ivf_sq_scan_impl.
+inline size_t sq_scan_smem_size(uint32_t dim) { return 3 * dim * sizeof(float); }
+
+template <int Capacity>
+size_t sq_scan_total_smem(uint32_t dim, uint32_t k)
+{
+  size_t scan_smem = sq_scan_smem_size(dim);
+  if constexpr (Capacity > 0) {
+    constexpr int kSubwarpSize = std::min<int>(Capacity, raft::WarpSize);
+    int num_subwarps           = kSqScanThreads / kSubwarpSize;
+    size_t merge_smem =
+      raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<float, uint32_t>(
+        num_subwarps, k);
+    return std::max(scan_smem, merge_smem);
+  }
+  return scan_smem;
+}
+
+// Map the runtime distance metric onto the corresponding compile-time tag and
+// invoke `f` (a generic lambda) with the tag type. Used by the JIT-LTO launch
+// dispatch to forward the metric to the planner without bloating the static
+// library with the kernel body.
+template <typename F>
+void dispatch_metric_tag(cuvs::distance::DistanceType metric, F&& f)
+{
+  switch (metric) {
+    case cuvs::distance::DistanceType::L2Expanded:
+    case cuvs::distance::DistanceType::L2SqrtExpanded: f(tag_metric_l2{}); return;
+    case cuvs::distance::DistanceType::InnerProduct: f(tag_metric_ip{}); return;
+    case cuvs::distance::DistanceType::CosineExpanded: f(tag_metric_cosine{}); return;
+    default: RAFT_FAIL("Unsupported metric type for IVF-SQ scan.");
+  }
+}
+
+template <typename FilterT>
+constexpr auto get_filter_type_tag()
+{
+  using namespace cuvs::neighbors::filtering;
+  if constexpr (std::is_same_v<FilterT, none_sample_filter>) {
+    return cuvs::neighbors::detail::tag_filter_none{};
+  } else if constexpr (std::is_same_v<FilterT, bitset_filter<uint32_t, int64_t>>) {
+    return cuvs::neighbors::detail::tag_filter_bitset{};
+  }
+}
+
+// Choose grid.x to saturate the GPU. Mirrors the previous static
+// configure_grid_dim_x but takes the JIT-linked cudaKernel_t.
+inline uint32_t configure_grid_dim_x(
+  uint32_t n_queries, uint32_t n_probes, int smem_size, int block_size, cudaKernel_t kernel)
+{
+  int dev_id;
+  RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+  int num_sms;
+  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+  int num_blocks_per_sm = 0;
+  RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    &num_blocks_per_sm, kernel, block_size, smem_size));
+
+  size_t min_grid_size = size_t(num_sms) * num_blocks_per_sm;
+  size_t min_grid_x    = raft::ceildiv<size_t>(min_grid_size, n_queries);
+  return std::min<uint32_t>(n_probes, static_cast<uint32_t>(min_grid_x));
+}
+
+// ---------------------------------------------------------------------------
+// JIT-LTO scan launcher
+//
+// The scan kernel `ivf_sq_scan_impl` lives as a fatbin fragment under
+// `detail/jit_lto_kernels/`. At runtime we ask the planner for the linked
+// kernel matching (Capacity, MetricTag, FilterTag), set the dynamic shared
+// memory attribute, compute grid_dim_x via cudaOccupancy*, and dispatch.
+//
+// Grid layout matches the original static kernel:
+//   kManageLocalTopK (Capacity > 0):
+//     grid (grid_dim_x, n_queries) - each block loops over probes
+//   otherwise (Capacity == 0):
+//     grid (n_probes, n_queries) - one block per (query, probe)
+// ---------------------------------------------------------------------------
+template <int Capacity,
+          typename MetricTag,
+          typename CodeT,
+          typename IvfSampleFilterTag,
+          typename IvfSampleFilterT>
+void launch_kernel(const index<CodeT>& idx,
+                   const float* queries_float,
+                   const float* query_norms,
+                   uint32_t n_queries,
+                   uint32_t n_probes,
+                   uint32_t k,
+                   uint32_t max_samples,
+                   const uint32_t* coarse_indices,
+                   const uint32_t* chunk_indices,
+                   float* out_distances,
+                   uint32_t* out_indices,
+                   uint32_t& grid_dim_x,
+                   rmm::cuda_stream_view stream,
+                   IvfSampleFilterT sample_filter)
+{
+  static_assert(std::is_same_v<CodeT, uint8_t>, "IVF-SQ JIT-LTO scan only supports CodeT=uint8_t");
+
+  constexpr bool kManageLocalTopK = (Capacity > 0);
+  constexpr int kThreads          = kSqScanThreads;
+  uint32_t dim                    = idx.dim();
+
+  // The scan entrypoint is templated only on Capacity. The warpsort runs
+  // ascending for all metrics; metric specialization is composed in at link
+  // time via four device-function fragments (setup_invariant_smem,
+  // setup_per_probe_smem, accumulate_distance, finalize_distance).
+  IvfSqScanPlanner kernel_planner;
+  kernel_planner.add_entrypoint<Capacity>();
+  kernel_planner.add_setup_invariant_smem_function<MetricTag>();
+  kernel_planner.add_setup_per_probe_smem_function<MetricTag>();
+  kernel_planner.add_accumulate_distance_function<MetricTag>();
+  kernel_planner.add_finalize_distance_function<MetricTag>();
+  kernel_planner.add_filter_device_function<IvfSampleFilterTag>();
+  auto kernel_launcher = kernel_planner.get_launcher();
+
+  // Extract bitset arguments from the user's filter object so they can be
+  // passed to the JIT-linked kernel as plain pointers/scalars; the JIT'd
+  // sample_filter fragment reconstructs the bitset_view inside the kernel.
+  int64_t* const* inds_ptrs_dev = idx.inds_ptrs().data_handle();
+  uint32_t* bitset_ptr          = nullptr;
+  int64_t bitset_len            = 0;
+  int64_t original_nbits        = 0;
+  if constexpr (std::is_same_v<IvfSampleFilterT,
+                               cuvs::neighbors::filtering::bitset_filter<uint32_t, int64_t>>) {
+    bitset_ptr     = sample_filter.view().data();
+    bitset_len     = sample_filter.view().size();
+    original_nbits = sample_filter.view().get_original_nbits();
+  }
+
+  size_t smem = sq_scan_total_smem<Capacity>(dim, k);
+
+  {
+    int dev_id;
+    RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+    int max_smem;
+    RAFT_CUDA_TRY(
+      cudaDeviceGetAttribute(&max_smem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev_id));
+    RAFT_EXPECTS(smem <= size_t(max_smem),
+                 "IVF-SQ scan kernel requires %zu bytes of shared memory (dim=%u, k=%u), "
+                 "but the device supports at most %d bytes per block.",
+                 smem,
+                 dim,
+                 k,
+                 max_smem);
+  }
+
+  {
+    int dev_id;
+    RAFT_CUDA_TRY(cudaGetDevice(&dev_id));
+    RAFT_CUDA_TRY(cudaKernelSetAttributeForDevice(kernel_launcher->get_kernel(),
+                                                  cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                                  static_cast<int>(smem),
+                                                  dev_id));
+  }
+
+  constexpr uint32_t kMaxGridY = 65535;
+
+  if constexpr (kManageLocalTopK) {
+    if (grid_dim_x == 0) {
+      grid_dim_x = configure_grid_dim_x(
+        std::min(kMaxGridY, n_queries), n_probes, smem, kThreads, kernel_launcher->get_kernel());
+      return;
+    }
+  }
+
+  dim3 block(kThreads);
+
+  for (uint32_t query_offset = 0; query_offset < n_queries; query_offset += kMaxGridY) {
+    uint32_t batch = std::min(kMaxGridY, n_queries - query_offset);
+    dim3 grid      = kManageLocalTopK ? dim3(grid_dim_x, batch) : dim3(n_probes, batch);
+
+    const float* q_ptr  = queries_float + uint64_t(query_offset) * dim;
+    const float* qn_ptr = query_norms ? query_norms + query_offset : query_norms;
+    const uint32_t* ci  = coarse_indices + uint64_t(query_offset) * n_probes;
+    const uint32_t* ch  = chunk_indices + uint64_t(query_offset) * n_probes;
+    float* od           = out_distances;
+    uint32_t* oi        = out_indices;
+    if constexpr (kManageLocalTopK) {
+      od += uint64_t(query_offset) * grid_dim_x * k;
+      oi += uint64_t(query_offset) * grid_dim_x * k;
+    } else {
+      od += uint64_t(query_offset) * max_samples;
+      oi += uint64_t(query_offset) * max_samples;
+    }
+
+    const uint8_t* const* data_ptrs = idx.data_ptrs().data_handle();
+    const uint32_t* list_sizes      = idx.list_sizes().data_handle();
+    const float* centers            = idx.centers().data_handle();
+    const float* sq_vmin            = idx.sq_vmin().data_handle();
+    const float* sq_delta           = idx.sq_delta().data_handle();
+
+    kernel_launcher->dispatch<ivf_sq_scan_func_t<int64_t>>(stream,
+                                                           grid,
+                                                           block,
+                                                           smem,
+                                                           data_ptrs,
+                                                           list_sizes,
+                                                           ci,
+                                                           q_ptr,
+                                                           centers,
+                                                           sq_vmin,
+                                                           sq_delta,
+                                                           qn_ptr,
+                                                           n_probes,
+                                                           dim,
+                                                           k,
+                                                           max_samples,
+                                                           ch,
+                                                           od,
+                                                           oi,
+                                                           inds_ptrs_dev,
+                                                           bitset_ptr,
+                                                           bitset_len,
+                                                           original_nbits);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// ivf_sq_scan: top-level scan dispatch with Capacity selection
+// ---------------------------------------------------------------------------
+template <typename CodeT, typename IvfSampleFilterT>
+void ivf_sq_scan(raft::resources const& handle,
+                 const index<CodeT>& idx,
+                 const float* queries_float,
+                 const float* query_norms,
+                 uint32_t n_queries,
+                 uint32_t n_probes,
+                 uint32_t k,
+                 uint32_t max_samples,
+                 const uint32_t* coarse_indices,
+                 const uint32_t* chunk_indices,
+                 float* out_distances,
+                 uint32_t* out_indices,
+                 IvfSampleFilterT sample_filter,
+                 uint32_t& grid_dim_x,
+                 rmm::cuda_stream_view stream)
+{
+  // Determine the fused top-k capacity (0 = disabled / fallback to materialization)
+  int capacity = is_local_topk_feasible(k) ? raft::bound_by_power_of_two(int(k)) : 0;
+
+  // Snap to the nearest supported compile-time Capacity value (must be a
+  // power of two). Values up to 32 share one instantiation; 64, 128 and 256
+  // each get their own.  Beyond kMaxSqScanCapacity we fall back to the
+  // non-fused path (Capacity == 0).
+  if (capacity > 0 && capacity < 32) {
+    capacity = 32;
+  } else if (capacity > kMaxSqScanCapacity) {
+    capacity = 0;
+  }
+
+  using IvfSampleFilterTag = decltype(get_filter_type_tag<IvfSampleFilterT>());
+
+  auto fwd = [&](auto cap_tag) {
+    constexpr int kCap = decltype(cap_tag)::value;
+    dispatch_metric_tag(idx.metric(), [&](auto metric_tag) {
+      using MetricTag = decltype(metric_tag);
+      launch_kernel<kCap, MetricTag, CodeT, IvfSampleFilterTag, IvfSampleFilterT>(idx,
+                                                                                  queries_float,
+                                                                                  query_norms,
+                                                                                  n_queries,
+                                                                                  n_probes,
+                                                                                  k,
+                                                                                  max_samples,
+                                                                                  coarse_indices,
+                                                                                  chunk_indices,
+                                                                                  out_distances,
+                                                                                  out_indices,
+                                                                                  grid_dim_x,
+                                                                                  stream,
+                                                                                  sample_filter);
+    });
+  };
+
+  switch (capacity) {
+    case 0: fwd(std::integral_constant<int, 0>{}); break;
+    case 32: fwd(std::integral_constant<int, 32>{}); break;
+    case 64: fwd(std::integral_constant<int, 64>{}); break;
+    case 128: fwd(std::integral_constant<int, 128>{}); break;
+    case 256: fwd(std::integral_constant<int, 256>{}); break;
+    default: RAFT_FAIL("Unexpected capacity value %d", capacity);
+  }
+}
+
+// ---------------------------------------------------------------------------
+// search_impl — host-side search logic
+// ---------------------------------------------------------------------------
+template <typename T, typename CodeT, typename IvfSampleFilterT>
+void search_impl(raft::resources const& handle,
+                 const index<CodeT>& index,
+                 const T* queries,
+                 uint32_t n_queries,
+                 uint32_t k,
+                 uint32_t n_probes,
+                 int64_t* neighbors,
+                 float* distances,
+                 rmm::device_async_resource_ref search_mr,
+                 IvfSampleFilterT sample_filter)
+{
+  auto stream = raft::resource::get_cuda_stream(handle);
+  auto dim    = index.dim();
+
+  // The scan kernel emits min-close distances for every metric (IP scores are
+  // negated by finalize_distance and undone by postprocess_distances below),
+  // so all scan-related top-k selections use select_min = true. Only the
+  // coarse search, which runs on the raw GEMM output, follows the metric's
+  // native direction.
+  const bool coarse_select_min = cuvs::distance::is_min_close(index.metric());
+
+  std::size_t n_queries_probes = std::size_t(n_queries) * std::size_t(n_probes);
+
+  bool needs_query_norms = index.metric() != cuvs::distance::DistanceType::InnerProduct;
+  rmm::device_uvector<float> query_norm_dev(needs_query_norms ? n_queries : 0, stream, search_mr);
+  // Pass nullptr to the kernel for IP, where the empty uvector's data() pointer
+  // is implementation-defined and the kernel's `query_norms` argument is unused.
+  const float* qn_ptr = needs_query_norms ? query_norm_dev.data() : nullptr;
+  rmm::device_uvector<float> distance_buffer_dev(n_queries * index.n_lists(), stream, search_mr);
+  rmm::device_uvector<float> coarse_distances_dev(n_queries_probes, stream, search_mr);
+  rmm::device_uvector<uint32_t> coarse_indices_dev(n_queries_probes, stream, search_mr);
+
+  size_t float_query_size;
+  if constexpr (std::is_same_v<T, float>) {
+    float_query_size = 0;
+  } else {
+    float_query_size = n_queries * dim;
+  }
+  rmm::device_uvector<float> converted_queries_dev(float_query_size, stream, search_mr);
+  float* converted_queries_ptr = converted_queries_dev.data();
+
+  if constexpr (std::is_same_v<T, float>) {
+    converted_queries_ptr = const_cast<float*>(queries);
+  } else {
+    raft::linalg::unaryOp(
+      converted_queries_ptr, queries, n_queries * dim, utils::mapping<float>{}, stream);
+  }
+
+  auto distance_buffer_dev_view = raft::make_device_matrix_view<float, int64_t>(
+    distance_buffer_dev.data(), n_queries, index.n_lists());
+
+  RAFT_EXPECTS(index.metric() == cuvs::distance::DistanceType::InnerProduct ||
+                 index.center_norms().has_value(),
+               "Center norms are required for search with L2 or Cosine metric. "
+               "Rebuild the index with add_data_on_build=true or call extend() first.");
+
+  float alpha = 1.0f;
+  float beta  = 0.0f;
+  switch (index.metric()) {
+    case cuvs::distance::DistanceType::L2Expanded:
+    case cuvs::distance::DistanceType::L2SqrtExpanded: {
+      alpha = -2.0f;
+      beta  = 1.0f;
+      raft::linalg::rowNorm<raft::linalg::L2Norm, true>(query_norm_dev.data(),
+                                                        converted_queries_ptr,
+                                                        static_cast<int64_t>(dim),
+                                                        static_cast<int64_t>(n_queries),
+                                                        stream);
+      utils::outer_add(query_norm_dev.data(),
+                       (int64_t)n_queries,
+                       index.center_norms()->data_handle(),
+                       (int64_t)index.n_lists(),
+                       distance_buffer_dev.data(),
+                       stream);
+      break;
+    }
+    case cuvs::distance::DistanceType::CosineExpanded: {
+      raft::linalg::rowNorm<raft::linalg::L2Norm, true>(query_norm_dev.data(),
+                                                        converted_queries_ptr,
+                                                        static_cast<int64_t>(dim),
+                                                        static_cast<int64_t>(n_queries),
+                                                        stream,
+                                                        raft::sqrt_op{});
+      alpha = -1.0f;
+      beta  = 0.0f;
+      break;
+    }
+    case cuvs::distance::DistanceType::InnerProduct: {
+      alpha = 1.0f;
+      beta  = 0.0f;
+      break;
+    }
+    default: RAFT_FAIL("Unsupported metric type for IVF-SQ search.");
+  }
+
+  raft::linalg::gemm(handle,
+                     true,
+                     false,
+                     index.n_lists(),
+                     n_queries,
+                     dim,
+                     &alpha,
+                     index.centers().data_handle(),
+                     dim,
+                     converted_queries_ptr,
+                     dim,
+                     &beta,
+                     distance_buffer_dev.data(),
+                     index.n_lists(),
+                     stream);
+
+  if (index.metric() == cuvs::distance::DistanceType::CosineExpanded) {
+    auto n_lists_local          = index.n_lists();
+    const auto* q_norm_ptr      = query_norm_dev.data();
+    const auto* center_norm_ptr = index.center_norms()->data_handle();
+    raft::linalg::map_offset(
+      handle,
+      distance_buffer_dev_view,
+      [=] __device__(const int64_t idx, const float dist) {
+        const auto query   = idx / n_lists_local;
+        const auto cluster = idx % n_lists_local;
+        float denom        = q_norm_ptr[query] * center_norm_ptr[cluster];
+        return (denom > 0.0f) ? dist / denom : 0.0f;
+      },
+      raft::make_const_mdspan(distance_buffer_dev_view));
+  }
+
+  cuvs::selection::select_k(
+    handle,
+    raft::make_const_mdspan(distance_buffer_dev_view),
+    std::nullopt,
+    raft::make_device_matrix_view<float, int64_t>(coarse_distances_dev.data(), n_queries, n_probes),
+    raft::make_device_matrix_view<uint32_t, int64_t>(
+      coarse_indices_dev.data(), n_queries, n_probes),
+    coarse_select_min);
+
+  rmm::device_uvector<uint32_t> num_samples(n_queries, stream, search_mr);
+  rmm::device_uvector<uint32_t> chunk_index(n_queries_probes, stream, search_mr);
+
+  ivf::detail::calc_chunk_indices::configure(n_probes, n_queries)(index.list_sizes().data_handle(),
+                                                                  coarse_indices_dev.data(),
+                                                                  chunk_index.data(),
+                                                                  num_samples.data(),
+                                                                  stream);
+
+  bool manage_local_topk = is_local_topk_feasible(k);
+
+  // Determine grid_dim_x for the fused path
+  uint32_t grid_dim_x = 0;
+  if (manage_local_topk) {
+    // Query the occupancy to compute optimal grid_dim_x (does not launch)
+    ivf_sq_scan(handle,
+                index,
+                converted_queries_ptr,
+                qn_ptr,
+                n_queries,
+                n_probes,
+                k,
+                0,
+                coarse_indices_dev.data(),
+                chunk_index.data(),
+                nullptr,
+                nullptr,
+                sample_filter,
+                grid_dim_x,
+                stream);
+    if (grid_dim_x == 0) {
+      manage_local_topk = false;
+      RAFT_LOG_WARN(
+        "IVF-SQ fused top-k kernel has zero occupancy (dim=%u, k=%u); "
+        "falling back to the non-fused scan path.",
+        index.dim(),
+        k);
+    }
+  }
+
+  // The scan kernel writes sample-local uint32 indices; postprocess_neighbors
+  // lifts them to int64_t database indices via inds_ptrs.
+  rmm::device_uvector<uint32_t> neighbors_uint32(std::size_t(n_queries) * k, stream, search_mr);
+  uint32_t* neighbors_uint32_ptr = neighbors_uint32.data();
+
+  if (manage_local_topk) {
+    // --- Fused top-k path ---
+    auto target_size = std::size_t(n_queries) * grid_dim_x * k;
+    rmm::device_uvector<float> distances_tmp(0, stream, search_mr);
+    rmm::device_uvector<uint32_t> indices_tmp(0, stream, search_mr);
+
+    float* dist_out_ptr   = nullptr;
+    uint32_t* idx_out_ptr = nullptr;
+
+    if (grid_dim_x > 1) {
+      distances_tmp.resize(target_size, stream);
+      indices_tmp.resize(target_size, stream);
+      dist_out_ptr = distances_tmp.data();
+      idx_out_ptr  = indices_tmp.data();
+    } else {
+      dist_out_ptr = distances;
+      idx_out_ptr  = neighbors_uint32_ptr;
+    }
+
+    ivf_sq_scan(handle,
+                index,
+                converted_queries_ptr,
+                qn_ptr,
+                n_queries,
+                n_probes,
+                k,
+                0,
+                coarse_indices_dev.data(),
+                chunk_index.data(),
+                dist_out_ptr,
+                idx_out_ptr,
+                sample_filter,
+                grid_dim_x,
+                stream);
+
+    // Merge across blocks if needed
+    if (grid_dim_x > 1) {
+      auto cols = uint32_t(grid_dim_x) * k;
+      cuvs::selection::select_k(
+        handle,
+        raft::make_device_matrix_view<const float, int64_t>(distances_tmp.data(), n_queries, cols),
+        raft::make_device_matrix_view<const uint32_t, int64_t>(indices_tmp.data(), n_queries, cols),
+        raft::make_device_matrix_view<float, int64_t>(distances, n_queries, k),
+        raft::make_device_matrix_view<uint32_t, int64_t>(neighbors_uint32_ptr, n_queries, k),
+        /*select_min=*/true);
+    }
+  } else {
+    // --- Fallback: materialize all distances ---
+    int64_t ms = std::max<int64_t>(index.accum_sorted_sizes()(n_probes), k);
+    RAFT_EXPECTS(ms <= int64_t(std::numeric_limits<uint32_t>::max()),
+                 "The maximum sample size is too big.");
+    uint32_t max_samples = static_cast<uint32_t>(ms);
+
+    rmm::device_uvector<float> all_distances(
+      std::size_t(n_queries) * max_samples, stream, search_mr);
+    rmm::device_uvector<uint32_t> all_indices(
+      std::size_t(n_queries) * max_samples, stream, search_mr);
+
+    thrust::fill_n(raft::resource::get_thrust_policy(handle),
+                   all_distances.data(),
+                   std::size_t(n_queries) * max_samples,
+                   std::numeric_limits<float>::max());
+    thrust::fill_n(raft::resource::get_thrust_policy(handle),
+                   all_indices.data(),
+                   std::size_t(n_queries) * max_samples,
+                   uint32_t(0xFFFFFFFF));
+
+    // grid_dim_x is unused for the non-fused path; set to n_probes so each
+    // block in the (n_probes, n_queries) grid processes exactly one probe
+    uint32_t gdx = n_probes;
+    ivf_sq_scan(handle,
+                index,
+                converted_queries_ptr,
+                qn_ptr,
+                n_queries,
+                n_probes,
+                k,
+                max_samples,
+                coarse_indices_dev.data(),
+                chunk_index.data(),
+                all_distances.data(),
+                all_indices.data(),
+                sample_filter,
+                gdx,
+                stream);
+
+    auto num_samples_view =
+      raft::make_device_vector_view<const uint32_t>(num_samples.data(), n_queries);
+
+    cuvs::selection::select_k(
+      handle,
+      raft::make_device_matrix_view<const float, int64_t>(
+        all_distances.data(), n_queries, max_samples),
+      raft::make_device_matrix_view<const uint32_t, int64_t>(
+        all_indices.data(), n_queries, max_samples),
+      raft::make_device_matrix_view<float, int64_t>(distances, n_queries, k),
+      raft::make_device_matrix_view<uint32_t, int64_t>(neighbors_uint32_ptr, n_queries, k),
+      /*select_min=*/true,
+      false,
+      cuvs::selection::SelectAlgo::kAuto,
+      num_samples_view);
+  }
+
+  // For IP, the kernel returned negated scores so the warpsort could run
+  // ascending; postprocess_distances multiplies by -1 to restore the public
+  // contract (positive inner products). Cosine distance is already min-close
+  // so it must not be flipped (account_for_max_close = false for cosine).
+  const bool account_for_max_close = index.metric() != cuvs::distance::DistanceType::CosineExpanded;
+  ivf::detail::postprocess_distances(
+    handle, distances, distances, index.metric(), n_queries, k, 1.0, account_for_max_close);
+
+  ivf::detail::postprocess_neighbors(neighbors,
+                                     neighbors_uint32_ptr,
+                                     index.inds_ptrs().data_handle(),
+                                     coarse_indices_dev.data(),
+                                     chunk_index.data(),
+                                     n_queries,
+                                     n_probes,
+                                     k,
+                                     stream);
+}
+
+template <typename T,
+          typename CodeT,
+          typename IvfSampleFilterT = cuvs::neighbors::filtering::none_sample_filter>
+inline void search_with_filtering(raft::resources const& handle,
+                                  const search_params& params,
+                                  const index<CodeT>& index,
+                                  const T* queries,
+                                  uint32_t n_queries,
+                                  uint32_t k,
+                                  int64_t* neighbors,
+                                  float* distances,
+                                  IvfSampleFilterT sample_filter = IvfSampleFilterT())
+{
+  cuvs::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
+    "ivf_sq::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
+
+  RAFT_EXPECTS(params.n_probes > 0,
+               "n_probes (number of clusters to probe in the search) must be positive.");
+  auto n_probes = std::min<uint32_t>(params.n_probes, index.n_lists());
+  if (n_probes < params.n_probes) {
+    RAFT_LOG_WARN(
+      "n_probes (%u) is larger than the number of lists in the index (%u), clamping to %u.",
+      params.n_probes,
+      index.n_lists(),
+      n_probes);
+  }
+
+  bool manage_local_topk = is_local_topk_feasible(k);
+
+  // Always compute max_samples: even on the fused path, search_impl may fall
+  // back to the materialize-all branch when the scan kernel reports zero
+  // occupancy, in which case the workspace must already be sized for the
+  // (much larger) materialize-path requirement.
+  int64_t ms = std::max<int64_t>(index.accum_sorted_sizes()(n_probes), k);
+  RAFT_EXPECTS(ms <= int64_t(std::numeric_limits<uint32_t>::max()),
+               "The maximum sample size is too big.");
+  uint32_t max_samples = static_cast<uint32_t>(ms);
+
+  constexpr uint64_t kExpectedWsSize = 1024ull * 1024 * 1024;
+  uint64_t max_ws_size =
+    std::min<uint64_t>(raft::resource::get_workspace_free_bytes(handle), kExpectedWsSize);
+
+  uint64_t converted_query_floats = std::is_same_v<T, float> ? 0 : index.dim();
+
+  // Materialize-path estimate: used directly for the non-fused path, and as
+  // a conservative upper bound for the fused path so the fallback branch in
+  // search_impl never overshoots the workspace.
+  uint64_t ws_materialize = sizeof(float) * (uint64_t(index.n_lists()) + n_probes + 1 +
+                                             max_samples + converted_query_floats) +
+                            sizeof(uint32_t) * (uint64_t(n_probes) * 2 + 1 + max_samples + k);
+
+  uint64_t ws_per_query;
+  if (manage_local_topk) {
+    // Fused path: only small per-query buffers for coarse search + chunk indices.
+    // The scan output is at most grid_dim_x * k per query (we conservatively
+    // assume grid_dim_x <= n_probes for the workspace estimate).
+    uint64_t fused_out = uint64_t(n_probes) * k;
+    uint64_t ws_fused  = sizeof(float) * (uint64_t(index.n_lists()) + n_probes + 1 + fused_out +
+                                         converted_query_floats) +
+                        sizeof(uint32_t) * (uint64_t(n_probes) * 2 + 1 + fused_out + k);
+    // Take the conservative max so search_impl can safely fall back to the
+    // materialize path if the fused-path occupancy probe returns zero.
+    ws_per_query = std::max(ws_fused, ws_materialize);
+  } else {
+    ws_per_query = ws_materialize;
+  }
+
+  const uint32_t max_queries =
+    std::min<uint32_t>(n_queries, std::max<uint64_t>(1, max_ws_size / ws_per_query));
+
+  for (uint32_t offset_q = 0; offset_q < n_queries; offset_q += max_queries) {
+    uint32_t queries_batch = std::min(max_queries, n_queries - offset_q);
+
+    search_impl<T, CodeT, IvfSampleFilterT>(handle,
+                                            index,
+                                            queries + std::size_t(offset_q) * index.dim(),
+                                            queries_batch,
+                                            k,
+                                            n_probes,
+                                            neighbors + std::size_t(offset_q) * k,
+                                            distances + std::size_t(offset_q) * k,
+                                            raft::resource::get_workspace_resource_ref(handle),
+                                            sample_filter);
+  }
+}
+
+template <typename T, typename CodeT, typename IvfSampleFilterT>
+void search_with_filtering(raft::resources const& handle,
+                           const search_params& params,
+                           const index<CodeT>& index,
+                           raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
+                           raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+                           raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+                           IvfSampleFilterT sample_filter = IvfSampleFilterT())
+{
+  RAFT_EXPECTS(
+    queries.extent(0) == neighbors.extent(0) && queries.extent(0) == distances.extent(0),
+    "Number of rows in output neighbors and distances matrices must equal the number of queries.");
+  RAFT_EXPECTS(neighbors.extent(1) == distances.extent(1),
+               "Number of columns in output neighbors and distances matrices must be equal");
+  RAFT_EXPECTS(queries.extent(1) == index.dim(),
+               "Number of query dimensions should equal number of dimensions in the index.");
+
+  search_with_filtering(handle,
+                        params,
+                        index,
+                        queries.data_handle(),
+                        static_cast<std::uint32_t>(queries.extent(0)),
+                        static_cast<std::uint32_t>(neighbors.extent(1)),
+                        neighbors.data_handle(),
+                        distances.data_handle(),
+                        sample_filter);
+}
+
+template <typename T, typename CodeT>
+void search(raft::resources const& handle,
+            const search_params& params,
+            const index<CodeT>& idx,
+            raft::device_matrix_view<const T, int64_t, raft::row_major> queries,
+            raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors,
+            raft::device_matrix_view<float, int64_t, raft::row_major> distances,
+            const cuvs::neighbors::filtering::base_filter& sample_filter_ref)
+{
+  try {
+    auto& sample_filter =
+      dynamic_cast<const cuvs::neighbors::filtering::none_sample_filter&>(sample_filter_ref);
+    return search_with_filtering(handle, params, idx, queries, neighbors, distances, sample_filter);
+  } catch (const std::bad_cast&) {
+  }
+
+  try {
+    auto& sample_filter =
+      dynamic_cast<const cuvs::neighbors::filtering::bitset_filter<uint32_t, int64_t>&>(
+        sample_filter_ref);
+    return search_with_filtering(handle, params, idx, queries, neighbors, distances, sample_filter);
+  } catch (const std::bad_cast&) {
+    RAFT_FAIL("Unsupported sample filter type");
+  }
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_search_uint8_t_int64_t.cu
new file mode 100644
index 0000000000..48247e207b
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_search_uint8_t_int64_t.cu
@@ -0,0 +1,30 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include "ivf_sq_search.cuh"
+
+namespace cuvs::neighbors::ivf_sq {
+
+#define CUVS_INST_IVF_SQ_SEARCH(T, CodeT)                                            \
+  void search(raft::resources const& handle,                                         \
+              const cuvs::neighbors::ivf_sq::search_params& params,                  \
+              const cuvs::neighbors::ivf_sq::index<CodeT>& index,                    \
+              raft::device_matrix_view<const T, int64_t, raft::row_major> queries,   \
+              raft::device_matrix_view<int64_t, int64_t, raft::row_major> neighbors, \
+              raft::device_matrix_view<float, int64_t, raft::row_major> distances,   \
+              const cuvs::neighbors::filtering::base_filter& sample_filter)          \
+  {                                                                                  \
+    cuvs::neighbors::ivf_sq::detail::search(                                         \
+      handle, params, index, queries, neighbors, distances, sample_filter);          \
+  }
+
+CUVS_INST_IVF_SQ_SEARCH(float, uint8_t);
+CUVS_INST_IVF_SQ_SEARCH(half, uint8_t);
+
+#undef CUVS_INST_IVF_SQ_SEARCH
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
new file mode 100644
index 0000000000..b201cceee7
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize.cuh
@@ -0,0 +1,159 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "../ivf_common.cuh"
+#include "../ivf_list.cuh"
+#include <cuvs/neighbors/common.hpp>
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include <raft/core/detail/mdspan_numpy_serializer.hpp>
+#include <raft/core/mdarray.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/serialize.hpp>
+#include <raft/util/pow2_utils.cuh>
+
+#include <fstream>
+
+namespace cuvs::neighbors::ivf_sq::detail {
+
+constexpr int serialization_version = 1;
+
+template <typename CodeT>
+void serialize(raft::resources const& handle, std::ostream& os, const index<CodeT>& index_)
+{
+  RAFT_LOG_DEBUG(
+    "Saving IVF-SQ index, size %zu, dim %u", static_cast<size_t>(index_.size()), index_.dim());
+
+  std::string dtype_string = raft::detail::numpy_serializer::get_numpy_dtype<CodeT>().to_string();
+  dtype_string.resize(4);
+  os << dtype_string;
+
+  serialize_scalar(handle, os, serialization_version);
+  serialize_scalar(handle, os, index_.size());
+  serialize_scalar(handle, os, index_.dim());
+  serialize_scalar(handle, os, index_.n_lists());
+  serialize_scalar(handle, os, index_.metric());
+  serialize_scalar(handle, os, index_.conservative_memory_allocation());
+  serialize_mdspan(handle, os, index_.centers());
+
+  if (index_.center_norms()) {
+    bool has_norms = true;
+    serialize_scalar(handle, os, has_norms);
+    serialize_mdspan(handle, os, *index_.center_norms());
+  } else {
+    bool has_norms = false;
+    serialize_scalar(handle, os, has_norms);
+  }
+
+  serialize_mdspan(handle, os, index_.sq_vmin());
+  serialize_mdspan(handle, os, index_.sq_delta());
+
+  auto sizes_host = raft::make_host_vector<uint32_t, uint32_t>(index_.list_sizes().extent(0));
+  raft::copy(sizes_host.data_handle(),
+             index_.list_sizes().data_handle(),
+             sizes_host.size(),
+             raft::resource::get_cuda_stream(handle));
+  raft::resource::sync_stream(handle);
+  serialize_mdspan(handle, os, sizes_host.view());
+
+  list_spec<uint32_t, CodeT, int64_t> list_store_spec{index_.dim(), true};
+  for (uint32_t label = 0; label < index_.n_lists(); label++) {
+    ivf::serialize_list(handle,
+                        os,
+                        index_.lists()[label],
+                        list_store_spec,
+                        raft::Pow2<kIndexGroupSize>::roundUp(sizes_host(label)));
+  }
+  raft::resource::sync_stream(handle);
+}
+
+template <typename CodeT>
+void serialize(raft::resources const& handle,
+               const std::string& filename,
+               const index<CodeT>& index_)
+{
+  std::ofstream of(filename, std::ios::out | std::ios::binary);
+  if (!of) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+  detail::serialize(handle, of, index_);
+  of.close();
+  if (!of) { RAFT_FAIL("Error writing output %s", filename.c_str()); }
+}
+
+template <typename CodeT>
+auto deserialize(raft::resources const& handle, std::istream& is) -> index<CodeT>
+{
+  char dtype_string[4];
+  is.read(dtype_string, 4);
+
+  auto ver = raft::deserialize_scalar<int>(handle, is);
+  if (ver != serialization_version) {
+    RAFT_FAIL("serialization version mismatch, expected %d, got %d ", serialization_version, ver);
+  }
+  auto n_rows  = raft::deserialize_scalar<int64_t>(handle, is);
+  auto dim     = raft::deserialize_scalar<uint32_t>(handle, is);
+  auto n_lists = raft::deserialize_scalar<uint32_t>(handle, is);
+  auto metric  = raft::deserialize_scalar<cuvs::distance::DistanceType>(handle, is);
+  bool cma     = raft::deserialize_scalar<bool>(handle, is);
+
+  index<CodeT> index_ = index<CodeT>(handle, metric, n_lists, dim, cma);
+
+  deserialize_mdspan(handle, is, index_.centers());
+
+  bool has_norms = raft::deserialize_scalar<bool>(handle, is);
+  if (has_norms) {
+    index_.allocate_center_norms(handle);
+    if (!index_.center_norms()) {
+      RAFT_FAIL("Error inconsistent center norms");
+    } else {
+      auto center_norms = index_.center_norms().value();
+      deserialize_mdspan(handle, is, center_norms);
+    }
+  }
+
+  deserialize_mdspan(handle, is, index_.sq_vmin());
+  deserialize_mdspan(handle, is, index_.sq_delta());
+
+  deserialize_mdspan(handle, is, index_.list_sizes());
+
+  list_spec<uint32_t, CodeT, int64_t> list_device_spec{index_.dim(), cma};
+  list_spec<uint32_t, CodeT, int64_t> list_store_spec{index_.dim(), true};
+  for (uint32_t label = 0; label < index_.n_lists(); label++) {
+    ivf::deserialize_list(handle, is, index_.lists()[label], list_store_spec, list_device_spec);
+  }
+  raft::resource::sync_stream(handle);
+
+  ivf::detail::recompute_internal_state(handle, index_);
+
+  return index_;
+}
+
+template <typename CodeT>
+auto deserialize(raft::resources const& handle, const std::string& filename) -> index<CodeT>
+{
+  std::ifstream is(filename, std::ios::in | std::ios::binary);
+  if (!is) { RAFT_FAIL("Cannot open file %s", filename.c_str()); }
+  auto index = detail::deserialize<CodeT>(handle, is);
+  is.close();
+  return index;
+}
+
+}  // namespace cuvs::neighbors::ivf_sq::detail
+
+#define CUVS_INST_IVF_SQ_SERIALIZE(CodeT)                                           \
+  void serialize(raft::resources const& handle,                                     \
+                 const std::string& filename,                                       \
+                 const cuvs::neighbors::ivf_sq::index<CodeT>& index)                \
+  {                                                                                 \
+    cuvs::neighbors::ivf_sq::detail::serialize(handle, filename, index);            \
+  }                                                                                 \
+                                                                                    \
+  void deserialize(raft::resources const& handle,                                   \
+                   const std::string& filename,                                     \
+                   cuvs::neighbors::ivf_sq::index<CodeT>* index)                    \
+  {                                                                                 \
+    *index = cuvs::neighbors::ivf_sq::detail::deserialize<CodeT>(handle, filename); \
+  }
diff --git a/cpp/src/neighbors/ivf_sq/ivf_sq_serialize_uint8_t.cu b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize_uint8_t.cu
new file mode 100644
index 0000000000..c2351ed8c3
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq/ivf_sq_serialize_uint8_t.cu
@@ -0,0 +1,16 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include "ivf_sq_serialize.cuh"
+
+namespace cuvs::neighbors::ivf_sq {
+
+CUVS_INST_IVF_SQ_SERIALIZE(uint8_t);
+
+#undef CUVS_INST_IVF_SQ_SERIALIZE
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/src/neighbors/ivf_sq_index.cpp b/cpp/src/neighbors/ivf_sq_index.cpp
new file mode 100644
index 0000000000..5a110a476c
--- /dev/null
+++ b/cpp/src/neighbors/ivf_sq_index.cpp
@@ -0,0 +1,243 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <cuvs/neighbors/ivf_sq.hpp>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/util/cuda_rt_essentials.hpp>
+
+#include <cstring>
+
+namespace cuvs::neighbors::ivf_sq {
+
+template <typename CodeT>
+index<CodeT>::index(raft::resources const& res)
+  : index(res, cuvs::distance::DistanceType::L2Expanded, 0, 0, false)
+{
+}
+
+template <typename CodeT>
+index<CodeT>::index(raft::resources const& res, const index_params& params, uint32_t dim)
+  : index(res, params.metric, params.n_lists, dim, params.conservative_memory_allocation)
+{
+}
+
+template <typename CodeT>
+index<CodeT>::index(raft::resources const& res,
+                    cuvs::distance::DistanceType metric,
+                    uint32_t n_lists,
+                    uint32_t dim,
+                    bool conservative_memory_allocation)
+  : cuvs::neighbors::index(),
+    metric_(metric),
+    conservative_memory_allocation_(conservative_memory_allocation),
+    lists_{n_lists},
+    list_sizes_{raft::make_device_vector<uint32_t, uint32_t>(res, n_lists)},
+    centers_(raft::make_device_matrix<float, uint32_t>(res, n_lists, dim)),
+    center_norms_(std::nullopt),
+    sq_vmin_{raft::make_device_vector<float, uint32_t>(res, dim)},
+    sq_delta_{raft::make_device_vector<float, uint32_t>(res, dim)},
+    data_ptrs_{raft::make_device_vector<CodeT*, uint32_t>(res, n_lists)},
+    inds_ptrs_{raft::make_device_vector<int64_t*, uint32_t>(res, n_lists)},
+    accum_sorted_sizes_{raft::make_host_vector<int64_t, uint32_t>(n_lists + 1)}
+{
+  check_consistency();
+  auto stream = raft::resource::get_cuda_stream(res);
+  std::memset(accum_sorted_sizes_.data_handle(), 0, accum_sorted_sizes_.size() * sizeof(int64_t));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(list_sizes_.data_handle(), 0, list_sizes_.size() * sizeof(uint32_t), stream));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(data_ptrs_.data_handle(), 0, data_ptrs_.size() * sizeof(CodeT*), stream));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(inds_ptrs_.data_handle(), 0, inds_ptrs_.size() * sizeof(int64_t*), stream));
+}
+
+template <typename CodeT>
+cuvs::distance::DistanceType index<CodeT>::metric() const noexcept
+{
+  return metric_;
+}
+
+template <typename CodeT>
+int64_t index<CodeT>::size() const noexcept
+{
+  return accum_sorted_sizes()(n_lists());
+}
+
+template <typename CodeT>
+uint32_t index<CodeT>::dim() const noexcept
+{
+  return centers_.extent(1);
+}
+
+template <typename CodeT>
+uint32_t index<CodeT>::n_lists() const noexcept
+{
+  return lists_.size();
+}
+
+template <typename CodeT>
+bool index<CodeT>::conservative_memory_allocation() const noexcept
+{
+  return conservative_memory_allocation_;
+}
+
+template <typename CodeT>
+raft::device_vector_view<uint32_t, uint32_t> index<CodeT>::list_sizes() noexcept
+{
+  return list_sizes_.view();
+}
+
+template <typename CodeT>
+raft::device_vector_view<const uint32_t, uint32_t> index<CodeT>::list_sizes() const noexcept
+{
+  return list_sizes_.view();
+}
+
+template <typename CodeT>
+raft::device_matrix_view<float, uint32_t, raft::row_major> index<CodeT>::centers() noexcept
+{
+  return centers_.view();
+}
+
+template <typename CodeT>
+raft::device_matrix_view<const float, uint32_t, raft::row_major> index<CodeT>::centers()
+  const noexcept
+{
+  return centers_.view();
+}
+
+template <typename CodeT>
+std::optional<raft::device_vector_view<float, uint32_t>> index<CodeT>::center_norms() noexcept
+{
+  if (center_norms_.has_value()) {
+    return std::make_optional<raft::device_vector_view<float, uint32_t>>(center_norms_->view());
+  } else {
+    return std::nullopt;
+  }
+}
+
+template <typename CodeT>
+std::optional<raft::device_vector_view<const float, uint32_t>> index<CodeT>::center_norms()
+  const noexcept
+{
+  if (center_norms_.has_value()) {
+    return std::make_optional<raft::device_vector_view<const float, uint32_t>>(
+      center_norms_->view());
+  } else {
+    return std::nullopt;
+  }
+}
+
+template <typename CodeT>
+void index<CodeT>::allocate_center_norms(raft::resources const& res)
+{
+  switch (metric_) {
+    case cuvs::distance::DistanceType::L2Expanded:
+    case cuvs::distance::DistanceType::L2SqrtExpanded:
+    case cuvs::distance::DistanceType::L2Unexpanded:
+    case cuvs::distance::DistanceType::L2SqrtUnexpanded:
+    case cuvs::distance::DistanceType::CosineExpanded:
+      center_norms_ = raft::make_device_vector<float, uint32_t>(res, n_lists());
+      break;
+    default: center_norms_ = std::nullopt;
+  }
+}
+
+template <typename CodeT>
+raft::device_vector_view<float, uint32_t> index<CodeT>::sq_vmin() noexcept
+{
+  return sq_vmin_.view();
+}
+
+template <typename CodeT>
+raft::device_vector_view<const float, uint32_t> index<CodeT>::sq_vmin() const noexcept
+{
+  return sq_vmin_.view();
+}
+
+template <typename CodeT>
+raft::device_vector_view<float, uint32_t> index<CodeT>::sq_delta() noexcept
+{
+  return sq_delta_.view();
+}
+
+template <typename CodeT>
+raft::device_vector_view<const float, uint32_t> index<CodeT>::sq_delta() const noexcept
+{
+  return sq_delta_.view();
+}
+
+template <typename CodeT>
+raft::host_vector_view<int64_t, uint32_t> index<CodeT>::accum_sorted_sizes() noexcept
+{
+  return accum_sorted_sizes_.view();
+}
+
+template <typename CodeT>
+raft::host_vector_view<const int64_t, uint32_t> index<CodeT>::accum_sorted_sizes() const noexcept
+{
+  return accum_sorted_sizes_.view();
+}
+
+template <typename CodeT>
+raft::device_vector_view<CodeT*, uint32_t> index<CodeT>::data_ptrs() noexcept
+{
+  return data_ptrs_.view();
+}
+
+template <typename CodeT>
+raft::device_vector_view<CodeT* const, uint32_t> index<CodeT>::data_ptrs() const noexcept
+{
+  return data_ptrs_.view();
+}
+
+template <typename CodeT>
+raft::device_vector_view<int64_t*, uint32_t> index<CodeT>::inds_ptrs() noexcept
+{
+  return inds_ptrs_.view();
+}
+
+template <typename CodeT>
+raft::device_vector_view<int64_t* const, uint32_t> index<CodeT>::inds_ptrs() const noexcept
+{
+  return inds_ptrs_.view();
+}
+
+template <typename CodeT>
+std::vector<std::shared_ptr<list_data<CodeT, int64_t>>>& index<CodeT>::lists() noexcept
+{
+  return lists_;
+}
+
+template <typename CodeT>
+const std::vector<std::shared_ptr<list_data<CodeT, int64_t>>>& index<CodeT>::lists() const noexcept
+{
+  return lists_;
+}
+
+template <typename CodeT>
+void index<CodeT>::check_consistency()
+{
+  auto n_lists = lists_.size();
+  RAFT_EXPECTS(list_sizes_.extent(0) == n_lists, "inconsistent list size");
+  RAFT_EXPECTS(data_ptrs_.extent(0) == n_lists, "inconsistent list size");
+  RAFT_EXPECTS(inds_ptrs_.extent(0) == n_lists, "inconsistent list size");
+  RAFT_EXPECTS((centers_.extent(0) == list_sizes_.extent(0)) &&
+                 (!center_norms_.has_value() || centers_.extent(0) == center_norms_->extent(0)),
+               "inconsistent number of lists (clusters)");
+  RAFT_EXPECTS(sq_vmin_.extent(0) == centers_.extent(1),
+               "sq_vmin size (%u) does not match dim (%u)",
+               static_cast<uint32_t>(sq_vmin_.extent(0)),
+               static_cast<uint32_t>(centers_.extent(1)));
+  RAFT_EXPECTS(sq_delta_.extent(0) == centers_.extent(1),
+               "sq_delta size (%u) does not match dim (%u)",
+               static_cast<uint32_t>(sq_delta_.extent(0)),
+               static_cast<uint32_t>(centers_.extent(1)));
+}
+
+template struct index<uint8_t>;
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b30f108789..b8ffd77387 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -125,6 +125,13 @@ ConfigureTest(
   PERCENT 100
 )
 
+ConfigureTest(
+  NAME NEIGHBORS_ANN_IVF_FLAT_UDF_TEST
+  PATH neighbors/ann_ivf_flat/test_udf.cu
+  GPUS 1
+  PERCENT 100
+)
+
 ConfigureTest(
   NAME NEIGHBORS_ANN_IVF_FLAT_TEST
   PATH neighbors/ann_ivf_flat/test_float_int64_t.cu neighbors/ann_ivf_flat/test_half_int64_t.cu
@@ -134,8 +141,8 @@ ConfigureTest(
 )
 
 ConfigureTest(
-  NAME NEIGHBORS_ANN_IVF_FLAT_UDF_TEST
-  PATH neighbors/ann_ivf_flat/test_udf.cu
+  NAME NEIGHBORS_ANN_IVF_SQ_TEST
+  PATH neighbors/ann_ivf_sq/test_float_int64_t.cu neighbors/ann_ivf_sq/test_half_int64_t.cu
   GPUS 1
   PERCENT 100
 )
diff --git a/cpp/tests/neighbors/ann_ivf_sq.cuh b/cpp/tests/neighbors/ann_ivf_sq.cuh
new file mode 100644
index 0000000000..b62892d9b1
--- /dev/null
+++ b/cpp/tests/neighbors/ann_ivf_sq.cuh
@@ -0,0 +1,573 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#pragma once
+
+#include "../test_utils.cuh"
+#include "ann_utils.cuh"
+#include "naive_knn.cuh"
+
+#include <cuvs/neighbors/ivf_sq.hpp>
+#include <raft/core/bitset.cuh>
+#include <raft/linalg/add.cuh>
+#include <raft/linalg/normalize.cuh>
+
+#include <raft/core/resource/cuda_stream_pool.hpp>
+#include <rmm/cuda_stream_pool.hpp>
+
+#include <numeric>
+
+namespace cuvs::neighbors::ivf_sq {
+
+struct test_ivf_sample_filter {
+  static constexpr unsigned offset = 300;
+};
+
+template <typename IdxT>
+struct AnnIvfSqInputs {
+  IdxT num_queries;
+  IdxT num_db_vecs;
+  IdxT dim;
+  IdxT k;
+  IdxT nprobe;
+  IdxT nlist;
+  cuvs::distance::DistanceType metric;
+  bool host_dataset = false;
+};
+
+template <typename IdxT>
+::std::ostream& operator<<(::std::ostream& os, const AnnIvfSqInputs<IdxT>& p)
+{
+  os << "{ " << p.num_queries << ", " << p.num_db_vecs << ", " << p.dim << ", " << p.k << ", "
+     << p.nprobe << ", " << p.nlist << ", "
+     << cuvs::neighbors::print_metric{static_cast<cuvs::distance::DistanceType>((int)p.metric)}
+     << ", " << (p.host_dataset ? "host" : "device") << '}' << std::endl;
+  return os;
+}
+
+template <typename T, typename DataT, typename IdxT>
+class AnnIVFSQTest : public ::testing::TestWithParam<AnnIvfSqInputs<IdxT>> {
+ public:
+  AnnIVFSQTest()
+    : stream_(raft::resource::get_cuda_stream(handle_)),
+      ps(::testing::TestWithParam<AnnIvfSqInputs<IdxT>>::GetParam()),
+      database(0, stream_),
+      search_queries(0, stream_)
+  {
+  }
+
+  void testAll()
+  {
+    auto naive = compute_naive_knn();
+    auto idx   = build_index(true);
+
+    {
+      SCOPED_TRACE("Search");
+      checkSearch(idx, naive);
+    }
+    {
+      SCOPED_TRACE("Serialize");
+      checkSerialize(idx);
+    }
+    {
+      SCOPED_TRACE("Filter");
+      checkFilter(idx);
+    }
+    {
+      SCOPED_TRACE("Extend");
+      checkExtend(naive);
+    }
+  }
+
+ protected:
+  struct SearchResults {
+    std::vector<IdxT> indices;
+    std::vector<T> distances;
+  };
+
+  void checkSearch(const cuvs::neighbors::ivf_sq::index<uint8_t>& idx, const SearchResults& naive)
+  {
+    auto results = search_index(idx);
+
+    float eps = 0.1;
+    ASSERT_TRUE(eval_neighbours(naive.indices,
+                                results.indices,
+                                naive.distances,
+                                results.distances,
+                                ps.num_queries,
+                                ps.k,
+                                eps,
+                                min_recall_threshold()));
+  }
+
+  void checkSerialize(const cuvs::neighbors::ivf_sq::index<uint8_t>& idx)
+  {
+    tmp_index_file index_file;
+    cuvs::neighbors::ivf_sq::serialize(handle_, index_file.filename, idx);
+    cuvs::neighbors::ivf_sq::index<uint8_t> index_loaded(handle_);
+    cuvs::neighbors::ivf_sq::deserialize(handle_, index_file.filename, &index_loaded);
+
+    ASSERT_EQ(idx.size(), index_loaded.size());
+    ASSERT_EQ(idx.dim(), index_loaded.dim());
+    ASSERT_EQ(idx.n_lists(), index_loaded.n_lists());
+
+    auto results_orig   = search_index(idx);
+    auto results_loaded = search_index(index_loaded);
+
+    float eps = 0.001;
+    ASSERT_TRUE(eval_neighbours(results_orig.indices,
+                                results_loaded.indices,
+                                results_orig.distances,
+                                results_loaded.distances,
+                                ps.num_queries,
+                                ps.k,
+                                eps,
+                                1.0));
+  }
+
+  void checkExtend(const SearchResults& naive)
+  {
+    auto idx_empty = build_index(false);
+    extend_index(&idx_empty);
+
+    auto results = search_index(idx_empty);
+
+    float eps = 0.1;
+    ASSERT_TRUE(eval_neighbours(naive.indices,
+                                results.indices,
+                                naive.distances,
+                                results.distances,
+                                ps.num_queries,
+                                ps.k,
+                                eps,
+                                min_recall_threshold()));
+  }
+
+  void checkFilter(const cuvs::neighbors::ivf_sq::index<uint8_t>& idx)
+  {
+    if (ps.num_db_vecs <= static_cast<IdxT>(test_ivf_sample_filter::offset)) {
+      GTEST_SKIP() << "Skipping filter test: num_db_vecs <= filter offset";
+    }
+
+    size_t queries_size = ps.num_queries * ps.k;
+    std::vector<IdxT> indices_ivfsq(queries_size);
+    std::vector<IdxT> indices_naive(queries_size);
+    std::vector<T> distances_ivfsq(queries_size);
+    std::vector<T> distances_naive(queries_size);
+
+    {
+      rmm::device_uvector<T> distances_naive_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_naive_dev(queries_size, stream_);
+      auto* database_filtered_ptr = database.data() + test_ivf_sample_filter::offset * ps.dim;
+      cuvs::neighbors::naive_knn<T, DataT, IdxT>(handle_,
+                                                 distances_naive_dev.data(),
+                                                 indices_naive_dev.data(),
+                                                 search_queries.data(),
+                                                 database_filtered_ptr,
+                                                 ps.num_queries,
+                                                 ps.num_db_vecs - test_ivf_sample_filter::offset,
+                                                 ps.dim,
+                                                 ps.k,
+                                                 ps.metric);
+      raft::linalg::addScalar(indices_naive_dev.data(),
+                              indices_naive_dev.data(),
+                              IdxT(test_ivf_sample_filter::offset),
+                              queries_size,
+                              stream_);
+      raft::update_host(distances_naive.data(), distances_naive_dev.data(), queries_size, stream_);
+      raft::update_host(indices_naive.data(), indices_naive_dev.data(), queries_size, stream_);
+      raft::resource::sync_stream(handle_);
+    }
+
+    {
+      double min_recall =
+        std::min(1.0, static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist));
+
+      rmm::device_uvector<T> distances_ivfsq_dev(queries_size, stream_);
+      rmm::device_uvector<IdxT> indices_ivfsq_dev(queries_size, stream_);
+
+      {
+        cuvs::neighbors::ivf_sq::search_params search_params;
+        search_params.n_probes = ps.nprobe;
+
+        auto removed_indices =
+          raft::make_device_vector<IdxT, int64_t>(handle_, test_ivf_sample_filter::offset);
+        raft::linalg::map_offset(handle_, removed_indices.view(), raft::identity_op{});
+        raft::resource::sync_stream(handle_);
+
+        cuvs::core::bitset<std::uint32_t, IdxT> removed_indices_bitset(
+          handle_, removed_indices.view(), ps.num_db_vecs);
+        auto bitset_filter_obj =
+          cuvs::neighbors::filtering::bitset_filter(removed_indices_bitset.view());
+
+        auto search_queries_view = raft::make_device_matrix_view<const DataT, IdxT>(
+          search_queries.data(), ps.num_queries, ps.dim);
+        auto indices_out_view =
+          raft::make_device_matrix_view<IdxT, IdxT>(indices_ivfsq_dev.data(), ps.num_queries, ps.k);
+        auto dists_out_view =
+          raft::make_device_matrix_view<T, IdxT>(distances_ivfsq_dev.data(), ps.num_queries, ps.k);
+
+        cuvs::neighbors::ivf_sq::search(handle_,
+                                        search_params,
+                                        idx,
+                                        search_queries_view,
+                                        indices_out_view,
+                                        dists_out_view,
+                                        bitset_filter_obj);
+
+        raft::update_host(
+          distances_ivfsq.data(), distances_ivfsq_dev.data(), queries_size, stream_);
+        raft::update_host(indices_ivfsq.data(), indices_ivfsq_dev.data(), queries_size, stream_);
+        raft::resource::sync_stream(handle_);
+      }
+      float eps = 0.1;
+      ASSERT_TRUE(eval_neighbours(indices_naive,
+                                  indices_ivfsq,
+                                  distances_naive,
+                                  distances_ivfsq,
+                                  ps.num_queries,
+                                  ps.k,
+                                  eps,
+                                  min_recall));
+    }
+  }
+
+  void SetUp() override
+  {
+    database.resize(ps.num_db_vecs * ps.dim, stream_);
+    search_queries.resize(ps.num_queries * ps.dim, stream_);
+
+    raft::random::RngState r(1234ULL);
+    if constexpr (std::is_same_v<DataT, float> || std::is_same_v<DataT, half>) {
+      raft::random::uniform(
+        handle_, r, database.data(), ps.num_db_vecs * ps.dim, DataT(0.1), DataT(2.0));
+      raft::random::uniform(
+        handle_, r, search_queries.data(), ps.num_queries * ps.dim, DataT(0.1), DataT(2.0));
+    }
+    raft::resource::sync_stream(handle_);
+  }
+
+  void TearDown() override
+  {
+    raft::resource::sync_stream(handle_);
+    database.resize(0, stream_);
+    search_queries.resize(0, stream_);
+  }
+
+  double min_recall_threshold()
+  {
+    return std::min(1.0, static_cast<double>(ps.nprobe) / static_cast<double>(ps.nlist));
+  }
+
+  SearchResults compute_naive_knn()
+  {
+    size_t queries_size = ps.num_queries * ps.k;
+    SearchResults results;
+    results.indices.resize(queries_size);
+    results.distances.resize(queries_size);
+
+    rmm::device_uvector<T> distances_dev(queries_size, stream_);
+    rmm::device_uvector<IdxT> indices_dev(queries_size, stream_);
+    cuvs::neighbors::naive_knn<T, DataT, IdxT>(handle_,
+                                               distances_dev.data(),
+                                               indices_dev.data(),
+                                               search_queries.data(),
+                                               database.data(),
+                                               ps.num_queries,
+                                               ps.num_db_vecs,
+                                               ps.dim,
+                                               ps.k,
+                                               ps.metric);
+    raft::update_host(results.distances.data(), distances_dev.data(), queries_size, stream_);
+    raft::update_host(results.indices.data(), indices_dev.data(), queries_size, stream_);
+    raft::resource::sync_stream(handle_);
+    return results;
+  }
+
+  cuvs::neighbors::ivf_sq::index<uint8_t> build_index(bool add_data_on_build)
+  {
+    cuvs::neighbors::ivf_sq::index_params index_params;
+    index_params.n_lists                  = ps.nlist;
+    index_params.metric                   = ps.metric;
+    index_params.add_data_on_build        = add_data_on_build;
+    index_params.kmeans_trainset_fraction = 0.5;
+
+    if (!ps.host_dataset) {
+      auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
+        (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
+      return cuvs::neighbors::ivf_sq::build(handle_, index_params, database_view);
+    } else {
+      auto host_database = raft::make_host_matrix<DataT, IdxT>(ps.num_db_vecs, ps.dim);
+      raft::copy(host_database.data_handle(), database.data(), ps.num_db_vecs * ps.dim, stream_);
+      raft::resource::sync_stream(handle_);
+      return cuvs::neighbors::ivf_sq::build(
+        handle_, index_params, raft::make_const_mdspan(host_database.view()));
+    }
+  }
+
+  void extend_index(cuvs::neighbors::ivf_sq::index<uint8_t>* idx)
+  {
+    if (!ps.host_dataset) {
+      auto database_view = raft::make_device_matrix_view<const DataT, IdxT>(
+        (const DataT*)database.data(), ps.num_db_vecs, ps.dim);
+      auto vector_indices = raft::make_device_vector<IdxT, IdxT>(handle_, ps.num_db_vecs);
+      raft::linalg::map_offset(handle_, vector_indices.view(), raft::identity_op{});
+      raft::resource::sync_stream(handle_);
+
+      auto indices_view = raft::make_device_vector_view<const IdxT, IdxT>(
+        vector_indices.data_handle(), ps.num_db_vecs);
+      cuvs::neighbors::ivf_sq::extend(
+        handle_,
+        database_view,
+        std::make_optional<raft::device_vector_view<const IdxT, IdxT>>(indices_view),
+        idx);
+    } else {
+      auto host_database = raft::make_host_matrix<DataT, IdxT>(ps.num_db_vecs, ps.dim);
+      raft::copy(host_database.data_handle(), database.data(), ps.num_db_vecs * ps.dim, stream_);
+      raft::resource::sync_stream(handle_);
+
+      auto vector_indices = raft::make_host_vector<IdxT>(handle_, ps.num_db_vecs);
+      std::iota(
+        vector_indices.data_handle(), vector_indices.data_handle() + ps.num_db_vecs, IdxT(0));
+
+      auto indices_view =
+        raft::make_host_vector_view<const IdxT, IdxT>(vector_indices.data_handle(), ps.num_db_vecs);
+      auto host_database_view = raft::make_host_matrix_view<const DataT, IdxT>(
+        host_database.data_handle(), ps.num_db_vecs, ps.dim);
+      cuvs::neighbors::ivf_sq::extend(
+        handle_,
+        host_database_view,
+        std::make_optional<raft::host_vector_view<const IdxT, IdxT>>(indices_view),
+        idx);
+    }
+  }
+
+  SearchResults search_index(const cuvs::neighbors::ivf_sq::index<uint8_t>& idx)
+  {
+    size_t queries_size = ps.num_queries * ps.k;
+    SearchResults results;
+    results.indices.resize(queries_size);
+    results.distances.resize(queries_size);
+
+    cuvs::neighbors::ivf_sq::search_params search_params;
+    search_params.n_probes = ps.nprobe;
+
+    rmm::device_uvector<T> distances_dev(queries_size, stream_);
+    rmm::device_uvector<IdxT> indices_dev(queries_size, stream_);
+
+    auto search_queries_view = raft::make_device_matrix_view<const DataT, IdxT>(
+      search_queries.data(), ps.num_queries, ps.dim);
+    auto indices_out_view =
+      raft::make_device_matrix_view<IdxT, IdxT>(indices_dev.data(), ps.num_queries, ps.k);
+    auto dists_out_view =
+      raft::make_device_matrix_view<T, IdxT>(distances_dev.data(), ps.num_queries, ps.k);
+
+    cuvs::neighbors::ivf_sq::search(
+      handle_, search_params, idx, search_queries_view, indices_out_view, dists_out_view);
+
+    raft::update_host(results.distances.data(), distances_dev.data(), queries_size, stream_);
+    raft::update_host(results.indices.data(), indices_dev.data(), queries_size, stream_);
+    raft::resource::sync_stream(handle_);
+    return results;
+  }
+
+  raft::resources handle_;
+  rmm::cuda_stream_view stream_;
+  AnnIvfSqInputs<IdxT> ps;
+  rmm::device_uvector<DataT> database;
+  rmm::device_uvector<DataT> search_queries;
+};
+
+const std::vector<AnnIvfSqInputs<int64_t>> inputs = {
+  // num_queries, num_db_vecs, dim, k, nprobe, nlist, metric
+
+  // ===== Dimension edge cases (all four metrics) =====
+  // dim=1 (CosineExpanded excluded: requires dim > 1)
+  {1000, 10000, 1, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 1, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 1, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
+  // dim=2,3,4,5 (unaligned)
+  {1000, 10000, 2, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 3, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 3, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 4, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 4, 16, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 5, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 5, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  // dim=7,8 (around veclen=16 boundary, not a multiple of veclen)
+  {1000, 10000, 7, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 7, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 8, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 8, 16, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 8, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  // dim=15,16,17 (around veclen=16 boundary)
+  {1000, 10000, 15, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 15, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
+  {1000, 10000, 17, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 17, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  // dim=31,32,33 (around 2*veclen boundary)
+  {1000, 10000, 31, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 31, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 32, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 32, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 32, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 33, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 33, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  // medium dims
+  {1000, 10000, 64, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 64, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
+  {1000, 10000, 256, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 256, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  // large dims (may exceed shared memory limits)
+  {1000, 10000, 2048, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2048, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 2049, 16, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 2049, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 2050, 16, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 2050, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 4096, 20, 50, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 4096, 20, 50, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 4096, 20, 50, 1024, cuvs::distance::DistanceType::CosineExpanded},
+
+  // ===== k edge cases =====
+  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 2, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 5, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 20, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 20, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 50, 100, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 100, 200, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 100, 200, 1024, cuvs::distance::DistanceType::InnerProduct},
+
+  // ===== Large k (beyond fused top-k kMaxSqScanCapacity=256, exercises materialized fallback)
+  // =====
+  // k=257: smallest k that forces the materialized path (Capacity clamped to 0)
+  {100, 10000, 32, 257, 100, 64, cuvs::distance::DistanceType::L2Expanded},
+  {100, 10000, 32, 257, 100, 64, cuvs::distance::DistanceType::InnerProduct},
+  {100, 10000, 32, 257, 100, 64, cuvs::distance::DistanceType::CosineExpanded},
+  // k=300: comfortably above the fused top-k threshold
+  {100, 10000, 32, 300, 64, 64, cuvs::distance::DistanceType::L2Expanded},
+  {100, 10000, 32, 300, 64, 64, cuvs::distance::DistanceType::InnerProduct},
+
+  // ===== nprobe / nlist edge cases =====
+  // nprobe == nlist (exhaustive probe)
+  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::CosineExpanded},
+  // nprobe == 1 (minimal probe)
+  {1000, 10000, 16, 10, 1, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 1, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  // nprobe > nlist (clamped to nlist)
+  {1000, 10000, 16, 10, 2048, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 2048, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  // various nprobe
+  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 10, 50, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
+  {1000, 10000, 16, 10, 70, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
+  // very small nlist
+  {100, 10000, 16, 10, 8, 8, cuvs::distance::DistanceType::L2Expanded},
+  {100, 10000, 16, 10, 8, 8, cuvs::distance::DistanceType::CosineExpanded},
+  // smaller nlist
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::L2Expanded},
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::InnerProduct},
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::CosineExpanded},
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::L2SqrtExpanded},
+
+  // ===== Dataset size edge cases =====
+  // single query
+  {1, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  // very few queries
+  {2, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {5, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  // very few db vectors (nlist reduced to fit)
+  {100, 500, 16, 10, 40, 256, cuvs::distance::DistanceType::L2Expanded},
+  {100, 500, 16, 10, 40, 256, cuvs::distance::DistanceType::CosineExpanded},
+  // larger datasets
+  {20, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {20, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 100000, 16, 10, 20, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {10000, 131072, 8, 10, 20, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {10000, 131072, 8, 10, 20, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {10000, 131072, 8, 10, 50, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {10000, 131072, 8, 10, 50, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
+
+  // ===== Large query batches (gridDim.x > 65535) =====
+  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::L2Expanded},
+  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::InnerProduct},
+  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::CosineExpanded},
+  {100000, 1024, 32, 10, 64, 64, cuvs::distance::DistanceType::L2SqrtExpanded},
+  {100000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded},
+  {100000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::CosineExpanded},
+  // just above the old 65535 limit
+  {65536, 1024, 16, 10, 32, 64, cuvs::distance::DistanceType::L2Expanded},
+  {65536, 1024, 16, 10, 32, 64, cuvs::distance::DistanceType::CosineExpanded},
+
+  // ===== Recall-stability: same data, different query counts =====
+  {20000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded},
+  {50000, 8712, 3, 10, 51, 66, cuvs::distance::DistanceType::L2Expanded},
+
+  // ===== Host dataset: build + extend from host_matrix_view =====
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, true},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded, true},
+  {1000, 10000, 3, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {100, 10000, 64, 10, 20, 512, cuvs::distance::DistanceType::InnerProduct, true},
+};
+
+const std::vector<AnnIvfSqInputs<int64_t>> inputs_half = {
+  // num_queries, num_db_vecs, dim, k, nprobe, nlist, metric, host_dataset
+
+  // All four metrics at a standard dimension
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2SqrtExpanded},
+
+  // Unaligned and small dimensions
+  {1000, 10000, 3, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 7, 16, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+
+  // Medium / larger dimensions
+  {1000, 10000, 64, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded},
+  {1000, 10000, 256, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+
+  // k edge cases
+  {1000, 10000, 16, 1, 40, 1024, cuvs::distance::DistanceType::L2Expanded},
+  {1000, 10000, 16, 50, 100, 1024, cuvs::distance::DistanceType::L2Expanded},
+
+  // nprobe / nlist edge cases
+  {1000, 10000, 16, 10, 64, 64, cuvs::distance::DistanceType::L2Expanded},
+  {100, 10000, 16, 10, 20, 512, cuvs::distance::DistanceType::CosineExpanded},
+
+  // Host dataset
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::L2Expanded, true},
+  {1000, 10000, 16, 10, 40, 1024, cuvs::distance::DistanceType::CosineExpanded, true},
+  {1000, 10000, 128, 10, 40, 1024, cuvs::distance::DistanceType::InnerProduct, true},
+};
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/tests/neighbors/ann_ivf_sq/test_float_int64_t.cu b/cpp/tests/neighbors/ann_ivf_sq/test_float_int64_t.cu
new file mode 100644
index 0000000000..8831ae720a
--- /dev/null
+++ b/cpp/tests/neighbors/ann_ivf_sq/test_float_int64_t.cu
@@ -0,0 +1,17 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_ivf_sq.cuh"
+
+namespace cuvs::neighbors::ivf_sq {
+
+typedef AnnIVFSQTest<float, float, int64_t> AnnIVFSQTestF_float;
+TEST_P(AnnIVFSQTestF_float, AnnIVFSQ) { this->testAll(); }
+
+INSTANTIATE_TEST_CASE_P(AnnIVFSQTest, AnnIVFSQTestF_float, ::testing::ValuesIn(inputs));
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/cpp/tests/neighbors/ann_ivf_sq/test_half_int64_t.cu b/cpp/tests/neighbors/ann_ivf_sq/test_half_int64_t.cu
new file mode 100644
index 0000000000..fc17e246dd
--- /dev/null
+++ b/cpp/tests/neighbors/ann_ivf_sq/test_half_int64_t.cu
@@ -0,0 +1,17 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <gtest/gtest.h>
+
+#include "../ann_ivf_sq.cuh"
+
+namespace cuvs::neighbors::ivf_sq {
+
+typedef AnnIVFSQTest<float, half, int64_t> AnnIVFSQTestF_half;
+TEST_P(AnnIVFSQTestF_half, AnnIVFSQ) { this->testAll(); }
+
+INSTANTIATE_TEST_CASE_P(AnnIVFSQTest, AnnIVFSQTestF_half, ::testing::ValuesIn(inputs_half));
+
+}  // namespace cuvs::neighbors::ivf_sq
diff --git a/docs/source/cpp_api/neighbors.rst b/docs/source/cpp_api/neighbors.rst
index 66b4e0c4aa..2aed5ca9e0 100644
--- a/docs/source/cpp_api/neighbors.rst
+++ b/docs/source/cpp_api/neighbors.rst
@@ -18,6 +18,7 @@ Nearest Neighbors
    neighbors_hnsw.rst
    neighbors_ivf_flat.rst
    neighbors_ivf_pq.rst
+   neighbors_ivf_sq.rst
    neighbors_mg.rst
    neighbors_nn_descent.rst
    neighbors_refine.rst
diff --git a/docs/source/cpp_api/neighbors_ivf_sq.rst b/docs/source/cpp_api/neighbors_ivf_sq.rst
new file mode 100644
index 0000000000..d0554f926a
--- /dev/null
+++ b/docs/source/cpp_api/neighbors_ivf_sq.rst
@@ -0,0 +1,68 @@
+IVF-SQ
+======
+
+The IVF-SQ method is an ANN algorithm. Like IVF-Flat, IVF-SQ splits the points into a number of clusters (also specified by a parameter called n_lists) and searches the closest clusters to compute the nearest neighbors (also specified by a parameter called n_probes), but it shrinks the sizes of the vectors using scalar quantization, independently mapping each dimension to a fixed-width integer code.
+
+.. role:: py(code)
+   :language: c++
+   :class: highlight
+
+``#include <cuvs/neighbors/ivf_sq.hpp>``
+
+namespace *cuvs::neighbors::ivf_sq*
+
+Index build parameters
+----------------------
+
+.. doxygengroup:: ivf_sq_cpp_index_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index search parameters
+-----------------------
+
+.. doxygengroup:: ivf_sq_cpp_search_params
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index
+-----
+
+.. doxygengroup:: ivf_sq_cpp_index
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index build
+-----------
+
+.. doxygengroup:: ivf_sq_cpp_index_build
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index extend
+------------
+
+.. doxygengroup:: ivf_sq_cpp_index_extend
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index search
+------------
+
+.. doxygengroup:: ivf_sq_cpp_index_search
+    :project: cuvs
+    :members:
+    :content-only:
+
+Index serialize
+---------------
+
+.. doxygengroup:: ivf_sq_cpp_index_serialize
+    :project: cuvs
+    :members:
+    :content-only:
diff --git a/python/cuvs_bench/cuvs_bench/config/algorithms.yaml b/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
index fa2195fc61..f181e549eb 100644
--- a/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
+++ b/python/cuvs_bench/cuvs_bench/config/algorithms.yaml
@@ -8,7 +8,7 @@ faiss_gpu_ivf_pq:
   executable: FAISS_GPU_IVF_PQ_ANN_BENCH
   requires_gpu: true
 faiss_gpu_ivf_sq:
-  executable: FAISS_GPU_IVF_PQ_ANN_BENCH
+  executable: FAISS_GPU_IVF_SQ_ANN_BENCH
   requires_gpu: true
 faiss_gpu_cagra:
   executable: FAISS_GPU_CAGRA_ANN_BENCH
@@ -25,6 +25,9 @@ faiss_cpu_ivf_flat:
 faiss_cpu_ivf_pq:
   executable: FAISS_CPU_IVF_PQ_ANN_BENCH
   requires_gpu: false
+faiss_cpu_ivf_sq:
+  executable: FAISS_CPU_IVF_SQ_ANN_BENCH
+  requires_gpu: false
 faiss_cpu_hnsw_flat:
   executable: FAISS_CPU_HNSW_FLAT_ANN_BENCH
   requires_gpu: false
@@ -34,6 +37,9 @@ cuvs_ivf_flat:
 cuvs_ivf_pq:
   executable: CUVS_IVF_PQ_ANN_BENCH
   requires_gpu: true
+cuvs_ivf_sq:
+  executable: CUVS_IVF_SQ_ANN_BENCH
+  requires_gpu: true
 cuvs_cagra:
   executable: CUVS_CAGRA_ANN_BENCH
   requires_gpu: true
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py b/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
index 9111bdc3b9..f22852f0ae 100644
--- a/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
+++ b/python/cuvs_bench/cuvs_bench/config/algos/constraints/__init__.py
@@ -1,5 +1,5 @@
 #
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 
@@ -50,11 +50,29 @@ def cuvs_cagra_search(params, build_params, k, batch_size):
     return True
 
 
+def cuvs_ivf_sq_search(params, build_params, k, batch_size):
+    if "nlist" in build_params and "nprobe" in params:
+        return build_params["nlist"] >= params["nprobe"]
+    return True
+
+
 ###############################################################################
 #                              FAISS constraints                              #
 ###############################################################################
 
 
+def faiss_gpu_ivf_sq_search(params, build_params, k, batch_size):
+    if "nlist" in build_params and "nprobe" in params:
+        return build_params["nlist"] >= params["nprobe"]
+    return True
+
+
+def faiss_cpu_ivf_sq_search(params, build_params, k, batch_size):
+    if "nlist" in build_params and "nprobe" in params:
+        return build_params["nlist"] >= params["nprobe"]
+    return True
+
+
 def faiss_gpu_ivf_pq_build(params, dims):
     ret = True
     # M must be defined
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml
new file mode 100644
index 0000000000..af59493eef
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/config/algos/cuvs_ivf_sq.yaml
@@ -0,0 +1,25 @@
+name: cuvs_ivf_sq
+constraints:
+  search: cuvs_bench.config.algos.constraints.cuvs_ivf_sq_search
+groups:
+  base:
+    build:
+      nlist: [1024, 2048, 4096]
+      ratio: [2, 4]
+      niter: [25]
+    search:
+      nprobe: [1, 5, 10, 20, 50, 100, 200]
+  large:
+    build:
+      nlist: [8192, 16384, 32768]
+      ratio: [4]
+      niter: [20]
+    search:
+      nprobe: [10, 20, 50, 100, 200, 500, 1000]
+  test:
+    build:
+      nlist: [1024]
+      ratio: [2]
+      niter: [20]
+    search:
+      nprobe: [1, 5]
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/faiss_cpu_ivf_sq.yaml b/python/cuvs_bench/cuvs_bench/config/algos/faiss_cpu_ivf_sq.yaml
new file mode 100644
index 0000000000..ce237f280d
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/config/algos/faiss_cpu_ivf_sq.yaml
@@ -0,0 +1,28 @@
+name: faiss_cpu_ivf_sq
+constraints:
+  search: cuvs_bench.config.algos.constraints.faiss_cpu_ivf_sq_search
+groups:
+  base:
+    build:
+      nlist: [1024, 2048, 4096]
+      ratio: [2, 4]
+      quantizer_type: [int8]
+    search:
+      nprobe: [1, 5, 10, 20, 50, 100, 200]
+      refine_ratio: [1]
+  large:
+    build:
+      nlist: [8192, 16384, 32768]
+      ratio: [4]
+      quantizer_type: [int8]
+    search:
+      nprobe: [10, 20, 50, 100, 200, 500, 1000]
+      refine_ratio: [1, 2]
+  test:
+    build:
+      nlist: [1024]
+      ratio: [2]
+      quantizer_type: [int8]
+    search:
+      nprobe: [1, 5]
+      refine_ratio: [1]
diff --git a/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_sq.yaml b/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_sq.yaml
new file mode 100644
index 0000000000..d49df8116d
--- /dev/null
+++ b/python/cuvs_bench/cuvs_bench/config/algos/faiss_gpu_ivf_sq.yaml
@@ -0,0 +1,28 @@
+name: faiss_gpu_ivf_sq
+constraints:
+  search: cuvs_bench.config.algos.constraints.faiss_gpu_ivf_sq_search
+groups:
+  base:
+    build:
+      nlist: [1024, 2048, 4096]
+      ratio: [2, 4]
+      quantizer_type: [int8]
+    search:
+      nprobe: [1, 5, 10, 20, 50, 100, 200]
+      refine_ratio: [1]
+  large:
+    build:
+      nlist: [8192, 16384, 32768]
+      ratio: [4]
+      quantizer_type: [int8]
+    search:
+      nprobe: [10, 20, 50, 100, 200, 500, 1000]
+      refine_ratio: [1, 2]
+  test:
+    build:
+      nlist: [1024]
+      ratio: [2]
+      quantizer_type: [int8]
+    search:
+      nprobe: [1, 5]
+      refine_ratio: [1]