feat: add diskann index#369
Conversation
… feat/diskann_index
… feat/diskann_index
| -Wl,--whole-archive | ||
| $<TARGET_FILE:core_knn_flat_static> | ||
| $<TARGET_FILE:core_knn_flat_sparse_static> | ||
| $<TARGET_FILE:core_knn_hnsw_static> |
| run: | | ||
| sudo apt-get update | ||
| sudo apt-get install -y --no-install-recommends \ | ||
| libaio-dev |
There was a problem hiding this comment.
如果用户的环境没有装libaio-dev,会发生什么?
There was a problem hiding this comment.
现在默认使用是需要安装libaio,可以通过配置的方式进行区分,千问的建议是通过linux安装包的方式安装libaio库:
Installation
zvec requires the libaio system library on linux platform.
On Ubuntu/Debian:
sudo apt-get install libaio1 libaio-dev
pip install zvecThere was a problem hiding this comment.
如果没有安装,会发生什么?这里预期的行为应该是 如果用户不安装aio,不影响除diskann的其他功能使用
| } | ||
|
|
||
| auto &pool = ctx->expanded_nodes(); | ||
| for (uint32_t i = 0; i < pool.size(); i++) { |
There was a problem hiding this comment.
可以使用std::remove_if + erase,效率高一些
|
|
||
| virtual ~DiskAnnQueryParams() = default; | ||
|
|
||
| int list_size() const { |
|
|
||
| sector_internal_id_++; | ||
| if (sector_internal_id_ >= sector_vec_num_) { | ||
| std::vector<uint8_t> padding_(padding_size_, 0); |
There was a problem hiding this comment.
没有必要allocate一个临时的std::vector?
std::memset(data_ptr + data_size_, 0, padding_size_);
… feat/diskann_index
| "proxima.stratified.trainer.cluster_params_in_level_"; | ||
|
|
||
| static const std::string MULTI_CHUNK_CLUSTER_COUNT = | ||
| "proxima.cluster.multi_chunk_cluster.count"; |
There was a problem hiding this comment.
等一起统一调整,这个其它索引里还没有改掉
|
|
||
| //! Init | ||
| int init() { | ||
| // file_.open(path, std::ios::in | std::ios::out); |
| typedef std::shared_ptr<MultiChunkClusterAlgorithm> Pointer; | ||
|
|
||
| //! Constructor | ||
| MultiChunkClusterAlgorithm(void) {} |
There was a problem hiding this comment.
使用default,{}会影响编译器优化,析构函数类似
| "ValueType must be arithmetic"); | ||
|
|
||
| //! Constructor | ||
| MultiChunkNumericalAlgorithm(void) {} |
|
|
||
| bool result = algorithm.cluster_once(*local_threads, &cost); | ||
| if (result != true) { | ||
| LOG_ERROR("(%u) Failed to cluster.", i + 1); |
There was a problem hiding this comment.
%u打印会有编译warning,建议%zu (size_t)xxx打印
| (*finished)++; | ||
| } | ||
|
|
||
| return; |
| int cleanup(void); | ||
|
|
||
| //! Reset Cluster | ||
| int reset(void); |
There was a problem hiding this comment.
void可以去掉,c++里面reset()即可
| } | ||
| } | ||
|
|
||
| (*out)[id * chunk_count_ + chunk] = static_cast<uint32_t>(sel_index); |
There was a problem hiding this comment.
不用cast,sel_index本身是uint32的
| entity_.set_neighbors(id, pruned_list); | ||
| lock_pool_[lock_idx].unlock(); | ||
|
|
||
| ret = inter_insert(id, pruned_list, ctx); |
There was a problem hiding this comment.
return inter_insert...否则会丢失错误
| return core::IndexError_Unsupported; | ||
| } | ||
|
|
||
| param_ = dynamic_cast<const DiskAnnIndexParam &>(param); |
| return pq_chunk_num_; | ||
| } | ||
|
|
||
| void pq_chunk_num(int pq_chunk_num) { |
| 20 | ||
| )pbdoc"); | ||
| diskann_params | ||
| .def(py::init<int>(), py::arg("list_size") = 10, R"pbdoc( |
There was a problem hiding this comment.
c++默认为300,注释也写默认300,这里是10
| Default is ``MetricType.IP`` (inner product). | ||
| max_degree (int):. | ||
| list_size (int): . | ||
| pq_chunk_num (bool): . |
There was a problem hiding this comment.
pq_chunk_num (bool): —— 类型是 int,不是 bool。
示例里 >>> print(params.n_list) / >>> print(params.nprobe) 引用的属性 DiskAnn 类上根本不存在
| uint64_t n_retries = 0) { | ||
| #if (defined(__linux) || defined(__linux__)) | ||
| uint64_t iters = | ||
| DiskAnnUtil::round_up(read_reqs.size(), MAX_EVENTS) / MAX_EVENTS; |
| break; | ||
| } | ||
| } | ||
| n_tries++; |
There was a problem hiding this comment.
retry不起作用,错误路径都返回或者break了
|
|
||
| if (ret != (int)n_ops) { | ||
| LOG_WARN( | ||
| "io_submit failed; returned: %d, expected=%lu, errno=%d, %s, " |
| ) | ||
| if(MSVC) | ||
| # MSVC: STATIC-only stub to avoid creating an empty DLL with no exports | ||
| cc_library( |
| // every other index type stays fully functional. | ||
| int EnsureDiskAnnRuntimeReady() { | ||
| const int status = ::zvec::LoadDiskAnnPlugin(); | ||
| if (status == ::zvec::kDiskAnnPluginOk) { |
There was a problem hiding this comment.
zvec前面的::可以去掉?应该不太会冲突
src/binding/python/binding.cc
35: [](const std::string &path) { return ::zvec::LoadDiskAnnPlugin(path); },
40: m.def("is_diskann_plugin_loaded", &::zvec::IsDiskAnnPluginLoaded,
42: m.def("is_libaio_available", &::zvec::IsLibAioAvailable,
48: m.attr("DISKANN_PLUGIN_OK") = static_cast<int>(::zvec::kDiskAnnPluginOk);
50: static_cast<int>(::zvec::kDiskAnnPluginUnsupportedPlatform);
52: static_cast<int>(::zvec::kDiskAnnPluginLibAioMissing);
54: static_cast<int>(::zvec::kDiskAnnPluginDlopenFailed);
src/core/plugin/diskann_plugin.cc
88: if (::dladdr(reinterpret_cast<void *>(&::zvec::LoadDiskAnnPlugin), &info) ==
105: if (::dladdr(reinterpret_cast<void *>(&::zvec::LoadDiskAnnPlugin), &info) ==
src/core/interface/indexes/diskann_index.cc
34: const int status = ::zvec::LoadDiskAnnPlugin();
35: if (status == ::zvec::kDiskAnnPluginOk) {
39: case ::zvec::kDiskAnnPluginLibAioMissing:
46: case ::zvec::kDiskAnnPluginUnsupportedPlatform:
51: case ::zvec::kDiskAnnPluginDlopenFailed:
| LOG_ERROR("Failed to open streamer, path: %s", file_path_.c_str()); | ||
| return core::IndexError_Runtime; | ||
| } | ||
| is_trained_ = true; |
There was a problem hiding this comment.
设置is_trained_后面add直接报错,这里创建streamer的作用是什么?
There was a problem hiding this comment.
这个streamer是提供readonly查询的,不提供add能力,和ivf保持一致
| auto holder = | ||
| std::make_shared<zvec::core::MultiPassIndexHolder<DataType::DT_FP16>>( | ||
| param_.dimension); | ||
| for (auto doc : doc_cache_) { |
There was a problem hiding this comment.
ivf_index.cc也有类似实现,抽个公共实现吧
| input_vector_meta_.dimension() * input_vector_meta_.unit_size()); | ||
|
|
||
| std::lock_guard<std::mutex> lock(mutex_); | ||
| while (doc_cache_.size() <= doc_id) { |
There was a problem hiding this comment.
已经上锁了,可以直接
if (size() <= doc_id) {
doc_cache_.resize(doc+1, {kInvalidKey, fake_data});
}
| auto dumper = core::IndexFactory::CreateDumper("FileDumper"); | ||
|
|
||
| dumper->create(file_path_); | ||
| builder_->dump(dumper); |
|
|
||
| int flush(void) override { | ||
| return IndexError_NotImplemented; | ||
| return 0; |
There was a problem hiding this comment.
这里因为index调用到了flush,所以还是做了0值返回,保证接口的一致。
| // See the License for the specific language governing permissions and | ||
| // limitations under the License. | ||
|
|
||
| #ifndef ZVEC_PLUGIN_DISKANN_PLUGIN_H |
| # --------------------------------------------------------------------------- # | ||
| # Plugin gating. | ||
| # --------------------------------------------------------------------------- # | ||
| _PLUGIN_LOAD_STATUS = zvec.load_diskann_plugin() |
There was a problem hiding this comment.
这里最好不要让用户手动调用一次吧,import zvec内可以自动调用load_diskann_plugin
There was a problem hiding this comment.
这个load_diskann_plugin不是必须调用,可以注释掉,或者去掉,会做隐式加载。如果没有load成功,会报错。
... ...
def main() -> int:
# ---- 1. Load the DiskAnn plugin. -------------------------------------- #
#status = zvec.load_diskann_plugin()
#if status != 0:
# print(
# f"[error] DiskAnn plugin failed to load (status={status}). "
# "Make sure libzvec_diskann_plugin.so sits next to _zvec.so.",
# file=sys.stderr,
# )
# return 1
#print("[ok] DiskAnn plugin loaded.")
# ---- 2. Prepare a throwaway workspace and the collection. ------------- #
...

Add diskann index into Zvec to lower memory usage in vector search as per the description: #325