From da330bd247a36a5d6cbb8c290b6cacd4ed1db5b0 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Thu, 14 May 2026 20:10:05 +0800 Subject: [PATCH 01/47] small block read --- src/ailego/buffer/vector_page_table.cc | 124 +++++++++++------- src/core/algorithm/hnsw/hnsw_entity.h | 29 +++- .../algorithm/hnsw/hnsw_streamer_entity.h | 28 +++- .../algorithm/vamana/vamana_streamer_entity.h | 28 +++- src/core/utility/buffer_storage.cc | 114 ++++++++++++---- .../zvec/ailego/buffer/vector_page_table.h | 34 ++--- .../zvec/core/framework/index_storage.h | 40 ++++++ 7 files changed, 281 insertions(+), 116 deletions(-) diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index fec7a1902..553919fb3 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include +#include #include #include @@ -41,6 +44,8 @@ static ssize_t zvec_pread(int fd, void *buf, size_t count, size_t offset) { namespace zvec { namespace ailego { +const size_t kVectorPageSize = MemoryHelper::PageSize(); + void VectorPageTable::init(size_t entry_num) { if (entries_) { delete[] entries_; @@ -97,12 +102,11 @@ void VectorPageTable::evict_block(block_id_t block_id) { assert(block_id < entry_num_); Entry &entry = entries_[block_id]; char *buffer = entry.buffer; - size_t size = entry.size; int expected = 0; if (entry.ref_count.compare_exchange_strong( expected, std::numeric_limits::min())) { if (buffer) { - MemoryLimitPool::get_instance().release_buffer(buffer, size); + MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); } } // Always reset in_evict_queue regardless of whether the CAS succeeded: @@ -113,32 +117,20 @@ void VectorPageTable::evict_block(block_id_t block_id) { entry.in_evict_queue.store(false, std::memory_order_relaxed); } -char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer, - size_t size) { +char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer) { assert(block_id < entry_num_); Entry &entry = entries_[block_id]; while (true) { int current_count = entry.ref_count.load(std::memory_order_relaxed); if (current_count >= 0) { - // Defensive branch: in practice this path should never be reached. - // set_block_acquired() is always called under block_mutexes_[block_id], - // and the caller (acquire_buffer) re-checks acquire_block() inside the - // same lock before invoking this function. Therefore, if we get here, - // ref_count must still be negative (unloaded). This branch is retained - // as a safety net in case the locking contract is violated in the future, - // e.g. if set_block_acquired is called from an unlocked context. if (entry.ref_count.compare_exchange_weak( current_count, current_count + 1, std::memory_order_acq_rel, std::memory_order_acquire)) { - MemoryLimitPool::get_instance().release_buffer(buffer, size); + MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); return entry.buffer; } } else { entry.buffer = buffer; - entry.size = size; - // Ensure in_evict_queue is cleared when the block is freshly loaded so - // that the first release_block() after loading can register it in the - // eviction queue. entry.in_evict_queue.store(false, std::memory_order_relaxed); entry.ref_count.store(1, std::memory_order_release); return entry.buffer; @@ -170,15 +162,13 @@ VecBufferPool::VecBufferPool(const std::string &filename) { file_size_ = st.st_size; } -int VecBufferPool::init(size_t segment_count) { - size_t block_num = segment_count + 10; +int VecBufferPool::init() { + size_t block_num = (file_size_ + kVectorPageSize - 1) / kVectorPageSize; page_table_.init(block_num); - // Allocate all mutexes in a single contiguous array so that the cold-path - // lock in acquire_buffer() accesses cache-friendly memory instead of - // chasing 31K+ independent heap pointers. - block_mutexes_ = std::make_unique(block_num); - block_mutexes_count_ = block_num; - LOG_DEBUG("entry num: %zu", page_table_.entry_num()); + block_mutexes_ = + std::make_unique(VecBufferPool::kMutexBucketCount); + LOG_DEBUG("entry num: %zu, file_size: %zu", page_table_.entry_num(), + file_size_); return 0; } @@ -186,54 +176,57 @@ VecBufferPoolHandle VecBufferPool::get_handle() { return VecBufferPoolHandle(*this); } -char *VecBufferPool::acquire_buffer(block_id_t block_id, size_t offset, - size_t size, int retry) { - assert(block_id < block_mutexes_count_); - char *buffer = page_table_.acquire_block(block_id); +char *VecBufferPool::acquire_buffer(block_id_t page_id, int retry) { + assert(page_id < page_table_.entry_num()); + char *buffer = page_table_.acquire_block(page_id); if (buffer) { return buffer; } - std::lock_guard lock(block_mutexes_[block_id]); - buffer = page_table_.acquire_block(block_id); + std::lock_guard lock( + block_mutexes_[page_id % VecBufferPool::kMutexBucketCount]); + buffer = page_table_.acquire_block(page_id); if (buffer) { return buffer; } { - bool found = - MemoryLimitPool::get_instance().try_acquire_buffer(size, buffer); + bool found = MemoryLimitPool::get_instance().try_acquire_buffer( + kVectorPageSize, buffer); if (!found) { for (int i = 0; i < retry; i++) { BlockEvictionQueue::get_instance().recycle(); - found = - MemoryLimitPool::get_instance().try_acquire_buffer(size, buffer); + found = MemoryLimitPool::get_instance().try_acquire_buffer( + kVectorPageSize, buffer); if (found) { break; } } } if (!found) { - LOG_ERROR( - "Buffer pool failed to get free buffer: file[%s], block_id[%zu], " - "offset[%zu], size[%zu]", - file_name_.c_str(), block_id, offset, size); + LOG_ERROR("Buffer pool failed to get free buffer: file[%s], page_id[%zu]", + file_name_.c_str(), page_id); return nullptr; } } + size_t page_offset = page_id * kVectorPageSize; + size_t expected_bytes = std::min(kVectorPageSize, file_size_ - page_offset); + if (expected_bytes < kVectorPageSize) { + std::memset(buffer + expected_bytes, 0, kVectorPageSize - expected_bytes); + } #if defined(_MSC_VER) - ssize_t read_bytes = zvec_pread(fd_, buffer, size, offset); + ssize_t read_bytes = zvec_pread(fd_, buffer, expected_bytes, page_offset); #else - ssize_t read_bytes = pread(fd_, buffer, size, offset); + ssize_t read_bytes = pread(fd_, buffer, expected_bytes, page_offset); #endif - if (read_bytes != static_cast(size)) { + if (read_bytes != static_cast(expected_bytes)) { LOG_ERROR( - "Buffer pool failed to read file at offset: file[%s], block_id[%zu], " - "offset[%zu], size[%zu]", - file_name_.c_str(), block_id, offset, size); - MemoryLimitPool::get_instance().release_buffer(buffer, size); + "Buffer pool failed to read file at offset: file[%s], page_id[%zu], " + "offset[%zu], expected[%zu], got[%zd]", + file_name_.c_str(), page_id, page_offset, expected_bytes, read_bytes); + MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); return nullptr; } - return page_table_.set_block_acquired(block_id, buffer, size); + return page_table_.set_block_acquired(page_id, buffer); } int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) { @@ -252,10 +245,41 @@ int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) { return 0; } -char *VecBufferPoolHandle::get_block(size_t offset, size_t size, - size_t block_id) { - char *buffer = pool_.acquire_buffer(block_id, offset, size, 50); - return buffer; +char *VecBufferPoolHandle::get_single_page(size_t file_offset, size_t len, + size_t &out_page_id) { + size_t first_page = file_offset / kVectorPageSize; + assert(len == 0 || (file_offset + len - 1) / kVectorPageSize == first_page); + out_page_id = first_page; + char *page = pool_.acquire_buffer(first_page, 50); + if (!page) { + return nullptr; + } + return page + (file_offset - first_page * kVectorPageSize); +} + +bool VecBufferPoolHandle::read_range(size_t file_offset, size_t len, + char *out) { + if (len == 0) { + return true; + } + size_t first_page = file_offset / kVectorPageSize; + size_t last_page = (file_offset + len - 1) / kVectorPageSize; + size_t remaining = len; + size_t dst_cursor = 0; + for (size_t pg = first_page; pg <= last_page; ++pg) { + char *page = pool_.acquire_buffer(pg, 50); + if (!page) { + return false; + } + size_t page_start = pg * kVectorPageSize; + size_t intra_offset = (pg == first_page) ? (file_offset - page_start) : 0; + size_t chunk = std::min(kVectorPageSize - intra_offset, remaining); + std::memcpy(out + dst_cursor, page + intra_offset, chunk); + pool_.page_table_.release_block(pg); + dst_cursor += chunk; + remaining -= chunk; + } + return true; } int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *buffer) { diff --git a/src/core/algorithm/hnsw/hnsw_entity.h b/src/core/algorithm/hnsw/hnsw_entity.h index a6ead8f63..bae57ec7a 100644 --- a/src/core/algorithm/hnsw/hnsw_entity.h +++ b/src/core/algorithm/hnsw/hnsw_entity.h @@ -201,11 +201,21 @@ struct BufferPoolMemoryBlock { void *data) : buffer_pool_handle_(handle), buffer_block_id_(block_id), data_(data) {} + static BufferPoolMemoryBlock MakeOwned(void *owned_data) { + BufferPoolMemoryBlock b; + b.owns_buffer_ = true; + b.data_ = owned_data; + return b; + } + BufferPoolMemoryBlock(const BufferPoolMemoryBlock &rhs) : buffer_pool_handle_(rhs.buffer_pool_handle_), buffer_block_id_(rhs.buffer_block_id_), data_(rhs.data_) { - if (buffer_pool_handle_) { + if (rhs.owns_buffer_) { + owns_buffer_ = false; + buffer_pool_handle_ = nullptr; + } else if (buffer_pool_handle_) { buffer_pool_handle_->acquire_one(buffer_block_id_); } } @@ -216,7 +226,10 @@ struct BufferPoolMemoryBlock { buffer_pool_handle_ = rhs.buffer_pool_handle_; buffer_block_id_ = rhs.buffer_block_id_; data_ = rhs.data_; - if (buffer_pool_handle_) { + if (rhs.owns_buffer_) { + owns_buffer_ = false; + buffer_pool_handle_ = nullptr; + } else if (buffer_pool_handle_) { buffer_pool_handle_->acquire_one(buffer_block_id_); } } @@ -226,8 +239,10 @@ struct BufferPoolMemoryBlock { BufferPoolMemoryBlock(BufferPoolMemoryBlock &&rhs) noexcept : buffer_pool_handle_(rhs.buffer_pool_handle_), buffer_block_id_(rhs.buffer_block_id_), + owns_buffer_(rhs.owns_buffer_), data_(rhs.data_) { rhs.buffer_pool_handle_ = nullptr; + rhs.owns_buffer_ = false; rhs.data_ = nullptr; } @@ -236,8 +251,10 @@ struct BufferPoolMemoryBlock { release(); buffer_pool_handle_ = rhs.buffer_pool_handle_; buffer_block_id_ = rhs.buffer_block_id_; + owns_buffer_ = rhs.owns_buffer_; data_ = rhs.data_; rhs.buffer_pool_handle_ = nullptr; + rhs.owns_buffer_ = false; rhs.data_ = nullptr; } return *this; @@ -260,7 +277,12 @@ struct BufferPoolMemoryBlock { private: void release() { - if (buffer_pool_handle_) { + if (owns_buffer_) { + if (data_) { + ailego_free(data_); + } + owns_buffer_ = false; + } else if (buffer_pool_handle_) { buffer_pool_handle_->release_one(buffer_block_id_); buffer_pool_handle_ = nullptr; } @@ -269,6 +291,7 @@ struct BufferPoolMemoryBlock { ailego::VecBufferPoolHandle *buffer_pool_handle_{nullptr}; size_t buffer_block_id_{0}; + bool owns_buffer_{false}; void *data_{nullptr}; }; diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.h b/src/core/algorithm/hnsw/hnsw_streamer_entity.h index 3dc6c9640..3c2fb0cea 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer_entity.h +++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.h @@ -638,9 +638,16 @@ HnswStreamerEntity::get_neighbors_typed( LOG_ERROR("Read neighbor header failed, ret=%zu", ret); return NeighborsT(); } - BufferPoolMemoryBlock block(mem_block.buffer_pool_handle_, - mem_block.buffer_block_id_, mem_block.data_); - mem_block.buffer_pool_handle_ = nullptr; + BufferPoolMemoryBlock block; + if (mem_block.type_ == IndexStorage::MemoryBlock::MBT_HEAP_SCRATCH) { + block = BufferPoolMemoryBlock::MakeOwned(mem_block.data_); + mem_block.data_ = nullptr; + mem_block.type_ = IndexStorage::MemoryBlock::MBT_UNKNOWN; + } else { + block = BufferPoolMemoryBlock(mem_block.buffer_pool_handle_, + mem_block.buffer_block_id_, mem_block.data_); + mem_block.buffer_pool_handle_ = nullptr; + } return NeighborsT(std::move(block)); } @@ -688,10 +695,19 @@ inline int HnswStreamerEntity::get_vector_typed( loc.second, read_size, ret); return IndexError_ReadData; } - vec_blocks[i] = - BufferPoolMemoryBlock(mem_block.buffer_pool_handle_, + vec_blocks[i] = [&]() { + if (mem_block.type_ == IndexStorage::MemoryBlock::MBT_HEAP_SCRATCH) { + BufferPoolMemoryBlock b = + BufferPoolMemoryBlock::MakeOwned(mem_block.data_); + mem_block.data_ = nullptr; + mem_block.type_ = IndexStorage::MemoryBlock::MBT_UNKNOWN; + return b; + } + BufferPoolMemoryBlock b(mem_block.buffer_pool_handle_, mem_block.buffer_block_id_, mem_block.data_); - mem_block.buffer_pool_handle_ = nullptr; + mem_block.buffer_pool_handle_ = nullptr; + return b; + }(); } return 0; } diff --git a/src/core/algorithm/vamana/vamana_streamer_entity.h b/src/core/algorithm/vamana/vamana_streamer_entity.h index ae2918786..ab8878cb3 100644 --- a/src/core/algorithm/vamana/vamana_streamer_entity.h +++ b/src/core/algorithm/vamana/vamana_streamer_entity.h @@ -352,9 +352,16 @@ VamanaStreamerEntity::get_neighbors_typed( LOG_ERROR("Read neighbor header failed, ret=%zu", ret); return NeighborsT(); } - BufferPoolMemoryBlock block(mem_block.buffer_pool_handle_, - mem_block.buffer_block_id_, mem_block.data_); - mem_block.buffer_pool_handle_ = nullptr; + BufferPoolMemoryBlock block; + if (mem_block.type_ == IndexStorage::MemoryBlock::MBT_HEAP_SCRATCH) { + block = BufferPoolMemoryBlock::MakeOwned(mem_block.data_); + mem_block.data_ = nullptr; + mem_block.type_ = IndexStorage::MemoryBlock::MBT_UNKNOWN; + } else { + block = BufferPoolMemoryBlock(mem_block.buffer_pool_handle_, + mem_block.buffer_block_id_, mem_block.data_); + mem_block.buffer_pool_handle_ = nullptr; + } return NeighborsT(std::move(block)); } @@ -392,10 +399,19 @@ inline int VamanaStreamerEntity::get_vector_typed( LOG_ERROR("Read vector failed, ret=%zu", ret); return IndexError_ReadData; } - vec_blocks[i] = - BufferPoolMemoryBlock(mem_block.buffer_pool_handle_, + vec_blocks[i] = [&]() { + if (mem_block.type_ == IndexStorage::MemoryBlock::MBT_HEAP_SCRATCH) { + BufferPoolMemoryBlock b = + BufferPoolMemoryBlock::MakeOwned(mem_block.data_); + mem_block.data_ = nullptr; + mem_block.type_ = IndexStorage::MemoryBlock::MBT_UNKNOWN; + return b; + } + BufferPoolMemoryBlock b(mem_block.buffer_pool_handle_, mem_block.buffer_block_id_, mem_block.data_); - mem_block.buffer_pool_handle_ = nullptr; + mem_block.buffer_pool_handle_ = nullptr; + return b; + }(); } return 0; } diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 62d442a5b..d0a05fd37 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -80,15 +80,13 @@ class BufferStorage : public IndexStorage { } len = meta->data_size - offset; } - size_t buffer_offset = segment_header_start_offset_ + - segment_header_->content_offset + - segment_->meta()->data_index; - auto *raw = owner_->get_buffer(buffer_offset, capacity_, segment_id_); - if (!raw) { + size_t abs_offset = segment_header_start_offset_ + + segment_header_->content_offset + + segment_->meta()->data_index + offset; + if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, + static_cast(buf))) { return 0; } - auto *data = raw + offset; - memmove(buf, data, len); return len; } @@ -101,14 +99,33 @@ class BufferStorage : public IndexStorage { } len = meta->data_size - offset; } - size_t buffer_offset = segment_header_start_offset_ + - segment_header_->content_offset + - segment_->meta()->data_index; - auto *raw = owner_->get_buffer(buffer_offset, capacity_, segment_id_); - if (!raw) { + size_t abs_offset = segment_header_start_offset_ + + segment_header_->content_offset + + segment_->meta()->data_index + offset; + size_t first_page = abs_offset / ailego::kVectorPageSize; + size_t last_page = (len == 0) + ? first_page + : (abs_offset + len - 1) / ailego::kVectorPageSize; + if (first_page == last_page) { + size_t page_id = 0; + char *raw = owner_->buffer_pool_handle_->get_single_page(abs_offset, + len, page_id); + if (!raw) { + return 0; + } + *data = raw; + return len; + } + char *tmp = static_cast(ailego_aligned_malloc(len, 4096)); + if (!tmp) { return 0; } - *data = raw + offset; + if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, tmp)) { + ailego_free(tmp); + return 0; + } + owner_->register_tmp_buffer(tmp); + *data = tmp; return len; } @@ -120,21 +137,36 @@ class BufferStorage : public IndexStorage { } len = meta->data_size - offset; } - size_t buffer_offset = segment_header_start_offset_ + - segment_header_->content_offset + - segment_->meta()->data_index; - auto *raw = owner_->get_buffer(buffer_offset, capacity_, segment_id_); - if (!raw) { - return 0; - } - - data.reset(owner_->buffer_pool_handle_.get(), segment_id_, raw + offset); - if (data.data()) { + size_t abs_offset = segment_header_start_offset_ + + segment_header_->content_offset + + segment_->meta()->data_index + offset; + size_t first_page = abs_offset / ailego::kVectorPageSize; + size_t last_page = (len == 0) + ? first_page + : (abs_offset + len - 1) / ailego::kVectorPageSize; + if (first_page == last_page) { + size_t page_id = 0; + char *raw = owner_->buffer_pool_handle_->get_single_page(abs_offset, + len, page_id); + if (!raw) { + LOG_ERROR("read error (single-page acquire failed)."); + return -1; + } + data.reset(owner_->buffer_pool_handle_.get(), page_id, raw); return len; - } else { - LOG_ERROR("read error."); + } + char *tmp = static_cast(ailego_aligned_malloc(len, 4096)); + if (!tmp) { + LOG_ERROR("read error (alloc cross-page temp buffer failed)."); + return -1; + } + if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, tmp)) { + ailego_free(tmp); + LOG_ERROR("read error (cross-page read_range failed)."); return -1; } + data = MemoryBlock::MakeOwned(tmp); + return len; } //! Write data into the storage with offset @@ -199,7 +231,7 @@ class BufferStorage : public IndexStorage { if (ret != 0) { return ret; } - ret = buffer_pool_->init(segments_.size()); + ret = buffer_pool_->init(); if (ret != 0) { return ret; } @@ -210,8 +242,22 @@ class BufferStorage : public IndexStorage { return 0; } - char *get_buffer(size_t offset, size_t length, size_t block_id) { - return buffer_pool_handle_->get_block(offset, length, block_id); + void register_tmp_buffer(char *buf) { + std::lock_guard latch(tmp_buffers_mutex_); + tmp_buffers_.push_back(buf); + } + + char *get_buffer(size_t offset, size_t length, size_t /*block_id*/) { + char *tmp = static_cast(ailego_aligned_malloc(length, 4096)); + if (!tmp) { + return nullptr; + } + if (!buffer_pool_handle_->read_range(offset, length, tmp)) { + ailego_free(tmp); + return nullptr; + } + register_tmp_buffer(tmp); + return tmp; } int get_meta(size_t offset, size_t length, char *out) { @@ -472,6 +518,15 @@ class BufferStorage : public IndexStorage { segments_.clear(); memset(&header_, 0, sizeof(header_)); memset(&footer_, 0, sizeof(footer_)); + { + std::lock_guard tmp_latch(tmp_buffers_mutex_); + for (char *p : tmp_buffers_) { + if (p) { + ailego_free(p); + } + } + tmp_buffers_.clear(); + } buffer_pool_handle_.reset(); buffer_pool_.reset(); max_segment_size_ = 0; @@ -503,6 +558,9 @@ class BufferStorage : public IndexStorage { bool index_dirty_{false}; mutable std::mutex mapping_mutex_{}; + std::vector tmp_buffers_{}; + mutable std::mutex tmp_buffers_mutex_{}; + // buffer manager std::string file_name_; IndexFormat::MetaHeader header_{}; diff --git a/src/include/zvec/ailego/buffer/vector_page_table.h b/src/include/zvec/ailego/buffer/vector_page_table.h index 653b7af53..c6a08c9da 100644 --- a/src/include/zvec/ailego/buffer/vector_page_table.h +++ b/src/include/zvec/ailego/buffer/vector_page_table.h @@ -42,16 +42,13 @@ namespace zvec { namespace ailego { +extern const size_t kVectorPageSize; + class VectorPageTable { - struct alignas(64) Entry { + struct Entry { std::atomic ref_count; - // True when this block has been enqueued in BlockEvictionQueue and has not - // yet been evicted. Used in release_block() to suppress duplicate - // insertions: once a block is in the eviction queue we never push it again - // until it is evicted (which resets the flag). std::atomic in_evict_queue; char *buffer; - size_t size; }; public: @@ -76,22 +73,17 @@ class VectorPageTable { void evict_block(block_id_t block_id); - char *set_block_acquired(block_id_t block_id, char *buffer, size_t size); + char *set_block_acquired(block_id_t block_id, char *buffer); size_t entry_num() const { return entry_num_; } - // Returns true if the block has no active references (ref_count <= 0). - // Used by VecBufferPool destructor to assert all handles are released. bool is_released(block_id_t block_id) const { assert(block_id < entry_num_); return entries_[block_id].ref_count.load(std::memory_order_relaxed) <= 0; } - // Returns true if the block is no longer registered in the eviction queue - // (either it was never added, or it has already been evicted). - // Used by BlockEvictionQueue to detect stale queue entries. inline bool is_dead_block(BlockEvictionQueue::BlockType block) const { Entry &entry = entries_[block.vector_block.first]; return !entry.in_evict_queue.load(std::memory_order_relaxed); @@ -108,12 +100,11 @@ class VecBufferPool { public: typedef std::shared_ptr Pointer; + static constexpr size_t kMutexBucketCount = 64UL * 1024UL; + VecBufferPool(const std::string &filename); ~VecBufferPool() { for (size_t i = 0; i < page_table_.entry_num(); ++i) { - // A positive ref_count means a VecBufferPoolHandle is still alive, - // which is a contract violation: all handles must be destroyed before - // the pool itself is destroyed. assert(page_table_.is_released(i)); page_table_.evict_block(i); } @@ -124,12 +115,11 @@ class VecBufferPool { #endif } - int init(size_t segment_count); + int init(); VecBufferPoolHandle get_handle(); - char *acquire_buffer(block_id_t block_id, size_t offset, size_t size, - int retry = 0); + char *acquire_buffer(block_id_t page_id, int retry = 0); int get_meta(size_t offset, size_t length, char *buffer); @@ -146,11 +136,7 @@ class VecBufferPool { VectorPageTable page_table_; private: - // Contiguous array of per-block mutexes (one allocation, cache-friendly for - // the cold-path load in acquire_buffer). block_mutexes_count_ mirrors the - // array length because unique_ptr has no built-in size accessor. std::unique_ptr block_mutexes_{}; - size_t block_mutexes_count_{0}; }; class VecBufferPoolHandle { @@ -162,7 +148,9 @@ class VecBufferPoolHandle { typedef std::shared_ptr Pointer; - char *get_block(size_t offset, size_t size, size_t block_id); + char *get_single_page(size_t file_offset, size_t len, size_t &out_page_id); + + bool read_range(size_t file_offset, size_t len, char *out); int get_meta(size_t offset, size_t length, char *buffer); diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h index ac1052e86..530073aad 100644 --- a/src/include/zvec/core/framework/index_storage.h +++ b/src/include/zvec/core/framework/index_storage.h @@ -34,6 +34,7 @@ class IndexStorage : public IndexModule { MBT_UNKNOWN = 0, MBT_MMAP = 1, MBT_BUFFERPOOL = 2, + MBT_HEAP_SCRATCH = 3, }; MemoryBlock() {} @@ -46,9 +47,17 @@ class IndexStorage : public IndexModule { } MemoryBlock(void *data) : type_(MemoryBlockType::MBT_MMAP), data_(data) {} + static MemoryBlock MakeOwned(void *owned) { + MemoryBlock mb; + mb.type_ = MemoryBlockType::MBT_HEAP_SCRATCH; + mb.data_ = owned; + return mb; + } + MemoryBlock(const MemoryBlock &rhs) { switch (rhs.type_) { case MemoryBlockType::MBT_MMAP: + case MemoryBlockType::MBT_HEAP_SCRATCH: this->reset(rhs.data_); break; case MemoryBlockType::MBT_BUFFERPOOL: @@ -71,6 +80,12 @@ class IndexStorage : public IndexModule { rhs.buffer_pool_handle_ = nullptr; rhs.type_ = MemoryBlockType::MBT_UNKNOWN; break; + case MemoryBlockType::MBT_HEAP_SCRATCH: + type_ = MemoryBlockType::MBT_HEAP_SCRATCH; + data_ = rhs.data_; + rhs.data_ = nullptr; + rhs.type_ = MemoryBlockType::MBT_UNKNOWN; + break; default: break; } @@ -87,6 +102,9 @@ class IndexStorage : public IndexModule { rhs.data_); buffer_pool_handle_->acquire_one(buffer_block_id_); break; + case MemoryBlockType::MBT_HEAP_SCRATCH: + this->reset(rhs.data_); + break; default: break; } @@ -106,6 +124,13 @@ class IndexStorage : public IndexModule { rhs.buffer_pool_handle_ = nullptr; rhs.type_ = MemoryBlockType::MBT_UNKNOWN; break; + case MemoryBlockType::MBT_HEAP_SCRATCH: + release_owned(); + type_ = MemoryBlockType::MBT_HEAP_SCRATCH; + data_ = rhs.data_; + rhs.data_ = nullptr; + rhs.type_ = MemoryBlockType::MBT_UNKNOWN; + break; default: break; } @@ -122,6 +147,9 @@ class IndexStorage : public IndexModule { buffer_pool_handle_->release_one(buffer_block_id_); } break; + case MemoryBlockType::MBT_HEAP_SCRATCH: + release_owned(); + break; default: break; } @@ -136,6 +164,8 @@ class IndexStorage : public IndexModule { void *data) { if (type_ == MemoryBlockType::MBT_BUFFERPOOL) { buffer_pool_handle_->release_one(buffer_block_id_); + } else if (type_ == MemoryBlockType::MBT_HEAP_SCRATCH) { + release_owned(); } type_ = MemoryBlockType::MBT_BUFFERPOOL; buffer_pool_handle_ = buffer_pool_handle; @@ -147,6 +177,8 @@ class IndexStorage : public IndexModule { if (type_ == MemoryBlockType::MBT_BUFFERPOOL) { buffer_pool_handle_->release_one(buffer_block_id_); buffer_pool_handle_ = nullptr; + } else if (type_ == MemoryBlockType::MBT_HEAP_SCRATCH) { + release_owned(); } type_ = MemoryBlockType::MBT_MMAP; data_ = data; @@ -156,6 +188,14 @@ class IndexStorage : public IndexModule { void *data_{nullptr}; mutable ailego::VecBufferPoolHandle *buffer_pool_handle_{nullptr}; size_t buffer_block_id_{0}; + + private: + void release_owned() { + if (data_) { + ailego_free(data_); + data_ = nullptr; + } + } }; struct SegmentData { From a5077f31d6d05bef6cc5f1f629e90bcebcce8552 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Fri, 15 May 2026 11:43:37 +0800 Subject: [PATCH 02/47] buffer write --- src/ailego/buffer/vector_page_table.cc | 164 +++++- .../algorithm/flat/flat_streamer_entity.cc | 22 +- src/core/algorithm/hnsw/hnsw_index_hash.h | 37 +- src/core/utility/buffer_storage.cc | 491 ++++++++++++++++-- .../zvec/ailego/buffer/vector_page_table.h | 79 ++- .../flat/flat_streamer_buffer_test.cc | 246 ++++++++- .../hnsw/hnsw_streamer_buffer_test.cc | 248 +++++++++ 7 files changed, 1202 insertions(+), 85 deletions(-) diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index 553919fb3..43a434225 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -39,6 +39,19 @@ static ssize_t zvec_pread(int fd, void *buf, size_t count, size_t offset) { } return static_cast(bytes_read); } +static ssize_t zvec_pwrite(int fd, const void *buf, size_t count, + size_t offset) { + HANDLE handle = reinterpret_cast(_get_osfhandle(fd)); + if (handle == INVALID_HANDLE_VALUE) return -1; + OVERLAPPED ov = {}; + ov.Offset = static_cast(offset & 0xFFFFFFFF); + ov.OffsetHigh = static_cast(offset >> 32); + DWORD bytes_written = 0; + if (!WriteFile(handle, buf, static_cast(count), &bytes_written, &ov)) { + return -1; + } + return static_cast(bytes_written); +} #endif namespace zvec { @@ -55,7 +68,9 @@ void VectorPageTable::init(size_t entry_num) { for (size_t i = 0; i < entry_num_; i++) { entries_[i].ref_count.store(std::numeric_limits::min()); entries_[i].in_evict_queue.store(false); + entries_[i].is_dirty.store(false); entries_[i].buffer = nullptr; + entries_[i].file_offset = 0; } } @@ -105,6 +120,13 @@ void VectorPageTable::evict_block(block_id_t block_id) { int expected = 0; if (entry.ref_count.compare_exchange_strong( expected, std::numeric_limits::min())) { + // If the block is dirty, flush it to disk before freeing the memory so + // that no modified data is silently lost during eviction. + if (buffer && entry.is_dirty.load(std::memory_order_relaxed) && + flush_callback_) { + flush_callback_(block_id, buffer, kVectorPageSize, entry.file_offset); + entry.is_dirty.store(false, std::memory_order_relaxed); + } if (buffer) { MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); } @@ -117,7 +139,8 @@ void VectorPageTable::evict_block(block_id_t block_id) { entry.in_evict_queue.store(false, std::memory_order_relaxed); } -char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer) { +char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer, + size_t file_offset) { assert(block_id < entry_num_); Entry &entry = entries_[block_id]; while (true) { @@ -131,19 +154,32 @@ char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer) { } } else { entry.buffer = buffer; + entry.file_offset = file_offset; entry.in_evict_queue.store(false, std::memory_order_relaxed); + // A freshly loaded block is clean (memory matches disk). + entry.is_dirty.store(false, std::memory_order_relaxed); entry.ref_count.store(1, std::memory_order_release); return entry.buffer; } } } -VecBufferPool::VecBufferPool(const std::string &filename) { +VecBufferPool::VecBufferPool(const std::string &filename, bool writable, + bool create) { file_name_ = filename; + writable_ = writable || create; #if defined(_MSC_VER) - fd_ = _open(filename.c_str(), O_RDONLY | _O_BINARY); + int flags = + writable_ + ? (create ? (O_RDWR | O_CREAT | O_TRUNC | _O_BINARY) + : (O_RDWR | _O_BINARY)) + : (O_RDONLY | _O_BINARY); + fd_ = _open(filename.c_str(), flags, 0644); #else - fd_ = open(filename.c_str(), O_RDONLY); + int flags = writable_ + ? (create ? (O_RDWR | O_CREAT | O_TRUNC) : O_RDWR) + : O_RDONLY; + fd_ = ::open(filename.c_str(), flags, 0644); #endif if (fd_ < 0) { throw std::runtime_error("Failed to open file: " + filename); @@ -169,6 +205,31 @@ int VecBufferPool::init() { std::make_unique(VecBufferPool::kMutexBucketCount); LOG_DEBUG("entry num: %zu, file_size: %zu", page_table_.entry_num(), file_size_); + + // In writable mode, inject a flush callback into the page table so that + // evict_block()/flush_block()/flush_all() can pwrite dirty blocks back to + // the backing file without needing to know about fd_ directly. + if (writable_) { + int fd = fd_; + const std::string &name = file_name_; + page_table_.set_flush_callback( + [fd, &name](block_id_t /*block_id*/, char *buf, size_t sz, + size_t off) -> int { +#if defined(_MSC_VER) + ssize_t w = zvec_pwrite(fd, buf, sz, off); +#else + ssize_t w = ::pwrite(fd, buf, sz, off); +#endif + if (w != static_cast(sz)) { + LOG_ERROR( + "Buffer pool flush failed: file[%s], offset[%zu], " + "expected[%zu], got[%zd]", + name.c_str(), off, sz, w); + return -1; + } + return 0; + }); + } return 0; } @@ -226,7 +287,7 @@ char *VecBufferPool::acquire_buffer(block_id_t page_id, int retry) { MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); return nullptr; } - return page_table_.set_block_acquired(page_id, buffer); + return page_table_.set_block_acquired(page_id, buffer, page_offset); } int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) { @@ -245,6 +306,81 @@ int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) { return 0; } +int VecBufferPool::write_range(size_t file_offset, size_t length, + const char *src) { + if (!writable_) { + LOG_ERROR("write_range called on read-only pool: file[%s]", + file_name_.c_str()); + return -1; + } + if (length == 0) { + return 0; + } + size_t first_page = file_offset / kVectorPageSize; + size_t last_page = (file_offset + length - 1) / kVectorPageSize; + size_t remaining = length; + size_t src_cursor = 0; + for (size_t pg = first_page; pg <= last_page; ++pg) { + // Loading the page ensures we do not clobber unrelated bytes within the + // same page when the write is not page-aligned. acquire_buffer() pre-fills + // from the backing file (or zero-pads beyond EOF). + char *page = this->acquire_buffer(pg, 50); + if (!page) { + LOG_ERROR("write_range acquire failed: file[%s], page[%zu]", + file_name_.c_str(), pg); + return -1; + } + size_t page_start = pg * kVectorPageSize; + size_t intra_offset = + (pg == first_page) ? (file_offset - page_start) : 0; + size_t chunk = std::min(kVectorPageSize - intra_offset, remaining); + std::memcpy(page + intra_offset, src + src_cursor, chunk); + page_table_.mark_dirty(pg); + page_table_.release_block(pg); + src_cursor += chunk; + remaining -= chunk; + } + return 0; +} + +int VecBufferPool::write_meta(size_t offset, size_t length, + const char *buffer) { + if (!writable_) { + LOG_ERROR("write_meta called on read-only pool: file[%s]", + file_name_.c_str()); + return -1; + } +#if defined(_MSC_VER) + ssize_t w = zvec_pwrite(fd_, buffer, length, offset); +#else + ssize_t w = ::pwrite(fd_, buffer, length, offset); +#endif + if (w != static_cast(length)) { + LOG_ERROR( + "Buffer pool failed to write meta: file[%s], offset[%zu], " + "length[%zu], got[%zd]", + file_name_.c_str(), offset, length, w); + return -1; + } + return 0; +} + +int VecBufferPool::flush_all() { + if (!writable_) { + return 0; + } + int rc = 0; + for (size_t i = 0; i < page_table_.entry_num(); ++i) { + if (page_table_.is_block_dirty(i)) { + int r = page_table_.flush_block(i); + if (r != 0) { + rc = r; + } + } + } + return rc; +} + char *VecBufferPoolHandle::get_single_page(size_t file_offset, size_t len, size_t &out_page_id) { size_t first_page = file_offset / kVectorPageSize; @@ -286,6 +422,24 @@ int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *buffer) { return pool_.get_meta(offset, length, buffer); } +int VecBufferPoolHandle::write_range(size_t file_offset, size_t len, + const char *src) { + return pool_.write_range(file_offset, len, src); +} + +int VecBufferPoolHandle::write_meta(size_t offset, size_t length, + const char *buffer) { + return pool_.write_meta(offset, length, buffer); +} + +int VecBufferPoolHandle::flush_all() { + return pool_.flush_all(); +} + +bool VecBufferPoolHandle::writable() const { + return pool_.writable(); +} + void VecBufferPoolHandle::release_one(block_id_t block_id) { pool_.page_table_.release_block(block_id); } diff --git a/src/core/algorithm/flat/flat_streamer_entity.cc b/src/core/algorithm/flat/flat_streamer_entity.cc index 988f5fdfb..87d9a1906 100644 --- a/src/core/algorithm/flat/flat_streamer_entity.cc +++ b/src/core/algorithm/flat/flat_streamer_entity.cc @@ -165,13 +165,20 @@ int FlatStreamerEntity::add(uint64_t key, const void *vec, size_t size) { IndexStorage::MemoryBlock head_block; this->get_head_block(head_block); - const BlockLocation *bl = - reinterpret_cast(head_block.data()); - if (ailego_unlikely(bl == nullptr)) { - LOG_ERROR("Failed to get block loc"); - return IndexError_ReadData; + BlockLocation block; + { + const BlockLocation *bl = + reinterpret_cast(head_block.data()); + if (ailego_unlikely(bl == nullptr)) { + LOG_ERROR("Failed to get block loc"); + return IndexError_ReadData; + } + block = *bl; } - BlockLocation block = *bl; + // Release the head block reference early so that the buffer pool ref_count + // and memory budget held by it do not block subsequent acquire/evict in this + // function (alloc_block / add_to_block may compete for the same memory). + head_block.reset(nullptr); if (!this->is_valid_block(block)) { int ret = this->alloc_block(block, &block); @@ -922,6 +929,9 @@ int FlatStreamerEntity::add_vector_with_id(const uint32_t id, const void *query, this->get_head_block(head_block); BlockLocation block = *reinterpret_cast(head_block.data()); + // Release buffer-pool pin before any alloc_block() call that may trigger + // append_segment() and rebuild the pool (same reason as in add()). + head_block.reset(nullptr); if (!this->is_valid_block(block)) { int ret = this->alloc_block(block, &block); if (ailego_unlikely(ret != 0)) { diff --git a/src/core/algorithm/hnsw/hnsw_index_hash.h b/src/core/algorithm/hnsw/hnsw_index_hash.h index 1557dcd93..29d81ac92 100644 --- a/src/core/algorithm/hnsw/hnsw_index_hash.h +++ b/src/core/algorithm/hnsw/hnsw_index_hash.h @@ -41,9 +41,9 @@ class HnswIndexHashMap { items_(reinterpret_cast(data)) {} //! Return a empty loc or the key item loc - Slot(Chunk::Pointer &&chunk, IndexStorage::MemoryBlock &&mem_block) - : chunk_(std::move(chunk)), items_block_(std::move(mem_block)) { - items_ = reinterpret_cast(items_block_.data()); + Slot(Chunk::Pointer &&chunk, std::vector &&local_data) + : chunk_(std::move(chunk)), local_data_(std::move(local_data)) { + items_ = reinterpret_cast(local_data_.data()); } const_iterator find(key_type key, uint32_t max_items, uint32_t mask) const { auto it = &items_[key & mask]; @@ -73,8 +73,8 @@ class HnswIndexHashMap { private: Chunk::Pointer chunk_{}; - const Item *items_{nullptr}; // point to chunk data - IndexStorage::MemoryBlock items_block_{}; + const Item *items_{nullptr}; // point to local_data_ + std::vector local_data_{}; }; public: @@ -114,9 +114,9 @@ class HnswIndexHashMap { } int cleanup(void) { - broker_.reset(); slots_.clear(); slots_.shrink_to_fit(); + broker_.reset(); mask_bits_ = 0U; slot_items_ = 0U; slot_loc_mask_ = 0U; @@ -179,14 +179,10 @@ class HnswIndexHashMap { LOG_ERROR("Chunk resize failed, size=%zu", size); return false; } - //! Read the whole data to memory - IndexStorage::MemoryBlock data_block; - if (ailego_unlikely(chunk->read(0U, data_block, size) != size)) { - LOG_ERROR("Chunk read failed, size=%zu", size); - return false; - } - - slots_.emplace_back(std::move(chunk), std::move(data_block)); + //! Use a local zero-initialized buffer; new chunks contain all zeros, + //! so no buffer-pool read is needed and no ref_count is pinned. + std::vector local_buf(size, 0); + slots_.emplace_back(std::move(chunk), std::move(local_buf)); return true; } @@ -208,13 +204,14 @@ class HnswIndexHashMap { i, chunk->data_size(), size); return IndexError_InvalidFormat; } - //! Read the whole data to memory - IndexStorage::MemoryBlock data_block; - if (ailego_unlikely(chunk->read(0U, data_block, size) != size)) { - LOG_ERROR("Chunk read failed, size=%zu", size); - return false; + //! Copy chunk data into a local buffer via fetch() so that no + //! buffer-pool block is pinned for the lifetime of the Slot. + std::vector local_buf(size); + if (ailego_unlikely(chunk->fetch(0U, local_buf.data(), size) != size)) { + LOG_ERROR("Chunk fetch failed, size=%zu", size); + return IndexError_InvalidFormat; } - slots_.emplace_back(std::move(chunk), std::move(data_block)); + slots_.emplace_back(std::move(chunk), std::move(local_buf)); } return 0; } diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index d0a05fd37..b6cd67d75 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -14,7 +14,10 @@ #include #include +#include +#include #include +#include #include #include #include @@ -72,7 +75,16 @@ class BufferStorage : public IndexStorage { } //! Fetch data from segment (with own buffer) + //! + //! LOCKING: takes a shared_lock on owner_->mapping_mutex_ so that + //! append_segment() / close_index() cannot tear down the pool mid-call. size_t fetch(size_t offset, void *buf, size_t len) const override { + std::shared_lock latch(owner_->mapping_mutex_); + if (ailego_unlikely(!owner_->buffer_pool_handle_)) { + LOG_ERROR("WrappedSegment::fetch: handle is null, file[%s], id[%zu]", + owner_->file_name_.c_str(), segment_id_); + return 0; + } if (ailego_unlikely(offset + len > segment_->meta()->data_size)) { auto meta = segment_->meta(); if (offset > meta->data_size) { @@ -91,7 +103,15 @@ class BufferStorage : public IndexStorage { } //! Read data from segment + //! LOCKING: see fetch() above for rationale. size_t read(size_t offset, const void **data, size_t len) override { + std::shared_lock latch(owner_->mapping_mutex_); + if (ailego_unlikely(!owner_->buffer_pool_handle_)) { + LOG_ERROR("WrappedSegment::read: handle is null, file[%s], id[%zu]", + owner_->file_name_.c_str(), segment_id_); + *data = nullptr; + return 0; + } if (ailego_unlikely(offset + len > segment_->meta()->data_size)) { auto meta = segment_->meta(); if (offset > meta->data_size) { @@ -111,17 +131,24 @@ class BufferStorage : public IndexStorage { char *raw = owner_->buffer_pool_handle_->get_single_page(abs_offset, len, page_id); if (!raw) { + *data = nullptr; return 0; } *data = raw; + // Release the buffer-pool ref count acquired by get_single_page(). + // The pointer remains valid as long as the page is not evicted; callers + // needing a stable pin should use the read(MemoryBlock&) overload. + owner_->buffer_pool_handle_->release_one(page_id); return len; } char *tmp = static_cast(ailego_aligned_malloc(len, 4096)); if (!tmp) { + *data = nullptr; return 0; } if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, tmp)) { ailego_free(tmp); + *data = nullptr; return 0; } owner_->register_tmp_buffer(tmp); @@ -129,7 +156,18 @@ class BufferStorage : public IndexStorage { return len; } + //! LOCKING: shared_lock held only while wiring the MemoryBlock. The + //! MemoryBlock carries its own ref_count (raised by get_single_page()) + //! and will release it via its destructor. size_t read(size_t offset, MemoryBlock &data, size_t len) override { + std::shared_lock latch(owner_->mapping_mutex_); + if (ailego_unlikely(!owner_->buffer_pool_handle_)) { + LOG_ERROR( + "WrappedSegment::read(MemoryBlock&): handle is null, file[%s], " + "id[%zu]", + owner_->file_name_.c_str(), segment_id_); + return 0; + } if (ailego_unlikely(offset + len > segment_->meta()->data_size)) { auto meta = segment_->meta(); if (offset > meta->data_size) { @@ -170,18 +208,62 @@ class BufferStorage : public IndexStorage { } //! Write data into the storage with offset - size_t write(size_t /*offset*/, const void * /*data*/, - size_t len) override { + //! LOCKING: see fetch() above for rationale. + size_t write(size_t offset, const void *data, size_t len) override { + std::shared_lock latch(owner_->mapping_mutex_); + if (ailego_unlikely(!owner_->buffer_pool_handle_ || + !owner_->buffer_pool_)) { + LOG_ERROR("WrappedSegment::write: pool is null, file[%s], id[%zu]", + owner_->file_name_.c_str(), segment_id_); + return 0; + } + // In read-only mode the write is a silent no-op so that callers that + // unconditionally write (e.g. CRC updates) do not return an error. + if (!owner_->buffer_pool_->writable()) { + return len; + } + if (ailego_unlikely(offset + len > capacity_)) { + LOG_ERROR("write() exceeds segment capacity: offset=%zu len=%zu cap=%zu", + offset, len, capacity_); + return 0; + } + auto meta = segment_->meta(); + if (offset + len > meta->data_size) { + meta->data_size = offset + len; + meta->padding_size = capacity_ - meta->data_size; + owner_->set_as_dirty(); + } + size_t abs_offset = segment_header_start_offset_ + + segment_header_->content_offset + + segment_->meta()->data_index + offset; + if (owner_->buffer_pool_handle_->write_range( + abs_offset, len, static_cast(data)) != 0) { + LOG_ERROR("write() page-cache write_range failed at abs_offset=%zu", + abs_offset); + return 0; + } return len; } //! Resize size of data - size_t resize(size_t /*size*/) override { - return 0; + size_t resize(size_t size) override { + auto meta = segment_->meta(); + if (meta->data_size != size) { + if (size > capacity_) { + size = capacity_; + } + meta->data_size = size; + meta->padding_size = capacity_ - size; + owner_->set_as_dirty(); + } + return size; } //! Update crc of data - void update_data_crc(uint32_t /*crc*/) override {} + void update_data_crc(uint32_t crc) override { + segment_->meta()->data_crc = crc; + owner_->set_as_dirty(); + } //! Clone the segment IndexStorage::Segment::Pointer clone(void) override { @@ -212,6 +294,10 @@ class BufferStorage : public IndexStorage { //! Initialize storage int init(const ailego::Params ¶ms) override { + uint32_t val = params.get_as_uint32(MMAPFILE_STORAGE_SEGMENT_META_CAPACITY); + if (val != 0) { + segment_meta_capacity_ = val; + } return 0; } @@ -222,9 +308,25 @@ class BufferStorage : public IndexStorage { } //! Open storage - int open(const std::string &path, bool /*create_if_missing*/) override { + int open(const std::string &path, bool create_if_missing) override { file_name_ = path; - buffer_pool_ = std::make_shared(path); + if (!ailego::File::IsExist(path) && create_if_missing) { + size_t last_slash = path.rfind('/'); + if (last_slash != std::string::npos) { + ailego::File::MakePath(path.substr(0, last_slash)); + } + int error_code = this->init_index(path); + if (error_code != 0) { + LOG_ERROR("init_index failed for %s, errno=%d", path.c_str(), + error_code); + return error_code; + } + } + + // Open in writable mode when the caller expects to modify the index + // (create_if_missing=true implies write intent, same as MMapFileStorage). + buffer_pool_ = std::make_shared( + path, /*writable=*/create_if_missing, /*create=*/false); buffer_pool_handle_ = std::make_shared( buffer_pool_->get_handle()); int ret = ParseToMapping(); @@ -236,9 +338,10 @@ class BufferStorage : public IndexStorage { return ret; } LOG_INFO( - "BufferStorage opened: file=%s, max_segment_size=%lu, " + "BufferStorage opened: file=%s, writable=%d, max_segment_size=%lu, " "segment_count=%zu", - file_name_.c_str(), max_segment_size_, segments_.size()); + file_name_.c_str(), static_cast(create_if_missing), + max_segment_size_, segments_.size()); return 0; } @@ -247,7 +350,18 @@ class BufferStorage : public IndexStorage { tmp_buffers_.push_back(buf); } + //! Acquire a page-table block. + //! + //! LOCKING CONTRACT: caller MUST already hold a shared_lock (or + //! unique_lock) on mapping_mutex_. char *get_buffer(size_t offset, size_t length, size_t /*block_id*/) { + if (ailego_unlikely(!buffer_pool_handle_)) { + LOG_ERROR( + "BufferStorage::get_buffer: handle is null, file[%s], " + "offset[%zu], length[%zu]", + file_name_.c_str(), offset, length); + return nullptr; + } char *tmp = static_cast(ailego_aligned_malloc(length, 4096)); if (!tmp) { return nullptr; @@ -260,13 +374,13 @@ class BufferStorage : public IndexStorage { return tmp; } - int get_meta(size_t offset, size_t length, char *out) { - return buffer_pool_handle_->get_meta(offset, length, out); - } - int ParseHeader(size_t offset) { std::unique_ptr buffer(new char[sizeof(header_)]); - if (get_meta(offset, sizeof(header_), buffer.get()) != 0) { + // NOTE: bypass a wrapper get_meta() -- ParseHeader is called from + // reopen_pool() which already holds a unique_lock on mapping_mutex_ + // (std::shared_mutex is not reentrant -> deadlock). + if (buffer_pool_handle_->get_meta(offset, sizeof(header_), buffer.get()) != + 0) { LOG_ERROR("Get segment header failed."); return IndexError_Runtime; } @@ -286,7 +400,9 @@ class BufferStorage : public IndexStorage { int ParseFooter(size_t offset) { std::unique_ptr buffer(new char[sizeof(footer_)]); - if (get_meta(offset, sizeof(footer_), buffer.get()) != 0) { + // Bypass wrapper -- see ParseHeader() comment for why. + if (buffer_pool_handle_->get_meta(offset, sizeof(footer_), buffer.get()) != + 0) { LOG_ERROR("Get segment footer failed."); return IndexError_Runtime; } @@ -305,11 +421,16 @@ class BufferStorage : public IndexStorage { } int ParseSegment(size_t offset) { - std::lock_guard latch(mapping_mutex_); + // NOTE: this function is only called from ParseToMapping(), which is + // itself called from either open() (single-threaded construction) or + // reopen_pool() (always invoked under the unique_lock held by + // append_segment()). Do NOT add an internal lock here -- doing so would + // deadlock the append_segment() path. std::unique_ptr segment_buffer = std::make_unique(footer_.segments_meta_size); - if (get_meta(offset, footer_.segments_meta_size, segment_buffer.get()) != - 0) { + // Bypass wrapper -- see ParseHeader() comment for why. + if (buffer_pool_handle_->get_meta(offset, footer_.segments_meta_size, + segment_buffer.get()) != 0) { LOG_ERROR("Get segment meta failed."); return IndexError_Runtime; } @@ -337,15 +458,20 @@ class BufferStorage : public IndexStorage { if (iter->segment_id_offset < segment_ids_offset) { segment_ids_offset = iter->segment_id_offset; } - id_hash_.emplace( - std::string(reinterpret_cast(segment_start) + - iter->segment_id_offset), - segments_.size()); - segments_.emplace( - std::string(reinterpret_cast(segment_start) + - iter->segment_id_offset), - IndexMapping::SegmentInfo{IndexMapping::Segment{iter}, - current_header_start_offset_, &header_}); + // Assign a stable numeric ID (block_id in the page table) to this + // segment. We use id_hash_.size() rather than segments_.size() because + // segments_ is intentionally NOT cleared between appends (to keep + // existing WrappedSegment pointers valid), so segments_.size() would + // reflect stale entries and produce wrong IDs on re-parse. + const std::string seg_name(reinterpret_cast(segment_start) + + iter->segment_id_offset); + id_hash_[seg_name] = id_hash_.size(); + // Update the segments_ entry in-place so that any WrappedSegment + // instances that already hold a pointer to this entry (via + // &segments_[name].segment) continue to use the refreshed meta_ptr_ + // after the re-parse. + segments_[seg_name] = IndexMapping::SegmentInfo{ + IndexMapping::Segment{iter}, current_header_start_offset_, &header_}; max_segment_size_ = std::max(max_segment_size_, iter->data_size + iter->padding_size); if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count > @@ -405,6 +531,12 @@ class BufferStorage : public IndexStorage { return ret; } + // Record per-chain metadata offsets so flush_index() can write + // updated segment metas and footers back to the backing file. + meta_chains_.push_back({current_header_start_offset_, footer_offset, + segment_start_offset, + footer_.segments_meta_size}); + if (footer_.next_meta_header_offset == 0) { break; } @@ -461,16 +593,17 @@ class BufferStorage : public IndexStorage { } protected: - //! Initialize index version segment - int init_version_segment(void) { + //! Initialize index version segment (writes content into an IndexMapping). + //! Only intended to be called from init_index() while `mapping` is still + //! open in create-mode. + int init_version_segment(IndexMapping &mapping) { size_t data_size = std::strlen(IndexVersion::Details()); - int error_code = - this->append_segment(INDEX_VERSION_SEGMENT_NAME, data_size); + int error_code = mapping.append(INDEX_VERSION_SEGMENT_NAME, data_size); if (error_code != 0) { return error_code; } - - auto segment = &get_segment_info(INDEX_VERSION_SEGMENT_NAME)->segment; + IndexMapping::Segment *segment = + mapping.map(INDEX_VERSION_SEGMENT_NAME, false, false); if (!segment) { return IndexError_MMapFile; } @@ -484,17 +617,35 @@ class BufferStorage : public IndexStorage { return 0; } - //! Initialize index file - int init_index(const std::string & /*path*/) { - // Add index version - int error_code = this->init_version_segment(); - if (error_code != 0) { - return error_code; + //! Create the initial on-disk index structure and write the mandatory + //! version segment. Uses IndexMapping (the same engine as MMapFileStorage) + //! so the produced file is fully compatible with both storage backends. + int init_index(const std::string &path) { + IndexMapping mapping; + int ret = mapping.create(path, segment_meta_capacity_); + if (ret != 0) { + LOG_ERROR( + "BufferStorage failed to create index file: path[%s], errno[%d]", + path.c_str(), ret); + return ret; } - - // Refresh mapping - this->refresh_index(0); - return 0; + ret = this->init_version_segment(mapping); + if (ret != 0) { + LOG_ERROR( + "BufferStorage failed to append version segment: path[%s], errno[%d]", + path.c_str(), ret); + mapping.close(); + return ret; + } + mapping.refresh(0); + ret = mapping.flush(); + mapping.close(); + if (ret != 0) { + LOG_ERROR( + "BufferStorage failed to flush new index file: path[%s], errno[%d]", + path.c_str(), ret); + } + return ret; } //! Set the index file as dirty @@ -503,16 +654,90 @@ class BufferStorage : public IndexStorage { } //! Refresh meta information (checksum, update time, etc.) - void refresh_index(uint64_t /*chkp*/) {} + void refresh_index(uint64_t /*chkp*/) { + // In BufferStorage the segment metadata lives in buffer_pool_buffers_. + // CRC recomputation and disk write are deferred to flush_index(). + // Just mark dirty so flush_index() will include the metadata write. + index_dirty_ = true; + } - //! Flush index storage + //! Flush index storage: persists any pending meta changes (segments_meta + + //! footer) for each header chain, then asks the page cache to write back + //! dirty data pages. int flush_index(void) { + if (!index_dirty_) { + return 0; + } + // SHARED LOCK: keep mapping_mutex_ held for the whole flush so that the + // pool/handle cannot be torn down by append_segment()/close_index() + // mid-flush. + std::shared_lock latch(mapping_mutex_); + // NULL GUARD: a previous append_segment() may have left the pool in a + // torn-down state. + if (!buffer_pool_ || !buffer_pool_handle_) { + LOG_ERROR("BufferStorage::flush_index skipped: pool not ready, file[%s]", + file_name_.c_str()); + return IndexError_Runtime; + } + if (!buffer_pool_->writable()) { + // Read-only pool: nothing to flush. + index_dirty_ = false; + return 0; + } + // Flush all dirty data blocks to the backing file first. + if (buffer_pool_handle_->flush_all() != 0) { + LOG_ERROR("flush_all data blocks failed: file[%s]", file_name_.c_str()); + return IndexError_WriteData; + } + // For each metadata chain, recompute the segment-meta CRC, update the + // footer (segments_meta_crc + footer_crc + update_time), and write both + // the segment metadata and the footer back to the backing file. + for (size_t ci = 0; + ci < meta_chains_.size() && ci < buffer_pool_buffers_.size(); ++ci) { + const MetaChain &chain = meta_chains_[ci]; + const char *seg_buf = buffer_pool_buffers_[ci].get(); + // Read the on-disk footer into a local copy so we can update it. + IndexFormat::MetaFooter footer; + if (buffer_pool_handle_->get_meta( + chain.footer_file_offset, sizeof(footer), + reinterpret_cast(&footer)) != 0) { + LOG_ERROR("Failed to read footer for flush: file[%s], chain[%zu]", + file_name_.c_str(), ci); + return IndexError_Runtime; + } + // Recompute segment metadata CRC and refresh the footer. + footer.segments_meta_crc = + ailego::Crc32c::Hash(seg_buf, chain.segment_meta_size, 0u); + IndexFormat::UpdateMetaFooter(&footer, 0); + // Write segment metadata back to disk. + if (buffer_pool_handle_->write_meta(chain.segment_meta_file_offset, + chain.segment_meta_size, + seg_buf) != 0) { + LOG_ERROR("Failed to write segment meta: file[%s], chain[%zu]", + file_name_.c_str(), ci); + return IndexError_WriteData; + } + // Write the updated footer back to disk. + if (buffer_pool_handle_->write_meta( + chain.footer_file_offset, sizeof(footer), + reinterpret_cast(&footer)) != 0) { + LOG_ERROR("Failed to write footer: file[%s], chain[%zu]", + file_name_.c_str(), ci); + return IndexError_WriteData; + } + } + index_dirty_ = false; return 0; } //! Close index storage void close_index(void) { - std::lock_guard latch(mapping_mutex_); + // Flush any outstanding dirty metadata to disk before tearing down. + // IMPORTANT: call flush_index() BEFORE taking the unique_lock below; + // flush_index() internally takes a shared_lock on the same mutex and + // std::shared_mutex is NOT reentrant. + this->flush_index(); + std::unique_lock latch(mapping_mutex_); file_name_.clear(); id_hash_.clear(); segments_.clear(); @@ -531,22 +756,167 @@ class BufferStorage : public IndexStorage { buffer_pool_.reset(); max_segment_size_ = 0; buffer_pool_buffers_.clear(); + meta_chains_.clear(); + // Drop retired pools last -- any stray MemoryBlock still holding a raw + // handle pointer would hit use-after-free here, but by close_index() + // time all build/search threads are expected to have joined. + retired_handles_.clear(); + retired_pools_.clear(); + current_header_start_offset_ = 0; + } + + //! Reopen the buffer pool and reload the mapping. Used both as the final + //! success step of append_segment() and as a rollback path when any + //! IndexMapping operation fails mid-way through append_segment(). + //! + //! VecBufferPool's constructor throws on open()/fstat() failure; we catch + //! that here and translate it into an error code. + int reopen_pool() { + try { + buffer_pool_ = std::make_shared( + file_name_, /*writable=*/true, /*create=*/false); + buffer_pool_handle_ = std::make_shared( + buffer_pool_->get_handle()); + } catch (const std::exception &e) { + LOG_ERROR( + "BufferStorage::reopen_pool failed to create pool: file[%s], " + "what[%s]", + file_name_.c_str(), e.what()); + buffer_pool_.reset(); + buffer_pool_handle_.reset(); + return IndexError_Runtime; + } + int ret = ParseToMapping(); + if (ret != 0) { + LOG_ERROR( + "BufferStorage::reopen_pool failed to parse mapping: file[%s], " + "errno[%d]", + file_name_.c_str(), ret); + return ret; + } + return buffer_pool_->init(); } //! Append a segment into storage - int append_segment(const std::string & /*id*/, size_t /*size*/) { - return 0; + int append_segment(const std::string &id, size_t size) { + // Flush any in-memory metadata changes (data_size, padding_size, CRC) + // accumulated by prior write()/resize() calls BEFORE we reset the buffer + // pool below. Without this flush, those changes would be lost when + // buffer_pool_buffers_ is cleared and re-populated from disk. + // IMPORTANT: call flush_index() BEFORE taking the unique_lock below; + // flush_index() internally takes a shared_lock on the same mutex and + // std::shared_mutex is NOT reentrant. + this->flush_index(); + + // UNIQUE LOCK: hold the mutex for the entire structural modification + // (reset -> IndexMapping.open/append/flush -> reopen_pool). Concurrent + // readers/writers taking shared_lock will block here. + std::unique_lock latch(mapping_mutex_); + + // RETIRE the old pool instead of immediately destroying it. MemoryBlock + // objects held by other threads carry a ref_count on a block inside this + // pool but store only a RAW VecBufferPoolHandle*; if we reset() the + // shared_ptr here, the pool destructor fires while those ref_counts are + // still > 0 and the is_released() assert trips. By parking in + // retired_pools_ the pool survives until all external refs are gone. + auto prune_retired = [&]() { + size_t w = 0; + for (size_t r = 0; r < retired_pools_.size(); ++r) { + bool any_held = false; + auto &pt = retired_pools_[r]->page_table_; + for (size_t i = 0; i < pt.entry_num(); ++i) { + if (!pt.is_released(i)) { + any_held = true; + break; + } + } + if (any_held) { + if (w != r) { + retired_pools_[w] = std::move(retired_pools_[r]); + retired_handles_[w] = std::move(retired_handles_[r]); + } + ++w; + } + } + retired_pools_.resize(w); + retired_handles_.resize(w); + }; + prune_retired(); + + // Flush and release the buffer pool so IndexMapping can safely open + // and structurally modify the same file. + if (buffer_pool_handle_) { + buffer_pool_handle_->flush_all(); + } + // Park the old pool + handle. + if (buffer_pool_) { + retired_pools_.push_back(std::move(buffer_pool_)); + retired_handles_.push_back(std::move(buffer_pool_handle_)); + } else { + buffer_pool_handle_.reset(); + } + buffer_pool_.reset(); + // Reset parse-time state EXCEPT for segments_: WrappedSegment instances + // held by callers store raw pointers into segments_' mapped values. + // The C++ standard guarantees that unordered_map references/pointers to + // mapped values are never invalidated by insertions, so we can safely + // leave segments_ intact and update entries in-place during re-parse. + id_hash_.clear(); + buffer_pool_buffers_.clear(); + meta_chains_.clear(); + current_header_start_offset_ = 0u; + max_segment_size_ = 0u; + memset(&header_, 0, sizeof(header_)); + memset(&footer_, 0, sizeof(footer_)); + + // Delegate the structural append to IndexMapping (same engine used by + // MMapFileStorage) so the on-disk format stays consistent. + IndexMapping mapping; + int ret = mapping.open(file_name_, /*cow=*/false, /*full_mode=*/false); + if (ret != 0) { + LOG_ERROR( + "BufferStorage::append_segment failed to open IndexMapping: " + "file[%s], id[%s], errno[%d]", + file_name_.c_str(), id.c_str(), ret); + reopen_pool(); + return ret; + } + ret = mapping.append(id, size); + if (ret != 0) { + LOG_ERROR( + "BufferStorage::append_segment failed to append segment: " + "file[%s], id[%s], errno[%d]", + file_name_.c_str(), id.c_str(), ret); + mapping.close(); + reopen_pool(); + return ret; + } + mapping.refresh(0); + ret = mapping.flush(); + mapping.close(); + if (ret != 0) { + LOG_ERROR( + "BufferStorage::append_segment failed to flush: " + "file[%s], id[%s], errno[%d]", + file_name_.c_str(), id.c_str(), ret); + reopen_pool(); + return ret; + } + + // Reopen the buffer pool and reload the mapping so the new segment is + // accessible via get_segment_info() / get(). + return reopen_pool(); } //! Test if a segment exists bool has_segment(const std::string &id) const { - std::lock_guard latch(mapping_mutex_); + std::shared_lock latch(mapping_mutex_); return (segments_.find(id) != segments_.end()); } //! Get a segment from storage IndexMapping::SegmentInfo *get_segment_info(const std::string &id) { - std::lock_guard latch(mapping_mutex_); + std::shared_lock latch(mapping_mutex_); auto iter = segments_.find(id); if (iter == segments_.end()) { return nullptr; @@ -556,7 +926,7 @@ class BufferStorage : public IndexStorage { private: bool index_dirty_{false}; - mutable std::mutex mapping_mutex_{}; + mutable std::shared_mutex mapping_mutex_{}; std::vector tmp_buffers_{}; mutable std::mutex tmp_buffers_mutex_{}; @@ -570,10 +940,29 @@ class BufferStorage : public IndexStorage { uint64_t max_segment_size_{0}; std::vector> buffer_pool_buffers_{}; + // Retired pools: see prune_retired() in append_segment() for the + // life-cycle contract. + std::vector retired_pools_{}; + std::vector retired_handles_{}; + ailego::VecBufferPool::Pointer buffer_pool_{nullptr}; ailego::VecBufferPoolHandle::Pointer buffer_pool_handle_{nullptr}; uint64_t current_header_start_offset_{0u}; uint64_t buffer_size_{2lu * 1024 * 1024 * 1024}; // 2G + + // Capacity (in bytes) of the segment metadata section written by + // init_index(). + uint32_t segment_meta_capacity_{4096u}; + + // Per-header-chain file offsets used by flush_index() to write updated + // segment metadata and footer back to the backing file after writes. + struct MetaChain { + uint64_t header_start_offset; + uint64_t footer_file_offset; + uint64_t segment_meta_file_offset; + uint32_t segment_meta_size; + }; + std::vector meta_chains_{}; }; INDEX_FACTORY_REGISTER_STORAGE(BufferStorage); diff --git a/src/include/zvec/ailego/buffer/vector_page_table.h b/src/include/zvec/ailego/buffer/vector_page_table.h index c6a08c9da..7fb0a9946 100644 --- a/src/include/zvec/ailego/buffer/vector_page_table.h +++ b/src/include/zvec/ailego/buffer/vector_page_table.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -48,10 +49,17 @@ class VectorPageTable { struct Entry { std::atomic ref_count; std::atomic in_evict_queue; + std::atomic is_dirty; char *buffer; + size_t file_offset; }; public: + // Callback invoked by evict_block() to persist a dirty block before its + // memory is released. Signature: (block_id, buffer, size, file_offset). + using FlushCallback = + std::function; + VectorPageTable() : entry_num_(0), entries_(nullptr) { BlockEvictionQueue::get_instance().set_valid(this); } @@ -73,7 +81,43 @@ class VectorPageTable { void evict_block(block_id_t block_id); - char *set_block_acquired(block_id_t block_id, char *buffer); + char *set_block_acquired(block_id_t block_id, char *buffer, + size_t file_offset); + + void set_flush_callback(FlushCallback cb) { + flush_callback_ = std::move(cb); + } + + //! Mark a loaded block as dirty so that it is persisted on eviction. + void mark_dirty(block_id_t block_id) { + assert(block_id < entry_num_); + entries_[block_id].is_dirty.store(true, std::memory_order_relaxed); + } + + bool is_block_dirty(block_id_t block_id) const { + assert(block_id < entry_num_); + return entries_[block_id].is_dirty.load(std::memory_order_relaxed); + } + + //! Flush a single dirty block without evicting it. Caller guarantees the + //! block is currently loaded (buffer != nullptr). + int flush_block(block_id_t block_id) { + assert(block_id < entry_num_); + Entry &entry = entries_[block_id]; + char *buffer = entry.buffer; + if (!buffer || !flush_callback_) { + return 0; + } + if (!entry.is_dirty.load(std::memory_order_relaxed)) { + return 0; + } + int rc = flush_callback_(block_id, buffer, kVectorPageSize, + entry.file_offset); + if (rc == 0) { + entry.is_dirty.store(false, std::memory_order_relaxed); + } + return rc; + } size_t entry_num() const { return entry_num_; @@ -92,6 +136,7 @@ class VectorPageTable { private: size_t entry_num_{0}; Entry *entries_{nullptr}; + FlushCallback flush_callback_{}; }; class VecBufferPoolHandle; @@ -102,8 +147,12 @@ class VecBufferPool { static constexpr size_t kMutexBucketCount = 64UL * 1024UL; - VecBufferPool(const std::string &filename); + VecBufferPool(const std::string &filename, bool writable = false, + bool create = false); ~VecBufferPool() { + // Flush any remaining dirty blocks before tearing down memory/fd so that + // writes are not silently lost. Safe to call even in read-only mode. + (void)this->flush_all(); for (size_t i = 0; i < page_table_.entry_num(); ++i) { assert(page_table_.is_released(i)); page_table_.evict_block(i); @@ -123,6 +172,23 @@ class VecBufferPool { int get_meta(size_t offset, size_t length, char *buffer); + //! Write a contiguous range via the page cache; marks touched pages dirty. + //! Returns 0 on success, -1 on failure (e.g. read-only pool or I/O error). + int write_range(size_t file_offset, size_t length, const char *src); + + //! Write raw bytes directly via pwrite, bypassing the page cache. Used for + //! metadata regions (header/footer/segments_meta) which are only read via + //! get_meta() and never cached. + int write_meta(size_t offset, size_t length, const char *buffer); + + //! Iterate all entries and persist any dirty blocks to disk. Safe to call + //! repeatedly; no-op in read-only mode. + int flush_all(); + + bool writable() const { + return writable_; + } + size_t file_size() const { return file_size_; } @@ -131,6 +197,7 @@ class VecBufferPool { int fd_; size_t file_size_; std::string file_name_; + bool writable_{false}; public: VectorPageTable page_table_; @@ -154,6 +221,14 @@ class VecBufferPoolHandle { int get_meta(size_t offset, size_t length, char *buffer); + int write_range(size_t file_offset, size_t len, const char *src); + + int write_meta(size_t offset, size_t length, const char *buffer); + + int flush_all(); + + bool writable() const; + void release_one(block_id_t block_id); void acquire_one(block_id_t block_id); diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_test.cc b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc index 6502d5321..cf4114750 100644 --- a/tests/core/algorithm/flat/flat_streamer_buffer_test.cc +++ b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc @@ -168,6 +168,251 @@ TEST_F(FlatStreamerTest, TestLinearSearch) { read_streamer.reset(); } +TEST_F(FlatStreamerTest, TestLinearSearchBuffer) { + MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL); + IndexStreamer::Pointer write_streamer = + IndexFactory::CreateStreamer("FlatStreamer"); + ASSERT_TRUE(write_streamer != nullptr); + + Params params; + ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params)); + auto storage = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_NE(nullptr, storage); + Params stg_params; + ASSERT_EQ(0, storage->init(stg_params)); + ASSERT_EQ(0, storage->open(dir_ + "Test/LinearSearchBuffer", true)); + ASSERT_EQ(0, write_streamer->open(storage)); + + auto ctx = write_streamer->create_context(); + ASSERT_TRUE(!!ctx); + + size_t cnt = 10000UL; + IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim); + for (size_t i = 0; i < cnt; i++) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + write_streamer->add_impl(i, vec.data(), qmeta, ctx); + } + write_streamer->flush(0UL); + write_streamer->close(); + write_streamer.reset(); + storage->close(); + + IndexStreamer::Pointer read_streamer = + IndexFactory::CreateStreamer("FlatStreamer"); + ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params)); + auto read_storage = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_NE(nullptr, read_storage); + ASSERT_EQ(0, read_storage->init(stg_params)); + ASSERT_EQ(0, read_storage->open(dir_ + "Test/LinearSearchBuffer", false)); + ASSERT_EQ(0, read_streamer->open(read_storage)); + size_t topk = 3; + auto provider = read_streamer->create_provider(); + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_FLOAT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + + ctx->set_topk(100U); + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = 10.1f; + } + ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx)); + auto &result = ctx->result(); + ASSERT_EQ(100U, result.size()); + ASSERT_EQ(10, result[0].key()); + ASSERT_EQ(11, result[1].key()); + ASSERT_EQ(5, result[10].key()); + ASSERT_EQ(0, result[20].key()); + ASSERT_EQ(30, result[30].key()); + ASSERT_EQ(35, result[35].key()); + ASSERT_EQ(99, result[99].key()); + + ElapsedTime elapsed_time; + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_FLOAT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl; + + read_streamer->close(); + read_streamer.reset(); +} + +TEST_F(FlatStreamerTest, TestLinearSearchBufferMMap) { + MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL); + IndexStreamer::Pointer write_streamer = + IndexFactory::CreateStreamer("FlatStreamer"); + ASSERT_TRUE(write_streamer != nullptr); + + Params params; + ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params)); + auto storage = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_NE(nullptr, storage); + Params stg_params; + ASSERT_EQ(0, storage->init(stg_params)); + ASSERT_EQ(0, storage->open(dir_ + "Test/LinearSearchBuffer", true)); + ASSERT_EQ(0, write_streamer->open(storage)); + + auto ctx = write_streamer->create_context(); + ASSERT_TRUE(!!ctx); + + size_t cnt = 10000UL; + IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim); + for (size_t i = 0; i < cnt; i++) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + write_streamer->add_impl(i, vec.data(), qmeta, ctx); + } + write_streamer->flush(0UL); + write_streamer->close(); + write_streamer.reset(); + storage->close(); + + IndexStreamer::Pointer read_streamer = + IndexFactory::CreateStreamer("FlatStreamer"); + ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params)); + auto read_storage = IndexFactory::CreateStorage("MMapFileStorage"); + ASSERT_NE(nullptr, read_storage); + ASSERT_EQ(0, read_storage->init(stg_params)); + ASSERT_EQ(0, read_storage->open(dir_ + "Test/LinearSearchBuffer", false)); + ASSERT_EQ(0, read_streamer->open(read_storage)); + size_t topk = 3; + auto provider = read_streamer->create_provider(); + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_FLOAT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + + ctx->set_topk(100U); + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = 10.1f; + } + ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx)); + auto &result = ctx->result(); + ASSERT_EQ(100U, result.size()); + ASSERT_EQ(10, result[0].key()); + ASSERT_EQ(11, result[1].key()); + ASSERT_EQ(5, result[10].key()); + ASSERT_EQ(0, result[20].key()); + ASSERT_EQ(30, result[30].key()); + ASSERT_EQ(35, result[35].key()); + ASSERT_EQ(99, result[99].key()); + + ElapsedTime elapsed_time; + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_FLOAT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl; + + read_streamer->close(); + read_streamer.reset(); +} + + TEST_F(FlatStreamerTest, TestLinearSearchWithLRU) { MemoryLimitPool::get_instance().init(100 * 1024UL * 1024UL); #ifdef __ANDROID__ @@ -350,7 +595,6 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) { ASSERT_EQ(topk, result1.size()); IndexStorage::MemoryBlock block; ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); - const float *data = (float *)block.data(); for (size_t j = 0; j < dim; ++j) { const float *data = (float *)provider->get_vector(result1[0].key()); EXPECT_FLOAT_EQ(data[j], i); diff --git a/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc b/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc index 30f9d7cbb..00d2251b2 100644 --- a/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc +++ b/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc @@ -171,6 +171,254 @@ TEST_F(HnswStreamerTest, TestHnswSearch) { cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl; } +TEST_F(HnswStreamerTest, TestHnswSearchBuffer) { + MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL); + IndexStreamer::Pointer write_streamer = + IndexFactory::CreateStreamer("HnswStreamer"); + ASSERT_TRUE(write_streamer != nullptr); + + Params params; + params.set(PARAM_HNSW_STREAMER_GET_VECTOR_ENABLE, true); + + ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params)); + auto storage = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_NE(nullptr, storage); + Params stg_params; + ASSERT_EQ(0, storage->init(stg_params)); + ASSERT_EQ(0, storage->open(dir_ + "Test/TestHnswSearchBuffer", true)); + ASSERT_EQ(0, write_streamer->open(storage)); + + auto ctx = write_streamer->create_context(); + ASSERT_TRUE(!!ctx); + + size_t cnt = 10000UL; + IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim); + for (size_t i = 0; i < cnt; i++) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + write_streamer->add_impl(i, vec.data(), qmeta, ctx); + } + write_streamer->flush(0UL); + write_streamer->close(); + write_streamer.reset(); + storage->close(); + + IndexStreamer::Pointer read_streamer = + IndexFactory::CreateStreamer("HnswStreamer"); + ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params)); + auto read_storage = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_NE(nullptr, read_storage); + ASSERT_EQ(0, read_storage->init(stg_params)); + ASSERT_EQ(0, read_storage->open(dir_ + "Test/TestHnswSearchBuffer", false)); + ASSERT_EQ(0, read_streamer->open(read_storage)); + size_t topk = 3; + auto provider = read_streamer->create_provider(); + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + + ctx->set_topk(100U); + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = 10.1f; + } + ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx)); + auto &result = ctx->result(); + ASSERT_EQ(100U, result.size()); + ASSERT_EQ(10, result[0].key()); + ASSERT_EQ(11, result[1].key()); + ASSERT_EQ(5, result[10].key()); + ASSERT_EQ(0, result[20].key()); + ASSERT_EQ(30, result[30].key()); + ASSERT_EQ(35, result[35].key()); + ASSERT_EQ(99, result[99].key()); + + ElapsedTime elapsed_time; + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + + read_streamer->close(); + read_streamer.reset(); + cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl; +} + +TEST_F(HnswStreamerTest, TestHnswSearchBufferMMap) { + MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL); + IndexStreamer::Pointer write_streamer = + IndexFactory::CreateStreamer("HnswStreamer"); + ASSERT_TRUE(write_streamer != nullptr); + + Params params; + params.set(PARAM_HNSW_STREAMER_GET_VECTOR_ENABLE, true); + + ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params)); + auto storage = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_NE(nullptr, storage); + Params stg_params; + ASSERT_EQ(0, storage->init(stg_params)); + ASSERT_EQ(0, storage->open(dir_ + "Test/TestHnswSearchBufferMMap", true)); + ASSERT_EQ(0, write_streamer->open(storage)); + + auto ctx = write_streamer->create_context(); + ASSERT_TRUE(!!ctx); + + size_t cnt = 10000UL; + IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim); + for (size_t i = 0; i < cnt; i++) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + write_streamer->add_impl(i, vec.data(), qmeta, ctx); + } + write_streamer->flush(0UL); + write_streamer->close(); + write_streamer.reset(); + storage->close(); + + IndexStreamer::Pointer read_streamer = + IndexFactory::CreateStreamer("HnswStreamer"); + ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params)); + auto read_storage = IndexFactory::CreateStorage("MMapFileStorage"); + ASSERT_NE(nullptr, read_storage); + ASSERT_EQ(0, read_storage->init(stg_params)); + ASSERT_EQ(0, read_storage->open(dir_ + "Test/TestHnswSearchBufferMMap", false)); + ASSERT_EQ(0, read_streamer->open(read_storage)); + size_t topk = 3; + auto provider = read_streamer->create_provider(); + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + + ctx->set_topk(100U); + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = 10.1f; + } + ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx)); + auto &result = ctx->result(); + ASSERT_EQ(100U, result.size()); + ASSERT_EQ(10, result[0].key()); + ASSERT_EQ(11, result[1].key()); + ASSERT_EQ(5, result[10].key()); + ASSERT_EQ(0, result[20].key()); + ASSERT_EQ(30, result[30].key()); + ASSERT_EQ(35, result[35].key()); + ASSERT_EQ(99, result[99].key()); + + ElapsedTime elapsed_time; + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + + read_streamer->close(); + read_streamer.reset(); + cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl; +} + TEST_F(HnswStreamerTest, TestHnswSearchMMap) { IndexStreamer::Pointer write_streamer = IndexFactory::CreateStreamer("HnswStreamer"); From 6ecb2b5f337e2dad32fabbf5ca19f72ea6b22cdd Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Fri, 15 May 2026 17:28:55 +0800 Subject: [PATCH 03/47] fix --- src/core/utility/buffer_storage.cc | 162 +++++++++++------- .../zvec/core/framework/index_storage.h | 70 +++++++- 2 files changed, 167 insertions(+), 65 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index b6cd67d75..a260a77ae 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -41,32 +41,37 @@ class BufferStorage : public IndexStorage { typedef std::shared_ptr Pointer; //! Constructor - WrappedSegment(BufferStorage *owner, IndexMapping::Segment *segment, - uint64_t segment_header_start_offset, - IndexFormat::MetaHeader *segment_header, size_t segment_id) - : segment_(segment), + //! + //! `info` MUST be a pointer into BufferStorage::segments_ (an + //! unordered_map mapped value). C++ guarantees those pointers stay + //! valid across insertions, so the WrappedSegment can safely fetch + //! the LATEST segment_header / segment_header_start_offset / Segment + //! after a re-parse caused by append_segment(). Storing the pointer + //! (rather than copying header_/offset into local fields) is what + //! prevents use-after-free when chain_headers_ is rebuilt. + WrappedSegment(BufferStorage *owner, IndexMapping::SegmentInfo *info, + size_t segment_id) + : segment_info_(info), owner_(owner), segment_id_(segment_id), - capacity_(static_cast(segment->meta()->data_size + - segment->meta()->padding_size)), - segment_header_start_offset_(segment_header_start_offset), - segment_header_(segment_header) {} + capacity_(static_cast(info->segment.meta()->data_size + + info->segment.meta()->padding_size)) {} //! Destructor virtual ~WrappedSegment(void) {} //! Retrieve size of data size_t data_size(void) const override { - return static_cast(segment_->meta()->data_size); + return static_cast(segment_info_->segment.meta()->data_size); } //! Retrieve crc of data uint32_t data_crc(void) const override { - return segment_->meta()->data_crc; + return segment_info_->segment.meta()->data_crc; } //! Retrieve size of padding size_t padding_size(void) const override { - return static_cast(segment_->meta()->padding_size); + return static_cast(segment_info_->segment.meta()->padding_size); } //! Retrieve capacity of segment @@ -85,16 +90,17 @@ class BufferStorage : public IndexStorage { owner_->file_name_.c_str(), segment_id_); return 0; } - if (ailego_unlikely(offset + len > segment_->meta()->data_size)) { - auto meta = segment_->meta(); + if (ailego_unlikely(offset + len > + segment_info_->segment.meta()->data_size)) { + auto meta = segment_info_->segment.meta(); if (offset > meta->data_size) { offset = meta->data_size; } len = meta->data_size - offset; } - size_t abs_offset = segment_header_start_offset_ + - segment_header_->content_offset + - segment_->meta()->data_index + offset; + size_t abs_offset = segment_info_->segment_header_start_offset + + segment_info_->segment_header->content_offset + + segment_info_->segment.meta()->data_index + offset; if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, static_cast(buf))) { return 0; @@ -112,16 +118,17 @@ class BufferStorage : public IndexStorage { *data = nullptr; return 0; } - if (ailego_unlikely(offset + len > segment_->meta()->data_size)) { - auto meta = segment_->meta(); + if (ailego_unlikely(offset + len > + segment_info_->segment.meta()->data_size)) { + auto meta = segment_info_->segment.meta(); if (offset > meta->data_size) { offset = meta->data_size; } len = meta->data_size - offset; } - size_t abs_offset = segment_header_start_offset_ + - segment_header_->content_offset + - segment_->meta()->data_index + offset; + size_t abs_offset = segment_info_->segment_header_start_offset + + segment_info_->segment_header->content_offset + + segment_info_->segment.meta()->data_index + offset; size_t first_page = abs_offset / ailego::kVectorPageSize; size_t last_page = (len == 0) ? first_page @@ -168,16 +175,17 @@ class BufferStorage : public IndexStorage { owner_->file_name_.c_str(), segment_id_); return 0; } - if (ailego_unlikely(offset + len > segment_->meta()->data_size)) { - auto meta = segment_->meta(); + if (ailego_unlikely(offset + len > + segment_info_->segment.meta()->data_size)) { + auto meta = segment_info_->segment.meta(); if (offset > meta->data_size) { offset = meta->data_size; } len = meta->data_size - offset; } - size_t abs_offset = segment_header_start_offset_ + - segment_header_->content_offset + - segment_->meta()->data_index + offset; + size_t abs_offset = segment_info_->segment_header_start_offset + + segment_info_->segment_header->content_offset + + segment_info_->segment.meta()->data_index + offset; size_t first_page = abs_offset / ailego::kVectorPageSize; size_t last_page = (len == 0) ? first_page @@ -203,7 +211,7 @@ class BufferStorage : public IndexStorage { LOG_ERROR("read error (cross-page read_range failed)."); return -1; } - data = MemoryBlock::MakeOwned(tmp); + data = MemoryBlock::MakeOwned(tmp, len); return len; } @@ -227,27 +235,37 @@ class BufferStorage : public IndexStorage { offset, len, capacity_); return 0; } - auto meta = segment_->meta(); + auto meta = segment_info_->segment.meta(); if (offset + len > meta->data_size) { meta->data_size = offset + len; meta->padding_size = capacity_ - meta->data_size; - owner_->set_as_dirty(); } - size_t abs_offset = segment_header_start_offset_ + - segment_header_->content_offset + - segment_->meta()->data_index + offset; + size_t abs_offset = segment_info_->segment_header_start_offset + + segment_info_->segment_header->content_offset + + segment_info_->segment.meta()->data_index + offset; if (owner_->buffer_pool_handle_->write_range( abs_offset, len, static_cast(data)) != 0) { LOG_ERROR("write() page-cache write_range failed at abs_offset=%zu", abs_offset); return 0; } + // ALWAYS mark dirty after a successful page-cache write so that the + // next flush_index() does NOT take the `if (!index_dirty_) return 0;` + // short-circuit and skip flush_all(). Previously this was only set + // when `data_size` grew, which meant fixed-size segments (e.g. + // chunk_meta_segment writing HnswChunkMeta in place) never raised + // the dirty flag -- their 4K page-cache pages were not flushed before + // append_segment() / reopen_pool(), so the freshly-rebuilt page table + // pread'd stale content from disk and chunk_cnts[NODE] lagged the + // real segment count, eventually causing sync_chunks() to see a + // mid-state segment and crash with a NULL Chunk::Pointer. + owner_->set_as_dirty(); return len; } //! Resize size of data size_t resize(size_t size) override { - auto meta = segment_->meta(); + auto meta = segment_info_->segment.meta(); if (meta->data_size != size) { if (size > capacity_) { size = capacity_; @@ -261,7 +279,7 @@ class BufferStorage : public IndexStorage { //! Update crc of data void update_data_crc(uint32_t crc) override { - segment_->meta()->data_crc = crc; + segment_info_->segment.meta()->data_crc = crc; owner_->set_as_dirty(); } @@ -272,14 +290,17 @@ class BufferStorage : public IndexStorage { protected: friend BufferStorage; - IndexMapping::Segment *segment_{}; + // Pointer into BufferStorage::segments_ (an unordered_map mapped value). + // C++ guarantees the address stays valid across map insertions. All + // header / start-offset / segment-meta accesses go through this pointer + // so that re-parses (append_segment -> reopen_pool) are observed without + // needing to recreate WrappedSegment instances held by callers. + IndexMapping::SegmentInfo *segment_info_{nullptr}; private: BufferStorage *owner_{nullptr}; size_t segment_id_{}; size_t capacity_{}; - uint64_t segment_header_start_offset_; - IndexFormat::MetaHeader *segment_header_; }; //! Destructor @@ -374,24 +395,23 @@ class BufferStorage : public IndexStorage { return tmp; } - int ParseHeader(size_t offset) { - std::unique_ptr buffer(new char[sizeof(header_)]); + int ParseHeader(size_t offset, IndexFormat::MetaHeader *out) { + std::unique_ptr buffer(new char[sizeof(*out)]); // NOTE: bypass a wrapper get_meta() -- ParseHeader is called from // reopen_pool() which already holds a unique_lock on mapping_mutex_ // (std::shared_mutex is not reentrant -> deadlock). - if (buffer_pool_handle_->get_meta(offset, sizeof(header_), buffer.get()) != + if (buffer_pool_handle_->get_meta(offset, sizeof(*out), buffer.get()) != 0) { LOG_ERROR("Get segment header failed."); return IndexError_Runtime; } - uint8_t *header_ptr = reinterpret_cast(buffer.get()); - memcpy(&header_, header_ptr, sizeof(header_)); - if (header_.meta_header_size != sizeof(IndexFormat::MetaHeader)) { + memcpy(out, buffer.get(), sizeof(*out)); + if (out->meta_header_size != sizeof(IndexFormat::MetaHeader)) { LOG_ERROR("Header meta size is invalid."); return IndexError_InvalidLength; } - if (ailego::Crc32c::Hash(&header_, sizeof(header_), header_.header_crc) != - header_.header_crc) { + if (ailego::Crc32c::Hash(out, sizeof(*out), out->header_crc) != + out->header_crc) { LOG_ERROR("Header meta checksum is invalid."); return IndexError_InvalidChecksum; } @@ -420,7 +440,7 @@ class BufferStorage : public IndexStorage { return 0; } - int ParseSegment(size_t offset) { + int ParseSegment(size_t offset, IndexFormat::MetaHeader *chain_header) { // NOTE: this function is only called from ParseToMapping(), which is // itself called from either open() (single-threaded construction) or // reopen_pool() (always invoked under the unique_lock held by @@ -470,8 +490,16 @@ class BufferStorage : public IndexStorage { // instances that already hold a pointer to this entry (via // &segments_[name].segment) continue to use the refreshed meta_ptr_ // after the re-parse. + // + // IMPORTANT: chain_header points into chain_headers_ which is a + // std::vector>; each chain owns its OWN + // MetaHeader copy. Do NOT use a shared &header_ here -- when there + // are multiple meta-header chains in the file, the next ParseHeader() + // would overwrite that single instance and break content_offset for + // all earlier-chain segments. segments_[seg_name] = IndexMapping::SegmentInfo{ - IndexMapping::Segment{iter}, current_header_start_offset_, &header_}; + IndexMapping::Segment{iter}, current_header_start_offset_, + chain_header}; max_segment_size_ = std::max(max_segment_size_, iter->data_size + iter->padding_size); if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count > @@ -486,30 +514,37 @@ class BufferStorage : public IndexStorage { int ParseToMapping() { while (true) { int ret; - ret = ParseHeader(current_header_start_offset_); + // Allocate an OWN MetaHeader for this chain so that subsequent chains + // never overwrite earlier-chain headers (prior implementation used a + // single header_ member, which corrupted content_offset for chain-0 + // segments once chain-1 was parsed). + chain_headers_.emplace_back( + std::make_unique()); + IndexFormat::MetaHeader *chain_header = chain_headers_.back().get(); + ret = ParseHeader(current_header_start_offset_, chain_header); if (ret != 0) { LOG_ERROR("Failed to parse header, errno %d, %s", ret, IndexError::What(ret)); return ret; } - switch (header_.version) { + switch (chain_header->version) { case IndexFormat::FORMAT_VERSION: break; default: - LOG_ERROR("Unsupported index version: %u", header_.version); + LOG_ERROR("Unsupported index version: %u", chain_header->version); return IndexError_Unsupported; } // Unpack footer - if (header_.meta_footer_size != sizeof(IndexFormat::MetaFooter)) { + if (chain_header->meta_footer_size != sizeof(IndexFormat::MetaFooter)) { return IndexError_InvalidLength; } - if ((int32_t)header_.meta_footer_offset < 0) { + if ((int32_t)chain_header->meta_footer_offset < 0) { return IndexError_Unsupported; } uint64_t footer_offset = - header_.meta_footer_offset + current_header_start_offset_; + chain_header->meta_footer_offset + current_header_start_offset_; ret = ParseFooter(footer_offset); if (ret != 0) { LOG_ERROR("Failed to parse footer, errno %d, %s", ret, @@ -524,7 +559,7 @@ class BufferStorage : public IndexStorage { } const uint64_t segment_start_offset = footer_offset - footer_.segments_meta_size; - ret = ParseSegment(segment_start_offset); + ret = ParseSegment(segment_start_offset, chain_header); if (ret != 0) { LOG_ERROR("Failed to parse segment, errno %d, %s", ret, IndexError::What(ret)); @@ -577,9 +612,7 @@ class BufferStorage : public IndexStorage { if (!segment_info) { return WrappedSegment::Pointer{}; } - return std::make_shared( - this, &segment_info->segment, segment_info->segment_header_start_offset, - segment_info->segment_header, id_hash_[id]); + return std::make_shared(this, segment_info, id_hash_[id]); } //! Test if it a segment exists @@ -589,7 +622,10 @@ class BufferStorage : public IndexStorage { //! Retrieve magic number of index uint32_t magic(void) const override { - return header_.magic; + if (chain_headers_.empty()) { + return 0u; + } + return chain_headers_.front()->magic; } protected: @@ -741,7 +777,7 @@ class BufferStorage : public IndexStorage { file_name_.clear(); id_hash_.clear(); segments_.clear(); - memset(&header_, 0, sizeof(header_)); + chain_headers_.clear(); memset(&footer_, 0, sizeof(footer_)); { std::lock_guard tmp_latch(tmp_buffers_mutex_); @@ -864,9 +900,9 @@ class BufferStorage : public IndexStorage { id_hash_.clear(); buffer_pool_buffers_.clear(); meta_chains_.clear(); + chain_headers_.clear(); current_header_start_offset_ = 0u; max_segment_size_ = 0u; - memset(&header_, 0, sizeof(header_)); memset(&footer_, 0, sizeof(footer_)); // Delegate the structural append to IndexMapping (same engine used by @@ -933,7 +969,11 @@ class BufferStorage : public IndexStorage { // buffer manager std::string file_name_; - IndexFormat::MetaHeader header_{}; + // Per-chain owning copies of MetaHeader. segments_[name].segment_header + // points into one of these, so each chain's content_offset stays stable + // across re-parses (a single shared header_ would be overwritten by the + // next chain's ParseHeader and corrupt earlier-chain segment reads). + std::vector> chain_headers_{}; IndexFormat::MetaFooter footer_{}; std::unordered_map segments_{}; std::unordered_map id_hash_{}; diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h index 530073aad..1fae20eb9 100644 --- a/src/include/zvec/core/framework/index_storage.h +++ b/src/include/zvec/core/framework/index_storage.h @@ -14,6 +14,8 @@ #pragma once +#include + #include #include #include @@ -47,23 +49,35 @@ class IndexStorage : public IndexModule { } MemoryBlock(void *data) : type_(MemoryBlockType::MBT_MMAP), data_(data) {} - static MemoryBlock MakeOwned(void *owned) { + //! Build an HEAP_SCRATCH MemoryBlock that owns `owned` (allocated via + //! ailego_malloc / ailego_aligned_malloc). `size` is the byte length of + //! the buffer and is required so that copy construction / copy + //! assignment can deep-copy the buffer instead of aliasing it (a shallow + //! copy would result in use-after-free once the original block is + //! destructed and frees the buffer). + static MemoryBlock MakeOwned(void *owned, size_t size) { MemoryBlock mb; mb.type_ = MemoryBlockType::MBT_HEAP_SCRATCH; mb.data_ = owned; + mb.scratch_size_ = size; return mb; } MemoryBlock(const MemoryBlock &rhs) { switch (rhs.type_) { case MemoryBlockType::MBT_MMAP: - case MemoryBlockType::MBT_HEAP_SCRATCH: this->reset(rhs.data_); break; case MemoryBlockType::MBT_BUFFERPOOL: this->reset(rhs.buffer_pool_handle_, rhs.buffer_block_id_, rhs.data_); buffer_pool_handle_->acquire_one(buffer_block_id_); break; + case MemoryBlockType::MBT_HEAP_SCRATCH: + // Deep copy: each owner must hold its own buffer, otherwise the + // first destructor frees the buffer and leaves the surviving + // copies dangling. + deep_copy_from(rhs); + break; default: break; } @@ -83,7 +97,9 @@ class IndexStorage : public IndexModule { case MemoryBlockType::MBT_HEAP_SCRATCH: type_ = MemoryBlockType::MBT_HEAP_SCRATCH; data_ = rhs.data_; + scratch_size_ = rhs.scratch_size_; rhs.data_ = nullptr; + rhs.scratch_size_ = 0; rhs.type_ = MemoryBlockType::MBT_UNKNOWN; break; default: @@ -103,7 +119,8 @@ class IndexStorage : public IndexModule { buffer_pool_handle_->acquire_one(buffer_block_id_); break; case MemoryBlockType::MBT_HEAP_SCRATCH: - this->reset(rhs.data_); + release_current(); + deep_copy_from(rhs); break; default: break; @@ -125,10 +142,12 @@ class IndexStorage : public IndexModule { rhs.type_ = MemoryBlockType::MBT_UNKNOWN; break; case MemoryBlockType::MBT_HEAP_SCRATCH: - release_owned(); + release_current(); type_ = MemoryBlockType::MBT_HEAP_SCRATCH; data_ = rhs.data_; + scratch_size_ = rhs.scratch_size_; rhs.data_ = nullptr; + rhs.scratch_size_ = 0; rhs.type_ = MemoryBlockType::MBT_UNKNOWN; break; default: @@ -154,6 +173,7 @@ class IndexStorage : public IndexModule { break; } data_ = nullptr; + scratch_size_ = 0; } const void *data() const { @@ -188,6 +208,10 @@ class IndexStorage : public IndexModule { void *data_{nullptr}; mutable ailego::VecBufferPoolHandle *buffer_pool_handle_{nullptr}; size_t buffer_block_id_{0}; + //! Byte size of the heap-scratch buffer pointed to by `data_`; only used + //! when type_ == MBT_HEAP_SCRATCH. Required for safe deep-copy on + //! copy-construction / copy-assignment of HEAP_SCRATCH blocks. + size_t scratch_size_{0}; private: void release_owned() { @@ -195,6 +219,44 @@ class IndexStorage : public IndexModule { ailego_free(data_); data_ = nullptr; } + scratch_size_ = 0; + } + + //! Drop whatever the current MemoryBlock holds, regardless of type, so + //! that the slot is ready to receive new ownership. Mirrors what the + //! destructor would do (minus zeroing data_) but leaves the type alone + //! for the caller to overwrite immediately afterwards. + void release_current() { + switch (type_) { + case MemoryBlockType::MBT_BUFFERPOOL: + if (buffer_pool_handle_) { + buffer_pool_handle_->release_one(buffer_block_id_); + buffer_pool_handle_ = nullptr; + } + break; + case MemoryBlockType::MBT_HEAP_SCRATCH: + release_owned(); + break; + default: + break; + } + data_ = nullptr; + type_ = MemoryBlockType::MBT_UNKNOWN; + } + + //! Allocate a fresh buffer of the same size as `rhs.scratch_size_`, + //! memcpy `rhs.data_` into it, and become the new owner. Used by the + //! HEAP_SCRATCH copy ctor / copy assignment so the original and the + //! copy each free their own buffer independently. + void deep_copy_from(const MemoryBlock &rhs) { + type_ = MemoryBlockType::MBT_HEAP_SCRATCH; + scratch_size_ = rhs.scratch_size_; + if (scratch_size_ > 0 && rhs.data_) { + data_ = ailego_malloc(scratch_size_); + std::memcpy(data_, rhs.data_, scratch_size_); + } else { + data_ = nullptr; + } } }; From be1d0f49ce50f34dc6ed64e89eb68027dbd0f992 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Fri, 15 May 2026 18:26:06 +0800 Subject: [PATCH 04/47] fix --- src/ailego/buffer/vector_page_table.cc | 26 ++ src/core/utility/buffer_storage.cc | 286 +++++++++++++----- .../zvec/ailego/buffer/vector_page_table.h | 9 + 3 files changed, 252 insertions(+), 69 deletions(-) diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index 43a434225..cb6ec3186 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -381,6 +381,32 @@ int VecBufferPool::flush_all() { return rc; } +bool VecBufferPool::extend_file(size_t new_size) { + if (!writable_) { + LOG_ERROR("extend_file called on read-only pool: file[%s]", + file_name_.c_str()); + return false; + } + if (new_size <= file_size_) { + return true; + } +#if defined(_MSC_VER) + if (_chsize_s(fd_, static_cast(new_size)) != 0) { + LOG_ERROR("extend_file _chsize_s failed: file[%s], new_size[%zu]", + file_name_.c_str(), new_size); + return false; + } +#else + if (::ftruncate(fd_, static_cast(new_size)) != 0) { + LOG_ERROR("extend_file ftruncate failed: file[%s], new_size[%zu]", + file_name_.c_str(), new_size); + return false; + } +#endif + file_size_ = new_size; + return true; +} + char *VecBufferPoolHandle::get_single_page(size_t file_offset, size_t len, size_t &out_page_id) { size_t first_page = file_offset / kVectorPageSize; diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index a260a77ae..4383caeb9 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -440,7 +440,8 @@ class BufferStorage : public IndexStorage { return 0; } - int ParseSegment(size_t offset, IndexFormat::MetaHeader *chain_header) { + int ParseSegment(size_t offset, IndexFormat::MetaHeader *chain_header, + uint32_t *out_segment_ids_offset) { // NOTE: this function is only called from ParseToMapping(), which is // itself called from either open() (single-threaded construction) or // reopen_pool() (always invoked under the unique_lock held by @@ -508,6 +509,9 @@ class BufferStorage : public IndexStorage { } } buffer_pool_buffers_.push_back(std::move(segment_buffer)); + if (out_segment_ids_offset) { + *out_segment_ids_offset = segment_ids_offset; + } return 0; } @@ -559,7 +563,9 @@ class BufferStorage : public IndexStorage { } const uint64_t segment_start_offset = footer_offset - footer_.segments_meta_size; - ret = ParseSegment(segment_start_offset, chain_header); + uint32_t segment_ids_offset = footer_.segments_meta_size; + ret = ParseSegment(segment_start_offset, chain_header, + &segment_ids_offset); if (ret != 0) { LOG_ERROR("Failed to parse segment, errno %d, %s", ret, IndexError::What(ret)); @@ -570,7 +576,8 @@ class BufferStorage : public IndexStorage { // updated segment metas and footers back to the backing file. meta_chains_.push_back({current_header_start_offset_, footer_offset, segment_start_offset, - footer_.segments_meta_size}); + footer_.segments_meta_size, + segment_ids_offset}); if (footer_.next_meta_header_offset == 0) { break; @@ -833,28 +840,49 @@ class BufferStorage : public IndexStorage { return buffer_pool_->init(); } - //! Append a segment into storage + //! Append a segment into storage. + //! + //! Stage 1 implementation: bypass IndexMapping entirely. We compute the + //! new chain layout in memory, persist only the touched bytes via + //! `write_meta` (a few pwrites), and rotate to a fresh VecBufferPool so + //! its page_table_ covers the extended file. ParseToMapping() is NOT + //! re-run because the in-memory state (segments_/chain_headers_/ + //! buffer_pool_buffers_/footer_/meta_chains_) is already authoritative. int append_segment(const std::string &id, size_t size) { // Flush any in-memory metadata changes (data_size, padding_size, CRC) - // accumulated by prior write()/resize() calls BEFORE we reset the buffer - // pool below. Without this flush, those changes would be lost when - // buffer_pool_buffers_ is cleared and re-populated from disk. - // IMPORTANT: call flush_index() BEFORE taking the unique_lock below; - // flush_index() internally takes a shared_lock on the same mutex and - // std::shared_mutex is NOT reentrant. + // accumulated by prior write()/resize() calls BEFORE we take the + // unique_lock. flush_index() takes a shared_lock on the same mutex + // and std::shared_mutex is NOT reentrant. this->flush_index(); - // UNIQUE LOCK: hold the mutex for the entire structural modification - // (reset -> IndexMapping.open/append/flush -> reopen_pool). Concurrent - // readers/writers taking shared_lock will block here. std::unique_lock latch(mapping_mutex_); - // RETIRE the old pool instead of immediately destroying it. MemoryBlock - // objects held by other threads carry a ref_count on a block inside this - // pool but store only a RAW VecBufferPoolHandle*; if we reset() the - // shared_ptr here, the pool destructor fires while those ref_counts are - // still > 0 and the is_released() assert trips. By parking in - // retired_pools_ the pool survives until all external refs are gone. + if (!buffer_pool_ || !buffer_pool_handle_) { + LOG_ERROR("append_segment: pool not ready, file[%s]", + file_name_.c_str()); + return IndexError_Runtime; + } + if (!buffer_pool_->writable()) { + LOG_ERROR("append_segment: pool is read-only, file[%s]", + file_name_.c_str()); + return IndexError_Runtime; + } + if (size == 0) { + return IndexError_InvalidArgument; + } + if (segments_.find(id) != segments_.end()) { + return IndexError_Duplicate; + } + if (meta_chains_.empty() || chain_headers_.empty() || + buffer_pool_buffers_.empty()) { + LOG_ERROR("append_segment: invalid state, file[%s]", + file_name_.c_str()); + return IndexError_Runtime; + } + + // Retire stale pools whose blocks are no longer referenced. Reused + // from the prior implementation so MemoryBlock instances held by other + // threads keep their raw VecBufferPoolHandle* alive. auto prune_retired = [&]() { size_t w = 0; for (size_t r = 0; r < retired_pools_.size(); ++r) { @@ -879,12 +907,161 @@ class BufferStorage : public IndexStorage { }; prune_retired(); - // Flush and release the buffer pool so IndexMapping can safely open - // and structurally modify the same file. + // Page-aligned padded size for the new segment. Matches IndexMapping's + // CalcPageAlignedSize() so the on-disk layout stays identical. + const size_t page_size = ailego::kVectorPageSize; + const size_t padded_size = (size + page_size - 1) / page_size * page_size; + + // The "current last chain" is meta_chains_.back() / chain_headers_.back(); + // footer_ is always the last chain's footer (overwritten by ParseFooter + // during ParseToMapping). + size_t id_size = id.length() + 1; + size_t need_size = sizeof(IndexFormat::SegmentMeta) + id_size; + MetaChain *chain = &meta_chains_.back(); + IndexFormat::MetaHeader *header = chain_headers_.back().get(); + char *meta_buf = buffer_pool_buffers_.back().get(); + + // ---- Step 1: chain split if current chain has no meta capacity left. + if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count + need_size > + chain->segment_ids_offset) { + size_t new_chain_start = buffer_pool_->file_size(); + new_chain_start = + (new_chain_start + page_size - 1) / page_size * page_size; + size_t new_meta_total = + (segment_meta_capacity_ + sizeof(IndexFormat::MetaHeader) + + sizeof(IndexFormat::MetaFooter) + page_size - 1) / + page_size * page_size; + uint32_t new_segments_meta_size = static_cast( + new_meta_total - sizeof(IndexFormat::MetaHeader) - + sizeof(IndexFormat::MetaFooter)); + + // Update OLD footer in memory + on disk so it links to the new chain. + footer_.next_meta_header_offset = new_chain_start; + IndexFormat::UpdateMetaFooter(&footer_, 0); + if (buffer_pool_handle_->write_meta( + chain->footer_file_offset, sizeof(footer_), + reinterpret_cast(&footer_)) != 0) { + LOG_ERROR("append_segment: write old footer failed, file[%s]", + file_name_.c_str()); + return IndexError_WriteData; + } + + // Extend the file and write the new chain's header + (zero) footer. + // The segment_meta region is implicitly zero-filled by ftruncate, + // matching the empty `new_meta_buf` we keep in memory. + if (!buffer_pool_->extend_file(new_chain_start + new_meta_total)) { + return IndexError_Runtime; + } + + auto new_header = std::make_unique(); + IndexFormat::SetupMetaHeader( + new_header.get(), + static_cast(new_meta_total - + sizeof(IndexFormat::MetaFooter)), + static_cast(new_meta_total)); + + auto new_meta_buf = std::make_unique(new_segments_meta_size); + std::memset(new_meta_buf.get(), 0, new_segments_meta_size); + + IndexFormat::MetaFooter new_footer; + IndexFormat::SetupMetaFooter(&new_footer); + new_footer.segments_meta_size = new_segments_meta_size; + new_footer.total_size = new_meta_total; + new_footer.segments_meta_crc = ailego::Crc32c::Hash( + new_meta_buf.get(), new_segments_meta_size, 0u); + IndexFormat::UpdateMetaFooter(&new_footer, 0); + + if (buffer_pool_handle_->write_meta( + new_chain_start, sizeof(IndexFormat::MetaHeader), + reinterpret_cast(new_header.get())) != 0) { + return IndexError_WriteData; + } + uint64_t new_segment_meta_file_offset = + new_chain_start + sizeof(IndexFormat::MetaHeader); + uint64_t new_footer_file_offset = + new_chain_start + new_header->meta_footer_offset; + if (buffer_pool_handle_->write_meta( + new_footer_file_offset, sizeof(new_footer), + reinterpret_cast(&new_footer)) != 0) { + return IndexError_WriteData; + } + + // Mirror to in-memory state. + chain_headers_.push_back(std::move(new_header)); + buffer_pool_buffers_.push_back(std::move(new_meta_buf)); + meta_chains_.push_back(MetaChain{new_chain_start, new_footer_file_offset, + new_segment_meta_file_offset, + new_segments_meta_size, + new_segments_meta_size}); + footer_ = new_footer; + current_header_start_offset_ = new_chain_start; + + chain = &meta_chains_.back(); + header = chain_headers_.back().get(); + meta_buf = buffer_pool_buffers_.back().get(); + } + + // ---- Step 2: append SegmentMeta + ID into the (possibly new) last + // chain, then persist meta_buf and footer. + uint64_t new_data_index = footer_.content_size; + uint64_t new_seg_abs_offset = + chain->header_start_offset + header->content_offset + new_data_index; + uint64_t new_file_size = new_seg_abs_offset + padded_size; + if (new_file_size > buffer_pool_->file_size()) { + if (!buffer_pool_->extend_file(new_file_size)) { + return IndexError_Runtime; + } + } + + chain->segment_ids_offset -= static_cast(id_size); + IndexFormat::SegmentMeta *new_seg = + reinterpret_cast(meta_buf) + + footer_.segment_count; + new_seg->segment_id_offset = chain->segment_ids_offset; + new_seg->data_index = new_data_index; + new_seg->data_size = 0; + new_seg->data_crc = 0; + new_seg->padding_size = padded_size; + std::memcpy(meta_buf + chain->segment_ids_offset, id.c_str(), id_size); + + footer_.segment_count += 1; + footer_.content_size += padded_size; + footer_.total_size += padded_size; + footer_.segments_meta_crc = + ailego::Crc32c::Hash(meta_buf, chain->segment_meta_size, 0u); + IndexFormat::UpdateMetaFooter(&footer_, 0); + + if (buffer_pool_handle_->write_meta(chain->segment_meta_file_offset, + chain->segment_meta_size, + meta_buf) != 0) { + LOG_ERROR("append_segment: write segment_meta failed, file[%s]", + file_name_.c_str()); + return IndexError_WriteData; + } + if (buffer_pool_handle_->write_meta( + chain->footer_file_offset, sizeof(footer_), + reinterpret_cast(&footer_)) != 0) { + LOG_ERROR("append_segment: write footer failed, file[%s]", + file_name_.c_str()); + return IndexError_WriteData; + } + + // Mirror to in-memory mapping. WrappedSegment instances already held + // by callers reference &segments_[name], whose address is stable across + // unordered_map insertions, so existing references stay valid. + segments_[id] = IndexMapping::SegmentInfo{ + IndexMapping::Segment{new_seg}, chain->header_start_offset, header}; + id_hash_[id] = id_hash_.size(); + max_segment_size_ = std::max(max_segment_size_, padded_size); + + // ---- Step 3: rotate the buffer pool so its page_table_ covers the + // freshly extended file. The OLD pool is parked in + // retired_pools_ to keep MemoryBlock ref counts safe; we + // do NOT re-run ParseToMapping() because the in-memory + // state is already authoritative. if (buffer_pool_handle_) { buffer_pool_handle_->flush_all(); } - // Park the old pool + handle. if (buffer_pool_) { retired_pools_.push_back(std::move(buffer_pool_)); retired_handles_.push_back(std::move(buffer_pool_handle_)); @@ -892,56 +1069,21 @@ class BufferStorage : public IndexStorage { buffer_pool_handle_.reset(); } buffer_pool_.reset(); - // Reset parse-time state EXCEPT for segments_: WrappedSegment instances - // held by callers store raw pointers into segments_' mapped values. - // The C++ standard guarantees that unordered_map references/pointers to - // mapped values are never invalidated by insertions, so we can safely - // leave segments_ intact and update entries in-place during re-parse. - id_hash_.clear(); - buffer_pool_buffers_.clear(); - meta_chains_.clear(); - chain_headers_.clear(); - current_header_start_offset_ = 0u; - max_segment_size_ = 0u; - memset(&footer_, 0, sizeof(footer_)); - // Delegate the structural append to IndexMapping (same engine used by - // MMapFileStorage) so the on-disk format stays consistent. - IndexMapping mapping; - int ret = mapping.open(file_name_, /*cow=*/false, /*full_mode=*/false); - if (ret != 0) { - LOG_ERROR( - "BufferStorage::append_segment failed to open IndexMapping: " - "file[%s], id[%s], errno[%d]", - file_name_.c_str(), id.c_str(), ret); - reopen_pool(); - return ret; - } - ret = mapping.append(id, size); - if (ret != 0) { - LOG_ERROR( - "BufferStorage::append_segment failed to append segment: " - "file[%s], id[%s], errno[%d]", - file_name_.c_str(), id.c_str(), ret); - mapping.close(); - reopen_pool(); - return ret; - } - mapping.refresh(0); - ret = mapping.flush(); - mapping.close(); - if (ret != 0) { + try { + buffer_pool_ = std::make_shared( + file_name_, /*writable=*/true, /*create=*/false); + buffer_pool_handle_ = std::make_shared( + buffer_pool_->get_handle()); + } catch (const std::exception &e) { LOG_ERROR( - "BufferStorage::append_segment failed to flush: " - "file[%s], id[%s], errno[%d]", - file_name_.c_str(), id.c_str(), ret); - reopen_pool(); - return ret; + "append_segment: failed to reopen pool: file[%s], what[%s]", + file_name_.c_str(), e.what()); + buffer_pool_.reset(); + buffer_pool_handle_.reset(); + return IndexError_Runtime; } - - // Reopen the buffer pool and reload the mapping so the new segment is - // accessible via get_segment_info() / get(). - return reopen_pool(); + return buffer_pool_->init(); } //! Test if a segment exists @@ -1001,6 +1143,12 @@ class BufferStorage : public IndexStorage { uint64_t footer_file_offset; uint64_t segment_meta_file_offset; uint32_t segment_meta_size; + // Lowest offset of segment ID strings within the segment_meta region. + // Equals segment_meta_size when no IDs have been written yet, and + // decreases by `strlen(id)+1` for each appended segment. Used by + // append_segment() to detect when the chain runs out of meta capacity + // and a new chain must be split off. + uint32_t segment_ids_offset; }; std::vector meta_chains_{}; }; diff --git a/src/include/zvec/ailego/buffer/vector_page_table.h b/src/include/zvec/ailego/buffer/vector_page_table.h index 7fb0a9946..588fff87c 100644 --- a/src/include/zvec/ailego/buffer/vector_page_table.h +++ b/src/include/zvec/ailego/buffer/vector_page_table.h @@ -185,6 +185,15 @@ class VecBufferPool { //! repeatedly; no-op in read-only mode. int flush_all(); + //! Extend the backing file to `new_size` bytes via ftruncate (no-op if + //! already >= new_size) and refresh the cached file_size_. + //! NOTE: page_table_.entry_num() is NOT updated here -- it stays at the + //! value computed by init(). Callers that need the page_table to cover + //! the extended range must reinitialize the pool (see BufferStorage's + //! append_segment retire-and-reopen flow). Returns true on success, + //! false on a read-only pool or I/O failure. + bool extend_file(size_t new_size); + bool writable() const { return writable_; } From 9290c3e32eacd6917c9e9d68ffe10cfc47abae76 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Mon, 18 May 2026 22:10:54 +0800 Subject: [PATCH 05/47] upd --- src/core/utility/buffer_storage.cc | 81 +++++++++++++++++++----------- 1 file changed, 52 insertions(+), 29 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 4383caeb9..58e0d1b0d 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include #include @@ -577,7 +578,7 @@ class BufferStorage : public IndexStorage { meta_chains_.push_back({current_header_start_offset_, footer_offset, segment_start_offset, footer_.segments_meta_size, - segment_ids_offset}); + segment_ids_offset, footer_}); if (footer_.next_meta_header_offset == 0) { break; @@ -691,9 +692,21 @@ class BufferStorage : public IndexStorage { return ret; } - //! Set the index file as dirty + //! Set the index file as dirty. + //! + //! HOT PATH: called once per WrappedSegment::write() / resize() / + //! update_data_crc(). Under 16-thread build (~100k writes total) every + //! unconditional store(true) on this shared cache line triggers MESI + //! invalidation across all cores -- classic cache-line ping-pong even + //! for relaxed atomics. Since the flag is true the vast majority of + //! the time (only flush_index() / refresh_index() reset it), guard the + //! store with a load: when the line is already in Shared/Modified=true + //! state on this core, the load is essentially free and we skip the + //! invalidating store. void set_as_dirty(void) { - index_dirty_ = true; + if (!index_dirty_.load(std::memory_order_relaxed)) { + index_dirty_.store(true, std::memory_order_relaxed); + } } //! Refresh meta information (checksum, update time, etc.) @@ -701,14 +714,16 @@ class BufferStorage : public IndexStorage { // In BufferStorage the segment metadata lives in buffer_pool_buffers_. // CRC recomputation and disk write are deferred to flush_index(). // Just mark dirty so flush_index() will include the metadata write. - index_dirty_ = true; + if (!index_dirty_.load(std::memory_order_relaxed)) { + index_dirty_.store(true, std::memory_order_relaxed); + } } //! Flush index storage: persists any pending meta changes (segments_meta + //! footer) for each header chain, then asks the page cache to write back //! dirty data pages. int flush_index(void) { - if (!index_dirty_) { + if (!index_dirty_.load(std::memory_order_relaxed)) { return 0; } // SHARED LOCK: keep mapping_mutex_ held for the whole flush so that the @@ -724,7 +739,7 @@ class BufferStorage : public IndexStorage { } if (!buffer_pool_->writable()) { // Read-only pool: nothing to flush. - index_dirty_ = false; + index_dirty_.store(false, std::memory_order_relaxed); return 0; } // Flush all dirty data blocks to the backing file first. @@ -733,28 +748,20 @@ class BufferStorage : public IndexStorage { return IndexError_WriteData; } // For each metadata chain, recompute the segment-meta CRC, update the - // footer (segments_meta_crc + footer_crc + update_time), and write both - // the segment metadata and the footer back to the backing file. + // in-memory footer (segments_meta_crc + footer_crc + update_time), and + // write both the segment metadata and the footer back to the backing + // file. Uses the per-chain in-memory footer copy, avoiding a pread. for (size_t ci = 0; ci < meta_chains_.size() && ci < buffer_pool_buffers_.size(); ++ci) { - const MetaChain &chain = meta_chains_[ci]; + MetaChain &mchain = meta_chains_[ci]; const char *seg_buf = buffer_pool_buffers_[ci].get(); - // Read the on-disk footer into a local copy so we can update it. - IndexFormat::MetaFooter footer; - if (buffer_pool_handle_->get_meta( - chain.footer_file_offset, sizeof(footer), - reinterpret_cast(&footer)) != 0) { - LOG_ERROR("Failed to read footer for flush: file[%s], chain[%zu]", - file_name_.c_str(), ci); - return IndexError_Runtime; - } - // Recompute segment metadata CRC and refresh the footer. - footer.segments_meta_crc = - ailego::Crc32c::Hash(seg_buf, chain.segment_meta_size, 0u); - IndexFormat::UpdateMetaFooter(&footer, 0); + // Recompute segment metadata CRC and refresh the per-chain footer. + mchain.footer.segments_meta_crc = + ailego::Crc32c::Hash(seg_buf, mchain.segment_meta_size, 0u); + IndexFormat::UpdateMetaFooter(&mchain.footer, 0); // Write segment metadata back to disk. - if (buffer_pool_handle_->write_meta(chain.segment_meta_file_offset, - chain.segment_meta_size, + if (buffer_pool_handle_->write_meta(mchain.segment_meta_file_offset, + mchain.segment_meta_size, seg_buf) != 0) { LOG_ERROR("Failed to write segment meta: file[%s], chain[%zu]", file_name_.c_str(), ci); @@ -762,14 +769,18 @@ class BufferStorage : public IndexStorage { } // Write the updated footer back to disk. if (buffer_pool_handle_->write_meta( - chain.footer_file_offset, sizeof(footer), - reinterpret_cast(&footer)) != 0) { + mchain.footer_file_offset, sizeof(mchain.footer), + reinterpret_cast(&mchain.footer)) != 0) { LOG_ERROR("Failed to write footer: file[%s], chain[%zu]", file_name_.c_str(), ci); return IndexError_WriteData; } } - index_dirty_ = false; + // Keep the convenience alias in sync with the last chain. + if (!meta_chains_.empty()) { + footer_ = meta_chains_.back().footer; + } + index_dirty_.store(false, std::memory_order_relaxed); return 0; } @@ -945,6 +956,7 @@ class BufferStorage : public IndexStorage { file_name_.c_str()); return IndexError_WriteData; } + chain->footer = footer_; // sync in-memory copy for flush_index // Extend the file and write the new chain's header + (zero) footer. // The segment_meta region is implicitly zero-filled by ftruncate, @@ -992,7 +1004,7 @@ class BufferStorage : public IndexStorage { meta_chains_.push_back(MetaChain{new_chain_start, new_footer_file_offset, new_segment_meta_file_offset, new_segments_meta_size, - new_segments_meta_size}); + new_segments_meta_size, new_footer}); footer_ = new_footer; current_header_start_offset_ = new_chain_start; @@ -1030,6 +1042,7 @@ class BufferStorage : public IndexStorage { footer_.segments_meta_crc = ailego::Crc32c::Hash(meta_buf, chain->segment_meta_size, 0u); IndexFormat::UpdateMetaFooter(&footer_, 0); + chain->footer = footer_; // sync in-memory copy for flush_index if (buffer_pool_handle_->write_meta(chain->segment_meta_file_offset, chain->segment_meta_size, @@ -1059,6 +1072,12 @@ class BufferStorage : public IndexStorage { // retired_pools_ to keep MemoryBlock ref counts safe; we // do NOT re-run ParseToMapping() because the in-memory // state is already authoritative. + // + // flush_all() is REQUIRED here despite the entry-point flush_index() + // having already flushed: between flush_index()'s shared_lock release + // and this function's unique_lock acquisition, other build threads may + // have produced new dirty pages via WrappedSegment::write(). Without + // this flush, the freshly opened pool would pread stale data from disk. if (buffer_pool_handle_) { buffer_pool_handle_->flush_all(); } @@ -1103,7 +1122,7 @@ class BufferStorage : public IndexStorage { } private: - bool index_dirty_{false}; + std::atomic index_dirty_{false}; mutable std::shared_mutex mapping_mutex_{}; std::vector tmp_buffers_{}; @@ -1149,6 +1168,10 @@ class BufferStorage : public IndexStorage { // append_segment() to detect when the chain runs out of meta capacity // and a new chain must be split off. uint32_t segment_ids_offset; + // In-memory copy of this chain's MetaFooter. Kept in sync with disk + // by flush_index() and append_segment(), avoiding a pread per chain + // on every flush. + IndexFormat::MetaFooter footer; }; std::vector meta_chains_{}; }; From 7b0db62b5f079b6432d89f346a548ad24b9fe488 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Tue, 19 May 2026 14:45:35 +0800 Subject: [PATCH 06/47] upd --- src/core/utility/buffer_storage.cc | 60 ++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 58e0d1b0d..1db2dd30b 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -85,7 +86,8 @@ class BufferStorage : public IndexStorage { //! LOCKING: takes a shared_lock on owner_->mapping_mutex_ so that //! append_segment() / close_index() cannot tear down the pool mid-call. size_t fetch(size_t offset, void *buf, size_t len) const override { - std::shared_lock latch(owner_->mapping_mutex_); + std::shared_lock latch( + owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); if (ailego_unlikely(!owner_->buffer_pool_handle_)) { LOG_ERROR("WrappedSegment::fetch: handle is null, file[%s], id[%zu]", owner_->file_name_.c_str(), segment_id_); @@ -112,7 +114,8 @@ class BufferStorage : public IndexStorage { //! Read data from segment //! LOCKING: see fetch() above for rationale. size_t read(size_t offset, const void **data, size_t len) override { - std::shared_lock latch(owner_->mapping_mutex_); + std::shared_lock latch( + owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); if (ailego_unlikely(!owner_->buffer_pool_handle_)) { LOG_ERROR("WrappedSegment::read: handle is null, file[%s], id[%zu]", owner_->file_name_.c_str(), segment_id_); @@ -168,7 +171,8 @@ class BufferStorage : public IndexStorage { //! MemoryBlock carries its own ref_count (raised by get_single_page()) //! and will release it via its destructor. size_t read(size_t offset, MemoryBlock &data, size_t len) override { - std::shared_lock latch(owner_->mapping_mutex_); + std::shared_lock latch( + owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); if (ailego_unlikely(!owner_->buffer_pool_handle_)) { LOG_ERROR( "WrappedSegment::read(MemoryBlock&): handle is null, file[%s], " @@ -219,7 +223,8 @@ class BufferStorage : public IndexStorage { //! Write data into the storage with offset //! LOCKING: see fetch() above for rationale. size_t write(size_t offset, const void *data, size_t len) override { - std::shared_lock latch(owner_->mapping_mutex_); + std::shared_lock latch( + owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); if (ailego_unlikely(!owner_->buffer_pool_handle_ || !owner_->buffer_pool_)) { LOG_ERROR("WrappedSegment::write: pool is null, file[%s], id[%zu]", @@ -726,10 +731,11 @@ class BufferStorage : public IndexStorage { if (!index_dirty_.load(std::memory_order_relaxed)) { return 0; } - // SHARED LOCK: keep mapping_mutex_ held for the whole flush so that the + // SHARED LOCK: keep one shard locked for the whole flush so that the // pool/handle cannot be torn down by append_segment()/close_index() // mid-flush. - std::shared_lock latch(mapping_mutex_); + std::shared_lock latch( + mapping_shards_[mapping_shard_id()].mtx); // NULL GUARD: a previous append_segment() may have left the pool in a // torn-down state. if (!buffer_pool_ || !buffer_pool_handle_) { @@ -791,7 +797,7 @@ class BufferStorage : public IndexStorage { // flush_index() internally takes a shared_lock on the same mutex and // std::shared_mutex is NOT reentrant. this->flush_index(); - std::unique_lock latch(mapping_mutex_); + AllShardsExclusiveLatch latch(mapping_shards_); file_name_.clear(); id_hash_.clear(); segments_.clear(); @@ -866,7 +872,7 @@ class BufferStorage : public IndexStorage { // and std::shared_mutex is NOT reentrant. this->flush_index(); - std::unique_lock latch(mapping_mutex_); + AllShardsExclusiveLatch latch(mapping_shards_); if (!buffer_pool_ || !buffer_pool_handle_) { LOG_ERROR("append_segment: pool not ready, file[%s]", @@ -1107,13 +1113,15 @@ class BufferStorage : public IndexStorage { //! Test if a segment exists bool has_segment(const std::string &id) const { - std::shared_lock latch(mapping_mutex_); + std::shared_lock latch( + mapping_shards_[mapping_shard_id()].mtx); return (segments_.find(id) != segments_.end()); } //! Get a segment from storage IndexMapping::SegmentInfo *get_segment_info(const std::string &id) { - std::shared_lock latch(mapping_mutex_); + std::shared_lock latch( + mapping_shards_[mapping_shard_id()].mtx); auto iter = segments_.find(id); if (iter == segments_.end()) { return nullptr; @@ -1123,7 +1131,37 @@ class BufferStorage : public IndexStorage { private: std::atomic index_dirty_{false}; - mutable std::shared_mutex mapping_mutex_{}; + + // Sharded reader-writer lock to eliminate cache-line ping-pong on the + // reader counter. 16 concurrent readers each hash to their own shard, + // avoiding cross-core contention. Writers (append_segment/close_index) + // lock ALL shards to achieve exclusive access. + static constexpr size_t kMappingMutexShards = 32; + struct alignas(64) MutexShard { + std::shared_mutex mtx; + }; + mutable MutexShard mapping_shards_[kMappingMutexShards]{}; + + // Per-thread shard selection (stable hash, no syscall). + size_t mapping_shard_id() const { + thread_local const size_t id = + std::hash()(std::this_thread::get_id()) % + kMappingMutexShards; + return id; + } + + // RAII guard that locks ALL shards exclusively (for writers). + struct AllShardsExclusiveLatch { + MutexShard *shards_; + AllShardsExclusiveLatch(MutexShard *shards) : shards_(shards) { + for (size_t i = 0; i < kMappingMutexShards; ++i) shards_[i].mtx.lock(); + } + ~AllShardsExclusiveLatch() { + for (size_t i = 0; i < kMappingMutexShards; ++i) shards_[i].mtx.unlock(); + } + AllShardsExclusiveLatch(const AllShardsExclusiveLatch &) = delete; + AllShardsExclusiveLatch &operator=(const AllShardsExclusiveLatch &) = delete; + }; std::vector tmp_buffers_{}; mutable std::mutex tmp_buffers_mutex_{}; From 01a46f69f4e08a5ce709260a85518a53eaae1baa Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Tue, 19 May 2026 15:43:07 +0800 Subject: [PATCH 07/47] upd --- src/ailego/buffer/vector_page_table.cc | 110 ++++++++++-------- src/core/utility/buffer_storage.cc | 73 +++--------- .../zvec/ailego/buffer/vector_page_table.h | 58 ++++++--- 3 files changed, 115 insertions(+), 126 deletions(-) diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index cb6ec3186..c96e40b91 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -60,46 +60,66 @@ namespace ailego { const size_t kVectorPageSize = MemoryHelper::PageSize(); void VectorPageTable::init(size_t entry_num) { - if (entries_) { - delete[] entries_; + // Free old segments if any. + for (size_t i = 0; i < segment_count_; ++i) { + delete[] segments_[i]; + segments_[i] = nullptr; } entry_num_ = entry_num; - entries_ = new Entry[entry_num_]; - for (size_t i = 0; i < entry_num_; i++) { - entries_[i].ref_count.store(std::numeric_limits::min()); - entries_[i].in_evict_queue.store(false); - entries_[i].is_dirty.store(false); - entries_[i].buffer = nullptr; - entries_[i].file_offset = 0; + segment_count_ = (entry_num + kSegmentSize - 1) / kSegmentSize; + for (size_t s = 0; s < segment_count_; ++s) { + segments_[s] = new Entry[kSegmentSize]; + for (size_t i = 0; i < kSegmentSize; ++i) { + segments_[s][i].ref_count.store(std::numeric_limits::min()); + segments_[s][i].in_evict_queue.store(false); + segments_[s][i].is_dirty.store(false); + segments_[s][i].buffer = nullptr; + segments_[s][i].file_offset = 0; + } + } +} + +void VectorPageTable::extend(size_t new_entry_num) { + if (new_entry_num <= entry_num_) return; + size_t new_segment_count = (new_entry_num + kSegmentSize - 1) / kSegmentSize; + for (size_t s = segment_count_; s < new_segment_count; ++s) { + segments_[s] = new Entry[kSegmentSize]; + for (size_t i = 0; i < kSegmentSize; ++i) { + segments_[s][i].ref_count.store(std::numeric_limits::min()); + segments_[s][i].in_evict_queue.store(false); + segments_[s][i].is_dirty.store(false); + segments_[s][i].buffer = nullptr; + segments_[s][i].file_offset = 0; + } } + segment_count_ = new_segment_count; + entry_num_ = new_entry_num; } char *VectorPageTable::acquire_block(block_id_t block_id) { assert(block_id < entry_num_); - Entry &entry = entries_[block_id]; + Entry &e = entry_at(block_id); while (true) { - int current_count = entry.ref_count.load(std::memory_order_acquire); + int current_count = e.ref_count.load(std::memory_order_acquire); if (current_count < 0) { return nullptr; } - if (entry.ref_count.compare_exchange_weak(current_count, current_count + 1, - std::memory_order_acq_rel, - std::memory_order_acquire)) { - return entry.buffer; + if (e.ref_count.compare_exchange_weak(current_count, current_count + 1, + std::memory_order_acq_rel, + std::memory_order_acquire)) { + return e.buffer; } } } void VectorPageTable::release_block(block_id_t block_id) { assert(block_id < entry_num_); - Entry &entry = entries_[block_id]; + Entry &e = entry_at(block_id); - if (entry.ref_count.fetch_sub(1, std::memory_order_release) == 1) { + if (e.ref_count.fetch_sub(1, std::memory_order_release) == 1) { std::atomic_thread_fence(std::memory_order_acquire); - // Attempt to transition in_evict_queue from false -> true. The CAS ensures - // only one thread enqueues this block even if multiple threads race here. bool expected = false; - if (entry.in_evict_queue.compare_exchange_strong( + if (e.in_evict_queue.compare_exchange_strong( expected, true, std::memory_order_acq_rel, std::memory_order_relaxed)) { BlockEvictionQueue::BlockType block; @@ -108,58 +128,48 @@ void VectorPageTable::release_block(block_id_t block_id) { block.vector_block.second = 0; BlockEvictionQueue::get_instance().add_single_block(block, 0); } - // else: block is already in the eviction queue; do not add a duplicate - // entry. } } void VectorPageTable::evict_block(block_id_t block_id) { assert(block_id < entry_num_); - Entry &entry = entries_[block_id]; - char *buffer = entry.buffer; + Entry &e = entry_at(block_id); + char *buffer = e.buffer; int expected = 0; - if (entry.ref_count.compare_exchange_strong( + if (e.ref_count.compare_exchange_strong( expected, std::numeric_limits::min())) { - // If the block is dirty, flush it to disk before freeing the memory so - // that no modified data is silently lost during eviction. - if (buffer && entry.is_dirty.load(std::memory_order_relaxed) && + if (buffer && e.is_dirty.load(std::memory_order_relaxed) && flush_callback_) { - flush_callback_(block_id, buffer, kVectorPageSize, entry.file_offset); - entry.is_dirty.store(false, std::memory_order_relaxed); + flush_callback_(block_id, buffer, kVectorPageSize, e.file_offset); + e.is_dirty.store(false, std::memory_order_relaxed); } if (buffer) { MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); } } - // Always reset in_evict_queue regardless of whether the CAS succeeded: - // - On success: the block is evicted; future releases should re-register it. - // - On failure: the block was re-acquired by another thread between the - // ref-count check and this call. Clearing in_evict_queue lets the next - // release_block() re-enqueue it so it is not silently lost. - entry.in_evict_queue.store(false, std::memory_order_relaxed); + e.in_evict_queue.store(false, std::memory_order_relaxed); } char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer, size_t file_offset) { assert(block_id < entry_num_); - Entry &entry = entries_[block_id]; + Entry &e = entry_at(block_id); while (true) { - int current_count = entry.ref_count.load(std::memory_order_relaxed); + int current_count = e.ref_count.load(std::memory_order_relaxed); if (current_count >= 0) { - if (entry.ref_count.compare_exchange_weak( + if (e.ref_count.compare_exchange_weak( current_count, current_count + 1, std::memory_order_acq_rel, std::memory_order_acquire)) { MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); - return entry.buffer; + return e.buffer; } } else { - entry.buffer = buffer; - entry.file_offset = file_offset; - entry.in_evict_queue.store(false, std::memory_order_relaxed); - // A freshly loaded block is clean (memory matches disk). - entry.is_dirty.store(false, std::memory_order_relaxed); - entry.ref_count.store(1, std::memory_order_release); - return entry.buffer; + e.buffer = buffer; + e.file_offset = file_offset; + e.in_evict_queue.store(false, std::memory_order_relaxed); + e.is_dirty.store(false, std::memory_order_relaxed); + e.ref_count.store(1, std::memory_order_release); + return e.buffer; } } } @@ -404,6 +414,12 @@ bool VecBufferPool::extend_file(size_t new_size) { } #endif file_size_ = new_size; + // Extend the page table to cover the new file range. Existing entries + // stay at their original addresses so concurrent readers are unaffected. + size_t new_entry_num = (file_size_ + kVectorPageSize - 1) / kVectorPageSize; + if (new_entry_num > page_table_.entry_num()) { + page_table_.extend(new_entry_num); + } return true; } diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 1db2dd30b..b08d146d6 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -83,11 +83,9 @@ class BufferStorage : public IndexStorage { //! Fetch data from segment (with own buffer) //! - //! LOCKING: takes a shared_lock on owner_->mapping_mutex_ so that - //! append_segment() / close_index() cannot tear down the pool mid-call. + //! C1: pool/handle are stable for the lifetime of the index + //! (no retire/rebuild), so no lock is needed on the hot path. size_t fetch(size_t offset, void *buf, size_t len) const override { - std::shared_lock latch( - owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); if (ailego_unlikely(!owner_->buffer_pool_handle_)) { LOG_ERROR("WrappedSegment::fetch: handle is null, file[%s], id[%zu]", owner_->file_name_.c_str(), segment_id_); @@ -112,10 +110,8 @@ class BufferStorage : public IndexStorage { } //! Read data from segment - //! LOCKING: see fetch() above for rationale. + //! C1: lock-free hot path (pool/handle never change during operation). size_t read(size_t offset, const void **data, size_t len) override { - std::shared_lock latch( - owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); if (ailego_unlikely(!owner_->buffer_pool_handle_)) { LOG_ERROR("WrappedSegment::read: handle is null, file[%s], id[%zu]", owner_->file_name_.c_str(), segment_id_); @@ -167,12 +163,8 @@ class BufferStorage : public IndexStorage { return len; } - //! LOCKING: shared_lock held only while wiring the MemoryBlock. The - //! MemoryBlock carries its own ref_count (raised by get_single_page()) - //! and will release it via its destructor. + //! C1: lock-free hot path (pool/handle never change during operation). size_t read(size_t offset, MemoryBlock &data, size_t len) override { - std::shared_lock latch( - owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); if (ailego_unlikely(!owner_->buffer_pool_handle_)) { LOG_ERROR( "WrappedSegment::read(MemoryBlock&): handle is null, file[%s], " @@ -221,10 +213,8 @@ class BufferStorage : public IndexStorage { } //! Write data into the storage with offset - //! LOCKING: see fetch() above for rationale. + //! C1: lock-free hot path (pool/handle never change during operation). size_t write(size_t offset, const void *data, size_t len) override { - std::shared_lock latch( - owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); if (ailego_unlikely(!owner_->buffer_pool_handle_ || !owner_->buffer_pool_)) { LOG_ERROR("WrappedSegment::write: pool is null, file[%s], id[%zu]", @@ -859,17 +849,11 @@ class BufferStorage : public IndexStorage { //! Append a segment into storage. //! - //! Stage 1 implementation: bypass IndexMapping entirely. We compute the - //! new chain layout in memory, persist only the touched bytes via - //! `write_meta` (a few pwrites), and rotate to a fresh VecBufferPool so - //! its page_table_ covers the extended file. ParseToMapping() is NOT - //! re-run because the in-memory state (segments_/chain_headers_/ - //! buffer_pool_buffers_/footer_/meta_chains_) is already authoritative. + //! C1: the page table extends in-place (no pool rotation). The exclusive + //! latch is held only briefly to protect segments_/id_hash_ insertion. int append_segment(const std::string &id, size_t size) { // Flush any in-memory metadata changes (data_size, padding_size, CRC) - // accumulated by prior write()/resize() calls BEFORE we take the - // unique_lock. flush_index() takes a shared_lock on the same mutex - // and std::shared_mutex is NOT reentrant. + // accumulated by prior write()/resize() calls. this->flush_index(); AllShardsExclusiveLatch latch(mapping_shards_); @@ -1073,42 +1057,11 @@ class BufferStorage : public IndexStorage { id_hash_[id] = id_hash_.size(); max_segment_size_ = std::max(max_segment_size_, padded_size); - // ---- Step 3: rotate the buffer pool so its page_table_ covers the - // freshly extended file. The OLD pool is parked in - // retired_pools_ to keep MemoryBlock ref counts safe; we - // do NOT re-run ParseToMapping() because the in-memory - // state is already authoritative. - // - // flush_all() is REQUIRED here despite the entry-point flush_index() - // having already flushed: between flush_index()'s shared_lock release - // and this function's unique_lock acquisition, other build threads may - // have produced new dirty pages via WrappedSegment::write(). Without - // this flush, the freshly opened pool would pread stale data from disk. - if (buffer_pool_handle_) { - buffer_pool_handle_->flush_all(); - } - if (buffer_pool_) { - retired_pools_.push_back(std::move(buffer_pool_)); - retired_handles_.push_back(std::move(buffer_pool_handle_)); - } else { - buffer_pool_handle_.reset(); - } - buffer_pool_.reset(); - - try { - buffer_pool_ = std::make_shared( - file_name_, /*writable=*/true, /*create=*/false); - buffer_pool_handle_ = std::make_shared( - buffer_pool_->get_handle()); - } catch (const std::exception &e) { - LOG_ERROR( - "append_segment: failed to reopen pool: file[%s], what[%s]", - file_name_.c_str(), e.what()); - buffer_pool_.reset(); - buffer_pool_handle_.reset(); - return IndexError_Runtime; - } - return buffer_pool_->init(); + // ---- Step 3: With the segmented page table (C1), extend_file() + // already extended the page table in-place. No pool + // rotation or flush_all is needed — the same pool/handle + // continues to serve both old and new pages. + return 0; } //! Test if a segment exists diff --git a/src/include/zvec/ailego/buffer/vector_page_table.h b/src/include/zvec/ailego/buffer/vector_page_table.h index 588fff87c..5996a9b2c 100644 --- a/src/include/zvec/ailego/buffer/vector_page_table.h +++ b/src/include/zvec/ailego/buffer/vector_page_table.h @@ -60,12 +60,14 @@ class VectorPageTable { using FlushCallback = std::function; - VectorPageTable() : entry_num_(0), entries_(nullptr) { + VectorPageTable() { BlockEvictionQueue::get_instance().set_valid(this); } ~VectorPageTable() { BlockEvictionQueue::get_instance().set_invalid(this); - delete[] entries_; + for (size_t i = 0; i < segment_count_; ++i) { + delete[] segments_[i]; + } } VectorPageTable(const VectorPageTable &) = delete; @@ -75,6 +77,11 @@ class VectorPageTable { void init(size_t entry_num); + //! Extend the page table to cover at least `new_entry_num` entries. + //! Existing entries stay at their original addresses (no invalidation). + //! Safe to call while readers operate on existing pages. + void extend(size_t new_entry_num); + char *acquire_block(block_id_t block_id); void release_block(block_id_t block_id); @@ -91,30 +98,30 @@ class VectorPageTable { //! Mark a loaded block as dirty so that it is persisted on eviction. void mark_dirty(block_id_t block_id) { assert(block_id < entry_num_); - entries_[block_id].is_dirty.store(true, std::memory_order_relaxed); + entry_at(block_id).is_dirty.store(true, std::memory_order_relaxed); } bool is_block_dirty(block_id_t block_id) const { assert(block_id < entry_num_); - return entries_[block_id].is_dirty.load(std::memory_order_relaxed); + return entry_at(block_id).is_dirty.load(std::memory_order_relaxed); } //! Flush a single dirty block without evicting it. Caller guarantees the //! block is currently loaded (buffer != nullptr). int flush_block(block_id_t block_id) { assert(block_id < entry_num_); - Entry &entry = entries_[block_id]; - char *buffer = entry.buffer; + Entry &e = entry_at(block_id); + char *buffer = e.buffer; if (!buffer || !flush_callback_) { return 0; } - if (!entry.is_dirty.load(std::memory_order_relaxed)) { + if (!e.is_dirty.load(std::memory_order_relaxed)) { return 0; } int rc = flush_callback_(block_id, buffer, kVectorPageSize, - entry.file_offset); + e.file_offset); if (rc == 0) { - entry.is_dirty.store(false, std::memory_order_relaxed); + e.is_dirty.store(false, std::memory_order_relaxed); } return rc; } @@ -125,17 +132,33 @@ class VectorPageTable { bool is_released(block_id_t block_id) const { assert(block_id < entry_num_); - return entries_[block_id].ref_count.load(std::memory_order_relaxed) <= 0; + return entry_at(block_id).ref_count.load(std::memory_order_relaxed) <= 0; } inline bool is_dead_block(BlockEvictionQueue::BlockType block) const { - Entry &entry = entries_[block.vector_block.first]; - return !entry.in_evict_queue.load(std::memory_order_relaxed); + const Entry &e = entry_at(block.vector_block.first); + return !e.in_evict_queue.load(std::memory_order_relaxed); } private: + // Segmented page table: entries are split across fixed-size segments so + // that extend() can grow the table without moving existing entries. + static constexpr size_t kSegmentShift = 16; // 65536 entries per segment + static constexpr size_t kSegmentSize = size_t{1} << kSegmentShift; + static constexpr size_t kSegmentMask = kSegmentSize - 1; + static constexpr size_t kMaxSegments = 2048; // up to 128M entries (512GB @ 4K) + size_t entry_num_{0}; - Entry *entries_{nullptr}; + size_t segment_count_{0}; + Entry *segments_[kMaxSegments]{}; + + Entry &entry_at(size_t idx) { + return segments_[idx >> kSegmentShift][idx & kSegmentMask]; + } + const Entry &entry_at(size_t idx) const { + return segments_[idx >> kSegmentShift][idx & kSegmentMask]; + } + FlushCallback flush_callback_{}; }; @@ -186,12 +209,9 @@ class VecBufferPool { int flush_all(); //! Extend the backing file to `new_size` bytes via ftruncate (no-op if - //! already >= new_size) and refresh the cached file_size_. - //! NOTE: page_table_.entry_num() is NOT updated here -- it stays at the - //! value computed by init(). Callers that need the page_table to cover - //! the extended range must reinitialize the pool (see BufferStorage's - //! append_segment retire-and-reopen flow). Returns true on success, - //! false on a read-only pool or I/O failure. + //! already >= new_size), refresh the cached file_size_, and extend the + //! page_table to cover the new range. Returns true on success, false on + //! a read-only pool or I/O failure. bool extend_file(size_t new_size); bool writable() const { From c60900bea6d8fa3d75bf24b06eac67c2a38c2b82 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Tue, 19 May 2026 17:44:44 +0800 Subject: [PATCH 08/47] fix --- src/core/utility/buffer_storage.cc | 265 ++++++++++++----------------- 1 file changed, 112 insertions(+), 153 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index b08d146d6..cabaa87f5 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -30,7 +30,29 @@ namespace zvec { namespace core { -/*! MMap File Storage +// Thread-local reusable scratch buffer for cross-page reads in the +// read(const void**) overload. Avoids allocating a new buffer on +// every cross-page read by reusing the same allocation on each thread. The +// returned pointer is valid only until the next cross-page read() on +// the same thread -- matching the single-page path's transient +// lifetime (ref released immediately, page may be evicted any time). +struct CrossPageScratch { + char *buf = nullptr; + size_t cap = 0; + ~CrossPageScratch() { + if (buf) ailego_free(buf); + } + char *ensure(size_t len) { + if (cap < len) { + if (buf) ailego_free(buf); + buf = static_cast(ailego_aligned_malloc(len, 4096)); + cap = buf ? len : 0; + } + return buf; + } +}; + +/*! Buffer Storage */ class BufferStorage : public IndexStorage { public: @@ -148,17 +170,19 @@ class BufferStorage : public IndexStorage { owner_->buffer_pool_handle_->release_one(page_id); return len; } - char *tmp = static_cast(ailego_aligned_malloc(len, 4096)); + // Reuse a thread-local scratch buffer to avoid allocating on + // every cross-page read. The pointer is valid until the next + // cross-page read(const void**) on the same thread. + thread_local CrossPageScratch scratch; + char *tmp = scratch.ensure(len); if (!tmp) { *data = nullptr; return 0; } if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, tmp)) { - ailego_free(tmp); *data = nullptr; return 0; } - owner_->register_tmp_buffer(tmp); *data = tmp; return len; } @@ -193,7 +217,7 @@ class BufferStorage : public IndexStorage { len, page_id); if (!raw) { LOG_ERROR("read error (single-page acquire failed)."); - return -1; + return 0; } data.reset(owner_->buffer_pool_handle_.get(), page_id, raw); return len; @@ -201,12 +225,12 @@ class BufferStorage : public IndexStorage { char *tmp = static_cast(ailego_aligned_malloc(len, 4096)); if (!tmp) { LOG_ERROR("read error (alloc cross-page temp buffer failed)."); - return -1; + return 0; } if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, tmp)) { ailego_free(tmp); LOG_ERROR("read error (cross-page read_range failed)."); - return -1; + return 0; } data = MemoryBlock::MakeOwned(tmp, len); return len; @@ -251,7 +275,7 @@ class BufferStorage : public IndexStorage { // when `data_size` grew, which meant fixed-size segments (e.g. // chunk_meta_segment writing HnswChunkMeta in place) never raised // the dirty flag -- their 4K page-cache pages were not flushed before - // append_segment() / reopen_pool(), so the freshly-rebuilt page table + // append_segment(), so the freshly-rebuilt page table // pread'd stale content from disk and chunk_cnts[NODE] lagged the // real segment count, eventually causing sync_chunks() to see a // mid-state segment and crash with a NULL Chunk::Pointer. @@ -289,7 +313,7 @@ class BufferStorage : public IndexStorage { // Pointer into BufferStorage::segments_ (an unordered_map mapped value). // C++ guarantees the address stays valid across map insertions. All // header / start-offset / segment-meta accesses go through this pointer - // so that re-parses (append_segment -> reopen_pool) are observed without + // so that re-parses after append_segment() are observed without // needing to recreate WrappedSegment instances held by callers. IndexMapping::SegmentInfo *segment_info_{nullptr}; @@ -362,40 +386,12 @@ class BufferStorage : public IndexStorage { return 0; } - void register_tmp_buffer(char *buf) { - std::lock_guard latch(tmp_buffers_mutex_); - tmp_buffers_.push_back(buf); - } - - //! Acquire a page-table block. - //! - //! LOCKING CONTRACT: caller MUST already hold a shared_lock (or - //! unique_lock) on mapping_mutex_. - char *get_buffer(size_t offset, size_t length, size_t /*block_id*/) { - if (ailego_unlikely(!buffer_pool_handle_)) { - LOG_ERROR( - "BufferStorage::get_buffer: handle is null, file[%s], " - "offset[%zu], length[%zu]", - file_name_.c_str(), offset, length); - return nullptr; - } - char *tmp = static_cast(ailego_aligned_malloc(length, 4096)); - if (!tmp) { - return nullptr; - } - if (!buffer_pool_handle_->read_range(offset, length, tmp)) { - ailego_free(tmp); - return nullptr; - } - register_tmp_buffer(tmp); - return tmp; - } - int ParseHeader(size_t offset, IndexFormat::MetaHeader *out) { std::unique_ptr buffer(new char[sizeof(*out)]); - // NOTE: bypass a wrapper get_meta() -- ParseHeader is called from - // reopen_pool() which already holds a unique_lock on mapping_mutex_ - // (std::shared_mutex is not reentrant -> deadlock). + // ParseHeader is called from ParseToMapping which is itself called + // from either open() (single-threaded) or append_segment() (under + // AllShardsExclusiveLatch). Do NOT add an internal lock here -- + // std::shared_mutex is not reentrant -> deadlock. if (buffer_pool_handle_->get_meta(offset, sizeof(*out), buffer.get()) != 0) { LOG_ERROR("Get segment header failed."); @@ -440,9 +436,8 @@ class BufferStorage : public IndexStorage { uint32_t *out_segment_ids_offset) { // NOTE: this function is only called from ParseToMapping(), which is // itself called from either open() (single-threaded construction) or - // reopen_pool() (always invoked under the unique_lock held by - // append_segment()). Do NOT add an internal lock here -- doing so would - // deadlock the append_segment() path. + // append_segment() (under AllShardsExclusiveLatch). Do NOT add an + // internal lock here -- doing so would deadlock the append path. std::unique_ptr segment_buffer = std::make_unique(footer_.segments_meta_size); // Bypass wrapper -- see ParseHeader() comment for why. @@ -482,7 +477,8 @@ class BufferStorage : public IndexStorage { // reflect stale entries and produce wrong IDs on re-parse. const std::string seg_name(reinterpret_cast(segment_start) + iter->segment_id_offset); - id_hash_[seg_name] = id_hash_.size(); + const size_t seg_id = id_hash_.size(); + id_hash_[seg_name] = seg_id; // Update the segments_ entry in-place so that any WrappedSegment // instances that already hold a pointer to this entry (via // &segments_[name].segment) continue to use the refreshed meta_ptr_ @@ -611,11 +607,18 @@ class BufferStorage : public IndexStorage { //! Retrieve a segment by id IndexStorage::Segment::Pointer get(const std::string &id, int) override { - auto segment_info = this->get_segment_info(id); - if (!segment_info) { + std::shared_lock latch( + mapping_shards_[mapping_shard_id()].mtx); + auto seg_iter = segments_.find(id); + if (seg_iter == segments_.end()) { return WrappedSegment::Pointer{}; } - return std::make_shared(this, segment_info, id_hash_[id]); + auto id_iter = id_hash_.find(id); + if (id_iter == id_hash_.end()) { + return WrappedSegment::Pointer{}; + } + return std::make_shared(this, &seg_iter->second, + id_iter->second); } //! Test if it a segment exists @@ -793,60 +796,14 @@ class BufferStorage : public IndexStorage { segments_.clear(); chain_headers_.clear(); memset(&footer_, 0, sizeof(footer_)); - { - std::lock_guard tmp_latch(tmp_buffers_mutex_); - for (char *p : tmp_buffers_) { - if (p) { - ailego_free(p); - } - } - tmp_buffers_.clear(); - } buffer_pool_handle_.reset(); buffer_pool_.reset(); max_segment_size_ = 0; buffer_pool_buffers_.clear(); meta_chains_.clear(); - // Drop retired pools last -- any stray MemoryBlock still holding a raw - // handle pointer would hit use-after-free here, but by close_index() - // time all build/search threads are expected to have joined. - retired_handles_.clear(); - retired_pools_.clear(); current_header_start_offset_ = 0; } - //! Reopen the buffer pool and reload the mapping. Used both as the final - //! success step of append_segment() and as a rollback path when any - //! IndexMapping operation fails mid-way through append_segment(). - //! - //! VecBufferPool's constructor throws on open()/fstat() failure; we catch - //! that here and translate it into an error code. - int reopen_pool() { - try { - buffer_pool_ = std::make_shared( - file_name_, /*writable=*/true, /*create=*/false); - buffer_pool_handle_ = std::make_shared( - buffer_pool_->get_handle()); - } catch (const std::exception &e) { - LOG_ERROR( - "BufferStorage::reopen_pool failed to create pool: file[%s], " - "what[%s]", - file_name_.c_str(), e.what()); - buffer_pool_.reset(); - buffer_pool_handle_.reset(); - return IndexError_Runtime; - } - int ret = ParseToMapping(); - if (ret != 0) { - LOG_ERROR( - "BufferStorage::reopen_pool failed to parse mapping: file[%s], " - "errno[%d]", - file_name_.c_str(), ret); - return ret; - } - return buffer_pool_->init(); - } - //! Append a segment into storage. //! //! C1: the page table extends in-place (no pool rotation). The exclusive @@ -881,33 +838,6 @@ class BufferStorage : public IndexStorage { return IndexError_Runtime; } - // Retire stale pools whose blocks are no longer referenced. Reused - // from the prior implementation so MemoryBlock instances held by other - // threads keep their raw VecBufferPoolHandle* alive. - auto prune_retired = [&]() { - size_t w = 0; - for (size_t r = 0; r < retired_pools_.size(); ++r) { - bool any_held = false; - auto &pt = retired_pools_[r]->page_table_; - for (size_t i = 0; i < pt.entry_num(); ++i) { - if (!pt.is_released(i)) { - any_held = true; - break; - } - } - if (any_held) { - if (w != r) { - retired_pools_[w] = std::move(retired_pools_[r]); - retired_handles_[w] = std::move(retired_handles_[r]); - } - ++w; - } - } - retired_pools_.resize(w); - retired_handles_.resize(w); - }; - prune_retired(); - // Page-aligned padded size for the new segment. Matches IndexMapping's // CalcPageAlignedSize() so the on-disk layout stays identical. const size_t page_size = ailego::kVectorPageSize; @@ -936,22 +866,35 @@ class BufferStorage : public IndexStorage { new_meta_total - sizeof(IndexFormat::MetaHeader) - sizeof(IndexFormat::MetaFooter)); - // Update OLD footer in memory + on disk so it links to the new chain. - footer_.next_meta_header_offset = new_chain_start; - IndexFormat::UpdateMetaFooter(&footer_, 0); + // Prepare the linked old footer WITHOUT mutating footer_ yet so + // that a write failure leaves in-memory state untouched. + const auto saved_footer = footer_; + IndexFormat::MetaFooter linked_footer = footer_; + linked_footer.next_meta_header_offset = new_chain_start; + IndexFormat::UpdateMetaFooter(&linked_footer, 0); + + // Write old footer with forward link to disk. if (buffer_pool_handle_->write_meta( - chain->footer_file_offset, sizeof(footer_), - reinterpret_cast(&footer_)) != 0) { + chain->footer_file_offset, sizeof(linked_footer), + reinterpret_cast(&linked_footer)) != 0) { LOG_ERROR("append_segment: write old footer failed, file[%s]", file_name_.c_str()); return IndexError_WriteData; } - chain->footer = footer_; // sync in-memory copy for flush_index + + // Best-effort rollback: restore original old footer on disk if a + // subsequent disk write in this split block fails. + auto undo_old_footer = [&]() { + buffer_pool_handle_->write_meta( + chain->footer_file_offset, sizeof(saved_footer), + reinterpret_cast(&saved_footer)); + }; // Extend the file and write the new chain's header + (zero) footer. // The segment_meta region is implicitly zero-filled by ftruncate, // matching the empty `new_meta_buf` we keep in memory. if (!buffer_pool_->extend_file(new_chain_start + new_meta_total)) { + undo_old_footer(); return IndexError_Runtime; } @@ -976,6 +919,7 @@ class BufferStorage : public IndexStorage { if (buffer_pool_handle_->write_meta( new_chain_start, sizeof(IndexFormat::MetaHeader), reinterpret_cast(new_header.get())) != 0) { + undo_old_footer(); return IndexError_WriteData; } uint64_t new_segment_meta_file_offset = @@ -985,10 +929,12 @@ class BufferStorage : public IndexStorage { if (buffer_pool_handle_->write_meta( new_footer_file_offset, sizeof(new_footer), reinterpret_cast(&new_footer)) != 0) { + undo_old_footer(); return IndexError_WriteData; } - // Mirror to in-memory state. + // All split disk writes succeeded -- commit in-memory state. + chain->footer = linked_footer; // old chain keeps linked footer chain_headers_.push_back(std::move(new_header)); buffer_pool_buffers_.push_back(std::move(new_meta_buf)); meta_chains_.push_back(MetaChain{new_chain_start, new_footer_file_offset, @@ -1015,6 +961,23 @@ class BufferStorage : public IndexStorage { } } + // Save mutable state for rollback if a disk write fails below. + const auto saved_footer = footer_; + const auto saved_chain_footer = chain->footer; + const auto saved_segment_ids_offset = chain->segment_ids_offset; + // Save the meta_buf regions that will be overwritten (SegmentMeta + // entry and segment-ID string) so they can be restored exactly, + // keeping the CRC consistent for a potential later flush_index(). + const size_t meta_entry_off = + sizeof(IndexFormat::SegmentMeta) * footer_.segment_count; + const uint32_t new_ids_off = + chain->segment_ids_offset - static_cast(id_size); + char saved_meta_entry[sizeof(IndexFormat::SegmentMeta)]; + std::memcpy(saved_meta_entry, meta_buf + meta_entry_off, + sizeof(IndexFormat::SegmentMeta)); + std::unique_ptr saved_id_bytes(new char[id_size]); + std::memcpy(saved_id_bytes.get(), meta_buf + new_ids_off, id_size); + chain->segment_ids_offset -= static_cast(id_size); IndexFormat::SegmentMeta *new_seg = reinterpret_cast(meta_buf) + @@ -1034,11 +997,24 @@ class BufferStorage : public IndexStorage { IndexFormat::UpdateMetaFooter(&footer_, 0); chain->footer = footer_; // sync in-memory copy for flush_index + // Rollback helper: restore meta_buf, footer_, and chain fields to + // their pre-Step-2 values so that flush_index() writes consistent + // metadata and the next append_segment() can retry cleanly. + auto rollback_step2 = [&]() { + std::memcpy(meta_buf + meta_entry_off, saved_meta_entry, + sizeof(IndexFormat::SegmentMeta)); + std::memcpy(meta_buf + new_ids_off, saved_id_bytes.get(), id_size); + footer_ = saved_footer; + chain->footer = saved_chain_footer; + chain->segment_ids_offset = saved_segment_ids_offset; + }; + if (buffer_pool_handle_->write_meta(chain->segment_meta_file_offset, chain->segment_meta_size, meta_buf) != 0) { LOG_ERROR("append_segment: write segment_meta failed, file[%s]", file_name_.c_str()); + rollback_step2(); return IndexError_WriteData; } if (buffer_pool_handle_->write_meta( @@ -1046,15 +1022,18 @@ class BufferStorage : public IndexStorage { reinterpret_cast(&footer_)) != 0) { LOG_ERROR("append_segment: write footer failed, file[%s]", file_name_.c_str()); + rollback_step2(); return IndexError_WriteData; } - // Mirror to in-memory mapping. WrappedSegment instances already held - // by callers reference &segments_[name], whose address is stable across - // unordered_map insertions, so existing references stay valid. + // All disk writes succeeded -- commit remaining in-memory state. + // WrappedSegment instances already held by callers reference + // &segments_[name], whose address is stable across unordered_map + // insertions, so existing references stay valid. segments_[id] = IndexMapping::SegmentInfo{ IndexMapping::Segment{new_seg}, chain->header_start_offset, header}; - id_hash_[id] = id_hash_.size(); + const size_t new_id = id_hash_.size(); + id_hash_[id] = new_id; max_segment_size_ = std::max(max_segment_size_, padded_size); // ---- Step 3: With the segmented page table (C1), extend_file() @@ -1071,22 +1050,11 @@ class BufferStorage : public IndexStorage { return (segments_.find(id) != segments_.end()); } - //! Get a segment from storage - IndexMapping::SegmentInfo *get_segment_info(const std::string &id) { - std::shared_lock latch( - mapping_shards_[mapping_shard_id()].mtx); - auto iter = segments_.find(id); - if (iter == segments_.end()) { - return nullptr; - } - return &iter->second; - } - private: std::atomic index_dirty_{false}; // Sharded reader-writer lock to eliminate cache-line ping-pong on the - // reader counter. 16 concurrent readers each hash to their own shard, + // reader counter. Each concurrent reader hashes to its own shard, // avoiding cross-core contention. Writers (append_segment/close_index) // lock ALL shards to achieve exclusive access. static constexpr size_t kMappingMutexShards = 32; @@ -1116,9 +1084,6 @@ class BufferStorage : public IndexStorage { AllShardsExclusiveLatch &operator=(const AllShardsExclusiveLatch &) = delete; }; - std::vector tmp_buffers_{}; - mutable std::mutex tmp_buffers_mutex_{}; - // buffer manager std::string file_name_; // Per-chain owning copies of MetaHeader. segments_[name].segment_header @@ -1132,15 +1097,9 @@ class BufferStorage : public IndexStorage { uint64_t max_segment_size_{0}; std::vector> buffer_pool_buffers_{}; - // Retired pools: see prune_retired() in append_segment() for the - // life-cycle contract. - std::vector retired_pools_{}; - std::vector retired_handles_{}; - ailego::VecBufferPool::Pointer buffer_pool_{nullptr}; ailego::VecBufferPoolHandle::Pointer buffer_pool_handle_{nullptr}; uint64_t current_header_start_offset_{0u}; - uint64_t buffer_size_{2lu * 1024 * 1024 * 1024}; // 2G // Capacity (in bytes) of the segment metadata section written by // init_index(). From f0b989876823a4c59eeb766d7f18e0aa7a0ecc96 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Tue, 19 May 2026 17:57:10 +0800 Subject: [PATCH 09/47] fix --- src/core/utility/buffer_storage.cc | 51 +++++++++++++++++------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index cabaa87f5..5bcffcfc9 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -113,13 +113,12 @@ class BufferStorage : public IndexStorage { owner_->file_name_.c_str(), segment_id_); return 0; } - if (ailego_unlikely(offset + len > - segment_info_->segment.meta()->data_size)) { - auto meta = segment_info_->segment.meta(); - if (offset > meta->data_size) { - offset = meta->data_size; + const size_t data_size = segment_info_->segment.meta()->data_size; + if (ailego_unlikely(offset > data_size || len > data_size - offset)) { + if (offset > data_size) { + offset = data_size; } - len = meta->data_size - offset; + len = data_size - offset; } size_t abs_offset = segment_info_->segment_header_start_offset + segment_info_->segment_header->content_offset + @@ -140,13 +139,12 @@ class BufferStorage : public IndexStorage { *data = nullptr; return 0; } - if (ailego_unlikely(offset + len > - segment_info_->segment.meta()->data_size)) { - auto meta = segment_info_->segment.meta(); - if (offset > meta->data_size) { - offset = meta->data_size; + const size_t data_size = segment_info_->segment.meta()->data_size; + if (ailego_unlikely(offset > data_size || len > data_size - offset)) { + if (offset > data_size) { + offset = data_size; } - len = meta->data_size - offset; + len = data_size - offset; } size_t abs_offset = segment_info_->segment_header_start_offset + segment_info_->segment_header->content_offset + @@ -196,13 +194,12 @@ class BufferStorage : public IndexStorage { owner_->file_name_.c_str(), segment_id_); return 0; } - if (ailego_unlikely(offset + len > - segment_info_->segment.meta()->data_size)) { - auto meta = segment_info_->segment.meta(); - if (offset > meta->data_size) { - offset = meta->data_size; + const size_t data_size = segment_info_->segment.meta()->data_size; + if (ailego_unlikely(offset > data_size || len > data_size - offset)) { + if (offset > data_size) { + offset = data_size; } - len = meta->data_size - offset; + len = data_size - offset; } size_t abs_offset = segment_info_->segment_header_start_offset + segment_info_->segment_header->content_offset + @@ -250,7 +247,7 @@ class BufferStorage : public IndexStorage { if (!owner_->buffer_pool_->writable()) { return len; } - if (ailego_unlikely(offset + len > capacity_)) { + if (ailego_unlikely(offset > capacity_ || len > capacity_ - offset)) { LOG_ERROR("write() exceeds segment capacity: offset=%zu len=%zu cap=%zu", offset, len, capacity_); return 0; @@ -372,10 +369,12 @@ class BufferStorage : public IndexStorage { buffer_pool_->get_handle()); int ret = ParseToMapping(); if (ret != 0) { + this->close_index(); return ret; } ret = buffer_pool_->init(); if (ret != 0) { + this->close_index(); return ret; } LOG_INFO( @@ -457,7 +456,7 @@ class BufferStorage : public IndexStorage { for (IndexFormat::SegmentMeta *iter = segment_start, *end = segment_start + footer_.segment_count; iter != end; ++iter) { - if (iter->segment_id_offset > footer_.segments_meta_size) { + if (iter->segment_id_offset >= footer_.segments_meta_size) { return IndexError_InvalidValue; } if (iter->data_index > footer_.content_size) { @@ -708,7 +707,11 @@ class BufferStorage : public IndexStorage { } //! Refresh meta information (checksum, update time, etc.) - void refresh_index(uint64_t /*chkp*/) { + void refresh_index(uint64_t chkp) { + // Store the checkpoint so flush_index() can persist it. + if (chkp != 0) { + pending_check_point_ = chkp; + } // In BufferStorage the segment metadata lives in buffer_pool_buffers_. // CRC recomputation and disk write are deferred to flush_index(). // Just mark dirty so flush_index() will include the metadata write. @@ -757,7 +760,7 @@ class BufferStorage : public IndexStorage { // Recompute segment metadata CRC and refresh the per-chain footer. mchain.footer.segments_meta_crc = ailego::Crc32c::Hash(seg_buf, mchain.segment_meta_size, 0u); - IndexFormat::UpdateMetaFooter(&mchain.footer, 0); + IndexFormat::UpdateMetaFooter(&mchain.footer, pending_check_point_); // Write segment metadata back to disk. if (buffer_pool_handle_->write_meta(mchain.segment_meta_file_offset, mchain.segment_meta_size, @@ -779,6 +782,7 @@ class BufferStorage : public IndexStorage { if (!meta_chains_.empty()) { footer_ = meta_chains_.back().footer; } + pending_check_point_ = 0; index_dirty_.store(false, std::memory_order_relaxed); return 0; } @@ -802,6 +806,8 @@ class BufferStorage : public IndexStorage { buffer_pool_buffers_.clear(); meta_chains_.clear(); current_header_start_offset_ = 0; + pending_check_point_ = 0; + index_dirty_.store(false, std::memory_order_relaxed); } //! Append a segment into storage. @@ -1052,6 +1058,7 @@ class BufferStorage : public IndexStorage { private: std::atomic index_dirty_{false}; + uint64_t pending_check_point_{0}; // Sharded reader-writer lock to eliminate cache-line ping-pong on the // reader counter. Each concurrent reader hashes to its own shard, From 4997a1fefca12a181dea13c764699359b932f9af Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Tue, 19 May 2026 19:37:56 +0800 Subject: [PATCH 10/47] fix --- src/core/utility/buffer_storage.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 5bcffcfc9..3c5917b37 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -708,9 +708,11 @@ class BufferStorage : public IndexStorage { //! Refresh meta information (checksum, update time, etc.) void refresh_index(uint64_t chkp) { - // Store the checkpoint so flush_index() can persist it. + // Store the checkpoint so flush_index() can persist it. Use relaxed + // atomics to avoid a data race with flush_index() readers/resetters + // (they may run concurrently on different threads). if (chkp != 0) { - pending_check_point_ = chkp; + pending_check_point_.store(chkp, std::memory_order_relaxed); } // In BufferStorage the segment metadata lives in buffer_pool_buffers_. // CRC recomputation and disk write are deferred to flush_index(). @@ -760,7 +762,9 @@ class BufferStorage : public IndexStorage { // Recompute segment metadata CRC and refresh the per-chain footer. mchain.footer.segments_meta_crc = ailego::Crc32c::Hash(seg_buf, mchain.segment_meta_size, 0u); - IndexFormat::UpdateMetaFooter(&mchain.footer, pending_check_point_); + IndexFormat::UpdateMetaFooter( + &mchain.footer, + pending_check_point_.load(std::memory_order_relaxed)); // Write segment metadata back to disk. if (buffer_pool_handle_->write_meta(mchain.segment_meta_file_offset, mchain.segment_meta_size, @@ -782,7 +786,7 @@ class BufferStorage : public IndexStorage { if (!meta_chains_.empty()) { footer_ = meta_chains_.back().footer; } - pending_check_point_ = 0; + pending_check_point_.store(0, std::memory_order_relaxed); index_dirty_.store(false, std::memory_order_relaxed); return 0; } @@ -806,7 +810,7 @@ class BufferStorage : public IndexStorage { buffer_pool_buffers_.clear(); meta_chains_.clear(); current_header_start_offset_ = 0; - pending_check_point_ = 0; + pending_check_point_.store(0, std::memory_order_relaxed); index_dirty_.store(false, std::memory_order_relaxed); } @@ -1058,7 +1062,7 @@ class BufferStorage : public IndexStorage { private: std::atomic index_dirty_{false}; - uint64_t pending_check_point_{0}; + std::atomic pending_check_point_{0}; // Sharded reader-writer lock to eliminate cache-line ping-pong on the // reader counter. Each concurrent reader hashes to its own shard, From 4bece9aca0e7056baa2435d0483561410c9d0a89 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Tue, 19 May 2026 19:46:57 +0800 Subject: [PATCH 11/47] fix --- src/core/utility/buffer_storage.cc | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 3c5917b37..cc9df7280 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -746,6 +746,13 @@ class BufferStorage : public IndexStorage { index_dirty_.store(false, std::memory_order_relaxed); return 0; } + // Snapshot the pending checkpoint at the start of the flush. We will + // use CAS at the end to reset it to 0 only if no concurrent + // refresh_index() has stored a newer value during the flush; otherwise + // the newer value (and dirty=true) must be preserved so the next + // flush_index() picks it up. + const uint64_t consumed_chkp = + pending_check_point_.load(std::memory_order_relaxed); // Flush all dirty data blocks to the backing file first. if (buffer_pool_handle_->flush_all() != 0) { LOG_ERROR("flush_all data blocks failed: file[%s]", file_name_.c_str()); @@ -762,9 +769,7 @@ class BufferStorage : public IndexStorage { // Recompute segment metadata CRC and refresh the per-chain footer. mchain.footer.segments_meta_crc = ailego::Crc32c::Hash(seg_buf, mchain.segment_meta_size, 0u); - IndexFormat::UpdateMetaFooter( - &mchain.footer, - pending_check_point_.load(std::memory_order_relaxed)); + IndexFormat::UpdateMetaFooter(&mchain.footer, consumed_chkp); // Write segment metadata back to disk. if (buffer_pool_handle_->write_meta(mchain.segment_meta_file_offset, mchain.segment_meta_size, @@ -786,8 +791,16 @@ class BufferStorage : public IndexStorage { if (!meta_chains_.empty()) { footer_ = meta_chains_.back().footer; } - pending_check_point_.store(0, std::memory_order_relaxed); - index_dirty_.store(false, std::memory_order_relaxed); + // CAS-reset: only consume the checkpoint we observed at the start. + // If a concurrent refresh_index() stored a newer value mid-flush, CAS + // fails and the newer value remains in pending_check_point_ along with + // dirty=true, so the next flush_index() will persist it. + uint64_t expected = consumed_chkp; + const bool consumed = pending_check_point_.compare_exchange_strong( + expected, 0, std::memory_order_relaxed); + if (consumed) { + index_dirty_.store(false, std::memory_order_relaxed); + } return 0; } From 91e7b7f3b5f2455f57f9f42af3e8d4821a5ee9f7 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Tue, 19 May 2026 19:56:10 +0800 Subject: [PATCH 12/47] fix --- src/core/utility/buffer_storage.cc | 49 +++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index cc9df7280..5095cb841 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -746,15 +746,33 @@ class BufferStorage : public IndexStorage { index_dirty_.store(false, std::memory_order_relaxed); return 0; } - // Snapshot the pending checkpoint at the start of the flush. We will - // use CAS at the end to reset it to 0 only if no concurrent - // refresh_index() has stored a newer value during the flush; otherwise - // the newer value (and dirty=true) must be preserved so the next - // flush_index() picks it up. + // Atomically claim the dirty flag at the START of the flush, not at the + // end. This prevents a TOCTOU race against the lock-free hot path: + // any WrappedSegment::write() that happens between flush_all() and the + // end of this function will simply re-set dirty=true (its set_as_dirty + // observes our cleared flag), and the next flush_index() will pick up + // those new dirty pages. An unconditional store(false) at the end + // would silently swallow that concurrent write. + bool expected_dirty = true; + if (!index_dirty_.compare_exchange_strong(expected_dirty, false, + std::memory_order_relaxed)) { + // Another thread already claimed and is performing the flush; treat + // this call as a no-op. The previous design (no CAS) allowed + // duplicate concurrent flushers; bailing out here is strictly safer + // because both flushers would otherwise race on per-chain footer + // mutation in the loop below. + return 0; + } + // Snapshot the pending checkpoint AFTER claiming dirty so that we + // observe at least every refresh_index() that happened before we + // claimed. The CAS-reset at the end will preserve any newer chkp + // stored by a concurrent refresh_index() during this flush. const uint64_t consumed_chkp = pending_check_point_.load(std::memory_order_relaxed); // Flush all dirty data blocks to the backing file first. if (buffer_pool_handle_->flush_all() != 0) { + // Restore dirty so the next flush_index() retries. + index_dirty_.store(true, std::memory_order_relaxed); LOG_ERROR("flush_all data blocks failed: file[%s]", file_name_.c_str()); return IndexError_WriteData; } @@ -776,6 +794,7 @@ class BufferStorage : public IndexStorage { seg_buf) != 0) { LOG_ERROR("Failed to write segment meta: file[%s], chain[%zu]", file_name_.c_str(), ci); + index_dirty_.store(true, std::memory_order_relaxed); return IndexError_WriteData; } // Write the updated footer back to disk. @@ -784,6 +803,7 @@ class BufferStorage : public IndexStorage { reinterpret_cast(&mchain.footer)) != 0) { LOG_ERROR("Failed to write footer: file[%s], chain[%zu]", file_name_.c_str(), ci); + index_dirty_.store(true, std::memory_order_relaxed); return IndexError_WriteData; } } @@ -791,16 +811,15 @@ class BufferStorage : public IndexStorage { if (!meta_chains_.empty()) { footer_ = meta_chains_.back().footer; } - // CAS-reset: only consume the checkpoint we observed at the start. - // If a concurrent refresh_index() stored a newer value mid-flush, CAS - // fails and the newer value remains in pending_check_point_ along with - // dirty=true, so the next flush_index() will persist it. - uint64_t expected = consumed_chkp; - const bool consumed = pending_check_point_.compare_exchange_strong( - expected, 0, std::memory_order_relaxed); - if (consumed) { - index_dirty_.store(false, std::memory_order_relaxed); - } + // CAS-reset pending: only consume the checkpoint we observed at the + // start. If a concurrent refresh_index() stored a newer value during + // the flush, CAS fails and the newer value remains in + // pending_check_point_; refresh_index() also re-set dirty=true (since + // we cleared it at the top), so the next flush_index() will persist + // the newer chkp. + uint64_t expected_chkp = consumed_chkp; + pending_check_point_.compare_exchange_strong(expected_chkp, 0, + std::memory_order_relaxed); return 0; } From f78fe39babfcf3576dfe4f35911ca33f4b058d8a Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Tue, 19 May 2026 20:37:59 +0800 Subject: [PATCH 13/47] fix --- src/core/utility/buffer_storage.cc | 162 +++++++++++++++++++++++------ 1 file changed, 131 insertions(+), 31 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 5095cb841..528252bca 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -234,8 +235,18 @@ class BufferStorage : public IndexStorage { } //! Write data into the storage with offset - //! C1: lock-free hot path (pool/handle never change during operation). + //! + //! Takes a SHARED latch on the owner's mapping shard. This pairs with + //! the EXCLUSIVE all-shards latch held by flush_index() / append_segment() + //! around the meta_buf CRC + write_meta phase: writers parallelize + //! across (and within) shards, but are fully excluded while CRC is + //! computed over the meta_buf bytes that this method mutates + //! (data_size / padding_size). Without this latch the lock-free hot + //! path raced with the CRC compute, producing footer.segments_meta_crc + //! that did not match the bytes pwrite()'d to disk. size_t write(size_t offset, const void *data, size_t len) override { + std::shared_lock latch( + owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); if (ailego_unlikely(!owner_->buffer_pool_handle_ || !owner_->buffer_pool_)) { LOG_ERROR("WrappedSegment::write: pool is null, file[%s], id[%zu]", @@ -281,7 +292,13 @@ class BufferStorage : public IndexStorage { } //! Resize size of data + //! + //! Takes a SHARED latch for the same reason as write(): mutating + //! meta->data_size / padding_size must be excluded from the CRC + //! compute in flush_index() / append_segment(). size_t resize(size_t size) override { + std::shared_lock latch( + owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); auto meta = segment_info_->segment.meta(); if (meta->data_size != size) { if (size > capacity_) { @@ -295,7 +312,13 @@ class BufferStorage : public IndexStorage { } //! Update crc of data + //! + //! Takes a SHARED latch for the same reason as write(): mutating + //! meta->data_crc must be excluded from the CRC compute in + //! flush_index() / append_segment(). void update_data_crc(uint32_t crc) override { + std::shared_lock latch( + owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); segment_info_->segment.meta()->data_crc = crc; owner_->set_as_dirty(); } @@ -692,34 +715,50 @@ class BufferStorage : public IndexStorage { //! Set the index file as dirty. //! //! HOT PATH: called once per WrappedSegment::write() / resize() / - //! update_data_crc(). Under 16-thread build (~100k writes total) every - //! unconditional store(true) on this shared cache line triggers MESI - //! invalidation across all cores -- classic cache-line ping-pong even - //! for relaxed atomics. Since the flag is true the vast majority of - //! the time (only flush_index() / refresh_index() reset it), guard the - //! store with a load: when the line is already in Shared/Modified=true - //! state on this core, the load is essentially free and we skip the - //! invalidating store. + //! update_data_crc(). We MUST unconditionally store(true) here, not + //! guard with a load-then-store: under relaxed semantics a writer can + //! observe a stale dirty=true (its own core's cached value) AFTER + //! flush_index() has CAS'd dirty to false on another core, then skip + //! its own store and the writer's modification gets dropped (next + //! flush_index() short-circuits at the top because dirty is false). + //! The MESI ping-pong is the cost of correctness; it is bounded by the + //! caller's write rate and amortized by the caller's actual I/O. void set_as_dirty(void) { - if (!index_dirty_.load(std::memory_order_relaxed)) { - index_dirty_.store(true, std::memory_order_relaxed); - } + index_dirty_.store(true, std::memory_order_relaxed); } //! Refresh meta information (checksum, update time, etc.) void refresh_index(uint64_t chkp) { - // Store the checkpoint so flush_index() can persist it. Use relaxed - // atomics to avoid a data race with flush_index() readers/resetters - // (they may run concurrently on different threads). + // Monotonic merge: callers may invoke refresh() out of order under + // concurrency (parallel writers, retries, batched commits delivered on + // different threads). An unconditional store would let a smaller chkp + // arriving later overwrite a larger one, violating the upper-layer + // invariant that the persisted check_point is non-decreasing. CAS-loop + // max guarantees the largest observed value wins regardless of arrival + // order; relaxed ordering is sufficient because flush_index() takes the + // all-shards exclusive latch which establishes the necessary + // happens-before for the actual disk write. if (chkp != 0) { - pending_check_point_.store(chkp, std::memory_order_relaxed); + uint64_t cur = pending_check_point_.load(std::memory_order_relaxed); + while (chkp > cur) { + if (pending_check_point_.compare_exchange_weak( + cur, chkp, std::memory_order_relaxed)) { + break; + } + // compare_exchange_weak refreshed `cur`; loop checks chkp > cur + // again and exits if some other thread already raised pending past + // our value. + } } // In BufferStorage the segment metadata lives in buffer_pool_buffers_. // CRC recomputation and disk write are deferred to flush_index(). - // Just mark dirty so flush_index() will include the metadata write. - if (!index_dirty_.load(std::memory_order_relaxed)) { - index_dirty_.store(true, std::memory_order_relaxed); - } + // Mark dirty unconditionally for the same reason as set_as_dirty(): + // a load-then-store guard would let a stale `true` observation skip + // the store and lose this refresh. Note: even when our chkp lost the + // CAS race (was discarded as stale), we still set dirty -- the winning + // larger chkp must be flushed, and flush_index()'s UpdateMetaFooter() + // is a no-op for chkp==0 so a spurious extra flush is harmless. + index_dirty_.store(true, std::memory_order_relaxed); } //! Flush index storage: persists any pending meta changes (segments_meta + @@ -729,11 +768,15 @@ class BufferStorage : public IndexStorage { if (!index_dirty_.load(std::memory_order_relaxed)) { return 0; } - // SHARED LOCK: keep one shard locked for the whole flush so that the - // pool/handle cannot be torn down by append_segment()/close_index() - // mid-flush. - std::shared_lock latch( - mapping_shards_[mapping_shard_id()].mtx); + // EXCLUSIVE all-shards latch: blocks the lock-free hot path + // (WrappedSegment::write / resize / update_data_crc) which mutates + // meta->data_size / padding_size / data_crc, the very bytes we hash + // to recompute footer.segments_meta_crc and pwrite to disk. Holding + // a single shard's shared lock (the previous design) was insufficient + // because writers on other shards could race with the CRC compute + // and produce a checksum that mismatches the on-disk segment_meta + // bytes, causing IndexError_InvalidChecksum on the next open(). + AllShardsExclusiveLatch latch(mapping_shards_); // NULL GUARD: a previous append_segment() may have left the pool in a // torn-down state. if (!buffer_pool_ || !buffer_pool_handle_) { @@ -826,9 +869,9 @@ class BufferStorage : public IndexStorage { //! Close index storage void close_index(void) { // Flush any outstanding dirty metadata to disk before tearing down. - // IMPORTANT: call flush_index() BEFORE taking the unique_lock below; - // flush_index() internally takes a shared_lock on the same mutex and - // std::shared_mutex is NOT reentrant. + // IMPORTANT: call flush_index() BEFORE taking the all-shards exclusive + // latch below; flush_index() now also takes an all-shards exclusive + // latch and std::shared_mutex is NOT reentrant. this->flush_index(); AllShardsExclusiveLatch latch(mapping_shards_); file_name_.clear(); @@ -894,6 +937,17 @@ class BufferStorage : public IndexStorage { IndexFormat::MetaHeader *header = chain_headers_.back().get(); char *meta_buf = buffer_pool_buffers_.back().get(); + // Rollback handle for the (possibly committed) chain split below. + // Default is a no-op; populated ONLY after Step 1's in-memory commit + // succeeds so that a Step 2 disk-write failure can undo the split as + // well, leaving meta_chains_ / chain_headers_ / buffer_pool_buffers_ / + // footer_ / current_header_start_offset_ exactly as they were before + // append_segment() ran. Without this, a Step 2 failure would leave + // an orphan empty chain permanently appended to the file (harmless + // for correctness because it stays linked and gets reused on next + // append, but disruptive for idempotent retries and unit tests). + std::function rollback_step1 = []() {}; + // ---- Step 1: chain split if current chain has no meta capacity left. if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count + need_size > chain->segment_ids_offset) { @@ -910,7 +964,7 @@ class BufferStorage : public IndexStorage { // Prepare the linked old footer WITHOUT mutating footer_ yet so // that a write failure leaves in-memory state untouched. - const auto saved_footer = footer_; + const auto saved_footer_before_split = footer_; IndexFormat::MetaFooter linked_footer = footer_; linked_footer.next_meta_header_offset = new_chain_start; IndexFormat::UpdateMetaFooter(&linked_footer, 0); @@ -928,8 +982,8 @@ class BufferStorage : public IndexStorage { // subsequent disk write in this split block fails. auto undo_old_footer = [&]() { buffer_pool_handle_->write_meta( - chain->footer_file_offset, sizeof(saved_footer), - reinterpret_cast(&saved_footer)); + chain->footer_file_offset, sizeof(saved_footer_before_split), + reinterpret_cast(&saved_footer_before_split)); }; // Extend the file and write the new chain's header + (zero) footer. @@ -975,6 +1029,14 @@ class BufferStorage : public IndexStorage { return IndexError_WriteData; } + // Snapshot the OLD chain's pre-commit state for rollback_step1. + // Captured by value because `chain` will be reassigned below to point + // at the new chain's slot in meta_chains_, and pop_back() during + // rollback would invalidate any reference into the old slot. + const auto saved_old_chain_footer = chain->footer; + const uint64_t saved_old_footer_file_offset = chain->footer_file_offset; + const uint64_t saved_current_header_start = current_header_start_offset_; + // All split disk writes succeeded -- commit in-memory state. chain->footer = linked_footer; // old chain keeps linked footer chain_headers_.push_back(std::move(new_header)); @@ -989,6 +1051,42 @@ class BufferStorage : public IndexStorage { chain = &meta_chains_.back(); header = chain_headers_.back().get(); meta_buf = buffer_pool_buffers_.back().get(); + + // Install rollback for the committed split: pop the new chain and + // restore the old chain on both disk and memory. Captured fully by + // value (except `this`-via-member-access) so a subsequent reassignment + // of local pointers (chain/header/meta_buf) does not corrupt the + // closure. + rollback_step1 = [this, saved_footer_before_split, + saved_old_chain_footer, saved_old_footer_file_offset, + saved_current_header_start]() { + // 1. Restore old chain's footer on disk (drop forward link). + buffer_pool_handle_->write_meta( + saved_old_footer_file_offset, sizeof(saved_footer_before_split), + reinterpret_cast(&saved_footer_before_split)); + // 2. Pop the freshly-pushed new chain from in-memory containers. + // The associated unique_ptr / unique_ptr + // are released here. + if (!meta_chains_.empty()) meta_chains_.pop_back(); + if (!chain_headers_.empty()) chain_headers_.pop_back(); + if (!buffer_pool_buffers_.empty()) buffer_pool_buffers_.pop_back(); + // 3. Restore old chain's in-memory footer (its forward link was + // set to the now-popped new chain). + if (!meta_chains_.empty()) { + meta_chains_.back().footer = saved_old_chain_footer; + } + // 4. Restore footer_ and current_header_start_offset_ to their + // pre-split values. The on-disk file size is intentionally NOT + // shrunk: most buffer-pool backends offer no precise truncate, + // and the leftover bytes (the orphan new_header / new_footer + // region) are unreachable -- step 1 above has already removed + // the forward link from the old footer, so ParseToMapping() + // stops at the old chain and the leftover region is reusable + // by the next append_segment()'s split via file_size() + // realignment. + footer_ = saved_footer_before_split; + current_header_start_offset_ = saved_current_header_start; + }; } // ---- Step 2: append SegmentMeta + ID into the (possibly new) last @@ -1057,6 +1155,7 @@ class BufferStorage : public IndexStorage { LOG_ERROR("append_segment: write segment_meta failed, file[%s]", file_name_.c_str()); rollback_step2(); + rollback_step1(); return IndexError_WriteData; } if (buffer_pool_handle_->write_meta( @@ -1065,6 +1164,7 @@ class BufferStorage : public IndexStorage { LOG_ERROR("append_segment: write footer failed, file[%s]", file_name_.c_str()); rollback_step2(); + rollback_step1(); return IndexError_WriteData; } From 21081e6dba3a823266141459a024712e39e1c75f Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Tue, 19 May 2026 21:08:20 +0800 Subject: [PATCH 14/47] fix --- src/core/utility/buffer_storage.cc | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 528252bca..caca2628b 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -777,6 +777,16 @@ class BufferStorage : public IndexStorage { // and produce a checksum that mismatches the on-disk segment_meta // bytes, causing IndexError_InvalidChecksum on the next open(). AllShardsExclusiveLatch latch(mapping_shards_); + return flush_index_locked(); + } + + //! Internal flush implementation. PRECONDITION: caller MUST already hold + //! AllShardsExclusiveLatch on mapping_shards_. Used by flush_index() + //! (which acquires the latch itself) and by close_index() (which must + //! flush and tear down under a SINGLE continuous latch hold so that no + //! writer can slip in between flush and pool reset and lose its dirty + //! pages). + int flush_index_locked(void) { // NULL GUARD: a previous append_segment() may have left the pool in a // torn-down state. if (!buffer_pool_ || !buffer_pool_handle_) { @@ -868,12 +878,19 @@ class BufferStorage : public IndexStorage { //! Close index storage void close_index(void) { - // Flush any outstanding dirty metadata to disk before tearing down. - // IMPORTANT: call flush_index() BEFORE taking the all-shards exclusive - // latch below; flush_index() now also takes an all-shards exclusive - // latch and std::shared_mutex is NOT reentrant. - this->flush_index(); + // Take the all-shards exclusive latch BEFORE flushing, and hold it for + // the entire teardown sequence. Earlier code released the latch + // between flush and teardown, opening a window in which a writer could + // grab a shared lock, mutate meta_buf via WrappedSegment::write() and + // call set_as_dirty(true). After this close_index() reacquired the + // latch and reset buffer_pool_handle_, those dirty pages would be + // dropped on the floor with no chance to flush. Holding a SINGLE + // latch instance across flush_index_locked() and the reset eliminates + // that window: writers can only enter once we have fully torn down + // (and at that point segments_/buffer_pool_handle_ are gone, so they + // would fail the null/state guards in WrappedSegment). AllShardsExclusiveLatch latch(mapping_shards_); + flush_index_locked(); file_name_.clear(); id_hash_.clear(); segments_.clear(); From 85f89dce2cbd4823b75cba59731e487fe7b3b744 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Wed, 20 May 2026 16:59:09 +0800 Subject: [PATCH 15/47] fix --- src/core/algorithm/flat/CMakeLists.txt | 7 +++++++ src/core/algorithm/flat_sparse/CMakeLists.txt | 9 +++++++++ src/core/algorithm/hnsw/CMakeLists.txt | 6 ++++++ src/core/algorithm/hnsw_rabitq/CMakeLists.txt | 6 ++++++ src/core/algorithm/hnsw_sparse/CMakeLists.txt | 6 ++++++ src/core/algorithm/ivf/CMakeLists.txt | 6 ++++++ src/core/algorithm/vamana/CMakeLists.txt | 6 ++++++ src/core/metric/CMakeLists.txt | 6 ++++++ src/core/mixed_reducer/CMakeLists.txt | 6 ++++++ src/core/quantizer/CMakeLists.txt | 6 ++++++ src/core/utility/CMakeLists.txt | 6 ++++++ 11 files changed, 70 insertions(+) diff --git a/src/core/algorithm/flat/CMakeLists.txt b/src/core/algorithm/flat/CMakeLists.txt index 4564d8ef0..60814960e 100644 --- a/src/core/algorithm/flat/CMakeLists.txt +++ b/src/core/algorithm/flat/CMakeLists.txt @@ -1,11 +1,18 @@ include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) include(${PROJECT_ROOT_DIR}/cmake/option.cmake) #message(STATUS "PROJECT_ROOT_DIR = ${PROJECT_ROOT_DIR}") + +if(NOT APPLE) + set(CORE_KNN_FLAT_LDFLAGS + "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") +endif() + cc_library( NAME core_knn_flat STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc LIBS core_framework INCS . ${PROJECT_ROOT_DIR}/src/core ${PROJECT_ROOT_DIR}/src/core/algorithm ${PROJECT_ROOT_DIR}/src/core/framework + LDFLAGS "${CORE_KNN_FLAT_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" ) diff --git a/src/core/algorithm/flat_sparse/CMakeLists.txt b/src/core/algorithm/flat_sparse/CMakeLists.txt index e27d2d3ee..44766138d 100644 --- a/src/core/algorithm/flat_sparse/CMakeLists.txt +++ b/src/core/algorithm/flat_sparse/CMakeLists.txt @@ -1,11 +1,20 @@ include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) include(${PROJECT_ROOT_DIR}/cmake/option.cmake) +# --exclude-libs is GNU ld / LLVM lld only; Apple ld does not support it. +# On macOS (Mach-O), symbol interposition works differently and the +# Arrow/Parquet double-free issue does not apply. +if(NOT APPLE) + set(CORE_KNN_FLAT_SPARSE_LDFLAGS + "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") +endif() + cc_library( NAME core_knn_flat_sparse STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc LIBS core_framework INCS . ${PROJECT_ROOT_DIR}/src/core ${PROJECT_ROOT_DIR}/src/core/algorithm + LDFLAGS "${CORE_KNN_FLAT_SPARSE_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" ) diff --git a/src/core/algorithm/hnsw/CMakeLists.txt b/src/core/algorithm/hnsw/CMakeLists.txt index f4a105402..cfd1147f4 100644 --- a/src/core/algorithm/hnsw/CMakeLists.txt +++ b/src/core/algorithm/hnsw/CMakeLists.txt @@ -1,11 +1,17 @@ include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) include(${PROJECT_ROOT_DIR}/cmake/option.cmake) +if(NOT APPLE) + set(CORE_KNN_HNSW_LDFLAGS + "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") +endif() + cc_library( NAME core_knn_hnsw STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc LIBS core_framework sparsehash INCS . ${PROJECT_ROOT_DIR}/src/core ${PROJECT_ROOT_DIR}/src/core/algorithm + LDFLAGS "${CORE_KNN_HNSW_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" ) diff --git a/src/core/algorithm/hnsw_rabitq/CMakeLists.txt b/src/core/algorithm/hnsw_rabitq/CMakeLists.txt index ed547dc76..09ce72f55 100644 --- a/src/core/algorithm/hnsw_rabitq/CMakeLists.txt +++ b/src/core/algorithm/hnsw_rabitq/CMakeLists.txt @@ -11,11 +11,17 @@ if(AUTO_DETECT_ARCH) endforeach() endif() +if(NOT APPLE) + set(CORE_KNN_HNSW_RABITQ_LDFLAGS + "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") +endif() + cc_library( NAME core_knn_hnsw_rabitq STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc LIBS core_framework rabitqlib sparsehash INCS . ${PROJECT_ROOT_DIR}/src ${PROJECT_ROOT_DIR}/src/core ${PROJECT_ROOT_DIR}/src/core/algorithm + LDFLAGS "${CORE_KNN_HNSW_RABITQ_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" ) \ No newline at end of file diff --git a/src/core/algorithm/hnsw_sparse/CMakeLists.txt b/src/core/algorithm/hnsw_sparse/CMakeLists.txt index fe26d10e1..15295b485 100644 --- a/src/core/algorithm/hnsw_sparse/CMakeLists.txt +++ b/src/core/algorithm/hnsw_sparse/CMakeLists.txt @@ -1,11 +1,17 @@ include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) include(${PROJECT_ROOT_DIR}/cmake/option.cmake) +if(NOT APPLE) + set(CORE_KNN_HNSW_SPARSE_LDFLAGS + "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") +endif() + cc_library( NAME core_knn_hnsw_sparse STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc LIBS core_framework sparsehash INCS . ${PROJECT_ROOT_DIR}/src/core ${PROJECT_ROOT_DIR}/src/core/algorithm + LDFLAGS "${CORE_KNN_HNSW_SPARSE_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" ) diff --git a/src/core/algorithm/ivf/CMakeLists.txt b/src/core/algorithm/ivf/CMakeLists.txt index ffcf30949..8e3872f31 100644 --- a/src/core/algorithm/ivf/CMakeLists.txt +++ b/src/core/algorithm/ivf/CMakeLists.txt @@ -1,10 +1,16 @@ include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) include(${PROJECT_ROOT_DIR}/cmake/option.cmake) +if(NOT APPLE) + set(CORE_KNN_IVF_LDFLAGS + "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") +endif() + cc_library( NAME core_knn_ivf STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc LIBS zvec_ailego core_framework core_knn_cluster INCS . ${PROJECT_ROOT_DIR}/src/core ${PROJECT_ROOT_DIR}/src/core/algorithm + LDFLAGS "${CORE_KNN_IVF_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" ) diff --git a/src/core/algorithm/vamana/CMakeLists.txt b/src/core/algorithm/vamana/CMakeLists.txt index 8e5bbda1e..b2feaf9c1 100644 --- a/src/core/algorithm/vamana/CMakeLists.txt +++ b/src/core/algorithm/vamana/CMakeLists.txt @@ -1,11 +1,17 @@ include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) include(${PROJECT_ROOT_DIR}/cmake/option.cmake) +if(NOT APPLE) + set(CORE_KNN_VAMANA_LDFLAGS + "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") +endif() + cc_library( NAME core_knn_vamana STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc LIBS core_framework core_knn_hnsw sparsehash INCS . ${PROJECT_ROOT_DIR}/src/core ${PROJECT_ROOT_DIR}/src/core/algorithm + LDFLAGS "${CORE_KNN_VAMANA_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" ) diff --git a/src/core/metric/CMakeLists.txt b/src/core/metric/CMakeLists.txt index 55dfc901e..2918b909b 100644 --- a/src/core/metric/CMakeLists.txt +++ b/src/core/metric/CMakeLists.txt @@ -1,11 +1,17 @@ include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) include(${PROJECT_ROOT_DIR}/cmake/option.cmake) +if(NOT APPLE) + set(CORE_METRIC_LDFLAGS + "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") +endif() + cc_library( NAME core_metric STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc LIBS zvec_ailego zvec_turbo core_framework INCS . ${PROJECT_ROOT_DIR}/src/core + LDFLAGS "${CORE_METRIC_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" ) diff --git a/src/core/mixed_reducer/CMakeLists.txt b/src/core/mixed_reducer/CMakeLists.txt index e9566456e..e7204f0f7 100644 --- a/src/core/mixed_reducer/CMakeLists.txt +++ b/src/core/mixed_reducer/CMakeLists.txt @@ -1,10 +1,16 @@ include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) include(${PROJECT_ROOT_DIR}/cmake/option.cmake) +if(NOT APPLE) + set(CORE_MIX_REDUCER_LDFLAGS + "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") +endif() + cc_library( NAME core_mix_reducer STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc LIBS zvec_ailego core_framework INCS . ${PROJECT_ROOT_DIR}/src/core + LDFLAGS "${CORE_MIX_REDUCER_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" ) diff --git a/src/core/quantizer/CMakeLists.txt b/src/core/quantizer/CMakeLists.txt index 21a03e449..80b4f612a 100644 --- a/src/core/quantizer/CMakeLists.txt +++ b/src/core/quantizer/CMakeLists.txt @@ -1,11 +1,17 @@ include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) include(${PROJECT_ROOT_DIR}/cmake/option.cmake) +if(NOT APPLE) + set(CORE_QUANTIZER_LDFLAGS + "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") +endif() + cc_library( NAME core_quantizer STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc LIBS zvec_ailego core_framework INCS . ${PROJECT_ROOT_DIR}/src/core + LDFLAGS "${CORE_QUANTIZER_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" ) diff --git a/src/core/utility/CMakeLists.txt b/src/core/utility/CMakeLists.txt index 99cf87ca2..7c3adf702 100644 --- a/src/core/utility/CMakeLists.txt +++ b/src/core/utility/CMakeLists.txt @@ -1,11 +1,17 @@ include(${PROJECT_ROOT_DIR}/cmake/bazel.cmake) include(${PROJECT_ROOT_DIR}/cmake/option.cmake) +if(NOT APPLE) + set(CORE_UTILITY_LDFLAGS + "-Wl,--exclude-libs,libparquet.a:libarrow.a:libarrow_bundled_dependencies.a") +endif() + cc_library( NAME core_utility STATIC SHARED STRICT ALWAYS_LINK SRCS *.cc LIBS zvec_ailego core_framework INCS . ${PROJECT_ROOT_DIR}/src/core + LDFLAGS "${CORE_UTILITY_LDFLAGS}" VERSION "${PROXIMA_ZVEC_VERSION}" ) From 5f8a745fef8fba4125329046366fe4e1fe028d3e Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Thu, 21 May 2026 21:25:35 +0800 Subject: [PATCH 16/47] fix --- src/ailego/buffer/vector_page_table.cc | 8 +++++++- src/core/algorithm/hnsw/hnsw_index_hash.h | 2 -- src/core/utility/buffer_storage.cc | 15 ++++++++++++--- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index c96e40b91..d7653ea9f 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -134,16 +134,22 @@ void VectorPageTable::release_block(block_id_t block_id) { void VectorPageTable::evict_block(block_id_t block_id) { assert(block_id < entry_num_); Entry &e = entry_at(block_id); - char *buffer = e.buffer; int expected = 0; if (e.ref_count.compare_exchange_strong( expected, std::numeric_limits::min())) { + // Read e.buffer ONLY after we won the CAS, so we are guaranteed to be the + // sole owner of the slot. Reading it before the CAS races with another + // thread that may have already evicted (and freed) e.buffer and then had + // a fresh acquire_buffer / set_block_acquired sequence overwrite e.buffer + // with a new pointer. + char *buffer = e.buffer; if (buffer && e.is_dirty.load(std::memory_order_relaxed) && flush_callback_) { flush_callback_(block_id, buffer, kVectorPageSize, e.file_offset); e.is_dirty.store(false, std::memory_order_relaxed); } if (buffer) { + e.buffer = nullptr; MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); } } diff --git a/src/core/algorithm/hnsw/hnsw_index_hash.h b/src/core/algorithm/hnsw/hnsw_index_hash.h index 29d81ac92..cc59e84ab 100644 --- a/src/core/algorithm/hnsw/hnsw_index_hash.h +++ b/src/core/algorithm/hnsw/hnsw_index_hash.h @@ -141,7 +141,6 @@ class HnswIndexHashMap { auto idx = key >> mask_bits_; if (idx >= slots_.size()) { if (ailego_unlikely(idx >= slots_.capacity())) { - LOG_ERROR("no space to insert"); return false; } for (auto i = slots_.size(); i <= idx; ++i) { @@ -152,7 +151,6 @@ class HnswIndexHashMap { } auto it = slots_[idx].find(key, slot_items_, slot_loc_mask_); if (ailego_unlikely(it == nullptr)) { - LOG_ERROR("no space to insert"); return false; } diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index caca2628b..afa6f2f69 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -46,8 +46,11 @@ struct CrossPageScratch { char *ensure(size_t len) { if (cap < len) { if (buf) ailego_free(buf); - buf = static_cast(ailego_aligned_malloc(len, 4096)); - cap = buf ? len : 0; + // C11 aligned_alloc requires size to be a multiple of alignment. + const size_t kAlign = 4096UL; + size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL); + buf = static_cast(ailego_aligned_malloc(alloc_size, kAlign)); + cap = buf ? alloc_size : 0; } return buf; } @@ -220,7 +223,13 @@ class BufferStorage : public IndexStorage { data.reset(owner_->buffer_pool_handle_.get(), page_id, raw); return len; } - char *tmp = static_cast(ailego_aligned_malloc(len, 4096)); + // C11 aligned_alloc requires the requested size to be a multiple of + // the alignment; round len up to the next 4K boundary. Without this + // glibc treats the call as undefined behaviour and silently corrupts + // heap metadata (manifesting later as `corrupted size vs. prev_size`). + const size_t kAlign = 4096UL; + size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL); + char *tmp = static_cast(ailego_aligned_malloc(alloc_size, kAlign)); if (!tmp) { LOG_ERROR("read error (alloc cross-page temp buffer failed)."); return 0; From 4940ef0cc656a90616544a92e10bcefa84bc07ea Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Fri, 22 May 2026 13:30:29 +0800 Subject: [PATCH 17/47] fix --- src/ailego/buffer/vector_page_table.cc | 29 ++++++++++++++++++-------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index d7653ea9f..2d222ffe1 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -135,13 +136,13 @@ void VectorPageTable::evict_block(block_id_t block_id) { assert(block_id < entry_num_); Entry &e = entry_at(block_id); int expected = 0; - if (e.ref_count.compare_exchange_strong( - expected, std::numeric_limits::min())) { - // Read e.buffer ONLY after we won the CAS, so we are guaranteed to be the - // sole owner of the slot. Reading it before the CAS races with another - // thread that may have already evicted (and freed) e.buffer and then had - // a fresh acquire_buffer / set_block_acquired sequence overwrite e.buffer - // with a new pointer. + // Two-phase eviction to prevent data race on e.buffer with + // set_block_acquired. We first CAS to kEvicting (-1), which causes + // set_block_acquired to spin-wait; then do the actual work (flush, free, + // null buffer); finally store INT_MIN ("evicted") which unblocks + // set_block_acquired. + static constexpr int kEvicting = -1; + if (e.ref_count.compare_exchange_strong(expected, kEvicting)) { char *buffer = e.buffer; if (buffer && e.is_dirty.load(std::memory_order_relaxed) && flush_callback_) { @@ -152,6 +153,10 @@ void VectorPageTable::evict_block(block_id_t block_id) { e.buffer = nullptr; MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); } + // Transition to fully-evicted state. Use release so that the + // set_block_acquired acquire-load sees e.buffer == nullptr. + e.ref_count.store(std::numeric_limits::min(), + std::memory_order_release); } e.in_evict_queue.store(false, std::memory_order_relaxed); } @@ -161,7 +166,7 @@ char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer, assert(block_id < entry_num_); Entry &e = entry_at(block_id); while (true) { - int current_count = e.ref_count.load(std::memory_order_relaxed); + int current_count = e.ref_count.load(std::memory_order_acquire); if (current_count >= 0) { if (e.ref_count.compare_exchange_weak( current_count, current_count + 1, std::memory_order_acq_rel, @@ -169,13 +174,19 @@ char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer, MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); return e.buffer; } - } else { + } else if (current_count == std::numeric_limits::min()) { + // Fully evicted — safe to claim this entry for our new buffer. e.buffer = buffer; e.file_offset = file_offset; e.in_evict_queue.store(false, std::memory_order_relaxed); e.is_dirty.store(false, std::memory_order_relaxed); e.ref_count.store(1, std::memory_order_release); return e.buffer; + } else { + // kEvicting (-1): eviction is in progress on this entry. Spin briefly + // until evict_block finishes (transitions to INT_MIN). + // This is a very short critical section (flush + free, ~μs). + std::this_thread::yield(); } } } From f545524553a21e7a39059bcea7f1369db0926fa6 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Fri, 22 May 2026 14:56:41 +0800 Subject: [PATCH 18/47] fix --- src/core/algorithm/hnsw/hnsw_streamer_entity.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.h b/src/core/algorithm/hnsw/hnsw_streamer_entity.h index 3c2fb0cea..af3de1990 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer_entity.h +++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.h @@ -370,6 +370,11 @@ class HnswStreamerEntity : public HnswEntity { if (level == 0) { return 0; } + // Serialize concurrent add_upper_neighbor calls: multiple build threads + // share the same entity via shared_mutex (shared-lock), so both + // upper_neighbor_chunks_ (vector mutation) and upper_neighbor_index_->insert + // (hashmap slot assignment) must be protected from concurrent writes. + std::lock_guard lk(upper_neighbor_mutex_); Chunk::Pointer chunk; uint64_t chunk_offset = UINT64_MAX; size_t neighbors_size = get_total_upper_neighbors_size(level); @@ -529,6 +534,9 @@ class HnswStreamerEntity : public HnswEntity { protected: IndexStreamer::Stats &stats_; std::mutex mutex_{}; + //! Guards add_upper_neighbor (upper_neighbor_chunks_ + upper_neighbor_index_ + //! insert) against concurrent build threads holding the shared lock. + mutable std::mutex upper_neighbor_mutex_{}; size_t max_index_size_{0UL}; uint32_t chunk_size_{kDefaultChunkSize}; uint32_t upper_neighbor_chunk_size_{kDefaultChunkSize}; From 23be06eacd91c40f6472960b1545e7d844be1e45 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Fri, 22 May 2026 15:45:53 +0800 Subject: [PATCH 19/47] clang format --- src/ailego/buffer/vector_page_table.cc | 56 +++++++++---------- .../algorithm/hnsw/hnsw_streamer_entity.h | 5 +- src/core/utility/buffer_storage.cc | 50 ++++++++--------- .../zvec/ailego/buffer/vector_page_table.h | 9 ++- .../zvec/core/framework/index_storage.h | 1 - 5 files changed, 57 insertions(+), 64 deletions(-) diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index 2d222ffe1..78dcd3c69 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -120,9 +120,9 @@ void VectorPageTable::release_block(block_id_t block_id) { if (e.ref_count.fetch_sub(1, std::memory_order_release) == 1) { std::atomic_thread_fence(std::memory_order_acquire); bool expected = false; - if (e.in_evict_queue.compare_exchange_strong( - expected, true, std::memory_order_acq_rel, - std::memory_order_relaxed)) { + if (e.in_evict_queue.compare_exchange_strong(expected, true, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { BlockEvictionQueue::BlockType block; block.page_table = this; block.vector_block.first = block_id; @@ -168,9 +168,9 @@ char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer, while (true) { int current_count = e.ref_count.load(std::memory_order_acquire); if (current_count >= 0) { - if (e.ref_count.compare_exchange_weak( - current_count, current_count + 1, std::memory_order_acq_rel, - std::memory_order_acquire)) { + if (e.ref_count.compare_exchange_weak(current_count, current_count + 1, + std::memory_order_acq_rel, + std::memory_order_acquire)) { MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); return e.buffer; } @@ -196,16 +196,13 @@ VecBufferPool::VecBufferPool(const std::string &filename, bool writable, file_name_ = filename; writable_ = writable || create; #if defined(_MSC_VER) - int flags = - writable_ - ? (create ? (O_RDWR | O_CREAT | O_TRUNC | _O_BINARY) - : (O_RDWR | _O_BINARY)) - : (O_RDONLY | _O_BINARY); + int flags = writable_ ? (create ? (O_RDWR | O_CREAT | O_TRUNC | _O_BINARY) + : (O_RDWR | _O_BINARY)) + : (O_RDONLY | _O_BINARY); fd_ = _open(filename.c_str(), flags, 0644); #else - int flags = writable_ - ? (create ? (O_RDWR | O_CREAT | O_TRUNC) : O_RDWR) - : O_RDONLY; + int flags = + writable_ ? (create ? (O_RDWR | O_CREAT | O_TRUNC) : O_RDWR) : O_RDONLY; fd_ = ::open(filename.c_str(), flags, 0644); #endif if (fd_ < 0) { @@ -239,23 +236,23 @@ int VecBufferPool::init() { if (writable_) { int fd = fd_; const std::string &name = file_name_; - page_table_.set_flush_callback( - [fd, &name](block_id_t /*block_id*/, char *buf, size_t sz, - size_t off) -> int { + page_table_.set_flush_callback([fd, &name](block_id_t /*block_id*/, + char *buf, size_t sz, + size_t off) -> int { #if defined(_MSC_VER) - ssize_t w = zvec_pwrite(fd, buf, sz, off); + ssize_t w = zvec_pwrite(fd, buf, sz, off); #else - ssize_t w = ::pwrite(fd, buf, sz, off); + ssize_t w = ::pwrite(fd, buf, sz, off); #endif - if (w != static_cast(sz)) { - LOG_ERROR( - "Buffer pool flush failed: file[%s], offset[%zu], " - "expected[%zu], got[%zd]", - name.c_str(), off, sz, w); - return -1; - } - return 0; - }); + if (w != static_cast(sz)) { + LOG_ERROR( + "Buffer pool flush failed: file[%s], offset[%zu], " + "expected[%zu], got[%zd]", + name.c_str(), off, sz, w); + return -1; + } + return 0; + }); } return 0; } @@ -358,8 +355,7 @@ int VecBufferPool::write_range(size_t file_offset, size_t length, return -1; } size_t page_start = pg * kVectorPageSize; - size_t intra_offset = - (pg == first_page) ? (file_offset - page_start) : 0; + size_t intra_offset = (pg == first_page) ? (file_offset - page_start) : 0; size_t chunk = std::min(kVectorPageSize - intra_offset, remaining); std::memcpy(page + intra_offset, src + src_cursor, chunk); page_table_.mark_dirty(pg); diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.h b/src/core/algorithm/hnsw/hnsw_streamer_entity.h index af3de1990..59f0285a9 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer_entity.h +++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.h @@ -372,8 +372,9 @@ class HnswStreamerEntity : public HnswEntity { } // Serialize concurrent add_upper_neighbor calls: multiple build threads // share the same entity via shared_mutex (shared-lock), so both - // upper_neighbor_chunks_ (vector mutation) and upper_neighbor_index_->insert - // (hashmap slot assignment) must be protected from concurrent writes. + // upper_neighbor_chunks_ (vector mutation) and + // upper_neighbor_index_->insert (hashmap slot assignment) must be protected + // from concurrent writes. std::lock_guard lk(upper_neighbor_mutex_); Chunk::Pointer chunk; uint64_t chunk_offset = UINT64_MAX; diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 637d1e179..8606c562c 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include #include #include #include -#include #include #include #include @@ -229,7 +229,8 @@ class BufferStorage : public IndexStorage { // heap metadata (manifesting later as `corrupted size vs. prev_size`). const size_t kAlign = 4096UL; size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL); - char *tmp = static_cast(ailego_aligned_malloc(alloc_size, kAlign)); + char *tmp = + static_cast(ailego_aligned_malloc(alloc_size, kAlign)); if (!tmp) { LOG_ERROR("read error (alloc cross-page temp buffer failed)."); return 0; @@ -268,8 +269,9 @@ class BufferStorage : public IndexStorage { return len; } if (ailego_unlikely(offset > capacity_ || len > capacity_ - offset)) { - LOG_ERROR("write() exceeds segment capacity: offset=%zu len=%zu cap=%zu", - offset, len, capacity_); + LOG_ERROR( + "write() exceeds segment capacity: offset=%zu len=%zu cap=%zu", + offset, len, capacity_); return 0; } auto meta = segment_info_->segment.meta(); @@ -521,9 +523,9 @@ class BufferStorage : public IndexStorage { // are multiple meta-header chains in the file, the next ParseHeader() // would overwrite that single instance and break content_offset for // all earlier-chain segments. - segments_[seg_name] = IndexMapping::SegmentInfo{ - IndexMapping::Segment{iter}, current_header_start_offset_, - chain_header}; + segments_[seg_name] = + IndexMapping::SegmentInfo{IndexMapping::Segment{iter}, + current_header_start_offset_, chain_header}; max_segment_size_ = std::max(max_segment_size_, iter->data_size + iter->padding_size); if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count > @@ -545,8 +547,7 @@ class BufferStorage : public IndexStorage { // never overwrite earlier-chain headers (prior implementation used a // single header_ member, which corrupted content_offset for chain-0 // segments once chain-1 was parsed). - chain_headers_.emplace_back( - std::make_unique()); + chain_headers_.emplace_back(std::make_unique()); IndexFormat::MetaHeader *chain_header = chain_headers_.back().get(); ret = ParseHeader(current_header_start_offset_, chain_header); if (ret != 0) { @@ -587,8 +588,8 @@ class BufferStorage : public IndexStorage { const uint64_t segment_start_offset = footer_offset - footer_.segments_meta_size; uint32_t segment_ids_offset = footer_.segments_meta_size; - ret = ParseSegment(segment_start_offset, chain_header, - &segment_ids_offset); + ret = + ParseSegment(segment_start_offset, chain_header, &segment_ids_offset); if (ret != 0) { LOG_ERROR("Failed to parse segment, errno %d, %s", ret, IndexError::What(ret)); @@ -598,8 +599,7 @@ class BufferStorage : public IndexStorage { // Record per-chain metadata offsets so flush_index() can write // updated segment metas and footers back to the backing file. meta_chains_.push_back({current_header_start_offset_, footer_offset, - segment_start_offset, - footer_.segments_meta_size, + segment_start_offset, footer_.segments_meta_size, segment_ids_offset, footer_}); if (footer_.next_meta_header_offset == 0) { @@ -927,8 +927,7 @@ class BufferStorage : public IndexStorage { AllShardsExclusiveLatch latch(mapping_shards_); if (!buffer_pool_ || !buffer_pool_handle_) { - LOG_ERROR("append_segment: pool not ready, file[%s]", - file_name_.c_str()); + LOG_ERROR("append_segment: pool not ready, file[%s]", file_name_.c_str()); return IndexError_Runtime; } if (!buffer_pool_->writable()) { @@ -944,8 +943,7 @@ class BufferStorage : public IndexStorage { } if (meta_chains_.empty() || chain_headers_.empty() || buffer_pool_buffers_.empty()) { - LOG_ERROR("append_segment: invalid state, file[%s]", - file_name_.c_str()); + LOG_ERROR("append_segment: invalid state, file[%s]", file_name_.c_str()); return IndexError_Runtime; } @@ -1034,8 +1032,8 @@ class BufferStorage : public IndexStorage { IndexFormat::SetupMetaFooter(&new_footer); new_footer.segments_meta_size = new_segments_meta_size; new_footer.total_size = new_meta_total; - new_footer.segments_meta_crc = ailego::Crc32c::Hash( - new_meta_buf.get(), new_segments_meta_size, 0u); + new_footer.segments_meta_crc = + ailego::Crc32c::Hash(new_meta_buf.get(), new_segments_meta_size, 0u); IndexFormat::UpdateMetaFooter(&new_footer, 0); if (buffer_pool_handle_->write_meta( @@ -1067,10 +1065,9 @@ class BufferStorage : public IndexStorage { chain->footer = linked_footer; // old chain keeps linked footer chain_headers_.push_back(std::move(new_header)); buffer_pool_buffers_.push_back(std::move(new_meta_buf)); - meta_chains_.push_back(MetaChain{new_chain_start, new_footer_file_offset, - new_segment_meta_file_offset, - new_segments_meta_size, - new_segments_meta_size, new_footer}); + meta_chains_.push_back(MetaChain{ + new_chain_start, new_footer_file_offset, new_segment_meta_file_offset, + new_segments_meta_size, new_segments_meta_size, new_footer}); footer_ = new_footer; current_header_start_offset_ = new_chain_start; @@ -1083,8 +1080,8 @@ class BufferStorage : public IndexStorage { // value (except `this`-via-member-access) so a subsequent reassignment // of local pointers (chain/header/meta_buf) does not corrupt the // closure. - rollback_step1 = [this, saved_footer_before_split, - saved_old_chain_footer, saved_old_footer_file_offset, + rollback_step1 = [this, saved_footer_before_split, saved_old_chain_footer, + saved_old_footer_file_offset, saved_current_header_start]() { // 1. Restore old chain's footer on disk (drop forward link). buffer_pool_handle_->write_meta( @@ -1250,7 +1247,8 @@ class BufferStorage : public IndexStorage { for (size_t i = 0; i < kMappingMutexShards; ++i) shards_[i].mtx.unlock(); } AllShardsExclusiveLatch(const AllShardsExclusiveLatch &) = delete; - AllShardsExclusiveLatch &operator=(const AllShardsExclusiveLatch &) = delete; + AllShardsExclusiveLatch &operator=(const AllShardsExclusiveLatch &) = + delete; }; // buffer manager diff --git a/src/include/zvec/ailego/buffer/vector_page_table.h b/src/include/zvec/ailego/buffer/vector_page_table.h index 5996a9b2c..24c70838d 100644 --- a/src/include/zvec/ailego/buffer/vector_page_table.h +++ b/src/include/zvec/ailego/buffer/vector_page_table.h @@ -57,8 +57,7 @@ class VectorPageTable { public: // Callback invoked by evict_block() to persist a dirty block before its // memory is released. Signature: (block_id, buffer, size, file_offset). - using FlushCallback = - std::function; + using FlushCallback = std::function; VectorPageTable() { BlockEvictionQueue::get_instance().set_valid(this); @@ -118,8 +117,7 @@ class VectorPageTable { if (!e.is_dirty.load(std::memory_order_relaxed)) { return 0; } - int rc = flush_callback_(block_id, buffer, kVectorPageSize, - e.file_offset); + int rc = flush_callback_(block_id, buffer, kVectorPageSize, e.file_offset); if (rc == 0) { e.is_dirty.store(false, std::memory_order_relaxed); } @@ -146,7 +144,8 @@ class VectorPageTable { static constexpr size_t kSegmentShift = 16; // 65536 entries per segment static constexpr size_t kSegmentSize = size_t{1} << kSegmentShift; static constexpr size_t kSegmentMask = kSegmentSize - 1; - static constexpr size_t kMaxSegments = 2048; // up to 128M entries (512GB @ 4K) + static constexpr size_t kMaxSegments = + 2048; // up to 128M entries (512GB @ 4K) size_t entry_num_{0}; size_t segment_count_{0}; diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h index 1fae20eb9..3da2e6669 100644 --- a/src/include/zvec/core/framework/index_storage.h +++ b/src/include/zvec/core/framework/index_storage.h @@ -15,7 +15,6 @@ #pragma once #include - #include #include #include From 73d50102cf356f5d59f569e4adfef850a3f204aa Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Mon, 25 May 2026 10:59:10 +0800 Subject: [PATCH 20/47] fix --- src/core/algorithm/flat/flat_streamer.cc | 2 +- src/core/algorithm/hnsw/hnsw_streamer.cc | 2 +- src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer.cc | 2 +- src/core/algorithm/hnsw_sparse/hnsw_sparse_streamer.cc | 2 +- src/core/algorithm/vamana/vamana_streamer.cc | 2 +- src/db/index/segment/segment.cc | 10 ++++++++++ 6 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/core/algorithm/flat/flat_streamer.cc b/src/core/algorithm/flat/flat_streamer.cc index 8969efc14..5e6171659 100644 --- a/src/core/algorithm/flat/flat_streamer.cc +++ b/src/core/algorithm/flat/flat_streamer.cc @@ -34,7 +34,7 @@ FlatStreamer::FlatStreamer() : entity_(stats_) {} template FlatStreamer::~FlatStreamer() { - if (state_ == STATE_INITED) { + if (state_ == STATE_INITED || state_ == STATE_OPENED) { this->cleanup(); } } diff --git a/src/core/algorithm/hnsw/hnsw_streamer.cc b/src/core/algorithm/hnsw/hnsw_streamer.cc index 935cae5d4..c5e78f415 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer.cc +++ b/src/core/algorithm/hnsw/hnsw_streamer.cc @@ -28,7 +28,7 @@ namespace core { HnswStreamer::HnswStreamer() = default; HnswStreamer::~HnswStreamer() { - if (state_ == STATE_INITED) { + if (state_ == STATE_INITED || state_ == STATE_OPENED) { this->cleanup(); } } diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer.cc b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer.cc index 9eacf0bc6..2ea2f6aa0 100644 --- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer.cc +++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer.cc @@ -40,7 +40,7 @@ HnswRabitqStreamer::HnswRabitqStreamer(IndexProvider::Pointer provider, provider_(std::move(provider)) {} HnswRabitqStreamer::~HnswRabitqStreamer() { - if (state_ == STATE_INITED) { + if (state_ == STATE_INITED || state_ == STATE_OPENED) { this->cleanup(); } } diff --git a/src/core/algorithm/hnsw_sparse/hnsw_sparse_streamer.cc b/src/core/algorithm/hnsw_sparse/hnsw_sparse_streamer.cc index 3abce8087..20c215257 100644 --- a/src/core/algorithm/hnsw_sparse/hnsw_sparse_streamer.cc +++ b/src/core/algorithm/hnsw_sparse/hnsw_sparse_streamer.cc @@ -27,7 +27,7 @@ namespace core { HnswSparseStreamer::HnswSparseStreamer() : entity_(stats_) {} HnswSparseStreamer::~HnswSparseStreamer() { - if (state_ == STATE_INITED) { + if (state_ == STATE_INITED || state_ == STATE_OPENED) { this->cleanup(); } } diff --git a/src/core/algorithm/vamana/vamana_streamer.cc b/src/core/algorithm/vamana/vamana_streamer.cc index ae935eb81..2738a98ad 100644 --- a/src/core/algorithm/vamana/vamana_streamer.cc +++ b/src/core/algorithm/vamana/vamana_streamer.cc @@ -26,7 +26,7 @@ namespace core { VamanaStreamer::VamanaStreamer() = default; VamanaStreamer::~VamanaStreamer() { - if (state_ == STATE_INITED) { + if (state_ == STATE_INITED || state_ == STATE_OPENED) { this->cleanup(); } } diff --git a/src/db/index/segment/segment.cc b/src/db/index/segment/segment.cc index 7d3b2a56b..210d5a0d5 100644 --- a/src/db/index/segment/segment.cc +++ b/src/db/index/segment/segment.cc @@ -522,10 +522,20 @@ Status SegmentImpl::close() { } } vector_indexers_.clear(); + for (const auto &[name, indexers] : quant_vector_indexers_) { + for (auto indexer : indexers) { + indexer->Close(); + } + } + quant_vector_indexers_.clear(); for (auto [name, indexer] : memory_vector_indexers_) { indexer->Close(); } memory_vector_indexers_.clear(); + for (auto [name, indexer] : quant_memory_vector_indexers_) { + indexer->Close(); + } + quant_memory_vector_indexers_.clear(); return Status::OK(); } From 67742c0f6bf3b06b0c40ca49cc9b6d35068646fe Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Mon, 25 May 2026 14:08:24 +0800 Subject: [PATCH 21/47] fix --- src/core/algorithm/hnsw/hnsw_streamer_entity.h | 13 +++++++++++++ .../hnsw_rabitq/hnsw_rabitq_streamer_entity.h | 13 +++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.h b/src/core/algorithm/hnsw/hnsw_streamer_entity.h index 59f0285a9..9c7dfa97c 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer_entity.h +++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.h @@ -414,6 +414,7 @@ class HnswStreamerEntity : public HnswEntity { meta.level = level; meta.index = (chunk_index << upper_neighbor_mask_bits_) | (chunk_offset / upper_neighbor_size_); + size_t zero_start = chunk_offset; chunk_offset += upper_neighbor_size_ * level; if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) { LOG_ERROR("HashMap insert value failed"); @@ -425,6 +426,18 @@ class HnswStreamerEntity : public HnswEntity { return IndexError_Runtime; } + // Zero-initialize the new upper neighbor region to ensure + // NeighborsHeader::neighbor_cnt is 0 before update_neighbors() writes it. + // Without this, the entry point node (whose add_node returns early) would + // have uninitialized neighbor data, causing garbage reads during traversal. + char zeros[neighbors_size]; + memset(zeros, 0, neighbors_size); + if (ailego_unlikely(chunk->write(zero_start, zeros, neighbors_size) != + neighbors_size)) { + LOG_ERROR("Chunk write zeros failed"); + return IndexError_Runtime; + } + return 0; } diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h index ea36143af..f9ae998c5 100644 --- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h +++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h @@ -373,6 +373,7 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { meta.level = level; meta.index = (chunk_index << upper_neighbor_mask_bits_) | (chunk_offset / upper_neighbor_size_); + size_t zero_start = chunk_offset; chunk_offset += upper_neighbor_size_ * level; if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) { LOG_ERROR("HashMap insert value failed"); @@ -384,6 +385,18 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { return IndexError_Runtime; } + // Zero-initialize the new upper neighbor region to ensure + // NeighborsHeader::neighbor_cnt is 0 before update_neighbors() writes it. + // Without this, the entry point node (whose add_node returns early) would + // have uninitialized neighbor data, causing garbage reads during traversal. + char zeros[neighbors_size]; + memset(zeros, 0, neighbors_size); + if (ailego_unlikely(chunk->write(zero_start, zeros, neighbors_size) != + neighbors_size)) { + LOG_ERROR("Chunk write zeros failed"); + return IndexError_Runtime; + } + return 0; } From fcce41d0c5fec599ea1ecdc8945ddae96118bab6 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Mon, 25 May 2026 15:34:19 +0800 Subject: [PATCH 22/47] fix --- .../algorithm/hnsw/hnsw_streamer_entity.h | 34 ++++++++++++------- .../hnsw_rabitq/hnsw_rabitq_streamer_entity.h | 34 ++++++++++++------- 2 files changed, 44 insertions(+), 24 deletions(-) diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.h b/src/core/algorithm/hnsw/hnsw_streamer_entity.h index 9c7dfa97c..c4636c4d7 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer_entity.h +++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.h @@ -416,28 +416,38 @@ class HnswStreamerEntity : public HnswEntity { (chunk_offset / upper_neighbor_size_); size_t zero_start = chunk_offset; chunk_offset += upper_neighbor_size_ * level; - if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) { - LOG_ERROR("HashMap insert value failed"); - return IndexError_Runtime; - } + // IMPORTANT: order matters here. + // 1) resize so the chunk's data_size covers the new region. + // 2) zero-fill the new region: storage backends like BufferStorage do + // NOT zero on resize -- only metadata is updated, and the underlying + // page may contain stale content from a previously-evicted page. + // Without this step, NeighborsHeader::neighbor_cnt is garbage and + // select_entry_point()/search_neighbors() iterate over garbage + // node_ids, eventually triggering find()'s assertion in + // get_upper_neighbor_chunk_loc(). + // 3) ONLY THEN publish the entry to upper_neighbor_index_, so that any + // concurrent reader that finds this id already sees a properly + // zeroed upper-neighbor slot. if (ailego_unlikely(chunk->resize(chunk_offset) != chunk_offset)) { LOG_ERROR("Chunk resize to %zu failed", (size_t)chunk_offset); return IndexError_Runtime; } - // Zero-initialize the new upper neighbor region to ensure - // NeighborsHeader::neighbor_cnt is 0 before update_neighbors() writes it. - // Without this, the entry point node (whose add_node returns early) would - // have uninitialized neighbor data, causing garbage reads during traversal. - char zeros[neighbors_size]; - memset(zeros, 0, neighbors_size); - if (ailego_unlikely(chunk->write(zero_start, zeros, neighbors_size) != - neighbors_size)) { + // Use std::vector instead of a VLA: VLAs are a GNU extension and may + // produce different codegen / be rejected under clang/MSVC. + std::vector zeros(neighbors_size, 0); + if (ailego_unlikely(chunk->write(zero_start, zeros.data(), + neighbors_size) != neighbors_size)) { LOG_ERROR("Chunk write zeros failed"); return IndexError_Runtime; } + if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) { + LOG_ERROR("HashMap insert value failed"); + return IndexError_Runtime; + } + return 0; } diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h index f9ae998c5..02c56ee72 100644 --- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h +++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h @@ -375,28 +375,38 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { (chunk_offset / upper_neighbor_size_); size_t zero_start = chunk_offset; chunk_offset += upper_neighbor_size_ * level; - if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) { - LOG_ERROR("HashMap insert value failed"); - return IndexError_Runtime; - } + // IMPORTANT: order matters here. + // 1) resize so the chunk's data_size covers the new region. + // 2) zero-fill the new region: storage backends like BufferStorage do + // NOT zero on resize -- only metadata is updated, and the underlying + // page may contain stale content from a previously-evicted page. + // Without this step, NeighborsHeader::neighbor_cnt is garbage and + // select_entry_point()/search_neighbors() iterate over garbage + // node_ids, eventually triggering find()'s assertion in + // get_upper_neighbor_chunk_loc() at line 291. + // 3) ONLY THEN publish the entry to upper_neighbor_index_, so that any + // concurrent reader that finds this id already sees a properly + // zeroed upper-neighbor slot. if (ailego_unlikely(chunk->resize(chunk_offset) != chunk_offset)) { LOG_ERROR("Chunk resize to %zu failed", (size_t)chunk_offset); return IndexError_Runtime; } - // Zero-initialize the new upper neighbor region to ensure - // NeighborsHeader::neighbor_cnt is 0 before update_neighbors() writes it. - // Without this, the entry point node (whose add_node returns early) would - // have uninitialized neighbor data, causing garbage reads during traversal. - char zeros[neighbors_size]; - memset(zeros, 0, neighbors_size); - if (ailego_unlikely(chunk->write(zero_start, zeros, neighbors_size) != - neighbors_size)) { + // Use std::vector instead of a VLA: VLAs are a GNU extension and may + // produce different codegen / be rejected under clang/MSVC. + std::vector zeros(neighbors_size, 0); + if (ailego_unlikely(chunk->write(zero_start, zeros.data(), + neighbors_size) != neighbors_size)) { LOG_ERROR("Chunk write zeros failed"); return IndexError_Runtime; } + if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) { + LOG_ERROR("HashMap insert value failed"); + return IndexError_Runtime; + } + return 0; } From 4b8f2e6499860567717e1e6bebb68374cae617bc Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Mon, 25 May 2026 16:52:20 +0800 Subject: [PATCH 23/47] fix --- .../algorithm/hnsw/hnsw_streamer_entity.h | 21 +++++++++++-------- .../hnsw_rabitq/hnsw_rabitq_streamer_entity.h | 11 ++++++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.h b/src/core/algorithm/hnsw/hnsw_streamer_entity.h index c4636c4d7..483aacdb3 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer_entity.h +++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.h @@ -17,6 +17,7 @@ #include #include #include +#include #if defined(__linux__) || defined(__APPLE__) #include #endif @@ -323,6 +324,10 @@ class HnswStreamerEntity : public HnswEntity { inline std::pair get_upper_neighbor_chunk_loc( level_t level, node_id_t id) const { + // Shared lock: concurrent readers are fine, but must synchronize with + // add_upper_neighbor's exclusive lock to avoid data-race on + // slots_.size() inside HnswIndexHashMap. + std::shared_lock lk(upper_neighbor_rw_mutex_); auto it = upper_neighbor_index_->find(id); ailego_assert_abort(it != upper_neighbor_index_->end(), "Get upper neighbor header failed"); @@ -370,12 +375,10 @@ class HnswStreamerEntity : public HnswEntity { if (level == 0) { return 0; } - // Serialize concurrent add_upper_neighbor calls: multiple build threads - // share the same entity via shared_mutex (shared-lock), so both - // upper_neighbor_chunks_ (vector mutation) and - // upper_neighbor_index_->insert (hashmap slot assignment) must be protected - // from concurrent writes. - std::lock_guard lk(upper_neighbor_mutex_); + // Exclusive lock: protects upper_neighbor_chunks_.emplace_back() and + // upper_neighbor_index_->insert() from racing with concurrent find() + // calls in get_upper_neighbor_chunk_loc(). + std::unique_lock lk(upper_neighbor_rw_mutex_); Chunk::Pointer chunk; uint64_t chunk_offset = UINT64_MAX; size_t neighbors_size = get_total_upper_neighbors_size(level); @@ -558,9 +561,9 @@ class HnswStreamerEntity : public HnswEntity { protected: IndexStreamer::Stats &stats_; std::mutex mutex_{}; - //! Guards add_upper_neighbor (upper_neighbor_chunks_ + upper_neighbor_index_ - //! insert) against concurrent build threads holding the shared lock. - mutable std::mutex upper_neighbor_mutex_{}; + //! Guards upper_neighbor_index_ and upper_neighbor_chunks_ against + //! concurrent reads (find) and writes (insert/emplace_back). + mutable std::shared_mutex upper_neighbor_rw_mutex_{}; size_t max_index_size_{0UL}; uint32_t chunk_size_{kDefaultChunkSize}; uint32_t upper_neighbor_chunk_size_{kDefaultChunkSize}; diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h index 02c56ee72..3e2507462 100644 --- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h +++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include @@ -286,6 +287,11 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { inline std::pair get_upper_neighbor_chunk_loc( level_t level, node_id_t id) const { + // Shared lock: concurrent readers are fine, but must synchronize with + // add_upper_neighbor's exclusive lock to avoid data-race on + // slots_.size() inside HnswIndexHashMap (the emplace_back in alloc_slot + // is not atomic and concurrent find() may see a stale size value). + std::shared_lock lk(upper_neighbor_rw_mutex_); auto it = upper_neighbor_index_->find(id); ailego_assert_abort(it != upper_neighbor_index_->end(), "Get upper neighbor header failed"); @@ -334,6 +340,10 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { if (level == 0) { return 0; } + // Exclusive lock: protects upper_neighbor_chunks_.emplace_back() and + // upper_neighbor_index_->insert() from racing with concurrent find() + // calls in get_upper_neighbor_chunk_loc(). + std::unique_lock lk(upper_neighbor_rw_mutex_); Chunk::Pointer chunk; uint64_t chunk_offset = -1UL; size_t neighbors_size = get_total_upper_neighbors_size(level); @@ -526,6 +536,7 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { bool get_vector_enabled_{false}; bool use_key_info_map_{true}; + mutable std::shared_mutex upper_neighbor_rw_mutex_{}; NIHashMapPointer upper_neighbor_index_{}; mutable std::shared_ptr keys_map_lock_{}; From 70323d398116ecf875cf9ba5b77970c9660409fd Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Mon, 25 May 2026 21:39:44 +0800 Subject: [PATCH 24/47] fix --- .../algorithm/hnsw/hnsw_streamer_entity.cc | 20 +++++---- .../algorithm/hnsw/hnsw_streamer_entity.h | 9 ++-- .../hnsw_rabitq/hnsw_rabitq_index_hash.h | 43 +++++++++++-------- .../hnsw_rabitq_streamer_entity.cc | 6 ++- .../hnsw_rabitq/hnsw_rabitq_streamer_entity.h | 33 ++++++++------ 5 files changed, 64 insertions(+), 47 deletions(-) diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc index acc9bee36..a8ada19e6 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc +++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc @@ -37,6 +37,7 @@ int HnswStreamerEntity::init(size_t max_doc_cnt) { std::lock_guard lock(mutex_); broker_ = std::make_shared(stats_); upper_neighbor_index_ = std::make_shared(); + upper_neighbor_rw_mutex_ = std::make_shared(); keys_map_lock_ = std::make_shared(); keys_map_ = std::make_shared>(); if (!keys_map_ || !upper_neighbor_index_ || !broker_ || !keys_map_lock_) { @@ -767,9 +768,10 @@ const HnswEntity::Pointer HnswStreamerEntity::clone() const { HnswStreamerEntity *entity = new (std::nothrow) HnswStreamerEntity( stats_, header(), chunk_size_, node_index_mask_bits_, upper_neighbor_mask_bits_, filter_same_key_, get_vector_enabled_, - upper_neighbor_index_, keys_map_lock_, keys_map_, use_key_info_map_, - std::move(node_chunks), std::move(upper_neighbor_chunks), broker_, - node_chunk_bases_, upper_neighbor_chunk_bases_); + upper_neighbor_index_, upper_neighbor_rw_mutex_, keys_map_lock_, + keys_map_, use_key_info_map_, std::move(node_chunks), + std::move(upper_neighbor_chunks), broker_, node_chunk_bases_, + upper_neighbor_chunk_bases_); if (ailego_unlikely(!entity)) { LOG_ERROR("HnswStreamerEntity new failed"); } @@ -800,9 +802,9 @@ const HnswEntity::Pointer HnswMmapStreamerEntity::clone() const { auto *entity = new (std::nothrow) HnswMmapStreamerEntity( stats_, header(), chunk_size_, node_index_mask_bits_, upper_neighbor_mask_bits_, filter_same_key_, get_vector_enabled_, - upper_neighbor_index_, keys_map_lock_, keys_map_, use_key_info_map_, - std::move(node_chunks), std::move(upper_neighbor_chunks), broker_, - nullptr, nullptr); + upper_neighbor_index_, upper_neighbor_rw_mutex_, keys_map_lock_, + keys_map_, use_key_info_map_, std::move(node_chunks), + std::move(upper_neighbor_chunks), broker_, nullptr, nullptr); if (ailego_unlikely(!entity)) { LOG_ERROR("HnswMmapStreamerEntity new failed"); } @@ -833,9 +835,9 @@ const HnswEntity::Pointer HnswContiguousStreamerEntity::clone() const { auto *entity = new (std::nothrow) HnswContiguousStreamerEntity( stats_, header(), chunk_size_, node_index_mask_bits_, upper_neighbor_mask_bits_, filter_same_key_, get_vector_enabled_, - upper_neighbor_index_, keys_map_lock_, keys_map_, use_key_info_map_, - std::move(node_chunks), std::move(upper_neighbor_chunks), broker_, - nullptr, nullptr); + upper_neighbor_index_, upper_neighbor_rw_mutex_, keys_map_lock_, + keys_map_, use_key_info_map_, std::move(node_chunks), + std::move(upper_neighbor_chunks), broker_, nullptr, nullptr); if (ailego_unlikely(!entity)) { LOG_ERROR("HnswContiguousStreamerEntity new failed"); return HnswEntity::Pointer(); diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.h b/src/core/algorithm/hnsw/hnsw_streamer_entity.h index 483aacdb3..6a4714c5d 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer_entity.h +++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.h @@ -252,6 +252,7 @@ class HnswStreamerEntity : public HnswEntity { uint32_t upper_neighbor_mask_bits, bool filter_same_key, bool get_vector_enabled, const NIHashMapPointer &upper_neighbor_index, + const std::shared_ptr &upper_neighbor_rw_mutex, std::shared_ptr &keys_map_lock, const HashMapPointer &keys_map, bool use_key_info_map, @@ -270,6 +271,7 @@ class HnswStreamerEntity : public HnswEntity { filter_same_key_(filter_same_key), get_vector_enabled_(get_vector_enabled), use_key_info_map_(use_key_info_map), + upper_neighbor_rw_mutex_(upper_neighbor_rw_mutex), upper_neighbor_index_(upper_neighbor_index), keys_map_lock_(keys_map_lock), keys_map_(keys_map), @@ -327,7 +329,7 @@ class HnswStreamerEntity : public HnswEntity { // Shared lock: concurrent readers are fine, but must synchronize with // add_upper_neighbor's exclusive lock to avoid data-race on // slots_.size() inside HnswIndexHashMap. - std::shared_lock lk(upper_neighbor_rw_mutex_); + std::shared_lock lk(*upper_neighbor_rw_mutex_); auto it = upper_neighbor_index_->find(id); ailego_assert_abort(it != upper_neighbor_index_->end(), "Get upper neighbor header failed"); @@ -378,7 +380,7 @@ class HnswStreamerEntity : public HnswEntity { // Exclusive lock: protects upper_neighbor_chunks_.emplace_back() and // upper_neighbor_index_->insert() from racing with concurrent find() // calls in get_upper_neighbor_chunk_loc(). - std::unique_lock lk(upper_neighbor_rw_mutex_); + std::unique_lock lk(*upper_neighbor_rw_mutex_); Chunk::Pointer chunk; uint64_t chunk_offset = UINT64_MAX; size_t neighbors_size = get_total_upper_neighbors_size(level); @@ -563,7 +565,8 @@ class HnswStreamerEntity : public HnswEntity { std::mutex mutex_{}; //! Guards upper_neighbor_index_ and upper_neighbor_chunks_ against //! concurrent reads (find) and writes (insert/emplace_back). - mutable std::shared_mutex upper_neighbor_rw_mutex_{}; + //! Shared via shared_ptr so all clones synchronize on the SAME mutex. + mutable std::shared_ptr upper_neighbor_rw_mutex_{}; size_t max_index_size_{0UL}; uint32_t chunk_size_{kDefaultChunkSize}; uint32_t upper_neighbor_chunk_size_{kDefaultChunkSize}; diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_index_hash.h b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_index_hash.h index 4f01aabb3..bf3dc1e7c 100644 --- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_index_hash.h +++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_index_hash.h @@ -41,9 +41,9 @@ class HnswIndexHashMap { items_(reinterpret_cast(data)) {} //! Return a empty loc or the key item loc - Slot(Chunk::Pointer &&chunk, IndexStorage::MemoryBlock &&mem_block) - : chunk_(std::move(chunk)), items_block_(std::move(mem_block)) { - items_ = reinterpret_cast(items_block_.data()); + Slot(Chunk::Pointer &&chunk, std::vector &&local_data) + : chunk_(std::move(chunk)), local_data_(std::move(local_data)) { + items_ = reinterpret_cast(local_data_.data()); } const_iterator find(key_type key, uint32_t max_items, uint32_t mask) const { auto it = &items_[key & mask]; @@ -73,8 +73,8 @@ class HnswIndexHashMap { private: Chunk::Pointer chunk_{}; - const Item *items_{nullptr}; // point to chunk data - IndexStorage::MemoryBlock items_block_{}; + const Item *items_{nullptr}; // point to local_data_ + std::vector local_data_{}; }; public: @@ -179,14 +179,18 @@ class HnswIndexHashMap { LOG_ERROR("Chunk resize failed, size=%zu", size); return false; } - //! Read the whole data to memory - IndexStorage::MemoryBlock data_block; - if (ailego_unlikely(chunk->read(0U, data_block, size) != size)) { - LOG_ERROR("Chunk read failed, size=%zu", size); - return false; - } - - slots_.emplace_back(std::move(chunk), std::move(data_block)); + //! Use a local zero-initialized buffer; new chunks contain all zeros, + //! so no buffer-pool read is needed and no ref_count is pinned. + //! NOTE: Previously this used `chunk->read(0U, data_block, size)` which + //! returns a view into the underlying BufferPool page. That made the + //! Slot's `items_` pointer alias buffer-pool memory shared across + //! threads, which under clang -O3 release exposed a data race on + //! Slot::find()'s probing read of `it->second` (concurrent + //! const_cast writes from insert() were not reliably visible). Using a + //! private zero-initialized vector matches the HNSW (non-RABITQ) + //! implementation and avoids this race. + std::vector local_buf(size, 0); + slots_.emplace_back(std::move(chunk), std::move(local_buf)); return true; } @@ -208,13 +212,14 @@ class HnswIndexHashMap { i, chunk->data_size(), size); return IndexError_InvalidFormat; } - //! Read the whole data to memory - IndexStorage::MemoryBlock data_block; - if (ailego_unlikely(chunk->read(0U, data_block, size) != size)) { - LOG_ERROR("Chunk read failed, size=%zu", size); - return false; + //! Copy chunk data into a local buffer via fetch() so that no + //! buffer-pool block is pinned for the lifetime of the Slot. + std::vector local_buf(size); + if (ailego_unlikely(chunk->fetch(0U, local_buf.data(), size) != size)) { + LOG_ERROR("Chunk fetch failed, size=%zu", size); + return IndexError_InvalidFormat; } - slots_.emplace_back(std::move(chunk), std::move(data_block)); + slots_.emplace_back(std::move(chunk), std::move(local_buf)); } return 0; } diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.cc b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.cc index 35501ed94..cef59c35c 100644 --- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.cc +++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.cc @@ -34,6 +34,7 @@ int HnswRabitqStreamerEntity::init(size_t max_doc_cnt) { std::lock_guard lock(mutex_); broker_ = std::make_shared(stats_); upper_neighbor_index_ = std::make_shared(); + upper_neighbor_rw_mutex_ = std::make_shared(); keys_map_lock_ = std::make_shared(); keys_map_ = std::make_shared>(); if (!keys_map_ || !upper_neighbor_index_ || !broker_ || !keys_map_lock_) { @@ -697,8 +698,9 @@ const HnswRabitqEntity::Pointer HnswRabitqStreamerEntity::clone() const { new (std::nothrow) HnswRabitqStreamerEntity( stats_, header(), chunk_size_, node_index_mask_bits_, upper_neighbor_mask_bits_, filter_same_key_, get_vector_enabled_, - upper_neighbor_index_, keys_map_lock_, keys_map_, use_key_info_map_, - std::move(node_chunks), std::move(upper_neighbor_chunks), broker_); + upper_neighbor_index_, upper_neighbor_rw_mutex_, keys_map_lock_, + keys_map_, use_key_info_map_, std::move(node_chunks), + std::move(upper_neighbor_chunks), broker_); if (ailego_unlikely(!entity)) { LOG_ERROR("HnswRabitqStreamerEntity new failed"); } diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h index 3e2507462..7c5b600e7 100644 --- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h +++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h @@ -217,17 +217,17 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { using NIHashMapPointer = std::shared_ptr; //! Private construct, only be called by clone method - HnswRabitqStreamerEntity(IndexStreamer::Stats &stats, const HNSWHeader &hd, - size_t chunk_size, uint32_t node_index_mask_bits, - uint32_t upper_neighbor_mask_bits, - bool filter_same_key, bool get_vector_enabled, - const NIHashMapPointer &upper_neighbor_index, - std::shared_ptr &keys_map_lock, - const HashMapPointer &keys_map, - bool use_key_info_map, - std::vector &&node_chunks, - std::vector &&upper_neighbor_chunks, - const HnswRabitqChunkBroker::Pointer &broker) + HnswRabitqStreamerEntity( + IndexStreamer::Stats &stats, const HNSWHeader &hd, size_t chunk_size, + uint32_t node_index_mask_bits, uint32_t upper_neighbor_mask_bits, + bool filter_same_key, bool get_vector_enabled, + const NIHashMapPointer &upper_neighbor_index, + const std::shared_ptr &upper_neighbor_rw_mutex, + std::shared_ptr &keys_map_lock, + const HashMapPointer &keys_map, bool use_key_info_map, + std::vector &&node_chunks, + std::vector &&upper_neighbor_chunks, + const HnswRabitqChunkBroker::Pointer &broker) : stats_(stats), chunk_size_(chunk_size), node_index_mask_bits_(node_index_mask_bits), @@ -238,6 +238,7 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { filter_same_key_(filter_same_key), get_vector_enabled_(get_vector_enabled), use_key_info_map_(use_key_info_map), + upper_neighbor_rw_mutex_(upper_neighbor_rw_mutex), upper_neighbor_index_(upper_neighbor_index), keys_map_lock_(keys_map_lock), keys_map_(keys_map), @@ -291,7 +292,7 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { // add_upper_neighbor's exclusive lock to avoid data-race on // slots_.size() inside HnswIndexHashMap (the emplace_back in alloc_slot // is not atomic and concurrent find() may see a stale size value). - std::shared_lock lk(upper_neighbor_rw_mutex_); + std::shared_lock lk(*upper_neighbor_rw_mutex_); auto it = upper_neighbor_index_->find(id); ailego_assert_abort(it != upper_neighbor_index_->end(), "Get upper neighbor header failed"); @@ -343,7 +344,7 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { // Exclusive lock: protects upper_neighbor_chunks_.emplace_back() and // upper_neighbor_index_->insert() from racing with concurrent find() // calls in get_upper_neighbor_chunk_loc(). - std::unique_lock lk(upper_neighbor_rw_mutex_); + std::unique_lock lk(*upper_neighbor_rw_mutex_); Chunk::Pointer chunk; uint64_t chunk_offset = -1UL; size_t neighbors_size = get_total_upper_neighbors_size(level); @@ -536,7 +537,11 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { bool get_vector_enabled_{false}; bool use_key_info_map_{true}; - mutable std::shared_mutex upper_neighbor_rw_mutex_{}; + // Shared via shared_ptr so that all cloned entities synchronize against + // the SAME mutex instance. A plain std::shared_mutex member would be + // independent per clone and provide no real protection for the shared + // upper_neighbor_index_ hashmap. + mutable std::shared_ptr upper_neighbor_rw_mutex_{}; NIHashMapPointer upper_neighbor_index_{}; mutable std::shared_ptr keys_map_lock_{}; From 31266ef93ca6b5be4ed24dd3fd31f98eaba99260 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Mon, 25 May 2026 21:43:05 +0800 Subject: [PATCH 25/47] clang format --- .../algorithm/hnsw/hnsw_streamer_entity.h | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.h b/src/core/algorithm/hnsw/hnsw_streamer_entity.h index 6a4714c5d..677393de3 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer_entity.h +++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.h @@ -247,20 +247,19 @@ class HnswStreamerEntity : public HnswEntity { using NIHashMapPointer = std::shared_ptr; //! Clone construct, used by clone method in subclasses - HnswStreamerEntity(IndexStreamer::Stats &stats, const HNSWHeader &hd, - size_t chunk_size, uint32_t node_index_mask_bits, - uint32_t upper_neighbor_mask_bits, bool filter_same_key, - bool get_vector_enabled, - const NIHashMapPointer &upper_neighbor_index, - const std::shared_ptr &upper_neighbor_rw_mutex, - std::shared_ptr &keys_map_lock, - const HashMapPointer &keys_map, - bool use_key_info_map, - std::vector &&node_chunks, - std::vector &&upper_neighbor_chunks, - const ChunkBroker::Pointer &broker, - std::shared_ptr> node_bases, - std::shared_ptr> upper_bases) + HnswStreamerEntity( + IndexStreamer::Stats &stats, const HNSWHeader &hd, size_t chunk_size, + uint32_t node_index_mask_bits, uint32_t upper_neighbor_mask_bits, + bool filter_same_key, bool get_vector_enabled, + const NIHashMapPointer &upper_neighbor_index, + const std::shared_ptr &upper_neighbor_rw_mutex, + std::shared_ptr &keys_map_lock, + const HashMapPointer &keys_map, bool use_key_info_map, + std::vector &&node_chunks, + std::vector &&upper_neighbor_chunks, + const ChunkBroker::Pointer &broker, + std::shared_ptr> node_bases, + std::shared_ptr> upper_bases) : stats_(stats), chunk_size_(chunk_size), node_index_mask_bits_(node_index_mask_bits), From 881a0b08a11b1445e6f5686168f795e6269af473 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Tue, 26 May 2026 12:53:29 +0800 Subject: [PATCH 26/47] fix compile --- src/include/zvec/core/framework/index_storage.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h index 416d59139..3da2e6669 100644 --- a/src/include/zvec/core/framework/index_storage.h +++ b/src/include/zvec/core/framework/index_storage.h @@ -65,7 +65,6 @@ class IndexStorage : public IndexModule { MemoryBlock(const MemoryBlock &rhs) { switch (rhs.type_) { case MemoryBlockType::MBT_MMAP: - case MemoryBlockType::MBT_HEAP_SCRATCH: this->reset(rhs.data_); break; case MemoryBlockType::MBT_BUFFERPOOL: From af4413b42d28a3b7c7cc52ec194f70e52c092435 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Tue, 26 May 2026 15:21:01 +0800 Subject: [PATCH 27/47] fix --- src/ailego/buffer/vector_page_table.cc | 112 +++++++++++-- src/core/utility/buffer_storage.cc | 158 +++++++++++++----- .../zvec/ailego/buffer/vector_page_table.h | 45 +++-- 3 files changed, 246 insertions(+), 69 deletions(-) diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index 78dcd3c69..34955a2b4 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -60,15 +60,25 @@ namespace ailego { const size_t kVectorPageSize = MemoryHelper::PageSize(); -void VectorPageTable::init(size_t entry_num) { - // Free old segments if any. - for (size_t i = 0; i < segment_count_; ++i) { +bool VectorPageTable::init(size_t entry_num) { + size_t need_segments = (entry_num + kSegmentSize - 1) / kSegmentSize; + if (need_segments > kMaxSegments) { + LOG_ERROR( + "VectorPageTable::init: entry_num=%zu exceeds capacity " + "(kMaxEntries=%zu, need_segments=%zu, kMaxSegments=%zu); " + "refusing to init.", + entry_num, kMaxEntries, need_segments, kMaxSegments); + return false; + } + // Free old segments if any. init() is only called from VecBufferPool::init + // which is single-threaded with respect to other accesses, so a relaxed + // load of segment_count_ is sufficient here. + size_t old_count = segment_count_.load(std::memory_order_relaxed); + for (size_t i = 0; i < old_count; ++i) { delete[] segments_[i]; segments_[i] = nullptr; } - entry_num_ = entry_num; - segment_count_ = (entry_num + kSegmentSize - 1) / kSegmentSize; - for (size_t s = 0; s < segment_count_; ++s) { + for (size_t s = 0; s < need_segments; ++s) { segments_[s] = new Entry[kSegmentSize]; for (size_t i = 0; i < kSegmentSize; ++i) { segments_[s][i].ref_count.store(std::numeric_limits::min()); @@ -78,12 +88,33 @@ void VectorPageTable::init(size_t entry_num) { segments_[s][i].file_offset = 0; } } + // Publish new segments to readers. segment_count_ is published first + // (release) so that a reader that acquire-loads segment_count_ before + // entry_num_ also sees a consistent segment table; entry_num_ is the + // primary synchronization point used by callers via entry_num(). + segment_count_.store(need_segments, std::memory_order_release); + entry_num_.store(entry_num, std::memory_order_release); + return true; } -void VectorPageTable::extend(size_t new_entry_num) { - if (new_entry_num <= entry_num_) return; +bool VectorPageTable::extend(size_t new_entry_num) { + // Relaxed read is fine: extend() is serialized by the caller (extend_file + // is invoked under the BufferStorage write latch). No other writer races + // with us on entry_num_ / segment_count_. + if (new_entry_num <= entry_num_.load(std::memory_order_relaxed)) { + return true; + } size_t new_segment_count = (new_entry_num + kSegmentSize - 1) / kSegmentSize; - for (size_t s = segment_count_; s < new_segment_count; ++s) { + if (new_segment_count > kMaxSegments) { + LOG_ERROR( + "VectorPageTable::extend: new_entry_num=%zu exceeds capacity " + "(kMaxEntries=%zu, new_segment_count=%zu, kMaxSegments=%zu); " + "refusing to extend.", + new_entry_num, kMaxEntries, new_segment_count, kMaxSegments); + return false; + } + size_t old_count = segment_count_.load(std::memory_order_relaxed); + for (size_t s = old_count; s < new_segment_count; ++s) { segments_[s] = new Entry[kSegmentSize]; for (size_t i = 0; i < kSegmentSize; ++i) { segments_[s][i].ref_count.store(std::numeric_limits::min()); @@ -93,12 +124,17 @@ void VectorPageTable::extend(size_t new_entry_num) { segments_[s][i].file_offset = 0; } } - segment_count_ = new_segment_count; - entry_num_ = new_entry_num; + // Publish in the same order as init(): segment_count_ first, entry_num_ + // last. Both are release-stores so that the prior segment allocation / + // Entry initialization is visible to any reader that acquire-loads either + // counter (typically via entry_num()). + segment_count_.store(new_segment_count, std::memory_order_release); + entry_num_.store(new_entry_num, std::memory_order_release); + return true; } char *VectorPageTable::acquire_block(block_id_t block_id) { - assert(block_id < entry_num_); + assert(block_id < entry_num_.load(std::memory_order_relaxed)); Entry &e = entry_at(block_id); while (true) { int current_count = e.ref_count.load(std::memory_order_acquire); @@ -114,7 +150,7 @@ char *VectorPageTable::acquire_block(block_id_t block_id) { } void VectorPageTable::release_block(block_id_t block_id) { - assert(block_id < entry_num_); + assert(block_id < entry_num_.load(std::memory_order_relaxed)); Entry &e = entry_at(block_id); if (e.ref_count.fetch_sub(1, std::memory_order_release) == 1) { @@ -133,7 +169,7 @@ void VectorPageTable::release_block(block_id_t block_id) { } void VectorPageTable::evict_block(block_id_t block_id) { - assert(block_id < entry_num_); + assert(block_id < entry_num_.load(std::memory_order_relaxed)); Entry &e = entry_at(block_id); int expected = 0; // Two-phase eviction to prevent data race on e.buffer with @@ -163,7 +199,7 @@ void VectorPageTable::evict_block(block_id_t block_id) { char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer, size_t file_offset) { - assert(block_id < entry_num_); + assert(block_id < entry_num_.load(std::memory_order_relaxed)); Entry &e = entry_at(block_id); while (true) { int current_count = e.ref_count.load(std::memory_order_acquire); @@ -224,7 +260,14 @@ VecBufferPool::VecBufferPool(const std::string &filename, bool writable, int VecBufferPool::init() { size_t block_num = (file_size_ + kVectorPageSize - 1) / kVectorPageSize; - page_table_.init(block_num); + if (!page_table_.init(block_num)) { + LOG_ERROR( + "VecBufferPool::init: page_table_ init failed for file[%s], " + "file_size=%zu, block_num=%zu (exceeds VectorPageTable::kMaxEntries=%zu)", + file_name_.c_str(), file_size_, block_num, + VectorPageTable::kMaxEntries); + return -1; + } block_mutexes_ = std::make_unique(VecBufferPool::kMutexBucketCount); LOG_DEBUG("entry num: %zu, file_size: %zu", page_table_.entry_num(), @@ -393,14 +436,27 @@ int VecBufferPool::flush_all() { return 0; } int rc = 0; + size_t total_dirty = 0; + size_t fail_count = 0; for (size_t i = 0; i < page_table_.entry_num(); ++i) { if (page_table_.is_block_dirty(i)) { + ++total_dirty; int r = page_table_.flush_block(i); if (r != 0) { rc = r; + ++fail_count; } } } + if (fail_count != 0) { + // Aggregated diagnostic so that callers (notably ~VecBufferPool, which + // discards the return value) cannot silently lose dirty pages: any + // unflushed page at this point means the on-disk image is now stale. + LOG_ERROR( + "VecBufferPool::flush_all: %zu/%zu dirty page(s) failed to flush, " + "file[%s] last_rc=%d -- on-disk data may be stale.", + fail_count, total_dirty, file_name_.c_str(), rc); + } return rc; } @@ -413,6 +469,19 @@ bool VecBufferPool::extend_file(size_t new_size) { if (new_size <= file_size_) { return true; } + // Pre-validate against the page table's static capacity BEFORE mutating + // any on-disk state. Otherwise a successful ftruncate followed by a + // failed page_table_.extend() would leave the file size and the page + // table out of sync (file grew, but no Entry slots cover the new range). + size_t new_entry_num = (new_size + kVectorPageSize - 1) / kVectorPageSize; + if (new_entry_num > VectorPageTable::kMaxEntries) { + LOG_ERROR( + "extend_file: requested new_size=%zu would require %zu page entries, " + "exceeding VectorPageTable::kMaxEntries=%zu (file=%s).", + new_size, new_entry_num, VectorPageTable::kMaxEntries, + file_name_.c_str()); + return false; + } #if defined(_MSC_VER) if (_chsize_s(fd_, static_cast(new_size)) != 0) { LOG_ERROR("extend_file _chsize_s failed: file[%s], new_size[%zu]", @@ -429,9 +498,16 @@ bool VecBufferPool::extend_file(size_t new_size) { file_size_ = new_size; // Extend the page table to cover the new file range. Existing entries // stay at their original addresses so concurrent readers are unaffected. - size_t new_entry_num = (file_size_ + kVectorPageSize - 1) / kVectorPageSize; + // Capacity has already been validated above, so this should never fail; + // a failure here would indicate a programming error and is logged. if (new_entry_num > page_table_.entry_num()) { - page_table_.extend(new_entry_num); + if (!page_table_.extend(new_entry_num)) { + LOG_ERROR( + "extend_file: page_table_.extend(%zu) failed unexpectedly after " + "capacity pre-check (file=%s, new_size=%zu).", + new_entry_num, file_name_.c_str(), new_size); + return false; + } } return true; } diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 56e1755d5..87926ab53 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -31,30 +31,16 @@ namespace zvec { namespace core { -// Thread-local reusable scratch buffer for cross-page reads in the -// read(const void**) overload. Avoids allocating a new buffer on -// every cross-page read by reusing the same allocation on each thread. The -// returned pointer is valid only until the next cross-page read() on -// the same thread -- matching the single-page path's transient -// lifetime (ref released immediately, page may be evicted any time). -struct CrossPageScratch { - char *buf = nullptr; - size_t cap = 0; - ~CrossPageScratch() { - if (buf) ailego_free(buf); - } - char *ensure(size_t len) { - if (cap < len) { - if (buf) ailego_free(buf); - // C11 aligned_alloc requires size to be a multiple of alignment. - const size_t kAlign = 4096UL; - size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL); - buf = static_cast(ailego_aligned_malloc(alloc_size, kAlign)); - cap = buf ? alloc_size : 0; - } - return buf; - } -}; +// Cross-page reads through the legacy read(const void**) overload need a +// buffer whose lifetime is at least as long as the BufferStorage itself, +// because callers store the returned pointer indefinitely (the historical +// contract is "pointer is valid until the storage is closed"). Earlier +// revisions used a thread_local scratch buffer here, which subtly broke +// that contract: the next cross-page read(const void**) on the SAME thread +// silently overwrote the buffer, dangling every previously-handed-out +// pointer. We now allocate per call and hand ownership to the storage's +// tmp_buffers_ list (freed in close_index()). Callers that want bounded +// memory should migrate to the read(MemoryBlock&) overload. /*! Buffer Storage */ @@ -166,25 +152,38 @@ class BufferStorage : public IndexStorage { return 0; } *data = raw; - // Release the buffer-pool ref count acquired by get_single_page(). - // The pointer remains valid as long as the page is not evicted; callers - // needing a stable pin should use the read(MemoryBlock&) overload. - owner_->buffer_pool_handle_->release_one(page_id); + // NOTE: get_single_page() acquires a pin on the page; we intentionally + // do NOT release it here. The legacy contract of read(const void**) + // is that the returned pointer remains valid until the storage is + // closed (an implicit, never-released pin). Many call sites rely on + // this lifetime guarantee. Callers that want explicit pin/release + // semantics should migrate to the read(MemoryBlock&) overload, which + // hands the ref-count to a RAII MemoryBlock. + (void)page_id; return len; } - // Reuse a thread-local scratch buffer to avoid allocating on - // every cross-page read. The pointer is valid until the next - // cross-page read(const void**) on the same thread. - thread_local CrossPageScratch scratch; - char *tmp = scratch.ensure(len); + // Cross-page path: allocate a buffer whose ownership is handed to + // owner_->tmp_buffers_ so that the returned pointer remains valid + // for the entire lifetime of the BufferStorage (matching the + // single-page "pinned forever" semantics established above). + // C11 aligned_alloc requires size to be a multiple of alignment. + const size_t kAlign = 4096UL; + size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL); + char *tmp = + static_cast(ailego_aligned_malloc(alloc_size, kAlign)); if (!tmp) { *data = nullptr; return 0; } if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, tmp)) { + ailego_free(tmp); *data = nullptr; return 0; } + { + std::lock_guard tmp_latch(owner_->tmp_buffers_mutex_); + owner_->tmp_buffers_.push_back(tmp); + } *data = tmp; return len; } @@ -263,6 +262,14 @@ class BufferStorage : public IndexStorage { owner_->file_name_.c_str(), segment_id_); return 0; } + if (ailego_unlikely( + owner_->corrupted_.load(std::memory_order_acquire))) { + LOG_ERROR( + "WrappedSegment::write: storage is marked corrupted, refusing " + "write, file[%s], id[%zu]", + owner_->file_name_.c_str(), segment_id_); + return 0; + } // In read-only mode the write is a silent no-op so that callers that // unconditionally write (e.g. CRC updates) do not return an error. if (!owner_->buffer_pool_->writable()) { @@ -803,6 +810,13 @@ class BufferStorage : public IndexStorage { file_name_.c_str()); return IndexError_Runtime; } + if (corrupted_.load(std::memory_order_acquire)) { + LOG_ERROR( + "BufferStorage::flush_index skipped: storage is marked corrupted, " + "file[%s]", + file_name_.c_str()); + return IndexError_Runtime; + } if (!buffer_pool_->writable()) { // Read-only pool: nothing to flush. index_dirty_.store(false, std::memory_order_relaxed); @@ -831,10 +845,29 @@ class BufferStorage : public IndexStorage { // stored by a concurrent refresh_index() during this flush. const uint64_t consumed_chkp = pending_check_point_.load(std::memory_order_relaxed); + // Restore consumed_chkp into pending_check_point_ on any failure path + // below so that the in-flight value is not lost. Although the current + // implementation only LOADs consumed_chkp (so pending already holds it), + // this explicit monotonic CAS-back makes the invariant + // (pending_check_point_ >= consumed_chkp) self-evident and resilient to + // future refactors that might exchange/zero pending eagerly. Uses the + // same CAS-loop max as refresh_index() so a concurrent larger chkp + // wins. + auto restore_chkp_on_failure = [this, consumed_chkp]() { + if (consumed_chkp == 0) return; + uint64_t cur = pending_check_point_.load(std::memory_order_relaxed); + while (consumed_chkp > cur) { + if (pending_check_point_.compare_exchange_weak( + cur, consumed_chkp, std::memory_order_relaxed)) { + break; + } + } + }; // Flush all dirty data blocks to the backing file first. if (buffer_pool_handle_->flush_all() != 0) { // Restore dirty so the next flush_index() retries. index_dirty_.store(true, std::memory_order_relaxed); + restore_chkp_on_failure(); LOG_ERROR("flush_all data blocks failed: file[%s]", file_name_.c_str()); return IndexError_WriteData; } @@ -857,6 +890,7 @@ class BufferStorage : public IndexStorage { LOG_ERROR("Failed to write segment meta: file[%s], chain[%zu]", file_name_.c_str(), ci); index_dirty_.store(true, std::memory_order_relaxed); + restore_chkp_on_failure(); return IndexError_WriteData; } // Write the updated footer back to disk. @@ -866,6 +900,7 @@ class BufferStorage : public IndexStorage { LOG_ERROR("Failed to write footer: file[%s], chain[%zu]", file_name_.c_str(), ci); index_dirty_.store(true, std::memory_order_relaxed); + restore_chkp_on_failure(); return IndexError_WriteData; } } @@ -922,6 +957,7 @@ class BufferStorage : public IndexStorage { current_header_start_offset_ = 0; pending_check_point_.store(0, std::memory_order_relaxed); index_dirty_.store(false, std::memory_order_relaxed); + corrupted_.store(false, std::memory_order_relaxed); } //! Append a segment into storage. @@ -939,6 +975,13 @@ class BufferStorage : public IndexStorage { LOG_ERROR("append_segment: pool not ready, file[%s]", file_name_.c_str()); return IndexError_Runtime; } + if (corrupted_.load(std::memory_order_acquire)) { + LOG_ERROR( + "append_segment: storage is marked corrupted, refusing to append, " + "file[%s], id[%s]", + file_name_.c_str(), id.c_str()); + return IndexError_Runtime; + } if (!buffer_pool_->writable()) { LOG_ERROR("append_segment: pool is read-only, file[%s]", file_name_.c_str()); @@ -1012,11 +1055,23 @@ class BufferStorage : public IndexStorage { } // Best-effort rollback: restore original old footer on disk if a - // subsequent disk write in this split block fails. - auto undo_old_footer = [&]() { - buffer_pool_handle_->write_meta( - chain->footer_file_offset, sizeof(saved_footer_before_split), - reinterpret_cast(&saved_footer_before_split)); + // subsequent disk write in this split block fails. If THIS rollback + // also fails to land on disk, the file is now in an inconsistent + // state (old footer points forward to a partially-written new chain + // region) -- raise the corrupted_ flag so subsequent writes refuse + // to compound the damage. + auto undo_old_footer = [this, chain, &saved_footer_before_split]() { + if (buffer_pool_handle_->write_meta( + chain->footer_file_offset, sizeof(saved_footer_before_split), + reinterpret_cast(&saved_footer_before_split)) != + 0) { + LOG_ERROR( + "append_segment: rollback write of old footer FAILED, file[%s] " + "is now in an inconsistent state -- marking storage as " + "corrupted; further writes will be rejected.", + file_name_.c_str()); + corrupted_.store(true, std::memory_order_release); + } }; // Extend the file and write the new chain's header + (zero) footer. @@ -1093,9 +1148,22 @@ class BufferStorage : public IndexStorage { saved_old_footer_file_offset, saved_current_header_start]() { // 1. Restore old chain's footer on disk (drop forward link). - buffer_pool_handle_->write_meta( - saved_old_footer_file_offset, sizeof(saved_footer_before_split), - reinterpret_cast(&saved_footer_before_split)); + // A failure here leaves the on-disk old footer still pointing + // at the now-popped new chain region, which ParseToMapping() + // would follow to garbage on the next open. Mark the storage + // corrupted so subsequent writes refuse to proceed. + if (buffer_pool_handle_->write_meta( + saved_old_footer_file_offset, + sizeof(saved_footer_before_split), + reinterpret_cast(&saved_footer_before_split)) != + 0) { + LOG_ERROR( + "append_segment: rollback_step1 write of old footer FAILED, " + "file[%s] is now in an inconsistent state -- marking storage " + "as corrupted; further writes will be rejected.", + file_name_.c_str()); + corrupted_.store(true, std::memory_order_release); + } // 2. Pop the freshly-pushed new chain from in-memory containers. // The associated unique_ptr / unique_ptr // are released here. @@ -1227,6 +1295,14 @@ class BufferStorage : public IndexStorage { private: std::atomic index_dirty_{false}; std::atomic pending_check_point_{0}; + // Set to true when a rollback path inside append_segment() fails to + // restore the on-disk metadata to its pre-call state. Once set, the + // storage is considered corrupted and all subsequent writes + // (write/append_segment/flush_index_locked) refuse to proceed so that + // we do not compound the damage on top of inconsistent on-disk state. + // The flag is only ever raised, never cleared, for the lifetime of the + // BufferStorage instance; close_index() resets the whole object. + std::atomic corrupted_{false}; // Sharded reader-writer lock to eliminate cache-line ping-pong on the // reader counter. Each concurrent reader hashes to its own shard, diff --git a/src/include/zvec/ailego/buffer/vector_page_table.h b/src/include/zvec/ailego/buffer/vector_page_table.h index 24c70838d..337c28c59 100644 --- a/src/include/zvec/ailego/buffer/vector_page_table.h +++ b/src/include/zvec/ailego/buffer/vector_page_table.h @@ -64,7 +64,11 @@ class VectorPageTable { } ~VectorPageTable() { BlockEvictionQueue::get_instance().set_invalid(this); - for (size_t i = 0; i < segment_count_; ++i) { + // Destructor runs without concurrent readers/writers (callers guarantee + // no live handles by the time the page table is destroyed), so a relaxed + // load is sufficient here. + size_t cnt = segment_count_.load(std::memory_order_relaxed); + for (size_t i = 0; i < cnt; ++i) { delete[] segments_[i]; } } @@ -74,12 +78,17 @@ class VectorPageTable { VectorPageTable(VectorPageTable &&) = delete; VectorPageTable &operator=(VectorPageTable &&) = delete; - void init(size_t entry_num); + //! Initialize the page table to cover `entry_num` entries. + //! Returns false (without modifying state) if `entry_num` exceeds the + //! statically allocated segment table capacity (kMaxEntries). + bool init(size_t entry_num); //! Extend the page table to cover at least `new_entry_num` entries. //! Existing entries stay at their original addresses (no invalidation). //! Safe to call while readers operate on existing pages. - void extend(size_t new_entry_num); + //! Returns false (without modifying state) if `new_entry_num` exceeds + //! the statically allocated segment table capacity (kMaxEntries). + bool extend(size_t new_entry_num); char *acquire_block(block_id_t block_id); @@ -96,19 +105,19 @@ class VectorPageTable { //! Mark a loaded block as dirty so that it is persisted on eviction. void mark_dirty(block_id_t block_id) { - assert(block_id < entry_num_); + assert(block_id < entry_num_.load(std::memory_order_relaxed)); entry_at(block_id).is_dirty.store(true, std::memory_order_relaxed); } bool is_block_dirty(block_id_t block_id) const { - assert(block_id < entry_num_); + assert(block_id < entry_num_.load(std::memory_order_relaxed)); return entry_at(block_id).is_dirty.load(std::memory_order_relaxed); } //! Flush a single dirty block without evicting it. Caller guarantees the //! block is currently loaded (buffer != nullptr). int flush_block(block_id_t block_id) { - assert(block_id < entry_num_); + assert(block_id < entry_num_.load(std::memory_order_relaxed)); Entry &e = entry_at(block_id); char *buffer = e.buffer; if (!buffer || !flush_callback_) { @@ -124,12 +133,15 @@ class VectorPageTable { return rc; } + //! Returns the current number of entries. Uses acquire ordering so that + //! callers iterating over [0, entry_num()) are guaranteed to see all + //! segments_[s] writes performed by a concurrent extend()/init(). size_t entry_num() const { - return entry_num_; + return entry_num_.load(std::memory_order_acquire); } bool is_released(block_id_t block_id) const { - assert(block_id < entry_num_); + assert(block_id < entry_num_.load(std::memory_order_relaxed)); return entry_at(block_id).ref_count.load(std::memory_order_relaxed) <= 0; } @@ -144,11 +156,24 @@ class VectorPageTable { static constexpr size_t kSegmentShift = 16; // 65536 entries per segment static constexpr size_t kSegmentSize = size_t{1} << kSegmentShift; static constexpr size_t kSegmentMask = kSegmentSize - 1; + + public: static constexpr size_t kMaxSegments = 2048; // up to 128M entries (512GB @ 4K) + // Maximum number of entries the segment table can ever hold. Callers + // (e.g. VecBufferPool::extend_file) can use this to pre-validate a target + // file size before mutating any on-disk state. + static constexpr size_t kMaxEntries = kMaxSegments * kSegmentSize; - size_t entry_num_{0}; - size_t segment_count_{0}; + private: + // entry_num_ and segment_count_ are mutated by writers in init()/extend() + // and observed by readers in entry_num() and the hot-path methods. They + // are atomic to establish a release/acquire synchronization edge with the + // (non-atomic) writes to segments_[s] performed prior to the store: any + // reader that observes the new entry_num_ is guaranteed to see the + // fully-initialized Entry slots in the corresponding segment. + std::atomic entry_num_{0}; + std::atomic segment_count_{0}; Entry *segments_[kMaxSegments]{}; Entry &entry_at(size_t idx) { From 6c0bc815c784c5c320c77709bad9f75283195583 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Tue, 26 May 2026 16:57:47 +0800 Subject: [PATCH 28/47] clang format --- src/ailego/buffer/vector_page_table.cc | 3 ++- src/core/utility/buffer_storage.cc | 6 ++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index 34955a2b4..5f62ca22c 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -263,7 +263,8 @@ int VecBufferPool::init() { if (!page_table_.init(block_num)) { LOG_ERROR( "VecBufferPool::init: page_table_ init failed for file[%s], " - "file_size=%zu, block_num=%zu (exceeds VectorPageTable::kMaxEntries=%zu)", + "file_size=%zu, block_num=%zu (exceeds " + "VectorPageTable::kMaxEntries=%zu)", file_name_.c_str(), file_size_, block_num, VectorPageTable::kMaxEntries); return -1; diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 87926ab53..b42dea8df 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -262,8 +262,7 @@ class BufferStorage : public IndexStorage { owner_->file_name_.c_str(), segment_id_); return 0; } - if (ailego_unlikely( - owner_->corrupted_.load(std::memory_order_acquire))) { + if (ailego_unlikely(owner_->corrupted_.load(std::memory_order_acquire))) { LOG_ERROR( "WrappedSegment::write: storage is marked corrupted, refusing " "write, file[%s], id[%zu]", @@ -1153,8 +1152,7 @@ class BufferStorage : public IndexStorage { // would follow to garbage on the next open. Mark the storage // corrupted so subsequent writes refuse to proceed. if (buffer_pool_handle_->write_meta( - saved_old_footer_file_offset, - sizeof(saved_footer_before_split), + saved_old_footer_file_offset, sizeof(saved_footer_before_split), reinterpret_cast(&saved_footer_before_split)) != 0) { LOG_ERROR( From 351b5463bd093bb439eaa8bbc8561e502a482c29 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Tue, 26 May 2026 21:09:43 +0800 Subject: [PATCH 29/47] fix --- src/core/utility/buffer_storage.cc | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index b42dea8df..7a65f001e 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -802,12 +802,17 @@ class BufferStorage : public IndexStorage { //! writer can slip in between flush and pool reset and lose its dirty //! pages). int flush_index_locked(void) { - // NULL GUARD: a previous append_segment() may have left the pool in a - // torn-down state. + // NULL GUARD: pool was never initialized (open() never succeeded, or + // close_index() already tore it down). This is a no-op rather than an + // error: close_index() unconditionally calls us as part of teardown, + // and a never-opened / already-closed storage simply has nothing to + // flush. Logging ERROR here would spam test logs on benign destructor + // / cleanup paths. Real corruption is still reported by the + // corrupted_ branch below. if (!buffer_pool_ || !buffer_pool_handle_) { - LOG_ERROR("BufferStorage::flush_index skipped: pool not ready, file[%s]", - file_name_.c_str()); - return IndexError_Runtime; + // Keep dirty flag in sync so a future re-open + flush is consistent. + index_dirty_.store(false, std::memory_order_relaxed); + return 0; } if (corrupted_.load(std::memory_order_acquire)) { LOG_ERROR( From 1eec9337768cb18ef8b5af9df35703107f7168cd Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Wed, 27 May 2026 22:25:49 +0800 Subject: [PATCH 30/47] fix --- src/ailego/buffer/vector_page_table.cc | 50 ++++- src/core/utility/buffer_storage.cc | 174 ++++++++++++++++-- .../zvec/ailego/buffer/vector_page_table.h | 15 +- 3 files changed, 210 insertions(+), 29 deletions(-) diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index 5f62ca22c..73f45dfff 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include #include @@ -199,8 +200,18 @@ void VectorPageTable::evict_block(block_id_t block_id) { char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer, size_t file_offset) { - assert(block_id < entry_num_.load(std::memory_order_relaxed)); + assert(block_id < entry_num_.load(std::memory_order_acquire)); Entry &e = entry_at(block_id); + // Diagnostics for the kEvicting wait. The wait itself never gives up: + // the only thread that can transition kEvicting -> INT_MIN is the + // evict_block() owner, so abandoning the spin here would orphan the + // entry in kEvicting forever. Instead, we use bounded backoff and emit + // tiered logs so a stuck eviction is observable. + using clock = std::chrono::steady_clock; + const auto wait_start = clock::now(); + auto last_log = wait_start; + unsigned spin_count = 0; + bool warned = false; while (true) { int current_count = e.ref_count.load(std::memory_order_acquire); if (current_count >= 0) { @@ -219,10 +230,39 @@ char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer, e.ref_count.store(1, std::memory_order_release); return e.buffer; } else { - // kEvicting (-1): eviction is in progress on this entry. Spin briefly - // until evict_block finishes (transitions to INT_MIN). - // This is a very short critical section (flush + free, ~μs). - std::this_thread::yield(); + // kEvicting (-1): eviction is in progress on this entry. + // Tiered backoff: hot spin first, then short sleep, then longer sleep. + ++spin_count; + if (spin_count < 64) { + // Pure busy wait for the common ~μs case. + } else if (spin_count < 1024) { + std::this_thread::yield(); + } else if (spin_count < 8192) { + std::this_thread::sleep_for(std::chrono::microseconds(100)); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + // Tiered diagnostics: warn once after 100ms, error every 1s after 1s. + const auto now = clock::now(); + const auto elapsed = now - wait_start; + if (!warned && + elapsed >= std::chrono::milliseconds(100)) { + LOG_WARN( + "set_block_acquired: long kEvicting wait on block_id=%zu " + "(>=100ms); evict_block may be slow", + static_cast(block_id)); + warned = true; + } + if (elapsed >= std::chrono::seconds(1) && + (now - last_log) >= std::chrono::seconds(1)) { + const auto secs = + std::chrono::duration_cast(elapsed).count(); + LOG_ERROR( + "set_block_acquired: stuck in kEvicting on block_id=%zu for " + "%lld s; evict_block owner may be hung or starved", + static_cast(block_id), static_cast(secs)); + last_log = now; + } } } } diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 7a65f001e..5db3ba0f8 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -253,6 +253,12 @@ class BufferStorage : public IndexStorage { //! (data_size / padding_size). Without this latch the lock-free hot //! path raced with the CRC compute, producing footer.segments_meta_crc //! that did not match the bytes pwrite()'d to disk. + //! + //! Takes a per-segment meta_mtx_ around the meta read-modify-write so + //! that two concurrent writers on the SAME segment cannot interleave + //! their (data_size, padding_size) updates and observe a state where + //! data_size + padding_size != capacity_. Different segments still + //! mutate in parallel because the mutex is per-WrappedSegment. size_t write(size_t offset, const void *data, size_t len) override { std::shared_lock latch( owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); @@ -281,9 +287,15 @@ class BufferStorage : public IndexStorage { return 0; } auto meta = segment_info_->segment.meta(); - if (offset + len > meta->data_size) { - meta->data_size = offset + len; - meta->padding_size = capacity_ - meta->data_size; + { + // Per-segment mutex: serialise concurrent writers that mutate + // (data_size, padding_size) on the SAME segment so the pair + // remains consistent (sum stays == capacity_). + std::lock_guard meta_latch(meta_mtx_); + if (offset + len > meta->data_size) { + meta->data_size = offset + len; + meta->padding_size = capacity_ - meta->data_size; + } } size_t abs_offset = segment_info_->segment_header_start_offset + segment_info_->segment_header->content_offset + @@ -312,17 +324,38 @@ class BufferStorage : public IndexStorage { //! //! Takes a SHARED latch for the same reason as write(): mutating //! meta->data_size / padding_size must be excluded from the CRC - //! compute in flush_index() / append_segment(). + //! compute in flush_index() / append_segment(). The per-segment + //! meta_mtx_ additionally serialises concurrent writers on the SAME + //! segment. size_t resize(size_t size) override { std::shared_lock latch( owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); + // Reject resize once the storage is marked corrupted. Without this + // guard, a resize() that lands AFTER append_segment()'s rollback + // failure would mutate meta_buf + flip index_dirty_, but the next + // flush_index_locked() would short-circuit on the same corrupted_ + // flag and never persist the change -- silent partial-update. + if (ailego_unlikely(owner_->corrupted_.load(std::memory_order_acquire))) { + LOG_ERROR( + "WrappedSegment::resize: storage is marked corrupted, refusing " + "resize, file[%s], id[%zu]", + owner_->file_name_.c_str(), segment_id_); + return 0; + } auto meta = segment_info_->segment.meta(); - if (meta->data_size != size) { - if (size > capacity_) { - size = capacity_; + bool changed = false; + { + std::lock_guard meta_latch(meta_mtx_); + if (meta->data_size != size) { + if (size > capacity_) { + size = capacity_; + } + meta->data_size = size; + meta->padding_size = capacity_ - size; + changed = true; } - meta->data_size = size; - meta->padding_size = capacity_ - size; + } + if (changed) { owner_->set_as_dirty(); } return size; @@ -332,11 +365,28 @@ class BufferStorage : public IndexStorage { //! //! Takes a SHARED latch for the same reason as write(): mutating //! meta->data_crc must be excluded from the CRC compute in - //! flush_index() / append_segment(). + //! flush_index() / append_segment(). The per-segment meta_mtx_ + //! ensures the data_crc store does not interleave with a concurrent + //! write()/resize() update of (data_size, padding_size). void update_data_crc(uint32_t crc) override { std::shared_lock latch( owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); - segment_info_->segment.meta()->data_crc = crc; + // Same rationale as resize(): refuse the meta mutation once the + // storage is corrupted, otherwise the CRC update would be lost on + // the next flush_index_locked() (which itself short-circuits on + // corrupted_), leaving on-disk and in-memory CRCs permanently + // diverged. + if (ailego_unlikely(owner_->corrupted_.load(std::memory_order_acquire))) { + LOG_ERROR( + "WrappedSegment::update_data_crc: storage is marked corrupted, " + "refusing CRC update, file[%s], id[%zu]", + owner_->file_name_.c_str(), segment_id_); + return; + } + { + std::lock_guard meta_latch(meta_mtx_); + segment_info_->segment.meta()->data_crc = crc; + } owner_->set_as_dirty(); } @@ -353,6 +403,12 @@ class BufferStorage : public IndexStorage { // so that re-parses after append_segment() are observed without // needing to recreate WrappedSegment instances held by callers. IndexMapping::SegmentInfo *segment_info_{nullptr}; + // Per-segment mutex protecting concurrent writer access to the + // (data_size, padding_size, data_crc) fields of segment_info_->segment. + // The owner's shard shared_mutex still excludes these writers vs + // flush_index()'s AllShardsExclusiveLatch; this mutex additionally + // serialises hot-path writers on the SAME WrappedSegment. + mutable std::mutex meta_mtx_{}; private: BufferStorage *owner_{nullptr}; @@ -1130,6 +1186,31 @@ class BufferStorage : public IndexStorage { const uint64_t saved_current_header_start = current_header_start_offset_; // All split disk writes succeeded -- commit in-memory state. + // + // STRONG EXCEPTION GUARANTEE: reserve() growth FIRST so the three + // push_back's below cannot throw (capacity is sufficient and the + // moved-in elements -- unique_ptr, unique_ptr, + // and the POD MetaChain aggregate -- have noexcept move ctors). + // Without this, a bad_alloc in the middle of the three push_back's + // leaves chain_headers_/buffer_pool_buffers_/meta_chains_ at + // mismatched sizes (one or two extended, the rest not), with + // footer_/current_header_start_offset_ either still or already + // pointing at the new chain. flush_index_locked() then iterates + // `min(meta_chains_.size(), buffer_pool_buffers_.size())` and + // silently skips the orphan chain, while ParseToMapping() on next + // open follows the on-disk forward link and DOES see it -- a + // classic split-brain. + try { + chain_headers_.reserve(chain_headers_.size() + 1); + buffer_pool_buffers_.reserve(buffer_pool_buffers_.size() + 1); + meta_chains_.reserve(meta_chains_.size() + 1); + } catch (const std::bad_alloc &) { + LOG_ERROR( + "append_segment: reserve for chain-split commit failed, file[%s]", + file_name_.c_str()); + undo_old_footer(); + return IndexError_Runtime; + } chain->footer = linked_footer; // old chain keeps linked footer chain_headers_.push_back(std::move(new_header)); buffer_pool_buffers_.push_back(std::move(new_meta_buf)); @@ -1272,13 +1353,54 @@ class BufferStorage : public IndexStorage { } // All disk writes succeeded -- commit remaining in-memory state. + // + // STRONG EXCEPTION GUARANTEE: emplace into segments_ and id_hash_ as + // a single transactional unit. unordered_map::emplace() can throw + // bad_alloc (node allocation), so if id_hash_ throws after segments_ + // succeeded, undo the segments_ insertion before propagating the + // failure. Otherwise segments_ would carry an entry with no + // matching id_hash_ slot -- get(id) would return the segment via + // segments_, but any IVF/HNSW path that joins through id_hash_ + // would silently miss it, producing the lopsided mapping the prior + // bug history attributes to id_hash_ races. + // // WrappedSegment instances already held by callers reference // &segments_[name], whose address is stable across unordered_map // insertions, so existing references stay valid. - segments_[id] = IndexMapping::SegmentInfo{ - IndexMapping::Segment{new_seg}, chain->header_start_offset, header}; - const size_t new_id = id_hash_.size(); - id_hash_[id] = new_id; + auto seg_ins = segments_.end(); + bool seg_inserted = false; + try { + auto ins = segments_.emplace( + id, IndexMapping::SegmentInfo{IndexMapping::Segment{new_seg}, + chain->header_start_offset, header}); + if (!ins.second) { + // Re-insertion under exclusive latch should be impossible (we + // checked find() earlier in the same critical section), but be + // defensive: fail loudly and roll the whole append back. + LOG_ERROR( + "append_segment: duplicate id appeared after commit, file[%s], " + "id[%s]", + file_name_.c_str(), id.c_str()); + rollback_step2(); + rollback_step1(); + return IndexError_Duplicate; + } + seg_ins = ins.first; + seg_inserted = true; + const size_t new_id = id_hash_.size(); + id_hash_.emplace(id, new_id); + } catch (const std::bad_alloc &) { + LOG_ERROR( + "append_segment: in-memory commit OOM, rolling back, file[%s], " + "id[%s]", + file_name_.c_str(), id.c_str()); + if (seg_inserted) { + segments_.erase(seg_ins); + } + rollback_step2(); + rollback_step1(); + return IndexError_Runtime; + } max_segment_size_ = std::max(max_segment_size_, padded_size); // ---- Step 3: With the segmented page table (C1), extend_file() @@ -1317,12 +1439,24 @@ class BufferStorage : public IndexStorage { }; mutable MutexShard mapping_shards_[kMappingMutexShards]{}; - // Per-thread shard selection (stable hash, no syscall). + // Per-(thread, instance) shard selection. We combine std::thread::id + // with `this` so that: + // 1) Two BufferStorage instances accessed from the SAME thread map + // to (typically) DIFFERENT shards. The previous thread_local-only + // implementation cached a single id per thread regardless of + // instance, which collapsed all instances onto one shard for that + // thread and effectively defeated sharding. + // 2) Skewed thread::id distributions (on glibc, thread::id is the + // aligned pthread_t pointer; `% 32` clusters) are dispersed by the + // boost-style hash_combine mix. + // Cost: ~3 ALU ops + one mod; cheaper than the cache-line ping-pong + // that the bug caused. size_t mapping_shard_id() const { - thread_local const size_t id = - std::hash()(std::this_thread::get_id()) % - kMappingMutexShards; - return id; + size_t seed = std::hash()(std::this_thread::get_id()); + size_t inst = std::hash()(static_cast(this)); + // boost::hash_combine(seed, inst) + seed ^= inst + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2); + return seed % kMappingMutexShards; } // RAII guard that locks ALL shards exclusively (for writers). diff --git a/src/include/zvec/ailego/buffer/vector_page_table.h b/src/include/zvec/ailego/buffer/vector_page_table.h index 337c28c59..f2e78a061 100644 --- a/src/include/zvec/ailego/buffer/vector_page_table.h +++ b/src/include/zvec/ailego/buffer/vector_page_table.h @@ -105,19 +105,19 @@ class VectorPageTable { //! Mark a loaded block as dirty so that it is persisted on eviction. void mark_dirty(block_id_t block_id) { - assert(block_id < entry_num_.load(std::memory_order_relaxed)); + assert(block_id < entry_num_.load(std::memory_order_acquire)); entry_at(block_id).is_dirty.store(true, std::memory_order_relaxed); } bool is_block_dirty(block_id_t block_id) const { - assert(block_id < entry_num_.load(std::memory_order_relaxed)); + assert(block_id < entry_num_.load(std::memory_order_acquire)); return entry_at(block_id).is_dirty.load(std::memory_order_relaxed); } //! Flush a single dirty block without evicting it. Caller guarantees the //! block is currently loaded (buffer != nullptr). int flush_block(block_id_t block_id) { - assert(block_id < entry_num_.load(std::memory_order_relaxed)); + assert(block_id < entry_num_.load(std::memory_order_acquire)); Entry &e = entry_at(block_id); char *buffer = e.buffer; if (!buffer || !flush_callback_) { @@ -141,7 +141,7 @@ class VectorPageTable { } bool is_released(block_id_t block_id) const { - assert(block_id < entry_num_.load(std::memory_order_relaxed)); + assert(block_id < entry_num_.load(std::memory_order_acquire)); return entry_at(block_id).ref_count.load(std::memory_order_relaxed) <= 0; } @@ -176,10 +176,17 @@ class VectorPageTable { std::atomic segment_count_{0}; Entry *segments_[kMaxSegments]{}; + // Pair with the release-store on segment_count_ in init()/extend() so + // that any reader observing the published segment table also sees the + // fully-initialized segments_[s] pointer and Entry slots. Without this + // acquire load, segments_[s] can be re-read as nullptr or a torn + // pointer on weak memory models (and even reordered on x86 under -O2). Entry &entry_at(size_t idx) { + (void)segment_count_.load(std::memory_order_acquire); return segments_[idx >> kSegmentShift][idx & kSegmentMask]; } const Entry &entry_at(size_t idx) const { + (void)segment_count_.load(std::memory_order_acquire); return segments_[idx >> kSegmentShift][idx & kSegmentMask]; } From 1b1c221ef2e23f96c9d0c5d6af7862c2c22ebed1 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Wed, 27 May 2026 22:47:13 +0800 Subject: [PATCH 31/47] clang format --- src/ailego/buffer/vector_page_table.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index 73f45dfff..2c7c41667 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -245,8 +245,7 @@ char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer, // Tiered diagnostics: warn once after 100ms, error every 1s after 1s. const auto now = clock::now(); const auto elapsed = now - wait_start; - if (!warned && - elapsed >= std::chrono::milliseconds(100)) { + if (!warned && elapsed >= std::chrono::milliseconds(100)) { LOG_WARN( "set_block_acquired: long kEvicting wait on block_id=%zu " "(>=100ms); evict_block may be slow", From 6fa450fe981a6604f4874503c9d64cc50419502c Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Thu, 28 May 2026 11:45:36 +0800 Subject: [PATCH 32/47] fix --- src/core/utility/buffer_storage.cc | 591 +++++++++++------------------ 1 file changed, 219 insertions(+), 372 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 5db3ba0f8..5735ecedf 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -31,16 +32,11 @@ namespace zvec { namespace core { -// Cross-page reads through the legacy read(const void**) overload need a -// buffer whose lifetime is at least as long as the BufferStorage itself, -// because callers store the returned pointer indefinitely (the historical -// contract is "pointer is valid until the storage is closed"). Earlier -// revisions used a thread_local scratch buffer here, which subtly broke -// that contract: the next cross-page read(const void**) on the SAME thread -// silently overwrote the buffer, dangling every previously-handed-out -// pointer. We now allocate per call and hand ownership to the storage's -// tmp_buffers_ list (freed in close_index()). Callers that want bounded -// memory should migrate to the read(MemoryBlock&) overload. +// The legacy read(const void**) overload guarantees the returned pointer +// stays valid until close_index(). Single-page reads pin the page +// (never released); cross-page reads allocate a temp buffer owned by +// tmp_buffers_ (freed in close_index()). Callers wanting bounded +// lifetime should use the read(MemoryBlock&) overload. /*! Buffer Storage */ @@ -54,15 +50,7 @@ class BufferStorage : public IndexStorage { //! Index Storage Pointer typedef std::shared_ptr Pointer; - //! Constructor - //! - //! `info` MUST be a pointer into BufferStorage::segments_ (an - //! unordered_map mapped value). C++ guarantees those pointers stay - //! valid across insertions, so the WrappedSegment can safely fetch - //! the LATEST segment_header / segment_header_start_offset / Segment - //! after a re-parse caused by append_segment(). Storing the pointer - //! (rather than copying header_/offset into local fields) is what - //! prevents use-after-free when chain_headers_ is rebuilt. + //! Constructor. See segment_info_ for the pointer-stability contract. WrappedSegment(BufferStorage *owner, IndexMapping::SegmentInfo *info, size_t segment_id) : segment_info_(info), @@ -152,21 +140,13 @@ class BufferStorage : public IndexStorage { return 0; } *data = raw; - // NOTE: get_single_page() acquires a pin on the page; we intentionally - // do NOT release it here. The legacy contract of read(const void**) - // is that the returned pointer remains valid until the storage is - // closed (an implicit, never-released pin). Many call sites rely on - // this lifetime guarantee. Callers that want explicit pin/release - // semantics should migrate to the read(MemoryBlock&) overload, which - // hands the ref-count to a RAII MemoryBlock. + // Pin held until close_index() per the never-released contract + // of this overload. (void)page_id; return len; } - // Cross-page path: allocate a buffer whose ownership is handed to - // owner_->tmp_buffers_ so that the returned pointer remains valid - // for the entire lifetime of the BufferStorage (matching the - // single-page "pinned forever" semantics established above). - // C11 aligned_alloc requires size to be a multiple of alignment. + // Cross-page path: see file-level banner. C11 aligned_alloc requires + // size to be a multiple of alignment. const size_t kAlign = 4096UL; size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL); char *tmp = @@ -243,22 +223,13 @@ class BufferStorage : public IndexStorage { return len; } - //! Write data into the storage with offset - //! - //! Takes a SHARED latch on the owner's mapping shard. This pairs with - //! the EXCLUSIVE all-shards latch held by flush_index() / append_segment() - //! around the meta_buf CRC + write_meta phase: writers parallelize - //! across (and within) shards, but are fully excluded while CRC is - //! computed over the meta_buf bytes that this method mutates - //! (data_size / padding_size). Without this latch the lock-free hot - //! path raced with the CRC compute, producing footer.segments_meta_crc - //! that did not match the bytes pwrite()'d to disk. + //! Write data into the storage with offset. //! - //! Takes a per-segment meta_mtx_ around the meta read-modify-write so - //! that two concurrent writers on the SAME segment cannot interleave - //! their (data_size, padding_size) updates and observe a state where - //! data_size + padding_size != capacity_. Different segments still - //! mutate in parallel because the mutex is per-WrappedSegment. + //! Locking: shared shard latch pairs with flush_index()'s exclusive + //! all-shards latch -- excludes CRC compute over meta_buf while we + //! mutate (data_size, padding_size). meta_mtx_ additionally + //! serialises concurrent writers on the SAME segment so the pair + //! stays consistent (sum == capacity_). size_t write(size_t offset, const void *data, size_t len) override { std::shared_lock latch( owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); @@ -288,9 +259,6 @@ class BufferStorage : public IndexStorage { } auto meta = segment_info_->segment.meta(); { - // Per-segment mutex: serialise concurrent writers that mutate - // (data_size, padding_size) on the SAME segment so the pair - // remains consistent (sum stays == capacity_). std::lock_guard meta_latch(meta_mtx_); if (offset + len > meta->data_size) { meta->data_size = offset + len; @@ -306,35 +274,17 @@ class BufferStorage : public IndexStorage { abs_offset); return 0; } - // ALWAYS mark dirty after a successful page-cache write so that the - // next flush_index() does NOT take the `if (!index_dirty_) return 0;` - // short-circuit and skip flush_all(). Previously this was only set - // when `data_size` grew, which meant fixed-size segments (e.g. - // chunk_meta_segment writing HnswChunkMeta in place) never raised - // the dirty flag -- their 4K page-cache pages were not flushed before - // append_segment(), so the freshly-rebuilt page table - // pread'd stale content from disk and chunk_cnts[NODE] lagged the - // real segment count, eventually causing sync_chunks() to see a - // mid-state segment and crash with a NULL Chunk::Pointer. + // Mark dirty unconditionally even when data_size did not grow: + // fixed-size in-place rewrites (e.g. chunk_meta_segment) must still + // trigger flush_all() before the next append_segment(). owner_->set_as_dirty(); return len; } - //! Resize size of data - //! - //! Takes a SHARED latch for the same reason as write(): mutating - //! meta->data_size / padding_size must be excluded from the CRC - //! compute in flush_index() / append_segment(). The per-segment - //! meta_mtx_ additionally serialises concurrent writers on the SAME - //! segment. + //! Resize size of data. See write() for the locking contract. size_t resize(size_t size) override { std::shared_lock latch( owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); - // Reject resize once the storage is marked corrupted. Without this - // guard, a resize() that lands AFTER append_segment()'s rollback - // failure would mutate meta_buf + flip index_dirty_, but the next - // flush_index_locked() would short-circuit on the same corrupted_ - // flag and never persist the change -- silent partial-update. if (ailego_unlikely(owner_->corrupted_.load(std::memory_order_acquire))) { LOG_ERROR( "WrappedSegment::resize: storage is marked corrupted, refusing " @@ -361,21 +311,10 @@ class BufferStorage : public IndexStorage { return size; } - //! Update crc of data - //! - //! Takes a SHARED latch for the same reason as write(): mutating - //! meta->data_crc must be excluded from the CRC compute in - //! flush_index() / append_segment(). The per-segment meta_mtx_ - //! ensures the data_crc store does not interleave with a concurrent - //! write()/resize() update of (data_size, padding_size). + //! Update crc of data. See write() for the locking contract. void update_data_crc(uint32_t crc) override { std::shared_lock latch( owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); - // Same rationale as resize(): refuse the meta mutation once the - // storage is corrupted, otherwise the CRC update would be lost on - // the next flush_index_locked() (which itself short-circuits on - // corrupted_), leaving on-disk and in-memory CRCs permanently - // diverged. if (ailego_unlikely(owner_->corrupted_.load(std::memory_order_acquire))) { LOG_ERROR( "WrappedSegment::update_data_crc: storage is marked corrupted, " @@ -397,17 +336,12 @@ class BufferStorage : public IndexStorage { protected: friend BufferStorage; - // Pointer into BufferStorage::segments_ (an unordered_map mapped value). - // C++ guarantees the address stays valid across map insertions. All - // header / start-offset / segment-meta accesses go through this pointer - // so that re-parses after append_segment() are observed without - // needing to recreate WrappedSegment instances held by callers. + // Pointer into BufferStorage::segments_ (unordered_map mapped value). + // The address is stable across map insertions, so re-parses after + // append_segment() are picked up without recreating WrappedSegment. IndexMapping::SegmentInfo *segment_info_{nullptr}; - // Per-segment mutex protecting concurrent writer access to the - // (data_size, padding_size, data_crc) fields of segment_info_->segment. - // The owner's shard shared_mutex still excludes these writers vs - // flush_index()'s AllShardsExclusiveLatch; this mutex additionally - // serialises hot-path writers on the SAME WrappedSegment. + // Serialises hot-path writers on the SAME segment so + // (data_size, padding_size, data_crc) updates do not interleave. mutable std::mutex meta_mtx_{}; private: @@ -481,12 +415,11 @@ class BufferStorage : public IndexStorage { return 0; } + // PRECONDITION (also for ParseFooter/ParseSegment/ParseToMapping): + // caller holds either single-threaded open() or AllShardsExclusiveLatch. + // Do NOT add an internal lock here -- std::shared_mutex is not reentrant. int ParseHeader(size_t offset, IndexFormat::MetaHeader *out) { std::unique_ptr buffer(new char[sizeof(*out)]); - // ParseHeader is called from ParseToMapping which is itself called - // from either open() (single-threaded) or append_segment() (under - // AllShardsExclusiveLatch). Do NOT add an internal lock here -- - // std::shared_mutex is not reentrant -> deadlock. if (buffer_pool_handle_->get_meta(offset, sizeof(*out), buffer.get()) != 0) { LOG_ERROR("Get segment header failed."); @@ -507,7 +440,6 @@ class BufferStorage : public IndexStorage { int ParseFooter(size_t offset) { std::unique_ptr buffer(new char[sizeof(footer_)]); - // Bypass wrapper -- see ParseHeader() comment for why. if (buffer_pool_handle_->get_meta(offset, sizeof(footer_), buffer.get()) != 0) { LOG_ERROR("Get segment footer failed."); @@ -529,13 +461,8 @@ class BufferStorage : public IndexStorage { int ParseSegment(size_t offset, IndexFormat::MetaHeader *chain_header, uint32_t *out_segment_ids_offset) { - // NOTE: this function is only called from ParseToMapping(), which is - // itself called from either open() (single-threaded construction) or - // append_segment() (under AllShardsExclusiveLatch). Do NOT add an - // internal lock here -- doing so would deadlock the append path. std::unique_ptr segment_buffer = std::make_unique(footer_.segments_meta_size); - // Bypass wrapper -- see ParseHeader() comment for why. if (buffer_pool_handle_->get_meta(offset, footer_.segments_meta_size, segment_buffer.get()) != 0) { LOG_ERROR("Get segment meta failed."); @@ -565,26 +492,32 @@ class BufferStorage : public IndexStorage { if (iter->segment_id_offset < segment_ids_offset) { segment_ids_offset = iter->segment_id_offset; } - // Assign a stable numeric ID (block_id in the page table) to this - // segment. We use id_hash_.size() rather than segments_.size() because - // segments_ is intentionally NOT cleared between appends (to keep - // existing WrappedSegment pointers valid), so segments_.size() would - // reflect stale entries and produce wrong IDs on re-parse. - const std::string seg_name(reinterpret_cast(segment_start) + - iter->segment_id_offset); + // Use id_hash_.size() (not segments_.size()) for the block_id: + // segments_ is intentionally NOT cleared between appends to keep + // existing WrappedSegment pointers valid, so it carries stale entries. + // + // Bound the C-string scan to the segments_meta buffer so a missing + // NUL terminator cannot walk past the buffer end (defence against + // crafted-CRC inputs; CRC already covers benign bit flips). + const char *seg_name_start = + reinterpret_cast(segment_start) + + iter->segment_id_offset; + const size_t seg_name_max = + footer_.segments_meta_size - iter->segment_id_offset; + const size_t seg_name_len = ::strnlen(seg_name_start, seg_name_max); + if (seg_name_len == seg_name_max) { + LOG_ERROR( + "ParseSegment: segment_id missing NUL terminator, file[%s]", + file_name_.c_str()); + return IndexError_InvalidValue; + } + const std::string seg_name(seg_name_start, seg_name_len); const size_t seg_id = id_hash_.size(); id_hash_[seg_name] = seg_id; - // Update the segments_ entry in-place so that any WrappedSegment - // instances that already hold a pointer to this entry (via - // &segments_[name].segment) continue to use the refreshed meta_ptr_ - // after the re-parse. - // - // IMPORTANT: chain_header points into chain_headers_ which is a - // std::vector>; each chain owns its OWN - // MetaHeader copy. Do NOT use a shared &header_ here -- when there - // are multiple meta-header chains in the file, the next ParseHeader() - // would overwrite that single instance and break content_offset for - // all earlier-chain segments. + // In-place update so existing WrappedSegment pointers see the + // refreshed meta_ptr_ after re-parse. chain_header MUST be the + // per-chain owning copy (not a shared &header_) -- see + // chain_headers_ field comment. segments_[seg_name] = IndexMapping::SegmentInfo{IndexMapping::Segment{iter}, current_header_start_offset_, chain_header}; @@ -605,10 +538,7 @@ class BufferStorage : public IndexStorage { int ParseToMapping() { while (true) { int ret; - // Allocate an OWN MetaHeader for this chain so that subsequent chains - // never overwrite earlier-chain headers (prior implementation used a - // single header_ member, which corrupted content_offset for chain-0 - // segments once chain-1 was parsed). + // Per-chain owning MetaHeader; see chain_headers_ field comment. chain_headers_.emplace_back(std::make_unique()); IndexFormat::MetaHeader *chain_header = chain_headers_.back().get(); ret = ParseHeader(current_header_start_offset_, chain_header); @@ -635,6 +565,17 @@ class BufferStorage : public IndexStorage { } uint64_t footer_offset = chain_header->meta_footer_offset + current_header_start_offset_; + // Reject uint64 wrap-around and offsets past file_size. + if (footer_offset < current_header_start_offset_ || + footer_offset + sizeof(IndexFormat::MetaFooter) > + buffer_pool_->file_size()) { + LOG_ERROR( + "ParseToMapping: invalid footer_offset=%lu (header=%lu, " + "file_size=%lu), file[%s]", + footer_offset, current_header_start_offset_, + buffer_pool_->file_size(), file_name_.c_str()); + return IndexError_InvalidValue; + } ret = ParseFooter(footer_offset); if (ret != 0) { LOG_ERROR("Failed to parse footer, errno %d, %s", ret, @@ -667,7 +608,31 @@ class BufferStorage : public IndexStorage { if (footer_.next_meta_header_offset == 0) { break; } - current_header_start_offset_ = footer_.next_meta_header_offset; + // Reject self-reference / backward jumps and offsets past file_size: + // such a corrupted next_meta_header_offset would otherwise drive the + // loop into infinite chain growth -> OOM. + const uint64_t next_off = footer_.next_meta_header_offset; + if (next_off <= current_header_start_offset_ || + next_off + sizeof(IndexFormat::MetaHeader) > + buffer_pool_->file_size()) { + LOG_ERROR( + "ParseToMapping: invalid next_meta_header_offset=%lu " + "(current=%lu, file_size=%lu), file[%s]", + next_off, current_header_start_offset_, + buffer_pool_->file_size(), file_name_.c_str()); + return IndexError_InvalidValue; + } + // Bound chain count: 1024 chains @ default 1MB segment_meta_capacity + // covers >1GB of metadata, far above realistic load. + constexpr size_t kMaxChains = 1024; + if (chain_headers_.size() >= kMaxChains) { + LOG_ERROR( + "ParseToMapping: chain count exceeds limit %zu, file[%s] may " + "be corrupted", + kMaxChains, file_name_.c_str()); + return IndexError_InvalidLength; + } + current_header_start_offset_ = next_off; } return 0; } @@ -783,32 +748,20 @@ class BufferStorage : public IndexStorage { return ret; } - //! Set the index file as dirty. - //! - //! HOT PATH: called once per WrappedSegment::write() / resize() / - //! update_data_crc(). We MUST unconditionally store(true) here, not - //! guard with a load-then-store: under relaxed semantics a writer can - //! observe a stale dirty=true (its own core's cached value) AFTER - //! flush_index() has CAS'd dirty to false on another core, then skip - //! its own store and the writer's modification gets dropped (next - //! flush_index() short-circuits at the top because dirty is false). - //! The MESI ping-pong is the cost of correctness; it is bounded by the - //! caller's write rate and amortized by the caller's actual I/O. + //! Mark the index as dirty. HOT PATH: store(true) unconditionally -- + //! a load-then-store guard could let a stale cached `true` skip the + //! store after flush_index() CAS'd dirty=false on another core, losing + //! the writer's modification. void set_as_dirty(void) { index_dirty_.store(true, std::memory_order_relaxed); } //! Refresh meta information (checksum, update time, etc.) void refresh_index(uint64_t chkp) { - // Monotonic merge: callers may invoke refresh() out of order under - // concurrency (parallel writers, retries, batched commits delivered on - // different threads). An unconditional store would let a smaller chkp - // arriving later overwrite a larger one, violating the upper-layer - // invariant that the persisted check_point is non-decreasing. CAS-loop - // max guarantees the largest observed value wins regardless of arrival - // order; relaxed ordering is sufficient because flush_index() takes the - // all-shards exclusive latch which establishes the necessary - // happens-before for the actual disk write. + // CAS-loop max: callers may invoke refresh() out of order, and the + // persisted check_point must be non-decreasing. Relaxed ordering is + // sufficient because flush_index() takes AllShardsExclusiveLatch which + // establishes the necessary happens-before for the disk write. if (chkp != 0) { uint64_t cur = pending_check_point_.load(std::memory_order_relaxed); while (chkp > cur) { @@ -816,57 +769,32 @@ class BufferStorage : public IndexStorage { cur, chkp, std::memory_order_relaxed)) { break; } - // compare_exchange_weak refreshed `cur`; loop checks chkp > cur - // again and exits if some other thread already raised pending past - // our value. } } - // In BufferStorage the segment metadata lives in buffer_pool_buffers_. - // CRC recomputation and disk write are deferred to flush_index(). - // Mark dirty unconditionally for the same reason as set_as_dirty(): - // a load-then-store guard would let a stale `true` observation skip - // the store and lose this refresh. Note: even when our chkp lost the - // CAS race (was discarded as stale), we still set dirty -- the winning - // larger chkp must be flushed, and flush_index()'s UpdateMetaFooter() - // is a no-op for chkp==0 so a spurious extra flush is harmless. + // Set dirty unconditionally even if our chkp lost the CAS race: the + // winning larger chkp must still be flushed. index_dirty_.store(true, std::memory_order_relaxed); } - //! Flush index storage: persists any pending meta changes (segments_meta + - //! footer) for each header chain, then asks the page cache to write back - //! dirty data pages. + //! Flush index storage. int flush_index(void) { if (!index_dirty_.load(std::memory_order_relaxed)) { return 0; } - // EXCLUSIVE all-shards latch: blocks the lock-free hot path - // (WrappedSegment::write / resize / update_data_crc) which mutates - // meta->data_size / padding_size / data_crc, the very bytes we hash - // to recompute footer.segments_meta_crc and pwrite to disk. Holding - // a single shard's shared lock (the previous design) was insufficient - // because writers on other shards could race with the CRC compute - // and produce a checksum that mismatches the on-disk segment_meta - // bytes, causing IndexError_InvalidChecksum on the next open(). + // Exclusive all-shards latch excludes the lock-free hot path while we + // hash meta_buf and pwrite footer; without it segments_meta_crc would + // not match the bytes on disk. AllShardsExclusiveLatch latch(mapping_shards_); return flush_index_locked(); } - //! Internal flush implementation. PRECONDITION: caller MUST already hold - //! AllShardsExclusiveLatch on mapping_shards_. Used by flush_index() - //! (which acquires the latch itself) and by close_index() (which must - //! flush and tear down under a SINGLE continuous latch hold so that no - //! writer can slip in between flush and pool reset and lose its dirty - //! pages). + //! PRECONDITION: caller holds AllShardsExclusiveLatch. Used by + //! flush_index() (acquires the latch) and close_index() (must flush + //! and tear down under one continuous latch hold). int flush_index_locked(void) { - // NULL GUARD: pool was never initialized (open() never succeeded, or - // close_index() already tore it down). This is a no-op rather than an - // error: close_index() unconditionally calls us as part of teardown, - // and a never-opened / already-closed storage simply has nothing to - // flush. Logging ERROR here would spam test logs on benign destructor - // / cleanup paths. Real corruption is still reported by the - // corrupted_ branch below. + // No-op on never-opened / already-closed storage: close_index() + // unconditionally calls us during teardown. if (!buffer_pool_ || !buffer_pool_handle_) { - // Keep dirty flag in sync so a future re-open + flush is consistent. index_dirty_.store(false, std::memory_order_relaxed); return 0; } @@ -882,37 +810,23 @@ class BufferStorage : public IndexStorage { index_dirty_.store(false, std::memory_order_relaxed); return 0; } - // Atomically claim the dirty flag at the START of the flush, not at the - // end. This prevents a TOCTOU race against the lock-free hot path: - // any WrappedSegment::write() that happens between flush_all() and the - // end of this function will simply re-set dirty=true (its set_as_dirty - // observes our cleared flag), and the next flush_index() will pick up - // those new dirty pages. An unconditional store(false) at the end - // would silently swallow that concurrent write. + // Claim dirty atomically AT THE START so any concurrent write() that + // lands during this flush re-sets dirty=true and is picked up by the + // next flush; an unconditional store(false) at the end would silently + // swallow it. bool expected_dirty = true; if (!index_dirty_.compare_exchange_strong(expected_dirty, false, std::memory_order_relaxed)) { - // Another thread already claimed and is performing the flush; treat - // this call as a no-op. The previous design (no CAS) allowed - // duplicate concurrent flushers; bailing out here is strictly safer - // because both flushers would otherwise race on per-chain footer - // mutation in the loop below. + // Another thread already claimed; bail out. return 0; } - // Snapshot the pending checkpoint AFTER claiming dirty so that we - // observe at least every refresh_index() that happened before we - // claimed. The CAS-reset at the end will preserve any newer chkp - // stored by a concurrent refresh_index() during this flush. + // Snapshot pending_check_point_ AFTER claiming dirty: any newer chkp + // stored by a concurrent refresh_index() will be preserved by the + // CAS-reset at the end (and refresh_index() will have re-set dirty). const uint64_t consumed_chkp = pending_check_point_.load(std::memory_order_relaxed); - // Restore consumed_chkp into pending_check_point_ on any failure path - // below so that the in-flight value is not lost. Although the current - // implementation only LOADs consumed_chkp (so pending already holds it), - // this explicit monotonic CAS-back makes the invariant - // (pending_check_point_ >= consumed_chkp) self-evident and resilient to - // future refactors that might exchange/zero pending eagerly. Uses the - // same CAS-loop max as refresh_index() so a concurrent larger chkp - // wins. + // Restore consumed_chkp on failure paths (CAS-loop max, same as + // refresh_index()) so a concurrent larger chkp wins. auto restore_chkp_on_failure = [this, consumed_chkp]() { if (consumed_chkp == 0) return; uint64_t cur = pending_check_point_.load(std::memory_order_relaxed); @@ -923,27 +837,21 @@ class BufferStorage : public IndexStorage { } } }; - // Flush all dirty data blocks to the backing file first. + // Flush dirty data blocks first. if (buffer_pool_handle_->flush_all() != 0) { - // Restore dirty so the next flush_index() retries. index_dirty_.store(true, std::memory_order_relaxed); restore_chkp_on_failure(); LOG_ERROR("flush_all data blocks failed: file[%s]", file_name_.c_str()); return IndexError_WriteData; } - // For each metadata chain, recompute the segment-meta CRC, update the - // in-memory footer (segments_meta_crc + footer_crc + update_time), and - // write both the segment metadata and the footer back to the backing - // file. Uses the per-chain in-memory footer copy, avoiding a pread. + // Per-chain: recompute segments_meta CRC, refresh footer, pwrite both. for (size_t ci = 0; ci < meta_chains_.size() && ci < buffer_pool_buffers_.size(); ++ci) { MetaChain &mchain = meta_chains_[ci]; const char *seg_buf = buffer_pool_buffers_[ci].get(); - // Recompute segment metadata CRC and refresh the per-chain footer. mchain.footer.segments_meta_crc = ailego::Crc32c::Hash(seg_buf, mchain.segment_meta_size, 0u); IndexFormat::UpdateMetaFooter(&mchain.footer, consumed_chkp); - // Write segment metadata back to disk. if (buffer_pool_handle_->write_meta(mchain.segment_meta_file_offset, mchain.segment_meta_size, seg_buf) != 0) { @@ -953,7 +861,6 @@ class BufferStorage : public IndexStorage { restore_chkp_on_failure(); return IndexError_WriteData; } - // Write the updated footer back to disk. if (buffer_pool_handle_->write_meta( mchain.footer_file_offset, sizeof(mchain.footer), reinterpret_cast(&mchain.footer)) != 0) { @@ -964,16 +871,12 @@ class BufferStorage : public IndexStorage { return IndexError_WriteData; } } - // Keep the convenience alias in sync with the last chain. if (!meta_chains_.empty()) { footer_ = meta_chains_.back().footer; } - // CAS-reset pending: only consume the checkpoint we observed at the - // start. If a concurrent refresh_index() stored a newer value during - // the flush, CAS fails and the newer value remains in - // pending_check_point_; refresh_index() also re-set dirty=true (since - // we cleared it at the top), so the next flush_index() will persist - // the newer chkp. + // CAS-reset pending: only consume the chkp we observed. A concurrent + // larger chkp survives and will be flushed next round (refresh_index() + // also re-set dirty). uint64_t expected_chkp = consumed_chkp; pending_check_point_.compare_exchange_strong(expected_chkp, 0, std::memory_order_relaxed); @@ -982,17 +885,9 @@ class BufferStorage : public IndexStorage { //! Close index storage void close_index(void) { - // Take the all-shards exclusive latch BEFORE flushing, and hold it for - // the entire teardown sequence. Earlier code released the latch - // between flush and teardown, opening a window in which a writer could - // grab a shared lock, mutate meta_buf via WrappedSegment::write() and - // call set_as_dirty(true). After this close_index() reacquired the - // latch and reset buffer_pool_handle_, those dirty pages would be - // dropped on the floor with no chance to flush. Holding a SINGLE - // latch instance across flush_index_locked() and the reset eliminates - // that window: writers can only enter once we have fully torn down - // (and at that point segments_/buffer_pool_handle_ are gone, so they - // would fail the null/state guards in WrappedSegment). + // Hold ONE continuous all-shards latch across flush + teardown so no + // writer can slip in between (which would dirty meta_buf only to have + // the page table reset under it, dropping the modification). AllShardsExclusiveLatch latch(mapping_shards_); flush_index_locked(); file_name_.clear(); @@ -1020,13 +915,11 @@ class BufferStorage : public IndexStorage { corrupted_.store(false, std::memory_order_relaxed); } - //! Append a segment into storage. - //! - //! C1: the page table extends in-place (no pool rotation). The exclusive - //! latch is held only briefly to protect segments_/id_hash_ insertion. + //! Append a segment into storage. C1: page table extends in-place; + //! latch held only briefly to protect segments_/id_hash_ insertion. int append_segment(const std::string &id, size_t size) { - // Flush any in-memory metadata changes (data_size, padding_size, CRC) - // accumulated by prior write()/resize() calls. + // Persist any pending data_size/padding/CRC mutations from prior + // write()/resize() before we re-hash and rewrite the segment_meta. this->flush_index(); AllShardsExclusiveLatch latch(mapping_shards_); @@ -1059,29 +952,21 @@ class BufferStorage : public IndexStorage { return IndexError_Runtime; } - // Page-aligned padded size for the new segment. Matches IndexMapping's - // CalcPageAlignedSize() so the on-disk layout stays identical. + // Page-aligned padded size; matches IndexMapping::CalcPageAlignedSize(). const size_t page_size = ailego::kVectorPageSize; const size_t padded_size = (size + page_size - 1) / page_size * page_size; - // The "current last chain" is meta_chains_.back() / chain_headers_.back(); - // footer_ is always the last chain's footer (overwritten by ParseFooter - // during ParseToMapping). + // The current last chain owns footer_ (overwritten by ParseFooter). size_t id_size = id.length() + 1; size_t need_size = sizeof(IndexFormat::SegmentMeta) + id_size; MetaChain *chain = &meta_chains_.back(); IndexFormat::MetaHeader *header = chain_headers_.back().get(); char *meta_buf = buffer_pool_buffers_.back().get(); - // Rollback handle for the (possibly committed) chain split below. - // Default is a no-op; populated ONLY after Step 1's in-memory commit - // succeeds so that a Step 2 disk-write failure can undo the split as - // well, leaving meta_chains_ / chain_headers_ / buffer_pool_buffers_ / - // footer_ / current_header_start_offset_ exactly as they were before - // append_segment() ran. Without this, a Step 2 failure would leave - // an orphan empty chain permanently appended to the file (harmless - // for correctness because it stays linked and gets reused on next - // append, but disruptive for idempotent retries and unit tests). + // Rollback handle for an in-memory-committed chain split. Default + // no-op; populated only after Step 1 commits, so a Step 2 failure + // can fully undo the split (otherwise an orphan empty chain would + // remain linked in the file). std::function rollback_step1 = []() {}; // ---- Step 1: chain split if current chain has no meta capacity left. @@ -1098,14 +983,12 @@ class BufferStorage : public IndexStorage { new_meta_total - sizeof(IndexFormat::MetaHeader) - sizeof(IndexFormat::MetaFooter)); - // Prepare the linked old footer WITHOUT mutating footer_ yet so - // that a write failure leaves in-memory state untouched. + // Stage the linked old footer without mutating footer_ yet. const auto saved_footer_before_split = footer_; IndexFormat::MetaFooter linked_footer = footer_; linked_footer.next_meta_header_offset = new_chain_start; IndexFormat::UpdateMetaFooter(&linked_footer, 0); - // Write old footer with forward link to disk. if (buffer_pool_handle_->write_meta( chain->footer_file_offset, sizeof(linked_footer), reinterpret_cast(&linked_footer)) != 0) { @@ -1114,12 +997,10 @@ class BufferStorage : public IndexStorage { return IndexError_WriteData; } - // Best-effort rollback: restore original old footer on disk if a - // subsequent disk write in this split block fails. If THIS rollback - // also fails to land on disk, the file is now in an inconsistent - // state (old footer points forward to a partially-written new chain - // region) -- raise the corrupted_ flag so subsequent writes refuse - // to compound the damage. + // Best-effort restore of the old footer if any subsequent write in + // this split block fails. If the restore itself fails, mark the + // storage corrupted -- on-disk old footer now points at a partial + // new chain region. auto undo_old_footer = [this, chain, &saved_footer_before_split]() { if (buffer_pool_handle_->write_meta( chain->footer_file_offset, sizeof(saved_footer_before_split), @@ -1135,8 +1016,7 @@ class BufferStorage : public IndexStorage { }; // Extend the file and write the new chain's header + (zero) footer. - // The segment_meta region is implicitly zero-filled by ftruncate, - // matching the empty `new_meta_buf` we keep in memory. + // The segment_meta region is zero-filled by ftruncate. if (!buffer_pool_->extend_file(new_chain_start + new_meta_total)) { undo_old_footer(); return IndexError_Runtime; @@ -1177,29 +1057,17 @@ class BufferStorage : public IndexStorage { return IndexError_WriteData; } - // Snapshot the OLD chain's pre-commit state for rollback_step1. - // Captured by value because `chain` will be reassigned below to point - // at the new chain's slot in meta_chains_, and pop_back() during - // rollback would invalidate any reference into the old slot. + // Snapshot the OLD chain's pre-commit state for rollback_step1 + // (captured by value: `chain` is reassigned below). const auto saved_old_chain_footer = chain->footer; const uint64_t saved_old_footer_file_offset = chain->footer_file_offset; const uint64_t saved_current_header_start = current_header_start_offset_; - // All split disk writes succeeded -- commit in-memory state. - // - // STRONG EXCEPTION GUARANTEE: reserve() growth FIRST so the three - // push_back's below cannot throw (capacity is sufficient and the - // moved-in elements -- unique_ptr, unique_ptr, - // and the POD MetaChain aggregate -- have noexcept move ctors). - // Without this, a bad_alloc in the middle of the three push_back's - // leaves chain_headers_/buffer_pool_buffers_/meta_chains_ at - // mismatched sizes (one or two extended, the rest not), with - // footer_/current_header_start_offset_ either still or already - // pointing at the new chain. flush_index_locked() then iterates - // `min(meta_chains_.size(), buffer_pool_buffers_.size())` and - // silently skips the orphan chain, while ParseToMapping() on next - // open follows the on-disk forward link and DOES see it -- a - // classic split-brain. + // Strong exception guarantee: reserve() FIRST so the three + // push_back's cannot throw mid-way and leave + // chain_headers_/buffer_pool_buffers_/meta_chains_ at mismatched + // sizes (which flush_index_locked() would silently skip while + // ParseToMapping() on next open follows the on-disk forward link). try { chain_headers_.reserve(chain_headers_.size() + 1); buffer_pool_buffers_.reserve(buffer_pool_buffers_.size() + 1); @@ -1224,19 +1092,15 @@ class BufferStorage : public IndexStorage { header = chain_headers_.back().get(); meta_buf = buffer_pool_buffers_.back().get(); - // Install rollback for the committed split: pop the new chain and - // restore the old chain on both disk and memory. Captured fully by - // value (except `this`-via-member-access) so a subsequent reassignment - // of local pointers (chain/header/meta_buf) does not corrupt the + // Install rollback for the committed split. Captures by value so + // later reassignment of chain/header/meta_buf does not corrupt the // closure. rollback_step1 = [this, saved_footer_before_split, saved_old_chain_footer, saved_old_footer_file_offset, saved_current_header_start]() { - // 1. Restore old chain's footer on disk (drop forward link). - // A failure here leaves the on-disk old footer still pointing - // at the now-popped new chain region, which ParseToMapping() - // would follow to garbage on the next open. Mark the storage - // corrupted so subsequent writes refuse to proceed. + // 1. Drop the forward link on the old footer. If this fails the + // on-disk old footer still points at the popped new chain + // region -- mark corrupted. if (buffer_pool_handle_->write_meta( saved_old_footer_file_offset, sizeof(saved_footer_before_split), reinterpret_cast(&saved_footer_before_split)) != @@ -1248,26 +1112,18 @@ class BufferStorage : public IndexStorage { file_name_.c_str()); corrupted_.store(true, std::memory_order_release); } - // 2. Pop the freshly-pushed new chain from in-memory containers. - // The associated unique_ptr / unique_ptr - // are released here. + // 2. Pop the freshly-pushed new chain (releases its unique_ptrs). if (!meta_chains_.empty()) meta_chains_.pop_back(); if (!chain_headers_.empty()) chain_headers_.pop_back(); if (!buffer_pool_buffers_.empty()) buffer_pool_buffers_.pop_back(); - // 3. Restore old chain's in-memory footer (its forward link was - // set to the now-popped new chain). + // 3. Restore the old chain's in-memory footer (forward link cleared). if (!meta_chains_.empty()) { meta_chains_.back().footer = saved_old_chain_footer; } - // 4. Restore footer_ and current_header_start_offset_ to their - // pre-split values. The on-disk file size is intentionally NOT - // shrunk: most buffer-pool backends offer no precise truncate, - // and the leftover bytes (the orphan new_header / new_footer - // region) are unreachable -- step 1 above has already removed - // the forward link from the old footer, so ParseToMapping() - // stops at the old chain and the leftover region is reusable - // by the next append_segment()'s split via file_size() - // realignment. + // 4. Restore footer_ + current_header_start_offset_. The on-disk + // file size is intentionally NOT shrunk: the orphan region is + // unreachable (step 1 cleared the link) and reusable by the + // next split via file_size() realignment. footer_ = saved_footer_before_split; current_header_start_offset_ = saved_current_header_start; }; @@ -1285,13 +1141,13 @@ class BufferStorage : public IndexStorage { } } - // Save mutable state for rollback if a disk write fails below. + // Save mutable state for rollback if a Step 2 disk write fails. The + // meta_buf regions that get overwritten (SegmentMeta entry + ID + // string) are also snapshotted so they can be restored exactly, + // keeping CRC consistent for a later flush_index(). const auto saved_footer = footer_; const auto saved_chain_footer = chain->footer; const auto saved_segment_ids_offset = chain->segment_ids_offset; - // Save the meta_buf regions that will be overwritten (SegmentMeta - // entry and segment-ID string) so they can be restored exactly, - // keeping the CRC consistent for a potential later flush_index(). const size_t meta_entry_off = sizeof(IndexFormat::SegmentMeta) * footer_.segment_count; const uint32_t new_ids_off = @@ -1321,9 +1177,15 @@ class BufferStorage : public IndexStorage { IndexFormat::UpdateMetaFooter(&footer_, 0); chain->footer = footer_; // sync in-memory copy for flush_index - // Rollback helper: restore meta_buf, footer_, and chain fields to - // their pre-Step-2 values so that flush_index() writes consistent - // metadata and the next append_segment() can retry cleanly. + // Rollback for Step 2: restore in-memory state AND best-effort + // rewrite the OLD segments_meta + footer back to disk. Without the + // disk rewrite, a write_meta(footer) failure (or post-write OOM) + // would tell the caller the append failed yet leave on-disk bytes + // describing the failed append -- ParseToMapping() on next open + // would surface a ghost segment with no entry in segments_/id_hash_. + // + // If the rewrite itself fails the file is unrepairable from here: + // raise corrupted_ so subsequent writers refuse to proceed. auto rollback_step2 = [&]() { std::memcpy(meta_buf + meta_entry_off, saved_meta_entry, sizeof(IndexFormat::SegmentMeta)); @@ -1331,6 +1193,21 @@ class BufferStorage : public IndexStorage { footer_ = saved_footer; chain->footer = saved_chain_footer; chain->segment_ids_offset = saved_segment_ids_offset; + + const int rc_meta = buffer_pool_handle_->write_meta( + chain->segment_meta_file_offset, chain->segment_meta_size, meta_buf); + const int rc_footer = buffer_pool_handle_->write_meta( + chain->footer_file_offset, sizeof(footer_), + reinterpret_cast(&footer_)); + if (rc_meta != 0 || rc_footer != 0) { + LOG_ERROR( + "append_segment: rollback_step2 disk rewrite FAILED " + "(rc_meta=%d, rc_footer=%d), file[%s] is now in an " + "inconsistent state -- marking storage as corrupted; further " + "writes will be rejected.", + rc_meta, rc_footer, file_name_.c_str()); + corrupted_.store(true, std::memory_order_release); + } }; if (buffer_pool_handle_->write_meta(chain->segment_meta_file_offset, @@ -1352,21 +1229,12 @@ class BufferStorage : public IndexStorage { return IndexError_WriteData; } - // All disk writes succeeded -- commit remaining in-memory state. - // - // STRONG EXCEPTION GUARANTEE: emplace into segments_ and id_hash_ as - // a single transactional unit. unordered_map::emplace() can throw - // bad_alloc (node allocation), so if id_hash_ throws after segments_ - // succeeded, undo the segments_ insertion before propagating the - // failure. Otherwise segments_ would carry an entry with no - // matching id_hash_ slot -- get(id) would return the segment via - // segments_, but any IVF/HNSW path that joins through id_hash_ - // would silently miss it, producing the lopsided mapping the prior - // bug history attributes to id_hash_ races. - // - // WrappedSegment instances already held by callers reference - // &segments_[name], whose address is stable across unordered_map - // insertions, so existing references stay valid. + // Strong exception guarantee for the in-memory commit: emplace into + // segments_ and id_hash_ as one transactional unit -- if id_hash_ + // throws after segments_ succeeded, undo segments_ before + // propagating. unordered_map::emplace() leaves existing element + // addresses stable, so WrappedSegment instances pointing into + // segments_ remain valid. auto seg_ins = segments_.end(); bool seg_inserted = false; try { @@ -1374,9 +1242,8 @@ class BufferStorage : public IndexStorage { id, IndexMapping::SegmentInfo{IndexMapping::Segment{new_seg}, chain->header_start_offset, header}); if (!ins.second) { - // Re-insertion under exclusive latch should be impossible (we - // checked find() earlier in the same critical section), but be - // defensive: fail loudly and roll the whole append back. + // Cannot happen under the exclusive latch we hold (find() above + // checked), but be defensive. LOG_ERROR( "append_segment: duplicate id appeared after commit, file[%s], " "id[%s]", @@ -1402,11 +1269,8 @@ class BufferStorage : public IndexStorage { return IndexError_Runtime; } max_segment_size_ = std::max(max_segment_size_, padded_size); - - // ---- Step 3: With the segmented page table (C1), extend_file() - // already extended the page table in-place. No pool - // rotation or flush_all is needed — the same pool/handle - // continues to serve both old and new pages. + // C1: extend_file() already extended the page table in-place; no pool + // rotation or flush_all needed. return 0; } @@ -1420,37 +1284,25 @@ class BufferStorage : public IndexStorage { private: std::atomic index_dirty_{false}; std::atomic pending_check_point_{0}; - // Set to true when a rollback path inside append_segment() fails to - // restore the on-disk metadata to its pre-call state. Once set, the - // storage is considered corrupted and all subsequent writes - // (write/append_segment/flush_index_locked) refuse to proceed so that - // we do not compound the damage on top of inconsistent on-disk state. - // The flag is only ever raised, never cleared, for the lifetime of the - // BufferStorage instance; close_index() resets the whole object. + // Set when an append_segment() rollback fails to restore on-disk state. + // Once set, all writers (write/append_segment/flush_index_locked) refuse + // to proceed. Only ever raised; cleared only by close_index(). std::atomic corrupted_{false}; - // Sharded reader-writer lock to eliminate cache-line ping-pong on the - // reader counter. Each concurrent reader hashes to its own shard, - // avoiding cross-core contention. Writers (append_segment/close_index) - // lock ALL shards to achieve exclusive access. + // Sharded reader-writer lock: each reader hashes to its own shard to + // avoid cache-line ping-pong on the reader counter; writers lock all + // shards. static constexpr size_t kMappingMutexShards = 32; struct alignas(64) MutexShard { std::shared_mutex mtx; }; mutable MutexShard mapping_shards_[kMappingMutexShards]{}; - // Per-(thread, instance) shard selection. We combine std::thread::id - // with `this` so that: - // 1) Two BufferStorage instances accessed from the SAME thread map - // to (typically) DIFFERENT shards. The previous thread_local-only - // implementation cached a single id per thread regardless of - // instance, which collapsed all instances onto one shard for that - // thread and effectively defeated sharding. - // 2) Skewed thread::id distributions (on glibc, thread::id is the - // aligned pthread_t pointer; `% 32` clusters) are dispersed by the - // boost-style hash_combine mix. - // Cost: ~3 ALU ops + one mod; cheaper than the cache-line ping-pong - // that the bug caused. + // Per-(thread, instance) shard selection. Combining thread::id with + // `this` ensures two BufferStorage instances on the same thread map to + // different shards (a thread_local-only id collapses them onto one + // shard). boost-style hash_combine disperses skewed thread::id + // distributions across the 32 shards. size_t mapping_shard_id() const { size_t seed = std::hash()(std::this_thread::get_id()); size_t inst = std::hash()(static_cast(this)); @@ -1479,9 +1331,8 @@ class BufferStorage : public IndexStorage { // buffer manager std::string file_name_; // Per-chain owning copies of MetaHeader. segments_[name].segment_header - // points into one of these, so each chain's content_offset stays stable - // across re-parses (a single shared header_ would be overwritten by the - // next chain's ParseHeader and corrupt earlier-chain segment reads). + // points into one of these; using a single shared header_ would let the + // next chain's ParseHeader overwrite earlier-chain content_offset. std::vector> chain_headers_{}; IndexFormat::MetaFooter footer_{}; std::unordered_map segments_{}; @@ -1497,22 +1348,18 @@ class BufferStorage : public IndexStorage { // init_index(). uint32_t segment_meta_capacity_{4096u}; - // Per-header-chain file offsets used by flush_index() to write updated - // segment metadata and footer back to the backing file after writes. + // Per-header-chain file offsets used by flush_index() and append_segment(). struct MetaChain { uint64_t header_start_offset; uint64_t footer_file_offset; uint64_t segment_meta_file_offset; uint32_t segment_meta_size; - // Lowest offset of segment ID strings within the segment_meta region. - // Equals segment_meta_size when no IDs have been written yet, and - // decreases by `strlen(id)+1` for each appended segment. Used by - // append_segment() to detect when the chain runs out of meta capacity - // and a new chain must be split off. + // Lowest segment-ID-string offset within segment_meta; equals + // segment_meta_size when empty, decreases by strlen(id)+1 per append. + // Used to detect when a chain split is needed. uint32_t segment_ids_offset; - // In-memory copy of this chain's MetaFooter. Kept in sync with disk - // by flush_index() and append_segment(), avoiding a pread per chain - // on every flush. + // In-memory copy of this chain's MetaFooter, kept in sync with disk by + // flush_index() and append_segment() to avoid a pread per chain. IndexFormat::MetaFooter footer; }; std::vector meta_chains_{}; From 59f80c18d17ed13803f442f9c08f9cef226c04a5 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Thu, 28 May 2026 11:46:12 +0800 Subject: [PATCH 33/47] clang format --- src/core/utility/buffer_storage.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 5735ecedf..80b0ac394 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -506,9 +506,8 @@ class BufferStorage : public IndexStorage { footer_.segments_meta_size - iter->segment_id_offset; const size_t seg_name_len = ::strnlen(seg_name_start, seg_name_max); if (seg_name_len == seg_name_max) { - LOG_ERROR( - "ParseSegment: segment_id missing NUL terminator, file[%s]", - file_name_.c_str()); + LOG_ERROR("ParseSegment: segment_id missing NUL terminator, file[%s]", + file_name_.c_str()); return IndexError_InvalidValue; } const std::string seg_name(seg_name_start, seg_name_len); @@ -618,8 +617,8 @@ class BufferStorage : public IndexStorage { LOG_ERROR( "ParseToMapping: invalid next_meta_header_offset=%lu " "(current=%lu, file_size=%lu), file[%s]", - next_off, current_header_start_offset_, - buffer_pool_->file_size(), file_name_.c_str()); + next_off, current_header_start_offset_, buffer_pool_->file_size(), + file_name_.c_str()); return IndexError_InvalidValue; } // Bound chain count: 1024 chains @ default 1MB segment_meta_capacity From 1ddc9608156596ac6f8536f3c5b90b31759c6bfe Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Thu, 28 May 2026 14:59:25 +0800 Subject: [PATCH 34/47] fix --- src/core/utility/buffer_storage.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 80b0ac394..c928d6d2e 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -1078,6 +1078,7 @@ class BufferStorage : public IndexStorage { undo_old_footer(); return IndexError_Runtime; } + chain = &meta_chains_.back(); chain->footer = linked_footer; // old chain keeps linked footer chain_headers_.push_back(std::move(new_header)); buffer_pool_buffers_.push_back(std::move(new_meta_buf)); From bf11afec14183a0d6eab156ca335570a79c07164 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Thu, 28 May 2026 15:39:55 +0800 Subject: [PATCH 35/47] add buffer storage write ut --- .../core/utility/buffer_storage_write_test.cc | 1173 +++++++++++++++++ 1 file changed, 1173 insertions(+) create mode 100644 tests/core/utility/buffer_storage_write_test.cc diff --git a/tests/core/utility/buffer_storage_write_test.cc b/tests/core/utility/buffer_storage_write_test.cc new file mode 100644 index 000000000..b69a973e5 --- /dev/null +++ b/tests/core/utility/buffer_storage_write_test.cc @@ -0,0 +1,1173 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace zvec; +using namespace zvec::core; + +class BufferStorageWriteTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + // Initialize the memory limit pool with 64MB - enough for all tests. + ailego::MemoryLimitPool::get_instance().init(64 * 1024UL * 1024UL); + } + + void SetUp() override { + file_path_ = "buffer_storage_write_test_dir/test_" + + std::to_string(reinterpret_cast(this)); + ailego::File::Delete(file_path_); + ailego::File::MakePath("buffer_storage_write_test_dir"); + } + + void TearDown() override { ailego::File::Delete(file_path_); } + + // Open BufferStorage in writable mode (create_if_missing=true) + IndexStorage::Pointer OpenWritable() { + auto storage = IndexFactory::CreateStorage("BufferStorage"); + if (!storage) return nullptr; + ailego::Params params; + storage->init(params); + if (storage->open(file_path_, true) != 0) return nullptr; + return storage; + } + + // Open BufferStorage in read-only mode + IndexStorage::Pointer OpenReadOnly() { + auto storage = IndexFactory::CreateStorage("BufferStorage"); + if (!storage) return nullptr; + ailego::Params params; + storage->init(params); + if (storage->open(file_path_, false) != 0) return nullptr; + return storage; + } + + std::string file_path_; +}; + +// ===== Basic Write Tests ===== + +// Test: Create new index via BufferStorage, append segment, write data, read back +TEST_F(BufferStorageWriteTest, WriteBasicCreateAndWrite) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + std::string data = "Hello BufferStorage Write!"; + EXPECT_EQ(data.size(), seg->write(0, data.data(), data.size())); + + // Verify data via fetch + std::vector buf(data.size()); + EXPECT_EQ(data.size(), seg->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(data, std::string(buf.data(), buf.size())); + + // data_size should reflect the written bytes + EXPECT_EQ(data.size(), seg->data_size()); + EXPECT_EQ(0, storage->close()); +} + +// Test: Write at non-zero offset within the segment +TEST_F(BufferStorageWriteTest, WriteAtNonZeroOffset) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 8192)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + // First write at offset 0 + std::string first = "AAAA"; + EXPECT_EQ(first.size(), seg->write(0, first.data(), first.size())); + + // Second write at offset 100 + std::string second = "BBBB"; + EXPECT_EQ(second.size(), seg->write(100, second.data(), second.size())); + + // data_size should be max(first.end, second.end) = 104 + EXPECT_EQ(104u, seg->data_size()); + + // Verify both writes + std::vector buf1(first.size()); + EXPECT_EQ(first.size(), seg->fetch(0, buf1.data(), buf1.size())); + EXPECT_EQ(first, std::string(buf1.data(), buf1.size())); + + std::vector buf2(second.size()); + EXPECT_EQ(second.size(), seg->fetch(100, buf2.data(), buf2.size())); + EXPECT_EQ(second, std::string(buf2.data(), buf2.size())); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Write to multiple independent segments +TEST_F(BufferStorageWriteTest, WriteMultipleSegments) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg_a", 4096)); + ASSERT_EQ(0, storage->append("seg_b", 4096)); + ASSERT_EQ(0, storage->append("seg_c", 4096)); + + auto seg_a = storage->get("seg_a"); + auto seg_b = storage->get("seg_b"); + auto seg_c = storage->get("seg_c"); + ASSERT_TRUE(seg_a); + ASSERT_TRUE(seg_b); + ASSERT_TRUE(seg_c); + + std::string da = "data_for_a"; + std::string db = "data_for_b_longer"; + std::string dc = "c"; + + EXPECT_EQ(da.size(), seg_a->write(0, da.data(), da.size())); + EXPECT_EQ(db.size(), seg_b->write(0, db.data(), db.size())); + EXPECT_EQ(dc.size(), seg_c->write(0, dc.data(), dc.size())); + + // Verify independently + std::vector buf(db.size()); + EXPECT_EQ(da.size(), seg_a->fetch(0, buf.data(), da.size())); + EXPECT_EQ(da, std::string(buf.data(), da.size())); + + EXPECT_EQ(db.size(), seg_b->fetch(0, buf.data(), db.size())); + EXPECT_EQ(db, std::string(buf.data(), db.size())); + + EXPECT_EQ(dc.size(), seg_c->fetch(0, buf.data(), dc.size())); + EXPECT_EQ(dc, std::string(buf.data(), dc.size())); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Overwrite existing data at the same offset +TEST_F(BufferStorageWriteTest, WriteOverwrite) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + std::string first = "XXXXXXXX"; + EXPECT_EQ(first.size(), seg->write(0, first.data(), first.size())); + + std::string second = "YYYYYYYY"; + EXPECT_EQ(second.size(), seg->write(0, second.data(), second.size())); + + // Second write should overwrite + std::vector buf(second.size()); + EXPECT_EQ(second.size(), seg->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(second, std::string(buf.data(), buf.size())); + + EXPECT_EQ(0, storage->close()); +} + +// ===== Boundary / Error Tests ===== + +// Test: Write exceeding segment capacity returns 0 +TEST_F(BufferStorageWriteTest, WriteExceedsCapacity) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + // Append a small segment (page-aligned, so at least 4096 bytes capacity) + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + size_t cap = seg->capacity(); + ASSERT_GT(cap, 0u); + + // Write at an offset that causes overflow: offset + len > capacity + std::vector big_data(cap + 1, 'Z'); + EXPECT_EQ(0u, seg->write(0, big_data.data(), big_data.size())); + + // Write at offset that exceeds capacity + std::string small = "small"; + EXPECT_EQ(0u, seg->write(cap + 1, small.data(), small.size())); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Write with zero length (edge case) +TEST_F(BufferStorageWriteTest, WriteZeroLength) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + // Writing zero bytes should succeed (no-op but valid) + EXPECT_EQ(0u, seg->write(0, "x", 0)); + EXPECT_EQ(0u, seg->data_size()); + + EXPECT_EQ(0, storage->close()); +} + +// ===== Persistence Tests ===== + +// Test: Write, flush, close, reopen, verify data persisted +TEST_F(BufferStorageWriteTest, WriteFlushReopenVerify) { + std::string data = "Persistent data that survives close/reopen"; + + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("persist_seg", 8192)); + auto seg = storage->get("persist_seg"); + ASSERT_TRUE(seg); + EXPECT_EQ(data.size(), seg->write(0, data.data(), data.size())); + EXPECT_EQ(0, storage->flush()); + EXPECT_EQ(0, storage->close()); + } + + // Reopen in read-only mode and verify + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("persist_seg"); + ASSERT_TRUE(seg); + EXPECT_EQ(data.size(), seg->data_size()); + + std::vector buf(data.size()); + EXPECT_EQ(data.size(), seg->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(data, std::string(buf.data(), buf.size())); + EXPECT_EQ(0, storage->close()); + } +} + +// Test: Multiple write-flush cycles persist all data +TEST_F(BufferStorageWriteTest, WriteMultipleFlushCycles) { + std::string data1 = "first_write"; + std::string data2 = "second_write_longer"; + + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + // First write + flush + EXPECT_EQ(data1.size(), seg->write(0, data1.data(), data1.size())); + EXPECT_EQ(0, storage->flush()); + + // Second write at a different offset + flush + EXPECT_EQ(data2.size(), + seg->write(200, data2.data(), data2.size())); + EXPECT_EQ(0, storage->flush()); + EXPECT_EQ(0, storage->close()); + } + + // Verify persistence + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + std::vector buf1(data1.size()); + EXPECT_EQ(data1.size(), seg->fetch(0, buf1.data(), buf1.size())); + EXPECT_EQ(data1, std::string(buf1.data(), buf1.size())); + + std::vector buf2(data2.size()); + EXPECT_EQ(data2.size(), seg->fetch(200, buf2.data(), buf2.size())); + EXPECT_EQ(data2, std::string(buf2.data(), buf2.size())); + + EXPECT_EQ(0, storage->close()); + } +} + +// Test: Close without explicit flush still persists (close_index does flush) +TEST_F(BufferStorageWriteTest, WriteCloseWithoutExplicitFlush) { + std::string data = "should_persist_on_close"; + + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + EXPECT_EQ(data.size(), seg->write(0, data.data(), data.size())); + // No explicit flush - close should handle it + EXPECT_EQ(0, storage->close()); + } + + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + std::vector buf(data.size()); + EXPECT_EQ(data.size(), seg->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(data, std::string(buf.data(), buf.size())); + EXPECT_EQ(0, storage->close()); + } +} + +// ===== Read-Only Behavior ===== + +// Test: Write to read-only storage is a silent no-op (returns len) +TEST_F(BufferStorageWriteTest, WriteReadOnlyNoOp) { + // First create an index file with a segment + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + std::string init_data = "initial"; + seg->write(0, init_data.data(), init_data.size()); + EXPECT_EQ(0, storage->flush()); + EXPECT_EQ(0, storage->close()); + } + + // Open read-only and attempt write + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + std::string new_data = "overwrite_attempt"; + // Should return len (silent no-op) + EXPECT_EQ(new_data.size(), + seg->write(0, new_data.data(), new_data.size())); + + // Data should remain unchanged (still "initial") + std::vector buf(7); + EXPECT_EQ(7u, seg->fetch(0, buf.data(), 7)); + EXPECT_EQ("initial", std::string(buf.data(), 7)); + + EXPECT_EQ(0, storage->close()); + } +} + +// ===== Resize Tests ===== + +// Test: Resize increases data_size without writing +TEST_F(BufferStorageWriteTest, ResizeGrow) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + EXPECT_EQ(0u, seg->data_size()); + size_t new_size = seg->resize(512); + EXPECT_EQ(512u, new_size); + EXPECT_EQ(512u, seg->data_size()); + EXPECT_EQ(seg->capacity() - 512, seg->padding_size()); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Resize shrinks data_size +TEST_F(BufferStorageWriteTest, ResizeShrink) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + // Write to grow data_size to 100 + std::vector buf(100, 'X'); + seg->write(0, buf.data(), buf.size()); + EXPECT_EQ(100u, seg->data_size()); + + // Resize to smaller + size_t new_size = seg->resize(50); + EXPECT_EQ(50u, new_size); + EXPECT_EQ(50u, seg->data_size()); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Resize beyond capacity is clamped +TEST_F(BufferStorageWriteTest, ResizeBeyondCapacityClamped) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + size_t cap = seg->capacity(); + size_t result = seg->resize(cap + 1000); + EXPECT_EQ(cap, result); + EXPECT_EQ(cap, seg->data_size()); + EXPECT_EQ(0u, seg->padding_size()); + + EXPECT_EQ(0, storage->close()); +} + +// ===== CRC Tests ===== + +// Test: update_data_crc reflects in data_crc() getter +TEST_F(BufferStorageWriteTest, UpdateDataCrc) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + uint32_t new_crc = 0xDEADBEEF; + seg->update_data_crc(new_crc); + EXPECT_EQ(new_crc, seg->data_crc()); + + EXPECT_EQ(0, storage->close()); +} + +// Test: CRC persists after flush and reopen +TEST_F(BufferStorageWriteTest, UpdateDataCrcPersistence) { + uint32_t crc_val = 0x12345678; + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + std::string data = "crc_test_data"; + seg->write(0, data.data(), data.size()); + seg->update_data_crc(crc_val); + EXPECT_EQ(0, storage->flush()); + EXPECT_EQ(0, storage->close()); + } + + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + EXPECT_EQ(crc_val, seg->data_crc()); + EXPECT_EQ(0, storage->close()); + } +} + +// ===== Concurrency Tests ===== + +// Test: Multiple threads writing to different segments concurrently +TEST_F(BufferStorageWriteTest, ConcurrentWriteDifferentSegments) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + const int kNumSegments = 8; + for (int i = 0; i < kNumSegments; ++i) { + ASSERT_EQ(0, storage->append("seg_" + std::to_string(i), 16384)); + } + + std::vector threads; + std::atomic errors{0}; + + for (int i = 0; i < kNumSegments; ++i) { + threads.emplace_back([&, i]() { + auto seg = storage->get("seg_" + std::to_string(i)); + if (!seg) { + errors.fetch_add(1); + return; + } + // Each thread writes its own pattern to its own segment + std::vector data(1024, static_cast('A' + i)); + for (int j = 0; j < 10; ++j) { + size_t offset = j * 1024; + if (seg->write(offset, data.data(), data.size()) != data.size()) { + errors.fetch_add(1); + } + } + }); + } + + for (auto &t : threads) t.join(); + EXPECT_EQ(0, errors.load()); + + // Verify each segment's data + for (int i = 0; i < kNumSegments; ++i) { + auto seg = storage->get("seg_" + std::to_string(i)); + ASSERT_TRUE(seg); + // Last write was at offset 9*1024, so data_size >= 10*1024 + EXPECT_GE(seg->data_size(), 10u * 1024u); + + std::vector buf(1024); + seg->fetch(0, buf.data(), 1024); + EXPECT_EQ(buf[0], static_cast('A' + i)); + } + + EXPECT_EQ(0, storage->close()); +} + +// Test: Multiple threads writing to the same segment at different offsets +TEST_F(BufferStorageWriteTest, ConcurrentWriteSameSegment) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + // Need large enough segment for all threads + ASSERT_EQ(0, storage->append("shared_seg", 65536)); + auto seg = storage->get("shared_seg"); + ASSERT_TRUE(seg); + + const int kNumThreads = 8; + const size_t kChunkSize = 256; + std::atomic errors{0}; + std::vector threads; + + for (int i = 0; i < kNumThreads; ++i) { + threads.emplace_back([&, i]() { + // Each thread writes to its own non-overlapping region + size_t offset = i * kChunkSize * 10; + std::vector data(kChunkSize, static_cast('A' + i)); + for (int j = 0; j < 10; ++j) { + if (seg->write(offset + j * kChunkSize, data.data(), data.size()) != + data.size()) { + errors.fetch_add(1); + } + } + }); + } + + for (auto &t : threads) t.join(); + EXPECT_EQ(0, errors.load()); + + // Verify each thread's region + for (int i = 0; i < kNumThreads; ++i) { + size_t offset = i * kChunkSize * 10; + std::vector buf(kChunkSize); + seg->fetch(offset, buf.data(), kChunkSize); + for (size_t b = 0; b < kChunkSize; ++b) { + EXPECT_EQ(buf[b], static_cast('A' + i)) + << "Mismatch at thread " << i << " byte " << b; + } + } + + EXPECT_EQ(0, storage->close()); +} + +// Test: Concurrent writers + flush (simulates real workload) +TEST_F(BufferStorageWriteTest, ConcurrentWriteWithFlush) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 65536)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + std::atomic stop{false}; + std::atomic write_errors{0}; + + // Writer threads + std::vector writers; + for (int i = 0; i < 4; ++i) { + writers.emplace_back([&, i]() { + std::vector data(128, static_cast('0' + i)); + int iter = 0; + while (!stop.load(std::memory_order_relaxed) && iter < 100) { + size_t offset = (i * 128 + (iter % 10) * 128) % 4096; + if (seg->write(offset, data.data(), data.size()) != data.size()) { + write_errors.fetch_add(1); + } + ++iter; + } + }); + } + + // Flush thread + std::thread flusher([&]() { + for (int i = 0; i < 5; ++i) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + storage->flush(); + } + stop.store(true); + }); + + for (auto &w : writers) w.join(); + flusher.join(); + + EXPECT_EQ(0, write_errors.load()); + EXPECT_EQ(0, storage->close()); +} + +// ===== Append + Write Integration ===== + +// Test: Append multiple segments then write to each +TEST_F(BufferStorageWriteTest, AppendThenWriteSequence) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + for (int i = 0; i < 5; ++i) { + std::string seg_name = "seg_" + std::to_string(i); + ASSERT_EQ(0, storage->append(seg_name, 4096)); + auto seg = storage->get(seg_name); + ASSERT_TRUE(seg); + + std::string data = "content_of_segment_" + std::to_string(i); + EXPECT_EQ(data.size(), seg->write(0, data.data(), data.size())); + } + + // Verify all segments have correct data + for (int i = 0; i < 5; ++i) { + std::string seg_name = "seg_" + std::to_string(i); + auto seg = storage->get(seg_name); + ASSERT_TRUE(seg); + std::string expected = "content_of_segment_" + std::to_string(i); + std::vector buf(expected.size()); + EXPECT_EQ(expected.size(), seg->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(expected, std::string(buf.data(), buf.size())); + } + + EXPECT_EQ(0, storage->close()); +} + +// Test: Write to a segment, append another, write to both, verify all +TEST_F(BufferStorageWriteTest, InterleavedAppendAndWrite) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + // Append and write first segment + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg1 = storage->get("seg1"); + ASSERT_TRUE(seg1); + std::string d1 = "first_data"; + EXPECT_EQ(d1.size(), seg1->write(0, d1.data(), d1.size())); + + // Append second segment (triggers flush_index internally) + ASSERT_EQ(0, storage->append("seg2", 4096)); + auto seg2 = storage->get("seg2"); + ASSERT_TRUE(seg2); + std::string d2 = "second_data"; + EXPECT_EQ(d2.size(), seg2->write(0, d2.data(), d2.size())); + + // Re-get seg1 (pointer stability) and write more + auto seg1_again = storage->get("seg1"); + ASSERT_TRUE(seg1_again); + std::string d1_extra = "extra"; + EXPECT_EQ(d1_extra.size(), + seg1_again->write(d1.size(), d1_extra.data(), d1_extra.size())); + + // Verify all data + std::vector buf(d1.size() + d1_extra.size()); + EXPECT_EQ(buf.size(), seg1_again->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(d1 + d1_extra, std::string(buf.data(), buf.size())); + + std::vector buf2(d2.size()); + EXPECT_EQ(d2.size(), seg2->fetch(0, buf2.data(), buf2.size())); + EXPECT_EQ(d2, std::string(buf2.data(), buf2.size())); + + EXPECT_EQ(0, storage->close()); +} + +// ===== Large Write Tests ===== + +// Test: Fill entire segment capacity with data +TEST_F(BufferStorageWriteTest, WriteLargeBuffer) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + // Request 16KB segment (will be page-aligned) + ASSERT_EQ(0, storage->append("big_seg", 16384)); + auto seg = storage->get("big_seg"); + ASSERT_TRUE(seg); + + size_t cap = seg->capacity(); + ASSERT_GE(cap, 16384u); + + // Fill with a pattern + std::vector data(cap); + std::iota(data.begin(), data.end(), static_cast(0)); + EXPECT_EQ(cap, seg->write(0, data.data(), data.size())); + EXPECT_EQ(cap, seg->data_size()); + EXPECT_EQ(0u, seg->padding_size()); + + // Verify a portion + std::vector verify(1024); + EXPECT_EQ(1024u, seg->fetch(0, verify.data(), 1024)); + EXPECT_EQ(0, std::memcmp(data.data(), verify.data(), 1024)); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Large write persistence across close/reopen +TEST_F(BufferStorageWriteTest, WriteLargeBufferPersistence) { + const size_t kSize = 8192; + std::vector data(kSize); + for (size_t i = 0; i < kSize; ++i) { + data[i] = static_cast(i % 256); + } + + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("large_seg", kSize)); + auto seg = storage->get("large_seg"); + ASSERT_TRUE(seg); + EXPECT_EQ(kSize, seg->write(0, data.data(), data.size())); + EXPECT_EQ(0, storage->close()); + } + + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("large_seg"); + ASSERT_TRUE(seg); + EXPECT_EQ(kSize, seg->data_size()); + + std::vector buf(kSize); + EXPECT_EQ(kSize, seg->fetch(0, buf.data(), kSize)); + EXPECT_EQ(0, std::memcmp(data.data(), buf.data(), kSize)); + EXPECT_EQ(0, storage->close()); + } +} + +// ===== Refresh / Checkpoint Tests ===== + +// Test: refresh() updates checkpoint and marks dirty +TEST_F(BufferStorageWriteTest, RefreshCheckpoint) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("seg1", 4096)); + + storage->refresh(42); + EXPECT_EQ(0, storage->flush()); + + // After flush the check_point should be >= 42 + EXPECT_GE(storage->check_point(), 42u); + + // Increasing checkpoint + storage->refresh(100); + EXPECT_EQ(0, storage->flush()); + EXPECT_GE(storage->check_point(), 100u); + + EXPECT_EQ(0, storage->close()); +} + +// ===== Duplicate / Error Handling ===== + +// Test: Appending a duplicate segment ID returns error +TEST_F(BufferStorageWriteTest, AppendDuplicateSegment) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("dup_seg", 4096)); + // Second append with same ID should fail + EXPECT_NE(0, storage->append("dup_seg", 4096)); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Appending a zero-size segment returns error +TEST_F(BufferStorageWriteTest, AppendZeroSize) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + EXPECT_NE(0, storage->append("zero_seg", 0)); + + EXPECT_EQ(0, storage->close()); +} + +// ===== Code Review Issue Tests ===== +// The following tests target specific bugs/races found during code review. + +// PR#414 Issue: data_size concurrent race on same segment. +// Multiple threads calling write() with different offsets should not corrupt +// the (data_size, padding_size) pair. Their sum must equal capacity when +// observed after all writers quiesce (individual unsynchronized reads during +// concurrent writes may appear torn, which is expected). +TEST_F(BufferStorageWriteTest, CR_DataSizePaddingSizeInvariant) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 8192)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + const size_t cap = seg->capacity(); + + const int kNumThreads = 8; + const int kIters = 200; + std::atomic write_failures{0}; + std::vector threads; + + for (int i = 0; i < kNumThreads; ++i) { + threads.emplace_back([&, i]() { + char buf[64]; + std::memset(buf, 'A' + i, sizeof(buf)); + for (int j = 0; j < kIters; ++j) { + // Write at various offsets within capacity to exercise data_size growth + size_t offset = ((i * 64) + j * 7) % (cap - 64); + if (seg->write(offset, buf, sizeof(buf)) != sizeof(buf)) { + write_failures.fetch_add(1); + } + } + }); + } + + for (auto &t : threads) t.join(); + EXPECT_EQ(0, write_failures.load()); + // After all writers stop, the invariant MUST hold + EXPECT_EQ(cap, seg->data_size() + seg->padding_size()); + EXPECT_GT(seg->data_size(), 0u); + EXPECT_EQ(0, storage->close()); +} + +// PR#414 Issue: Concurrent write() + resize() on same segment. +// meta_mtx_ must serialize so that (data_size, padding_size) stays consistent. +// The invariant is verified after all threads stop (reads without meta_mtx_ +// during concurrent mutation may observe a torn pair, which is expected). +TEST_F(BufferStorageWriteTest, CR_ConcurrentWriteAndResize) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 8192)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + const size_t cap = seg->capacity(); + + std::atomic stop{false}; + std::atomic write_failures{0}; + + // Writer thread: grows data_size by writing at increasing offsets + std::thread writer([&]() { + char buf[128]; + std::memset(buf, 'W', sizeof(buf)); + for (int j = 0; j < 300 && !stop.load(std::memory_order_relaxed); ++j) { + size_t offset = j % (cap - 128); + if (seg->write(offset, buf, sizeof(buf)) != sizeof(buf)) { + write_failures.fetch_add(1); + } + } + }); + + // Resizer thread: constantly resizes + std::thread resizer([&]() { + for (int j = 0; j < 300 && !stop.load(std::memory_order_relaxed); ++j) { + size_t new_size = (j * 37) % cap; + seg->resize(new_size); + } + stop.store(true); + }); + + writer.join(); + resizer.join(); + + EXPECT_EQ(0, write_failures.load()); + // After quiescence, invariant must hold + EXPECT_EQ(cap, seg->data_size() + seg->padding_size()); + EXPECT_EQ(0, storage->close()); +} + +// Chain-split bug: Many appends exhaust segment_meta capacity, triggering +// chain split. After reopen, ALL segments must be findable. +// (Tests fix for reserve()-induced dangling pointer in append_segment.) +TEST_F(BufferStorageWriteTest, CR_ChainSplitAllSegmentsAccessible) { + const int kNumSegments = 50; // Enough to trigger chain split with default 4096 meta capacity + + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + for (int i = 0; i < kNumSegments; ++i) { + std::string name = "chain_seg_" + std::to_string(i); + ASSERT_EQ(0, storage->append(name, 4096)) + << "Failed to append segment " << i; + auto seg = storage->get(name); + ASSERT_TRUE(seg) << "Failed to get segment " << name << " right after append"; + // Write a marker so we can verify on reopen + std::string marker = "marker_" + std::to_string(i); + EXPECT_EQ(marker.size(), seg->write(0, marker.data(), marker.size())); + } + EXPECT_EQ(0, storage->flush()); + EXPECT_EQ(0, storage->close()); + } + + // Reopen and verify ALL segments are present and readable + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + for (int i = 0; i < kNumSegments; ++i) { + std::string name = "chain_seg_" + std::to_string(i); + auto seg = storage->get(name); + ASSERT_TRUE(seg) << "Segment " << name << " missing after reopen (chain-split bug?)"; + std::string expected = "marker_" + std::to_string(i); + std::vector buf(expected.size()); + EXPECT_EQ(expected.size(), seg->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(expected, std::string(buf.data(), buf.size())) + << "Data mismatch for " << name; + } + EXPECT_EQ(0, storage->close()); + } +} + +// mapping_shard_id bug: Multiple BufferStorage instances opened on the +// same thread must work correctly (the old thread_local shard_id would +// map them to the same shard, causing potential conflicts). +TEST_F(BufferStorageWriteTest, CR_MultipleInstancesSameThread) { + std::string path2 = file_path_ + "_second"; + ailego::File::Delete(path2); + + auto storage1 = OpenWritable(); + ASSERT_TRUE(storage1); + + // Open a second independent BufferStorage instance + auto storage2 = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_TRUE(storage2); + ailego::Params params; + storage2->init(params); + ASSERT_EQ(0, storage2->open(path2, true)); + + // Append and write to both concurrently from the SAME thread + ASSERT_EQ(0, storage1->append("seg_a", 4096)); + ASSERT_EQ(0, storage2->append("seg_b", 4096)); + + auto seg_a = storage1->get("seg_a"); + auto seg_b = storage2->get("seg_b"); + ASSERT_TRUE(seg_a); + ASSERT_TRUE(seg_b); + + std::string da = "instance_one_data"; + std::string db = "instance_two_data"; + EXPECT_EQ(da.size(), seg_a->write(0, da.data(), da.size())); + EXPECT_EQ(db.size(), seg_b->write(0, db.data(), db.size())); + + // Verify data isolation + std::vector buf1(da.size()); + EXPECT_EQ(da.size(), seg_a->fetch(0, buf1.data(), buf1.size())); + EXPECT_EQ(da, std::string(buf1.data(), buf1.size())); + + std::vector buf2(db.size()); + EXPECT_EQ(db.size(), seg_b->fetch(0, buf2.data(), buf2.size())); + EXPECT_EQ(db, std::string(buf2.data(), buf2.size())); + + EXPECT_EQ(0, storage1->close()); + EXPECT_EQ(0, storage2->close()); + ailego::File::Delete(path2); +} + +// Cross-page read/write: Write data spanning page boundaries (4KB pages), +// then read back via both fetch() and read(MemoryBlock&) to verify the +// cross-page buffer allocation path. (Tests fix for UAF in cross-page read.) +TEST_F(BufferStorageWriteTest, CR_CrossPageWriteAndRead) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + // Segment large enough to span multiple pages + ASSERT_EQ(0, storage->append("cross_page_seg", 16384)); + auto seg = storage->get("cross_page_seg"); + ASSERT_TRUE(seg); + + // Write 5000 bytes starting at offset 2000, which crosses the first + // page boundary at 4096 (relative to segment data start in the file). + const size_t kWriteOffset = 2000; + const size_t kWriteLen = 5000; + std::vector write_data(kWriteLen); + for (size_t i = 0; i < kWriteLen; ++i) { + write_data[i] = static_cast((i * 7 + 13) % 256); + } + EXPECT_EQ(kWriteLen, seg->write(kWriteOffset, write_data.data(), kWriteLen)); + + // Read back via fetch (uses read_range internally for cross-page) + std::vector fetch_buf(kWriteLen); + EXPECT_EQ(kWriteLen, seg->fetch(kWriteOffset, fetch_buf.data(), kWriteLen)); + EXPECT_EQ(write_data, fetch_buf); + + // Read back via read(MemoryBlock&) - exercises the cross-page alloc path + IndexStorage::MemoryBlock mb; + EXPECT_EQ(kWriteLen, seg->read(kWriteOffset, mb, kWriteLen)); + EXPECT_EQ(0, std::memcmp(write_data.data(), mb.data(), kWriteLen)); + + EXPECT_EQ(0, storage->close()); +} + +// Dirty flag race: write() after flush_index() must re-set the dirty flag. +// If the write lands between CAS(dirty, false) and the end of flush, +// the next flush must still persist it. Verified by close→reopen→read. +TEST_F(BufferStorageWriteTest, CR_DirtyFlagNotLostAfterFlush) { + std::string early_data = "early"; + std::string late_data = "late_write_after_flush"; + + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + // Write and flush + EXPECT_EQ(early_data.size(), + seg->write(0, early_data.data(), early_data.size())); + EXPECT_EQ(0, storage->flush()); + + // Write again AFTER flush - dirty flag must be re-set + EXPECT_EQ(late_data.size(), + seg->write(100, late_data.data(), late_data.size())); + // Close without explicit flush (close_index will flush) + EXPECT_EQ(0, storage->close()); + } + + // Reopen and verify the late write persisted + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + std::vector buf(late_data.size()); + EXPECT_EQ(late_data.size(), seg->fetch(100, buf.data(), buf.size())); + EXPECT_EQ(late_data, std::string(buf.data(), buf.size())); + EXPECT_EQ(0, storage->close()); + } +} + +// Stress test: Concurrent flush + write interleaving to expose dirty flag races. +// All writes that return successfully MUST be visible after final close+reopen. +TEST_F(BufferStorageWriteTest, CR_ConcurrentFlushWriteDirtyFlagStress) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 65536)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + // Track the highest offset+len successfully written + std::atomic max_committed_end{0}; + std::atomic stop{false}; + + // Writer: writes sequentially increasing offsets + std::thread writer([&]() { + char pattern[64]; + std::memset(pattern, 'P', sizeof(pattern)); + for (int i = 0; i < 500 && !stop.load(std::memory_order_relaxed); ++i) { + size_t offset = i * 64; + if (offset + 64 > 65536) break; + if (seg->write(offset, pattern, 64) == 64) { + // Update max committed end + size_t end = offset + 64; + size_t cur = max_committed_end.load(std::memory_order_relaxed); + while (end > cur) { + if (max_committed_end.compare_exchange_weak( + cur, end, std::memory_order_relaxed)) { + break; + } + } + } + } + }); + + // Flusher: repeatedly flushes to trigger the CAS(dirty, false) path + std::thread flusher([&]() { + for (int i = 0; i < 50; ++i) { + storage->flush(); + std::this_thread::sleep_for(std::chrono::microseconds(100)); + } + stop.store(true); + }); + + writer.join(); + flusher.join(); + + size_t final_data_size = seg->data_size(); + EXPECT_GE(final_data_size, max_committed_end.load()); + EXPECT_EQ(0, storage->close()); +} + +// Pointer stability after append: WrappedSegment obtained BEFORE a new +// append must still work correctly AFTER the append (unordered_map address +// stability guarantee). This tests the fix for reserve()-based invalidation. +TEST_F(BufferStorageWriteTest, CR_PointerStabilityAcrossAppend) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg_first", 4096)); + auto seg_first = storage->get("seg_first"); + ASSERT_TRUE(seg_first); + + // Write initial data + std::string initial = "before_append"; + EXPECT_EQ(initial.size(), seg_first->write(0, initial.data(), initial.size())); + + // Append many more segments (may trigger internal rehash/resize) + for (int i = 0; i < 20; ++i) { + ASSERT_EQ(0, storage->append("new_seg_" + std::to_string(i), 4096)); + } + + // The original segment handle must still be valid and writable + std::string after = "_after_appends"; + EXPECT_EQ(after.size(), + seg_first->write(initial.size(), after.data(), after.size())); + + // Verify full data + std::string expected = initial + after; + std::vector buf(expected.size()); + EXPECT_EQ(expected.size(), seg_first->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(expected, std::string(buf.data(), buf.size())); + + EXPECT_EQ(0, storage->close()); +} + +// update_data_crc concurrent with write: CRC update must be serialized +// with data_size changes via meta_mtx_. Invariant verified post-quiescence. +TEST_F(BufferStorageWriteTest, CR_ConcurrentWriteAndCrcUpdate) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 8192)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + const size_t cap = seg->capacity(); + + std::atomic stop{false}; + std::atomic write_failures{0}; + + // Writer thread + std::thread writer([&]() { + char buf[128]; + std::memset(buf, 'X', sizeof(buf)); + for (int i = 0; i < 500 && !stop.load(std::memory_order_relaxed); ++i) { + size_t offset = (i * 128) % (cap - 128); + if (seg->write(offset, buf, sizeof(buf)) != sizeof(buf)) { + write_failures.fetch_add(1); + } + } + }); + + // CRC updater thread: concurrently updates CRC + std::thread crc_updater([&]() { + for (int i = 0; i < 500 && !stop.load(std::memory_order_relaxed); ++i) { + seg->update_data_crc(static_cast(i)); + } + stop.store(true); + }); + + writer.join(); + crc_updater.join(); + + EXPECT_EQ(0, write_failures.load()); + // After all threads stop, invariant must hold + EXPECT_EQ(cap, seg->data_size() + seg->padding_size()); + // CRC should have been updated (last writer wins) + // Just verify it doesn't crash and the value is readable + (void)seg->data_crc(); + EXPECT_EQ(0, storage->close()); +} From 327bf4316667df54842453c65b1252ff74c33f05 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Thu, 28 May 2026 17:08:10 +0800 Subject: [PATCH 36/47] fix ut --- tests/core/utility/buffer_storage_write_test.cc | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/tests/core/utility/buffer_storage_write_test.cc b/tests/core/utility/buffer_storage_write_test.cc index b69a973e5..a97a32c17 100644 --- a/tests/core/utility/buffer_storage_write_test.cc +++ b/tests/core/utility/buffer_storage_write_test.cc @@ -992,10 +992,18 @@ TEST_F(BufferStorageWriteTest, CR_CrossPageWriteAndRead) { EXPECT_EQ(kWriteLen, seg->fetch(kWriteOffset, fetch_buf.data(), kWriteLen)); EXPECT_EQ(write_data, fetch_buf); - // Read back via read(MemoryBlock&) - exercises the cross-page alloc path - IndexStorage::MemoryBlock mb; - EXPECT_EQ(kWriteLen, seg->read(kWriteOffset, mb, kWriteLen)); - EXPECT_EQ(0, std::memcmp(write_data.data(), mb.data(), kWriteLen)); + // Read back via read(MemoryBlock&) - exercises the cross-page alloc path. + // Scope the MemoryBlock so it is destroyed BEFORE storage->close(): + // when the read happens to land on a single page (e.g. macOS arm64 with + // 16KB pages, where [2000, 7000) fits in one page) the returned block + // is MBT_BUFFERPOOL holding a raw pointer to buffer_pool_handle_. Once + // close_index() resets buffer_pool_handle_/buffer_pool_, that raw + // pointer dangles and ~MemoryBlock()'s release_one() segfaults. + { + IndexStorage::MemoryBlock mb; + EXPECT_EQ(kWriteLen, seg->read(kWriteOffset, mb, kWriteLen)); + EXPECT_EQ(0, std::memcmp(write_data.data(), mb.data(), kWriteLen)); + } EXPECT_EQ(0, storage->close()); } From f5f334ca41486d456144de0c302fe6c110a5e2dc Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Fri, 29 May 2026 11:21:17 +0800 Subject: [PATCH 37/47] fix for pr comment --- src/ailego/buffer/vector_page_table.cc | 42 +++++++------------ src/core/utility/buffer_storage.cc | 14 +++---- .../zvec/ailego/buffer/vector_page_table.h | 3 +- 3 files changed, 22 insertions(+), 37 deletions(-) diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index 2c7c41667..8e5c43f30 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -20,10 +20,6 @@ #include #include -#if !defined(_MSC_VER) -#include -#endif - #if defined(_MSC_VER) #ifndef NOMINMAX #define NOMINMAX @@ -54,6 +50,16 @@ static ssize_t zvec_pwrite(int fd, const void *buf, size_t count, } return static_cast(bytes_written); } +#else +#include +static inline ssize_t zvec_pread(int fd, void *buf, size_t count, + size_t offset) { + return ::pread(fd, buf, count, static_cast(offset)); +} +static inline ssize_t zvec_pwrite(int fd, const void *buf, size_t count, + size_t offset) { + return ::pwrite(fd, buf, count, static_cast(offset)); +} #endif namespace zvec { @@ -266,18 +272,14 @@ char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer, } } -VecBufferPool::VecBufferPool(const std::string &filename, bool writable, - bool create) { +VecBufferPool::VecBufferPool(const std::string &filename, bool writable) { file_name_ = filename; - writable_ = writable || create; + writable_ = writable; #if defined(_MSC_VER) - int flags = writable_ ? (create ? (O_RDWR | O_CREAT | O_TRUNC | _O_BINARY) - : (O_RDWR | _O_BINARY)) - : (O_RDONLY | _O_BINARY); + int flags = writable_ ? (O_RDWR | _O_BINARY) : (O_RDONLY | _O_BINARY); fd_ = _open(filename.c_str(), flags, 0644); #else - int flags = - writable_ ? (create ? (O_RDWR | O_CREAT | O_TRUNC) : O_RDWR) : O_RDONLY; + int flags = writable_ ? O_RDWR : O_RDONLY; fd_ = ::open(filename.c_str(), flags, 0644); #endif if (fd_ < 0) { @@ -322,11 +324,7 @@ int VecBufferPool::init() { page_table_.set_flush_callback([fd, &name](block_id_t /*block_id*/, char *buf, size_t sz, size_t off) -> int { -#if defined(_MSC_VER) ssize_t w = zvec_pwrite(fd, buf, sz, off); -#else - ssize_t w = ::pwrite(fd, buf, sz, off); -#endif if (w != static_cast(sz)) { LOG_ERROR( "Buffer pool flush failed: file[%s], offset[%zu], " @@ -381,11 +379,7 @@ char *VecBufferPool::acquire_buffer(block_id_t page_id, int retry) { if (expected_bytes < kVectorPageSize) { std::memset(buffer + expected_bytes, 0, kVectorPageSize - expected_bytes); } -#if defined(_MSC_VER) ssize_t read_bytes = zvec_pread(fd_, buffer, expected_bytes, page_offset); -#else - ssize_t read_bytes = pread(fd_, buffer, expected_bytes, page_offset); -#endif if (read_bytes != static_cast(expected_bytes)) { LOG_ERROR( "Buffer pool failed to read file at offset: file[%s], page_id[%zu], " @@ -398,11 +392,7 @@ char *VecBufferPool::acquire_buffer(block_id_t page_id, int retry) { } int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) { -#if defined(_MSC_VER) ssize_t read_bytes = zvec_pread(fd_, buffer, length, offset); -#else - ssize_t read_bytes = pread(fd_, buffer, length, offset); -#endif if (read_bytes != static_cast(length)) { LOG_ERROR( "Buffer pool failed to read file at offset: file[%s], offset[%zu], " @@ -456,11 +446,7 @@ int VecBufferPool::write_meta(size_t offset, size_t length, file_name_.c_str()); return -1; } -#if defined(_MSC_VER) ssize_t w = zvec_pwrite(fd_, buffer, length, offset); -#else - ssize_t w = ::pwrite(fd_, buffer, length, offset); -#endif if (w != static_cast(length)) { LOG_ERROR( "Buffer pool failed to write meta: file[%s], offset[%zu], " diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index c928d6d2e..caaa3cf8a 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -394,7 +394,7 @@ class BufferStorage : public IndexStorage { // Open in writable mode when the caller expects to modify the index // (create_if_missing=true implies write intent, same as MMapFileStorage). buffer_pool_ = std::make_shared( - path, /*writable=*/create_if_missing, /*create=*/false); + path, /*writable=*/create_if_missing); buffer_pool_handle_ = std::make_shared( buffer_pool_->get_handle()); int ret = ParseToMapping(); @@ -419,18 +419,18 @@ class BufferStorage : public IndexStorage { // caller holds either single-threaded open() or AllShardsExclusiveLatch. // Do NOT add an internal lock here -- std::shared_mutex is not reentrant. int ParseHeader(size_t offset, IndexFormat::MetaHeader *out) { - std::unique_ptr buffer(new char[sizeof(*out)]); - if (buffer_pool_handle_->get_meta(offset, sizeof(*out), buffer.get()) != - 0) { + constexpr size_t kHeaderSize = sizeof(IndexFormat::MetaHeader); + std::unique_ptr buffer(new char[kHeaderSize]); + if (buffer_pool_handle_->get_meta(offset, kHeaderSize, buffer.get()) != 0) { LOG_ERROR("Get segment header failed."); return IndexError_Runtime; } - memcpy(out, buffer.get(), sizeof(*out)); - if (out->meta_header_size != sizeof(IndexFormat::MetaHeader)) { + memcpy(out, buffer.get(), kHeaderSize); + if (out->meta_header_size != kHeaderSize) { LOG_ERROR("Header meta size is invalid."); return IndexError_InvalidLength; } - if (ailego::Crc32c::Hash(out, sizeof(*out), out->header_crc) != + if (ailego::Crc32c::Hash(out, kHeaderSize, out->header_crc) != out->header_crc) { LOG_ERROR("Header meta checksum is invalid."); return IndexError_InvalidChecksum; diff --git a/src/include/zvec/ailego/buffer/vector_page_table.h b/src/include/zvec/ailego/buffer/vector_page_table.h index f2e78a061..8bcc13e99 100644 --- a/src/include/zvec/ailego/buffer/vector_page_table.h +++ b/src/include/zvec/ailego/buffer/vector_page_table.h @@ -201,8 +201,7 @@ class VecBufferPool { static constexpr size_t kMutexBucketCount = 64UL * 1024UL; - VecBufferPool(const std::string &filename, bool writable = false, - bool create = false); + VecBufferPool(const std::string &filename, bool writable = false); ~VecBufferPool() { // Flush any remaining dirty blocks before tearing down memory/fd so that // writes are not silently lost. Safe to call even in read-only mode. From f9063e5e8536e7de6b040625c7088cc422958a49 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Fri, 29 May 2026 16:24:17 +0800 Subject: [PATCH 38/47] add ut --- src/core/interface/indexes/ivf_index.cc | 10 +- .../index/storage/lazy_record_batch_reader.h | 3 +- tests/db/collection_test.cc | 805 +++++++++--------- 3 files changed, 435 insertions(+), 383 deletions(-) diff --git a/src/core/interface/indexes/ivf_index.cc b/src/core/interface/indexes/ivf_index.cc index 0cfba037c..d38afbabd 100644 --- a/src/core/interface/indexes/ivf_index.cc +++ b/src/core/interface/indexes/ivf_index.cc @@ -84,14 +84,18 @@ int IVFIndex::Open(const std::string &file_path, break; } case StorageOptions::StorageType::kBufferPool: { - storage_ = core::IndexFactory::CreateStorage("BufferStorage"); + // NOTE: IVF index is dumped via FileDumper (plain binary file), which is + // not compatible with BufferStorage's IndexFormat layout (header/footer + // chain). Until IVF gains a BufferStorage-aware dump path, fall back to + // MMapFileReadStorage so the freshly-dumped file can be reopened. + storage_ = core::IndexFactory::CreateStorage("MMapFileReadStorage"); if (storage_ == nullptr) { - LOG_ERROR("Failed to create BufferStorage"); + LOG_ERROR("Failed to create MMapFileReadStorage (IVF buffer-pool fallback)"); return core::IndexError_Runtime; } int ret = storage_->init(storage_params); if (ret != 0) { - LOG_ERROR("Failed to init BufferStorage, path: %s, err: %s", + LOG_ERROR("Failed to init MMapFileReadStorage (IVF buffer-pool fallback), path: %s, err: %s", file_path_.c_str(), core::IndexError::What(ret)); return ret; } diff --git a/src/db/index/storage/lazy_record_batch_reader.h b/src/db/index/storage/lazy_record_batch_reader.h index 451bba8e0..e1286e305 100644 --- a/src/db/index/storage/lazy_record_batch_reader.h +++ b/src/db/index/storage/lazy_record_batch_reader.h @@ -128,7 +128,8 @@ class ParquetRecordBatchReader : public arrow::RecordBatchReader { std::vector> chunks(col_indices_.size()); if (with_cache_) { for (size_t col_idx = 0; col_idx < col_indices_.size(); ++col_idx) { - auto buffer_id = ailego::ParquetBufferID(file_path_, col_idx, rg_id); + auto buffer_id = + ailego::ParquetBufferID(file_path_, col_indices_[col_idx], rg_id); auto buffer_handle = ailego::ParquetBufferPool::get_instance().acquire_buffer(buffer_id); std::shared_ptr col_chunked_array = diff --git a/tests/db/collection_test.cc b/tests/db/collection_test.cc index 931740155..76910c3bb 100644 --- a/tests/db/collection_test.cc +++ b/tests/db/collection_test.cc @@ -47,6 +47,8 @@ std::string col_path = "test_collection"; class CollectionTest : public ::testing::Test { protected: void SetUp() override { + zvec::ailego::MemoryLimitPool::get_instance().init( + 2 * 1024ll * 1024ll * 1024ll); FileHelper::RemoveDirectory(col_path); } @@ -57,128 +59,132 @@ class CollectionTest : public ::testing::Test { }; TEST_F(CollectionTest, Feature_CreateAndOpen_General) { - CollectionOptions options; - options.read_only_ = false; - options.enable_mmap_ = true; + auto func = [&](bool enable_mmap) { + CollectionOptions options; + options.read_only_ = false; + options.enable_mmap_ = enable_mmap; - std::string path = "./demo"; + std::string path = "./demo"; - ailego::FileHelper::RemoveDirectory(path.c_str()); + ailego::FileHelper::RemoveDirectory(path.c_str()); - auto schema = TestHelper::CreateNormalSchema(); - auto result = Collection::CreateAndOpen(path, *schema, options); - if (!result.has_value()) { - std::cout << result.error().message() << std::endl; - } - ASSERT_TRUE(result.has_value()); - ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str())); - - auto col = result.value(); - ASSERT_EQ(col->Path(), path); - ASSERT_EQ(col->Schema(), *schema); - ASSERT_EQ(col->Options(), options); - auto stats = col->Stats().value(); - ASSERT_TRUE(stats.doc_count == 0); - ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); - ASSERT_EQ(stats.index_completeness["dense_fp16"], 1); - // ASSERT_EQ(stats.index_completeness["dense_fp64"], 1); - ASSERT_EQ(stats.index_completeness["sparse_fp32"], 1); - ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); - - ASSERT_EQ(col->Destroy(), Status::OK()); - - // after destroyed, every interface should return error - std::vector empty_docs; - ASSERT_FALSE(col->Insert(empty_docs).has_value()); - ASSERT_FALSE(col->Update(empty_docs).has_value()); - ASSERT_FALSE(col->Delete({}).has_value()); - ASSERT_FALSE(col->DeleteByFilter("").ok()); - ASSERT_FALSE(col->Fetch({}).has_value()); - ASSERT_FALSE(col->Query(VectorQuery{}).has_value()); - ASSERT_FALSE(col->Query(MultiQuery{}).has_value()); - ASSERT_FALSE(col->GroupByQuery({}).has_value()); - ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); - ASSERT_FALSE(col->DropIndex("").ok()); - ASSERT_FALSE(col->AddColumn(nullptr, "").ok()); - ASSERT_FALSE(col->AlterColumn("", "", nullptr).ok()); - ASSERT_FALSE(col->DropColumn("").ok()); - ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); - ASSERT_FALSE(col->Optimize().ok()); - ASSERT_FALSE(col->Flush().ok()); - ASSERT_FALSE(col->Destroy().ok()); - ASSERT_FALSE(col->Options().has_value()); - ASSERT_FALSE(col->Path().has_value()); - ASSERT_FALSE(col->Stats().has_value()); - ASSERT_FALSE(col->Schema().has_value()); - - ASSERT_FALSE(ailego::FileHelper::IsExist(path.c_str())); - - // recreate - result = Collection::CreateAndOpen(path, *schema, options); - ASSERT_TRUE(result.has_value()); - ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str())); + auto schema = TestHelper::CreateNormalSchema(); + auto result = Collection::CreateAndOpen(path, *schema, options); + if (!result.has_value()) { + std::cout << result.error().message() << std::endl; + } + ASSERT_TRUE(result.has_value()); + ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str())); + + auto col = result.value(); + ASSERT_EQ(col->Path(), path); + ASSERT_EQ(col->Schema(), *schema); + ASSERT_EQ(col->Options(), options); + auto stats = col->Stats().value(); + ASSERT_TRUE(stats.doc_count == 0); + ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); + ASSERT_EQ(stats.index_completeness["dense_fp16"], 1); + // ASSERT_EQ(stats.index_completeness["dense_fp64"], 1); + ASSERT_EQ(stats.index_completeness["sparse_fp32"], 1); + ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); - col = std::move(result.value()); - col.reset(); - col = nullptr; + ASSERT_EQ(col->Destroy(), Status::OK()); + + // after destroyed, every interface should return error + std::vector empty_docs; + ASSERT_FALSE(col->Insert(empty_docs).has_value()); + ASSERT_FALSE(col->Update(empty_docs).has_value()); + ASSERT_FALSE(col->Delete({}).has_value()); + ASSERT_FALSE(col->DeleteByFilter("").ok()); + ASSERT_FALSE(col->Fetch({}).has_value()); + ASSERT_FALSE(col->Query(VectorQuery{}).has_value()); + ASSERT_FALSE(col->Query(MultiQuery{}).has_value()); + ASSERT_FALSE(col->GroupByQuery({}).has_value()); + ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); + ASSERT_FALSE(col->DropIndex("").ok()); + ASSERT_FALSE(col->AddColumn(nullptr, "").ok()); + ASSERT_FALSE(col->AlterColumn("", "", nullptr).ok()); + ASSERT_FALSE(col->DropColumn("").ok()); + ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); + ASSERT_FALSE(col->Optimize().ok()); + ASSERT_FALSE(col->Flush().ok()); + ASSERT_FALSE(col->Destroy().ok()); + ASSERT_FALSE(col->Options().has_value()); + ASSERT_FALSE(col->Path().has_value()); + ASSERT_FALSE(col->Stats().has_value()); + ASSERT_FALSE(col->Schema().has_value()); + + ASSERT_FALSE(ailego::FileHelper::IsExist(path.c_str())); + + // recreate + result = Collection::CreateAndOpen(path, *schema, options); + ASSERT_TRUE(result.has_value()); + ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str())); - ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str())); + col = std::move(result.value()); + col.reset(); + col = nullptr; - // reopen - result = Collection::Open(path, options); - ASSERT_TRUE(result.has_value()); - col = std::move(result.value()); - col.reset(); + ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str())); - // reopen with read-only - options.read_only_ = true; - result = Collection::Open(path, options); - if (!result.has_value()) { - std::cout << result.error().message() << std::endl; - } - ASSERT_TRUE(result.has_value()); - col = result.value(); + // reopen + result = Collection::Open(path, options); + ASSERT_TRUE(result.has_value()); + col = std::move(result.value()); + col.reset(); - ASSERT_EQ(col->Path(), path); - ASSERT_EQ(col->Schema(), *schema); - ASSERT_EQ(col->Options(), options); - stats = col->Stats().value(); - ASSERT_TRUE(stats.doc_count == 0); - ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); - ASSERT_EQ(stats.index_completeness["dense_fp16"], 1); - // ASSERT_EQ(stats.index_completeness["dense_fp64"], 1); - ASSERT_EQ(stats.index_completeness["sparse_fp32"], 1); - ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); - - // when open with read-only, write operation should fail - ASSERT_FALSE(col->Flush().ok()); - ASSERT_FALSE(col->Destroy().ok()); - ASSERT_FALSE(col->Insert(empty_docs).has_value()); - ASSERT_FALSE(col->Update(empty_docs).has_value()); - ASSERT_FALSE(col->Delete({}).has_value()); - ASSERT_FALSE(col->DeleteByFilter("").ok()); - ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); - ASSERT_FALSE(col->DropIndex("").ok()); - ASSERT_FALSE(col->AddColumn(nullptr, "").ok()); - ASSERT_FALSE(col->AlterColumn("", "", nullptr).ok()); - ASSERT_FALSE(col->DropColumn("").ok()); - ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); - ASSERT_FALSE(col->Optimize().ok()); - - // two threads open with read_only - result = Collection::Open(path, options); - if (!result.has_value()) { - std::cout << result.error().message() << std::endl; - } - ASSERT_TRUE(result.has_value()); - col = result.value(); + // reopen with read-only + options.read_only_ = true; + result = Collection::Open(path, options); + if (!result.has_value()) { + std::cout << result.error().message() << std::endl; + } + ASSERT_TRUE(result.has_value()); + col = result.value(); - auto result1 = Collection::Open(path, options); - if (!result1.has_value()) { - std::cout << result1.error().message() << std::endl; - } - ASSERT_TRUE(result1.has_value()); - auto col1 = result1.value(); + ASSERT_EQ(col->Path(), path); + ASSERT_EQ(col->Schema(), *schema); + ASSERT_EQ(col->Options(), options); + stats = col->Stats().value(); + ASSERT_TRUE(stats.doc_count == 0); + ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); + ASSERT_EQ(stats.index_completeness["dense_fp16"], 1); + // ASSERT_EQ(stats.index_completeness["dense_fp64"], 1); + ASSERT_EQ(stats.index_completeness["sparse_fp32"], 1); + ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); + + // when open with read-only, write operation should fail + ASSERT_FALSE(col->Flush().ok()); + ASSERT_FALSE(col->Destroy().ok()); + ASSERT_FALSE(col->Insert(empty_docs).has_value()); + ASSERT_FALSE(col->Update(empty_docs).has_value()); + ASSERT_FALSE(col->Delete({}).has_value()); + ASSERT_FALSE(col->DeleteByFilter("").ok()); + ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); + ASSERT_FALSE(col->DropIndex("").ok()); + ASSERT_FALSE(col->AddColumn(nullptr, "").ok()); + ASSERT_FALSE(col->AlterColumn("", "", nullptr).ok()); + ASSERT_FALSE(col->DropColumn("").ok()); + ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); + ASSERT_FALSE(col->Optimize().ok()); + + // two threads open with read_only + result = Collection::Open(path, options); + if (!result.has_value()) { + std::cout << result.error().message() << std::endl; + } + ASSERT_TRUE(result.has_value()); + col = result.value(); + + auto result1 = Collection::Open(path, options); + if (!result1.has_value()) { + std::cout << result1.error().message() << std::endl; + } + ASSERT_TRUE(result1.has_value()); + auto col1 = result1.value(); + }; + // func(true); + func(false); } TEST_F(CollectionTest, Feature_CreateAndOpen_Empty) { @@ -391,13 +397,13 @@ TEST_F(CollectionTest, Feature_Write_Batch_Validate) { } TEST_F(CollectionTest, Feature_Insert_General) { - auto func = [&](bool schema_nullable, bool doc_nullable, + auto func = [&](bool enable_mmap, bool schema_nullable, bool doc_nullable, int doc_count = 1000) { FileHelper::RemoveDirectory(col_path); // create with normal schema auto schema = TestHelper::CreateNormalSchema(schema_nullable); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; auto collection = TestHelper::CreateCollectionWithDoc( col_path, *schema, options, 0, doc_count, doc_nullable); @@ -478,14 +484,16 @@ TEST_F(CollectionTest, Feature_Insert_General) { ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); }; - func(false, false); - func(true, true); - func(true, false); - func(false, true); + for (bool enable_mmap : {/*true,*/ false}) { + func(enable_mmap, false, false); + func(enable_mmap, true, true); + func(enable_mmap, true, false); + func(enable_mmap, false, true); - func(false, false, 0); - func(false, false, 1); - func(false, false, 2); + func(enable_mmap, false, false, 0); + func(enable_mmap, false, false, 1); + func(enable_mmap, false, false, 2); + } } TEST_F(CollectionTest, Feature_Insert_ScalarIndex) { @@ -809,13 +817,13 @@ TEST_F(CollectionTest, Feature_Insert_Duplicate) { } TEST_F(CollectionTest, Feature_Upsert_General) { - auto func = [&](bool schema_nullable, bool doc_nullable, + auto func = [&](bool enable_mmap, bool schema_nullable, bool doc_nullable, int doc_count = 1000) { FileHelper::RemoveDirectory(col_path); // create with normal schema auto schema = TestHelper::CreateNormalSchema(schema_nullable); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; auto collection = TestHelper::CreateCollectionWithDoc( col_path, *schema, options, 0, doc_count, doc_nullable, true); @@ -896,14 +904,16 @@ TEST_F(CollectionTest, Feature_Upsert_General) { ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); }; - func(false, false); - func(true, true); - func(true, false); - func(false, true); + for (bool enable_mmap : {/*true,*/ false}) { + func(enable_mmap, false, false); + func(enable_mmap, true, true); + func(enable_mmap, true, false); + func(enable_mmap, false, true); - func(false, false, 0); - func(false, false, 1); - func(false, false, 2); + func(enable_mmap, false, false, 0); + func(enable_mmap, false, false, 1); + func(enable_mmap, false, false, 2); + } } TEST_F(CollectionTest, Feature_Upsert_Incremental) { @@ -1096,9 +1106,9 @@ TEST_F(CollectionTest, Feature_Upsert_Nullable) { TEST_F(CollectionTest, Feature_Update_General) { - auto func = [&](int doc_count) { + auto func = [&](bool enable_mmap, int doc_count) { auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; FileHelper::RemoveDirectory(col_path); // insert first @@ -1180,10 +1190,12 @@ TEST_F(CollectionTest, Feature_Update_General) { check_doc(doc_count); }; - func(99); - func(100); - func(101); - func(1000); + for (bool enable_mmap : {/*true,*/ false}) { + func(enable_mmap, 99); + func(enable_mmap, 100); + func(enable_mmap, 101); + func(enable_mmap, 1000); + } } TEST_F(CollectionTest, Feature_Update_Incremental) { @@ -1437,9 +1449,9 @@ TEST_F(CollectionTest, Feature_Update_Empty) { } TEST_F(CollectionTest, Feature_Delete_General) { - auto func = [&](int doc_count) { + auto func = [&](bool enable_mmap, int doc_count) { auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; FileHelper::RemoveDirectory(col_path); // insert first @@ -1515,10 +1527,12 @@ TEST_F(CollectionTest, Feature_Delete_General) { check_doc(doc_count); }; - func(99); - func(100); - func(101); - func(1000); + for (bool enable_mmap : {/*true,*/ false}) { + func(enable_mmap, 99); + func(enable_mmap, 100); + func(enable_mmap, 101); + func(enable_mmap, 1000); + } } TEST_F(CollectionTest, Feature_Delete_Repeated) { @@ -1578,9 +1592,9 @@ TEST_F(CollectionTest, Feature_Delete_Repeated) { } TEST_F(CollectionTest, Feature_DeleteByFilter_General) { - auto func = [&](int doc_count) { + auto func = [&](bool enable_mmap, int doc_count) { auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; FileHelper::RemoveDirectory(col_path); // insert first @@ -1659,10 +1673,12 @@ TEST_F(CollectionTest, Feature_DeleteByFilter_General) { check_doc(doc_count); }; - func(99); - func(100); - func(101); - func(1000); + for (bool enable_mmap : {/*true,*/ false}) { + func(enable_mmap, 99); + func(enable_mmap, 100); + func(enable_mmap, 101); + func(enable_mmap, 1000); + } } TEST_F(CollectionTest, Feature_DeleteByFilter_ScalarIndex) { @@ -1755,122 +1771,131 @@ TEST_F(CollectionTest, Feature_DeleteByFilter_ScalarIndex) { } TEST_F(CollectionTest, Feature_MixedWrite_General) { - // case1: insert -> upsert -> update -> delete - auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; - FileHelper::RemoveDirectory(col_path); + auto func = [&](bool enable_mmap) { + // case1: insert -> upsert -> update -> delete + auto schema = TestHelper::CreateNormalSchema(); + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; + FileHelper::RemoveDirectory(col_path); - // insert first - auto collection = - TestHelper::CreateCollectionWithDoc(col_path, *schema, options, 0, 0); + // insert first + auto collection = + TestHelper::CreateCollectionWithDoc(col_path, *schema, options, 0, 0); - for (int i = 0; i < 100; i++) { - // std::cout << "insert: " << i << std::endl; - - // insert - auto new_doc = TestHelper::CreateDoc(i, *schema); - std::vector new_docs = {new_doc}; - auto res = collection->Insert(new_docs); - ASSERT_TRUE(res.has_value()); - ASSERT_TRUE(res.value()[0].ok()); - - // fetch - auto docs = collection->Fetch({TestHelper::MakePK(i)}); - ASSERT_TRUE(docs.has_value()); - ASSERT_EQ(docs.value().size(), 1); - ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); - ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); + for (int i = 0; i < 100; i++) { + // std::cout << "insert: " << i << std::endl; - auto stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, i + 1); - - // upsert - new_doc = TestHelper::CreateDoc(i + 1, *schema, TestHelper::MakePK(i)); - new_docs = {new_doc}; - res = collection->Upsert(new_docs); - ASSERT_TRUE(res.has_value()); - ASSERT_TRUE(res.value()[0].ok()); - - // fetch - docs = collection->Fetch({TestHelper::MakePK(i)}).value(); - ASSERT_TRUE(docs.has_value()); - ASSERT_EQ(docs.value().size(), 1); - ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); - ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); + // insert + auto new_doc = TestHelper::CreateDoc(i, *schema); + std::vector new_docs = {new_doc}; + auto res = collection->Insert(new_docs); + ASSERT_TRUE(res.has_value()); + ASSERT_TRUE(res.value()[0].ok()); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, i + 1); - - // update - new_doc = TestHelper::CreateDoc(i + 2, *schema, TestHelper::MakePK(i)); - new_docs = {new_doc}; - res = collection->Update(new_docs); - ASSERT_TRUE(res.has_value()); - ASSERT_TRUE(res.value()[0].ok()); - - // fetch - docs = collection->Fetch({TestHelper::MakePK(i)}).value(); - ASSERT_TRUE(docs.has_value()); - ASSERT_EQ(docs.value().size(), 1); - ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); - ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); + // fetch + auto docs = collection->Fetch({TestHelper::MakePK(i)}); + ASSERT_TRUE(docs.has_value()); + ASSERT_EQ(docs.value().size(), 1); + ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); + ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, i + 1); + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, i + 1); - // delete - res = collection->Delete({TestHelper::MakePK(i)}); - ASSERT_TRUE(res.has_value()); - ASSERT_TRUE(res.value()[0].ok()); + // upsert + new_doc = TestHelper::CreateDoc(i + 1, *schema, TestHelper::MakePK(i)); + new_docs = {new_doc}; + res = collection->Upsert(new_docs); + ASSERT_TRUE(res.has_value()); + ASSERT_TRUE(res.value()[0].ok()); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, i); - - // insert again - new_doc = TestHelper::CreateDoc(i, *schema); - new_docs = {new_doc}; - res = collection->Insert(new_docs); - ASSERT_TRUE(res.has_value()); - ASSERT_TRUE(res.value()[0].ok()); - - // fetch - docs = collection->Fetch({TestHelper::MakePK(i)}); - ASSERT_TRUE(docs.has_value()); - ASSERT_EQ(docs.value().size(), 1); - ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); - ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); + // fetch + docs = collection->Fetch({TestHelper::MakePK(i)}).value(); + ASSERT_TRUE(docs.has_value()); + ASSERT_EQ(docs.value().size(), 1); + ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); + ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, i + 1); - } + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, i + 1); + + // update + new_doc = TestHelper::CreateDoc(i + 2, *schema, TestHelper::MakePK(i)); + new_docs = {new_doc}; + res = collection->Update(new_docs); + ASSERT_TRUE(res.has_value()); + ASSERT_TRUE(res.value()[0].ok()); + + // fetch + docs = collection->Fetch({TestHelper::MakePK(i)}).value(); + ASSERT_TRUE(docs.has_value()); + ASSERT_EQ(docs.value().size(), 1); + ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); + ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, i + 1); + + // delete + res = collection->Delete({TestHelper::MakePK(i)}); + ASSERT_TRUE(res.has_value()); + ASSERT_TRUE(res.value()[0].ok()); + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, i); + + // insert again + new_doc = TestHelper::CreateDoc(i, *schema); + new_docs = {new_doc}; + res = collection->Insert(new_docs); + ASSERT_TRUE(res.has_value()); + ASSERT_TRUE(res.value()[0].ok()); + + // fetch + docs = collection->Fetch({TestHelper::MakePK(i)}); + ASSERT_TRUE(docs.has_value()); + ASSERT_EQ(docs.value().size(), 1); + ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); + ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, i + 1); + } + }; + // func(true); + func(false); } TEST_F(CollectionTest, Feature_CreateIndex_General) { - // create empty collection - auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; - auto collection = TestHelper::CreateCollectionWithDoc(col_path, *schema, - options, 0, 0, false); + auto func = [&](bool enable_mmap) { + // create empty collection + auto schema = TestHelper::CreateNormalSchema(); + auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1024}; + auto collection = TestHelper::CreateCollectionWithDoc(col_path, *schema, + options, 0, 0, false); - ASSERT_TRUE(collection->Flush().ok()); - auto stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, 0); + ASSERT_TRUE(collection->Flush().ok()); + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, 0); - auto index_params = std::make_shared(MetricType::IP); - auto s = collection->CreateIndex("dense_fp32", index_params); - if (!s.ok()) { - std::cout << "status: " << s.message() << std::endl; - ASSERT_TRUE(false); - } - auto new_index_params = std::make_shared(MetricType::COSINE); - s = collection->CreateIndex("dense_fp32", index_params); - if (!s.ok()) { - std::cout << "status: " << s.message() << std::endl; - ASSERT_TRUE(false); - } + auto index_params = std::make_shared(MetricType::IP); + auto s = collection->CreateIndex("dense_fp32", index_params); + if (!s.ok()) { + std::cout << "status: " << s.message() << std::endl; + ASSERT_TRUE(false); + } + auto new_index_params = + std::make_shared(MetricType::COSINE); + s = collection->CreateIndex("dense_fp32", index_params); + if (!s.ok()) { + std::cout << "status: " << s.message() << std::endl; + ASSERT_TRUE(false); + } - s = collection->CreateIndex("dense_fp32_invalid", index_params); - ASSERT_FALSE(s.ok()); + s = collection->CreateIndex("dense_fp32_invalid", index_params); + ASSERT_FALSE(s.ok()); + }; + // func(true); + func(false); } TEST_F(CollectionTest, Feature_CreateIndex_Vector) { @@ -2230,72 +2255,76 @@ TEST_F(CollectionTest, Feature_CreateIndex_Scalar) { } TEST_F(CollectionTest, Feature_DropIndex_General) { - // create empty collection - auto schema = TestHelper::CreateSchemaWithVectorIndex(); - auto options = CollectionOptions{false, true, 64 * 1024 * 1204}; - auto collection = TestHelper::CreateCollectionWithDoc(col_path, *schema, - options, 0, 0, false); + auto func = [&](bool enable_mmap) { + // create empty collection + auto schema = TestHelper::CreateSchemaWithVectorIndex(); + auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1204}; + auto collection = TestHelper::CreateCollectionWithDoc(col_path, *schema, + options, 0, 0, false); - ASSERT_TRUE(collection->Flush().ok()); - auto stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, 0); - ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); + ASSERT_TRUE(collection->Flush().ok()); + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, 0); + ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); - ASSERT_EQ(collection->Schema(), *schema); + ASSERT_EQ(collection->Schema(), *schema); - auto s = collection->DropIndex("dense_fp32_invalid"); - ASSERT_FALSE(s.ok()); + auto s = collection->DropIndex("dense_fp32_invalid"); + ASSERT_FALSE(s.ok()); - s = collection->DropIndex("dense_fp32"); - if (!s.ok()) { - std::cout << "drop index err: " << s.message() << std::endl; - } - ASSERT_TRUE(s.ok()); + s = collection->DropIndex("dense_fp32"); + if (!s.ok()) { + std::cout << "drop index err: " << s.message() << std::endl; + } + ASSERT_TRUE(s.ok()); - s = collection->DropIndex("dense_fp32"); - ASSERT_TRUE(s.ok()); + s = collection->DropIndex("dense_fp32"); + ASSERT_TRUE(s.ok()); - auto new_schema = std::make_shared(*schema); - s = new_schema->drop_index("dense_fp32"); - ASSERT_TRUE(s.ok()); - ASSERT_EQ(*new_schema, collection->Schema()); + auto new_schema = std::make_shared(*schema); + s = new_schema->drop_index("dense_fp32"); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(*new_schema, collection->Schema()); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, 0); - ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, 0); + ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); - ASSERT_EQ(*collection->Schema() - .value() - .get_vector_field("dense_fp32") - ->index_params(), - DefaultVectorIndexParams); + ASSERT_EQ(*collection->Schema() + .value() + .get_vector_field("dense_fp32") + ->index_params(), + DefaultVectorIndexParams); - s = collection->DropIndex("dense_fp32"); - if (!s.ok()) { - std::cout << "drop index err: " << s.message() << std::endl; - } - ASSERT_TRUE(s.ok()); + s = collection->DropIndex("dense_fp32"); + if (!s.ok()) { + std::cout << "drop index err: " << s.message() << std::endl; + } + ASSERT_TRUE(s.ok()); - auto schema1 = collection->Schema().value(); + auto schema1 = collection->Schema().value(); - collection.reset(); + collection.reset(); - auto result = Collection::Open(col_path, options); - ASSERT_TRUE(result.has_value()); + auto result = Collection::Open(col_path, options); + ASSERT_TRUE(result.has_value()); - collection = std::move(result.value()); - auto schema2 = collection->Schema().value(); + collection = std::move(result.value()); + auto schema2 = collection->Schema().value(); - if (schema1 != schema2) { - std::cout << "schema1: " << schema1.to_string_formatted() << std::endl; - std::cout << "schema2: " << schema2.to_string_formatted() << std::endl; - } - ASSERT_EQ(schema1, schema2); + if (schema1 != schema2) { + std::cout << "schema1: " << schema1.to_string_formatted() << std::endl; + std::cout << "schema2: " << schema2.to_string_formatted() << std::endl; + } + ASSERT_EQ(schema1, schema2); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, 0); - ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, 0); + ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); + }; + // func(true); + func(false); } TEST_F(CollectionTest, Feature_DropIndex_Vector) { @@ -2527,14 +2556,14 @@ TEST_F(CollectionTest, Feature_DropIndex_AfterCreate) { } TEST_F(CollectionTest, Feature_Optimize_General) { - auto func = [](int concurrency) { + auto func = [](bool enable_mmap, int concurrency) { FileHelper::RemoveDirectory(col_path); int doc_count = 1000; // create empty collection auto schema = TestHelper::CreateSchemaWithVectorIndex(); - auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1024}; auto collection = TestHelper::CreateCollectionWithDoc( col_path, *schema, options, 0, doc_count, false); @@ -2586,12 +2615,15 @@ TEST_F(CollectionTest, Feature_Optimize_General) { std::cout << "check success 3" << std::endl; }; - func(0); - func(4); + for (bool enable_mmap : {/*true,*/ false}) { + func(enable_mmap, 0); + func(enable_mmap, 4); + } } TEST_F(CollectionTest, Feature_Optimize_Repeated) { - auto run_repeated_optimize_test = [&](IndexParams::Ptr index_params) { + auto run_repeated_optimize_test = [&](bool enable_mmap, + IndexParams::Ptr index_params) { ASSERT_NE(index_params, nullptr); SCOPED_TRACE(testing::Message() << "index_params=" << index_params->to_string()); @@ -2600,7 +2632,7 @@ TEST_F(CollectionTest, Feature_Optimize_Repeated) { int doc_count = 1000; auto schema = TestHelper::CreateSchemaWithVectorIndex(false, "demo", index_params); - auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1024}; auto collection = TestHelper::CreateCollectionWithDoc( col_path, *schema, options, 0, doc_count, false); @@ -2741,22 +2773,31 @@ TEST_F(CollectionTest, Feature_Optimize_Repeated) { }; - run_repeated_optimize_test(std::make_shared( - MetricType::IP, QuantizeType::UNDEFINED)); - run_repeated_optimize_test( - std::make_shared(MetricType::IP, QuantizeType::FP16)); - run_repeated_optimize_test(std::make_shared( - MetricType::IP, 16, 200, QuantizeType::UNDEFINED)); - run_repeated_optimize_test(std::make_shared( - MetricType::IP, 16, 200, QuantizeType::FP16)); - run_repeated_optimize_test(std::make_shared( - MetricType::IP, 10, 4, false, QuantizeType::UNDEFINED)); - run_repeated_optimize_test(std::make_shared( - MetricType::IP, 10, 4, false, QuantizeType::FP16)); + for (bool enable_mmap : {/*true,*/ false}) { + run_repeated_optimize_test(enable_mmap, + std::make_shared( + MetricType::IP, QuantizeType::UNDEFINED)); + run_repeated_optimize_test( + enable_mmap, + std::make_shared(MetricType::IP, QuantizeType::FP16)); + run_repeated_optimize_test( + enable_mmap, std::make_shared( + MetricType::IP, 16, 200, QuantizeType::UNDEFINED)); + run_repeated_optimize_test( + enable_mmap, std::make_shared(MetricType::IP, 16, 200, + QuantizeType::FP16)); + run_repeated_optimize_test(enable_mmap, std::make_shared( + MetricType::IP, 10, 4, false, + QuantizeType::UNDEFINED)); + run_repeated_optimize_test( + enable_mmap, std::make_shared( + MetricType::IP, 10, 4, false, QuantizeType::FP16)); #if RABITQ_SUPPORTED - run_repeated_optimize_test(std::make_shared( - MetricType::IP, 7, 256, 16, 200, 0)); + run_repeated_optimize_test( + enable_mmap, std::make_shared(MetricType::IP, 7, + 256, 16, 200, 0)); #endif + } } TEST_F(CollectionTest, Feature_Optimize_MetricType) { @@ -3428,13 +3469,13 @@ TEST_F(CollectionTest, Feature_Query_Validate) { } TEST_F(CollectionTest, Feature_Query_General) { - auto func = [&](std::string field_name) { + auto func = [&](bool enable_mmap, std::string field_name) { FileHelper::RemoveDirectory(col_path); int doc_count = 1000; // create with normal schema auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; auto collection = TestHelper::CreateCollectionWithDoc( col_path, *schema, options, 0, doc_count); @@ -3496,8 +3537,10 @@ TEST_F(CollectionTest, Feature_Query_General) { } }; - func("dense_fp32"); - func("sparse_fp32"); + for (bool enable_mmap : {/*true,*/ false}) { + func(enable_mmap, "dense_fp32"); + func(enable_mmap, "sparse_fp32"); + } } TEST_F(CollectionTest, Feature_Query_Empty) { @@ -4114,69 +4157,73 @@ TEST_F(CollectionTest, Feature_MultiQuery_CallbackReranker) { TEST_F(CollectionTest, Feature_GroupByQuery) {} TEST_F(CollectionTest, Feature_AddColumn_General) { - // create collection - int doc_count = 1000; - auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; - auto collection = TestHelper::CreateCollectionWithDoc( - col_path, *schema, options, 0, doc_count, false); + auto func = [&](bool enable_mmap) { + // create collection + int doc_count = 1000; + auto schema = TestHelper::CreateNormalSchema(); + auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1024}; + auto collection = TestHelper::CreateCollectionWithDoc( + col_path, *schema, options, 0, doc_count, false); - ASSERT_TRUE(collection->Flush().ok()); - auto stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, doc_count); - auto field_schema = - std::make_shared("add_int32", DataType::INT32, false); - auto s = collection->AddColumn(field_schema, "int32", AddColumnOptions()); - if (!s.ok()) { - std::cout << "status: " << s.message() << std::endl; - ASSERT_TRUE(false); - } - auto new_schema = collection->Schema().value(); - ASSERT_TRUE(new_schema.has_field("add_int32")); + ASSERT_TRUE(collection->Flush().ok()); + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, doc_count); + auto field_schema = + std::make_shared("add_int32", DataType::INT32, false); + auto s = collection->AddColumn(field_schema, "int32", AddColumnOptions()); + if (!s.ok()) { + std::cout << "status: " << s.message() << std::endl; + ASSERT_TRUE(false); + } + auto new_schema = collection->Schema().value(); + ASSERT_TRUE(new_schema.has_field("add_int32")); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, doc_count); + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, doc_count); - auto check_doc = [&](int doc_count) { - for (int i = 0; i < doc_count; i++) { - auto expect_doc = TestHelper::CreateDoc(i, new_schema); - auto result = collection->Fetch({expect_doc.pk()}); - ASSERT_TRUE(result.has_value()); - ASSERT_EQ(result.value().size(), 1); - ASSERT_EQ(result.value().count(expect_doc.pk()), 1); - auto doc = result.value()[expect_doc.pk()]; - ASSERT_NE(doc, nullptr); - if (*doc != expect_doc) { - std::cout << " doc:" << doc->to_detail_string() << std::endl; - std::cout << "expect_doc:" << expect_doc.to_detail_string() - << std::endl; + auto check_doc = [&](int doc_count) { + for (int i = 0; i < doc_count; i++) { + auto expect_doc = TestHelper::CreateDoc(i, new_schema); + auto result = collection->Fetch({expect_doc.pk()}); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value().size(), 1); + ASSERT_EQ(result.value().count(expect_doc.pk()), 1); + auto doc = result.value()[expect_doc.pk()]; + ASSERT_NE(doc, nullptr); + if (*doc != expect_doc) { + std::cout << " doc:" << doc->to_detail_string() << std::endl; + std::cout << "expect_doc:" << expect_doc.to_detail_string() + << std::endl; + } + ASSERT_EQ(*doc, expect_doc); } - ASSERT_EQ(*doc, expect_doc); - } - }; + }; - check_doc(doc_count); + check_doc(doc_count); - // validate query result - for (int i = 1; i < 2; i++) { - VectorQuery query; - query.topk_ = 10; - query.include_vector_ = true; + // validate query result + for (int i = 1; i < 2; i++) { + VectorQuery query; + query.topk_ = 10; + query.include_vector_ = true; - auto result = collection->Query(query); - if (!result.has_value()) { - std::cout << "err: " << result.error().message() << std::endl; - } - ASSERT_TRUE(result.has_value()); - ASSERT_EQ(result.value().size(), std::min(query.topk_, doc_count)); + auto result = collection->Query(query); + if (!result.has_value()) { + std::cout << "err: " << result.error().message() << std::endl; + } + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value().size(), std::min(query.topk_, doc_count)); - auto fields_name = new_schema.all_field_names(); - for (int j = 0; j < std::min(query.topk_, doc_count); j++) { - auto result_doc = result.value()[j]; - auto doc_fields_names = result_doc->field_names(); - ASSERT_TRUE(vectors_equal_when_sorted(fields_name, doc_fields_names)); + auto fields_name = new_schema.all_field_names(); + for (int j = 0; j < std::min(query.topk_, doc_count); j++) { + auto result_doc = result.value()[j]; + auto doc_fields_names = result_doc->field_names(); + ASSERT_TRUE(vectors_equal_when_sorted(fields_name, doc_fields_names)); + } } - } + }; + // func(true); + func(false); } TEST_F(CollectionTest, Feature_AddColumn_CornerCase) { From bdeaa63933159e65eacda230983d403c98580700 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Fri, 29 May 2026 16:33:30 +0800 Subject: [PATCH 39/47] clang format --- src/core/interface/indexes/ivf_index.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/core/interface/indexes/ivf_index.cc b/src/core/interface/indexes/ivf_index.cc index d38afbabd..1b91eebea 100644 --- a/src/core/interface/indexes/ivf_index.cc +++ b/src/core/interface/indexes/ivf_index.cc @@ -90,13 +90,16 @@ int IVFIndex::Open(const std::string &file_path, // MMapFileReadStorage so the freshly-dumped file can be reopened. storage_ = core::IndexFactory::CreateStorage("MMapFileReadStorage"); if (storage_ == nullptr) { - LOG_ERROR("Failed to create MMapFileReadStorage (IVF buffer-pool fallback)"); + LOG_ERROR( + "Failed to create MMapFileReadStorage (IVF buffer-pool fallback)"); return core::IndexError_Runtime; } int ret = storage_->init(storage_params); if (ret != 0) { - LOG_ERROR("Failed to init MMapFileReadStorage (IVF buffer-pool fallback), path: %s, err: %s", - file_path_.c_str(), core::IndexError::What(ret)); + LOG_ERROR( + "Failed to init MMapFileReadStorage (IVF buffer-pool fallback), " + "path: %s, err: %s", + file_path_.c_str(), core::IndexError::What(ret)); return ret; } break; From e796a31485c9cf37d01b133ddf72f7cea8ec758f Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Fri, 29 May 2026 16:36:46 +0800 Subject: [PATCH 40/47] fix ut --- tests/db/collection_test.cc | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/db/collection_test.cc b/tests/db/collection_test.cc index 76910c3bb..2707b645c 100644 --- a/tests/db/collection_test.cc +++ b/tests/db/collection_test.cc @@ -47,8 +47,8 @@ std::string col_path = "test_collection"; class CollectionTest : public ::testing::Test { protected: void SetUp() override { - zvec::ailego::MemoryLimitPool::get_instance().init( - 2 * 1024ll * 1024ll * 1024ll); + zvec::ailego::MemoryLimitPool::get_instance().init(2 * 1024ll * 1024ll * + 1024ll); FileHelper::RemoveDirectory(col_path); } @@ -183,7 +183,7 @@ TEST_F(CollectionTest, Feature_CreateAndOpen_General) { ASSERT_TRUE(result1.has_value()); auto col1 = result1.value(); }; - // func(true); + func(true); func(false); } @@ -484,7 +484,7 @@ TEST_F(CollectionTest, Feature_Insert_General) { ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); }; - for (bool enable_mmap : {/*true,*/ false}) { + for (bool enable_mmap : {true, false}) { func(enable_mmap, false, false); func(enable_mmap, true, true); func(enable_mmap, true, false); @@ -904,7 +904,7 @@ TEST_F(CollectionTest, Feature_Upsert_General) { ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); }; - for (bool enable_mmap : {/*true,*/ false}) { + for (bool enable_mmap : {true, false}) { func(enable_mmap, false, false); func(enable_mmap, true, true); func(enable_mmap, true, false); @@ -1190,7 +1190,7 @@ TEST_F(CollectionTest, Feature_Update_General) { check_doc(doc_count); }; - for (bool enable_mmap : {/*true,*/ false}) { + for (bool enable_mmap : {true, false}) { func(enable_mmap, 99); func(enable_mmap, 100); func(enable_mmap, 101); @@ -1527,7 +1527,7 @@ TEST_F(CollectionTest, Feature_Delete_General) { check_doc(doc_count); }; - for (bool enable_mmap : {/*true,*/ false}) { + for (bool enable_mmap : {true, false}) { func(enable_mmap, 99); func(enable_mmap, 100); func(enable_mmap, 101); @@ -1673,7 +1673,7 @@ TEST_F(CollectionTest, Feature_DeleteByFilter_General) { check_doc(doc_count); }; - for (bool enable_mmap : {/*true,*/ false}) { + for (bool enable_mmap : {true, false}) { func(enable_mmap, 99); func(enable_mmap, 100); func(enable_mmap, 101); @@ -1861,7 +1861,7 @@ TEST_F(CollectionTest, Feature_MixedWrite_General) { ASSERT_EQ(stats.doc_count, i + 1); } }; - // func(true); + func(true); func(false); } @@ -1894,7 +1894,7 @@ TEST_F(CollectionTest, Feature_CreateIndex_General) { s = collection->CreateIndex("dense_fp32_invalid", index_params); ASSERT_FALSE(s.ok()); }; - // func(true); + func(true); func(false); } @@ -2323,7 +2323,7 @@ TEST_F(CollectionTest, Feature_DropIndex_General) { ASSERT_EQ(stats.doc_count, 0); ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); }; - // func(true); + func(true); func(false); } @@ -2615,7 +2615,7 @@ TEST_F(CollectionTest, Feature_Optimize_General) { std::cout << "check success 3" << std::endl; }; - for (bool enable_mmap : {/*true,*/ false}) { + for (bool enable_mmap : {true, false}) { func(enable_mmap, 0); func(enable_mmap, 4); } @@ -2773,7 +2773,7 @@ TEST_F(CollectionTest, Feature_Optimize_Repeated) { }; - for (bool enable_mmap : {/*true,*/ false}) { + for (bool enable_mmap : {true, false}) { run_repeated_optimize_test(enable_mmap, std::make_shared( MetricType::IP, QuantizeType::UNDEFINED)); @@ -3537,7 +3537,7 @@ TEST_F(CollectionTest, Feature_Query_General) { } }; - for (bool enable_mmap : {/*true,*/ false}) { + for (bool enable_mmap : {true, false}) { func(enable_mmap, "dense_fp32"); func(enable_mmap, "sparse_fp32"); } @@ -4222,7 +4222,7 @@ TEST_F(CollectionTest, Feature_AddColumn_General) { } } }; - // func(true); + func(true); func(false); } From 971da9823af324dc8f580e05fca3bfef9b70a156 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Fri, 29 May 2026 16:50:34 +0800 Subject: [PATCH 41/47] fix ut --- tests/db/collection_test.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/db/collection_test.cc b/tests/db/collection_test.cc index 2707b645c..d586f815f 100644 --- a/tests/db/collection_test.cc +++ b/tests/db/collection_test.cc @@ -1867,6 +1867,7 @@ TEST_F(CollectionTest, Feature_MixedWrite_General) { TEST_F(CollectionTest, Feature_CreateIndex_General) { auto func = [&](bool enable_mmap) { + FileHelper::RemoveDirectory(col_path); // create empty collection auto schema = TestHelper::CreateNormalSchema(); auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1024}; @@ -2256,6 +2257,7 @@ TEST_F(CollectionTest, Feature_CreateIndex_Scalar) { TEST_F(CollectionTest, Feature_DropIndex_General) { auto func = [&](bool enable_mmap) { + FileHelper::RemoveDirectory(col_path); // create empty collection auto schema = TestHelper::CreateSchemaWithVectorIndex(); auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1204}; @@ -4158,6 +4160,7 @@ TEST_F(CollectionTest, Feature_GroupByQuery) {} TEST_F(CollectionTest, Feature_AddColumn_General) { auto func = [&](bool enable_mmap) { + FileHelper::RemoveDirectory(col_path); // create collection int doc_count = 1000; auto schema = TestHelper::CreateNormalSchema(); From f17f45b11c021ccbc3707c25f9b06308e4ef45f1 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Fri, 29 May 2026 19:50:39 +0800 Subject: [PATCH 42/47] fix ut --- src/ailego/buffer/vector_page_table.cc | 9 +++++++ src/core/utility/buffer_storage.cc | 37 +++++++++++++++++++++----- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index 8e5c43f30..c9296d640 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -545,6 +545,10 @@ char *VecBufferPoolHandle::get_single_page(size_t file_offset, size_t len, out_page_id = first_page; char *page = pool_.acquire_buffer(first_page, 50); if (!page) { + LOG_ERROR( + "VecBufferPoolHandle::get_single_page: acquire_buffer failed, " + "file_offset=%zu, len=%zu, page=%zu, page_size=%zu", + file_offset, len, first_page, kVectorPageSize); return nullptr; } return page + (file_offset - first_page * kVectorPageSize); @@ -562,6 +566,11 @@ bool VecBufferPoolHandle::read_range(size_t file_offset, size_t len, for (size_t pg = first_page; pg <= last_page; ++pg) { char *page = pool_.acquire_buffer(pg, 50); if (!page) { + LOG_ERROR( + "VecBufferPoolHandle::read_range: acquire_buffer failed, " + "file_offset=%zu, len=%zu, page=%zu, first_page=%zu, last_page=%zu, " + "page_size=%zu", + file_offset, len, pg, first_page, last_page, kVectorPageSize); return false; } size_t page_start = pg * kVectorPageSize; diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index caaa3cf8a..13e9728f4 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -103,6 +103,10 @@ class BufferStorage : public IndexStorage { segment_info_->segment.meta()->data_index + offset; if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, static_cast(buf))) { + LOG_ERROR( + "WrappedSegment::fetch: read_range failed, file[%s], id[%zu], " + "abs_offset=%zu, len=%zu", + owner_->file_name_.c_str(), segment_id_, abs_offset, len); return 0; } return len; @@ -136,6 +140,11 @@ class BufferStorage : public IndexStorage { char *raw = owner_->buffer_pool_handle_->get_single_page(abs_offset, len, page_id); if (!raw) { + LOG_ERROR( + "WrappedSegment::read: single-page acquire failed, file[%s], " + "id[%zu], abs_offset=%zu, len=%zu, page=%zu", + owner_->file_name_.c_str(), segment_id_, abs_offset, len, + first_page); *data = nullptr; return 0; } @@ -146,16 +155,29 @@ class BufferStorage : public IndexStorage { return len; } // Cross-page path: see file-level banner. C11 aligned_alloc requires - // size to be a multiple of alignment. - const size_t kAlign = 4096UL; + // size to be a multiple of alignment, and alignment must be a power + // of two; kVectorPageSize is sysconf(_SC_PAGESIZE) which satisfies + // both, and matches the buffer-pool's actual page granularity across + // platforms (e.g. 4K on Linux, 16K on iOS arm64 / some Android arm64). + const size_t kAlign = ailego::kVectorPageSize; size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL); char *tmp = static_cast(ailego_aligned_malloc(alloc_size, kAlign)); if (!tmp) { + LOG_ERROR( + "WrappedSegment::read: cross-page alloc failed, file[%s], " + "id[%zu], abs_offset=%zu, len=%zu, alloc_size=%zu, align=%zu", + owner_->file_name_.c_str(), segment_id_, abs_offset, len, + alloc_size, kAlign); *data = nullptr; return 0; } if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, tmp)) { + LOG_ERROR( + "WrappedSegment::read: cross-page read_range failed, file[%s], " + "id[%zu], abs_offset=%zu, len=%zu, first_page=%zu, last_page=%zu", + owner_->file_name_.c_str(), segment_id_, abs_offset, len, + first_page, last_page); ailego_free(tmp); *data = nullptr; return 0; @@ -203,10 +225,13 @@ class BufferStorage : public IndexStorage { return len; } // C11 aligned_alloc requires the requested size to be a multiple of - // the alignment; round len up to the next 4K boundary. Without this - // glibc treats the call as undefined behaviour and silently corrupts - // heap metadata (manifesting later as `corrupted size vs. prev_size`). - const size_t kAlign = 4096UL; + // the alignment, and alignment must be a power of two. Use the + // buffer-pool page granularity (sysconf(_SC_PAGESIZE)) which is the + // actual page size across platforms (e.g. 4K on Linux, 16K on iOS + // arm64 / some Android arm64), avoiding a hard-coded 4K mismatch. + // Without correct alignment some libcs (notably Bionic) silently + // return NULL or corrupt heap metadata. + const size_t kAlign = ailego::kVectorPageSize; size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL); char *tmp = static_cast(ailego_aligned_malloc(alloc_size, kAlign)); From 12b1337c0ed27a68c8062642bb94aa326f8d00eb Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Sat, 30 May 2026 00:55:22 +0800 Subject: [PATCH 43/47] fix --- src/core/utility/buffer_storage.cc | 65 ++++++++++++++++++++++-------- 1 file changed, 49 insertions(+), 16 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 13e9728f4..bc62822f8 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -62,8 +62,15 @@ class BufferStorage : public IndexStorage { ~WrappedSegment(void) override {} //! Retrieve size of data + //! + //! data_size / padding_size are mutated lock-free by concurrent + //! writers (write/resize) and observed by concurrent readers on the + //! lock-free hot path. Use acquire/release ordering so weakly-ordered + //! ARM (e.g. Android arm64) cannot see stale values that would cause + //! read() to truncate len to 0. size_t data_size(void) const override { - return static_cast(segment_info_->segment.meta()->data_size); + return static_cast(__atomic_load_n( + &segment_info_->segment.meta()->data_size, __ATOMIC_ACQUIRE)); } //! Retrieve crc of data @@ -73,7 +80,8 @@ class BufferStorage : public IndexStorage { //! Retrieve size of padding size_t padding_size(void) const override { - return static_cast(segment_info_->segment.meta()->padding_size); + return static_cast(__atomic_load_n( + &segment_info_->segment.meta()->padding_size, __ATOMIC_ACQUIRE)); } //! Retrieve capacity of segment @@ -91,7 +99,8 @@ class BufferStorage : public IndexStorage { owner_->file_name_.c_str(), segment_id_); return 0; } - const size_t data_size = segment_info_->segment.meta()->data_size; + const size_t data_size = __atomic_load_n( + &segment_info_->segment.meta()->data_size, __ATOMIC_ACQUIRE); if (ailego_unlikely(offset > data_size || len > data_size - offset)) { if (offset > data_size) { offset = data_size; @@ -121,7 +130,8 @@ class BufferStorage : public IndexStorage { *data = nullptr; return 0; } - const size_t data_size = segment_info_->segment.meta()->data_size; + const size_t data_size = __atomic_load_n( + &segment_info_->segment.meta()->data_size, __ATOMIC_ACQUIRE); if (ailego_unlikely(offset > data_size || len > data_size - offset)) { if (offset > data_size) { offset = data_size; @@ -199,7 +209,8 @@ class BufferStorage : public IndexStorage { owner_->file_name_.c_str(), segment_id_); return 0; } - const size_t data_size = segment_info_->segment.meta()->data_size; + const size_t data_size = __atomic_load_n( + &segment_info_->segment.meta()->data_size, __ATOMIC_ACQUIRE); if (ailego_unlikely(offset > data_size || len > data_size - offset)) { if (offset > data_size) { offset = data_size; @@ -283,22 +294,38 @@ class BufferStorage : public IndexStorage { return 0; } auto meta = segment_info_->segment.meta(); - { - std::lock_guard meta_latch(meta_mtx_); - if (offset + len > meta->data_size) { - meta->data_size = offset + len; - meta->padding_size = capacity_ - meta->data_size; - } - } size_t abs_offset = segment_info_->segment_header_start_offset + segment_info_->segment_header->content_offset + - segment_info_->segment.meta()->data_index + offset; + meta->data_index + offset; + // Write the bytes BEFORE publishing the new data_size to readers. + // Lock-free readers observe data_size with acquire ordering; the + // release-store below establishes happens-before with the page + // contents written above. Publishing data_size first (the previous + // ordering) allowed a reader on weakly-ordered ARM to see the new + // length but still read stale page contents -- or, in the inverse + // direction, see a stale length and truncate len to 0 + // (root cause of "Read sparse vector failed ... ret=0"). if (owner_->buffer_pool_handle_->write_range( abs_offset, len, static_cast(data)) != 0) { LOG_ERROR("write() page-cache write_range failed at abs_offset=%zu", abs_offset); return 0; } + { + std::lock_guard meta_latch(meta_mtx_); + uint64_t cur = + __atomic_load_n(&meta->data_size, __ATOMIC_RELAXED); + if (offset + len > cur) { + uint64_t new_size = offset + len; + // padding_size is paired with data_size; publish it first + // (relaxed) so readers that acquire data_size see a + // consistent (data_size + padding_size == capacity_) pair. + __atomic_store_n(&meta->padding_size, capacity_ - new_size, + __ATOMIC_RELAXED); + __atomic_store_n(&meta->data_size, new_size, + __ATOMIC_RELEASE); + } + } // Mark dirty unconditionally even when data_size did not grow: // fixed-size in-place rewrites (e.g. chunk_meta_segment) must still // trigger flush_all() before the next append_segment(). @@ -321,12 +348,18 @@ class BufferStorage : public IndexStorage { bool changed = false; { std::lock_guard meta_latch(meta_mtx_); - if (meta->data_size != size) { + uint64_t cur = + __atomic_load_n(&meta->data_size, __ATOMIC_RELAXED); + if (cur != size) { if (size > capacity_) { size = capacity_; } - meta->data_size = size; - meta->padding_size = capacity_ - size; + // See write() for the publish ordering rationale: padding first + // (relaxed), then release-store data_size so concurrent lock-free + // readers observe a consistent pair. + __atomic_store_n(&meta->padding_size, capacity_ - size, + __ATOMIC_RELAXED); + __atomic_store_n(&meta->data_size, size, __ATOMIC_RELEASE); changed = true; } } From 9b2edc506bb3dd037c1effda29a97c3dca437916 Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Sat, 30 May 2026 09:47:26 +0800 Subject: [PATCH 44/47] clang format --- src/core/utility/buffer_storage.cc | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index bc62822f8..6aca9ffec 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -313,8 +313,7 @@ class BufferStorage : public IndexStorage { } { std::lock_guard meta_latch(meta_mtx_); - uint64_t cur = - __atomic_load_n(&meta->data_size, __ATOMIC_RELAXED); + uint64_t cur = __atomic_load_n(&meta->data_size, __ATOMIC_RELAXED); if (offset + len > cur) { uint64_t new_size = offset + len; // padding_size is paired with data_size; publish it first @@ -322,8 +321,7 @@ class BufferStorage : public IndexStorage { // consistent (data_size + padding_size == capacity_) pair. __atomic_store_n(&meta->padding_size, capacity_ - new_size, __ATOMIC_RELAXED); - __atomic_store_n(&meta->data_size, new_size, - __ATOMIC_RELEASE); + __atomic_store_n(&meta->data_size, new_size, __ATOMIC_RELEASE); } } // Mark dirty unconditionally even when data_size did not grow: @@ -348,8 +346,7 @@ class BufferStorage : public IndexStorage { bool changed = false; { std::lock_guard meta_latch(meta_mtx_); - uint64_t cur = - __atomic_load_n(&meta->data_size, __ATOMIC_RELAXED); + uint64_t cur = __atomic_load_n(&meta->data_size, __ATOMIC_RELAXED); if (cur != size) { if (size > capacity_) { size = capacity_; From de292b05684946e88fd42fbfb902eae0b034490d Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Sat, 30 May 2026 10:59:45 +0800 Subject: [PATCH 45/47] fix --- src/core/utility/buffer_storage.cc | 105 +++++++++++++++++++++-------- 1 file changed, 76 insertions(+), 29 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 6aca9ffec..b2470facb 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -31,6 +31,54 @@ namespace zvec { namespace core { +namespace { + +// Cross-compiler helpers for lock-free 64-bit acquire/release access +// to SegmentMeta::data_size / padding_size. +// +// These fields are POD (uint64_t) inside a serialised struct so we cannot +// change their type to std::atomic<>; std::atomic_ref is C++20 and the +// project targets C++17. GCC/Clang have native __atomic_* builtins that +// emit single ldar/stlr on arm64 and plain mov on x86_64. MSVC lacks +// these builtins, so we fall back to volatile load/store paired with a +// std::atomic_thread_fence, which is correct on all targets MSVC ships +// (x86_64 / arm64 desktop) and equivalent in cost. +inline uint64_t bs_load_acquire(const uint64_t *p) { +#if defined(__GNUC__) || defined(__clang__) + return __atomic_load_n(p, __ATOMIC_ACQUIRE); +#else + uint64_t v = *static_cast(p); + std::atomic_thread_fence(std::memory_order_acquire); + return v; +#endif +} + +inline uint64_t bs_load_relaxed(const uint64_t *p) { +#if defined(__GNUC__) || defined(__clang__) + return __atomic_load_n(p, __ATOMIC_RELAXED); +#else + return *static_cast(p); +#endif +} + +inline void bs_store_release(uint64_t *p, uint64_t v) { +#if defined(__GNUC__) || defined(__clang__) + __atomic_store_n(p, v, __ATOMIC_RELEASE); +#else + std::atomic_thread_fence(std::memory_order_release); + *static_cast(p) = v; +#endif +} + +inline void bs_store_relaxed(uint64_t *p, uint64_t v) { +#if defined(__GNUC__) || defined(__clang__) + __atomic_store_n(p, v, __ATOMIC_RELAXED); +#else + *static_cast(p) = v; +#endif +} + +} // namespace // The legacy read(const void**) overload guarantees the returned pointer // stays valid until close_index(). Single-page reads pin the page @@ -69,8 +117,8 @@ class BufferStorage : public IndexStorage { //! ARM (e.g. Android arm64) cannot see stale values that would cause //! read() to truncate len to 0. size_t data_size(void) const override { - return static_cast(__atomic_load_n( - &segment_info_->segment.meta()->data_size, __ATOMIC_ACQUIRE)); + return static_cast( + bs_load_acquire(&segment_info_->segment.meta()->data_size)); } //! Retrieve crc of data @@ -80,8 +128,8 @@ class BufferStorage : public IndexStorage { //! Retrieve size of padding size_t padding_size(void) const override { - return static_cast(__atomic_load_n( - &segment_info_->segment.meta()->padding_size, __ATOMIC_ACQUIRE)); + return static_cast( + bs_load_acquire(&segment_info_->segment.meta()->padding_size)); } //! Retrieve capacity of segment @@ -99,8 +147,8 @@ class BufferStorage : public IndexStorage { owner_->file_name_.c_str(), segment_id_); return 0; } - const size_t data_size = __atomic_load_n( - &segment_info_->segment.meta()->data_size, __ATOMIC_ACQUIRE); + const size_t data_size = + bs_load_acquire(&segment_info_->segment.meta()->data_size); if (ailego_unlikely(offset > data_size || len > data_size - offset)) { if (offset > data_size) { offset = data_size; @@ -130,8 +178,8 @@ class BufferStorage : public IndexStorage { *data = nullptr; return 0; } - const size_t data_size = __atomic_load_n( - &segment_info_->segment.meta()->data_size, __ATOMIC_ACQUIRE); + const size_t data_size = + bs_load_acquire(&segment_info_->segment.meta()->data_size); if (ailego_unlikely(offset > data_size || len > data_size - offset)) { if (offset > data_size) { offset = data_size; @@ -166,10 +214,14 @@ class BufferStorage : public IndexStorage { } // Cross-page path: see file-level banner. C11 aligned_alloc requires // size to be a multiple of alignment, and alignment must be a power - // of two; kVectorPageSize is sysconf(_SC_PAGESIZE) which satisfies - // both, and matches the buffer-pool's actual page granularity across - // platforms (e.g. 4K on Linux, 16K on iOS arm64 / some Android arm64). - const size_t kAlign = ailego::kVectorPageSize; + // of two. Use a fixed 4096-byte alignment for the dst buffer: 4K is + // the minimum page granularity across all supported platforms + // (always a divisor of the 16K/64K page sizes used on Apple Silicon + // and some Android arm64 configurations) and is sufficient for the + // downstream SIMD/DMA-friendly access contract. Pinning kAlign to + // 4096 also avoids over-allocating 16KB per cross-page read on + // large-page platforms. + static constexpr size_t kAlign = 4096UL; size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL); char *tmp = static_cast(ailego_aligned_malloc(alloc_size, kAlign)); @@ -209,8 +261,8 @@ class BufferStorage : public IndexStorage { owner_->file_name_.c_str(), segment_id_); return 0; } - const size_t data_size = __atomic_load_n( - &segment_info_->segment.meta()->data_size, __ATOMIC_ACQUIRE); + const size_t data_size = + bs_load_acquire(&segment_info_->segment.meta()->data_size); if (ailego_unlikely(offset > data_size || len > data_size - offset)) { if (offset > data_size) { offset = data_size; @@ -236,13 +288,10 @@ class BufferStorage : public IndexStorage { return len; } // C11 aligned_alloc requires the requested size to be a multiple of - // the alignment, and alignment must be a power of two. Use the - // buffer-pool page granularity (sysconf(_SC_PAGESIZE)) which is the - // actual page size across platforms (e.g. 4K on Linux, 16K on iOS - // arm64 / some Android arm64), avoiding a hard-coded 4K mismatch. - // Without correct alignment some libcs (notably Bionic) silently - // return NULL or corrupt heap metadata. - const size_t kAlign = ailego::kVectorPageSize; + // the alignment, and alignment must be a power of two. See the + // sibling read(const void**) overload above for the rationale of + // pinning kAlign to a fixed 4096 instead of sysconf(_SC_PAGESIZE). + static constexpr size_t kAlign = 4096UL; size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL); char *tmp = static_cast(ailego_aligned_malloc(alloc_size, kAlign)); @@ -313,15 +362,14 @@ class BufferStorage : public IndexStorage { } { std::lock_guard meta_latch(meta_mtx_); - uint64_t cur = __atomic_load_n(&meta->data_size, __ATOMIC_RELAXED); + uint64_t cur = bs_load_relaxed(&meta->data_size); if (offset + len > cur) { uint64_t new_size = offset + len; // padding_size is paired with data_size; publish it first // (relaxed) so readers that acquire data_size see a // consistent (data_size + padding_size == capacity_) pair. - __atomic_store_n(&meta->padding_size, capacity_ - new_size, - __ATOMIC_RELAXED); - __atomic_store_n(&meta->data_size, new_size, __ATOMIC_RELEASE); + bs_store_relaxed(&meta->padding_size, capacity_ - new_size); + bs_store_release(&meta->data_size, new_size); } } // Mark dirty unconditionally even when data_size did not grow: @@ -346,7 +394,7 @@ class BufferStorage : public IndexStorage { bool changed = false; { std::lock_guard meta_latch(meta_mtx_); - uint64_t cur = __atomic_load_n(&meta->data_size, __ATOMIC_RELAXED); + uint64_t cur = bs_load_relaxed(&meta->data_size); if (cur != size) { if (size > capacity_) { size = capacity_; @@ -354,9 +402,8 @@ class BufferStorage : public IndexStorage { // See write() for the publish ordering rationale: padding first // (relaxed), then release-store data_size so concurrent lock-free // readers observe a consistent pair. - __atomic_store_n(&meta->padding_size, capacity_ - size, - __ATOMIC_RELAXED); - __atomic_store_n(&meta->data_size, size, __ATOMIC_RELEASE); + bs_store_relaxed(&meta->padding_size, capacity_ - size); + bs_store_release(&meta->data_size, size); changed = true; } } From 3a1212656935954ca1ae9e7d7a0db1f6405f5ddd Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Sat, 30 May 2026 12:54:13 +0800 Subject: [PATCH 46/47] fix --- src/core/utility/buffer_storage.cc | 69 +++++++++++++++++++++++++----- 1 file changed, 58 insertions(+), 11 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index b2470facb..0118e4285 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -223,8 +223,16 @@ class BufferStorage : public IndexStorage { // large-page platforms. static constexpr size_t kAlign = 4096UL; size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL); - char *tmp = - static_cast(ailego_aligned_malloc(alloc_size, kAlign)); + // Allocate a 4K-aligned slot from the per-storage arena pool. + // This batches page-aligned allocation: under heap fragmentation + // (notably Android Bionic scudo), one large posix_memalign per + // arena via the secondary (mmap-backed) allocator is far more + // reliable than many independent posix_memalign(4K, 4K) calls. + char *tmp = nullptr; + { + std::lock_guard tmp_latch(owner_->tmp_buffers_mutex_); + tmp = owner_->tmp_arena_alloc_locked(alloc_size); + } if (!tmp) { LOG_ERROR( "WrappedSegment::read: cross-page alloc failed, file[%s], " @@ -240,14 +248,12 @@ class BufferStorage : public IndexStorage { "id[%zu], abs_offset=%zu, len=%zu, first_page=%zu, last_page=%zu", owner_->file_name_.c_str(), segment_id_, abs_offset, len, first_page, last_page); - ailego_free(tmp); + // The arena slot is intentionally not rolled back: rolling back + // would require holding the arena lock across read_range, while + // the worst-case leak per failed read is one slot (alloc_size). *data = nullptr; return 0; } - { - std::lock_guard tmp_latch(owner_->tmp_buffers_mutex_); - owner_->tmp_buffers_.push_back(tmp); - } *data = tmp; return len; } @@ -998,9 +1004,9 @@ class BufferStorage : public IndexStorage { memset(&footer_, 0, sizeof(footer_)); { std::lock_guard tmp_latch(tmp_buffers_mutex_); - for (char *p : tmp_buffers_) { - if (p) { - ailego_free(p); + for (const ArenaBlock &b : tmp_buffers_) { + if (b.base) { + ailego_free(b.base); } } tmp_buffers_.clear(); @@ -1427,7 +1433,48 @@ class BufferStorage : public IndexStorage { delete; }; - std::vector tmp_buffers_{}; + // Arena slab for cross-page temp buffers handed out by + // WrappedSegment::read(const void**). The legacy contract requires + // every returned pointer to stay valid until close_index(), so slots + // are never freed individually -- they are carved out of large + // 4K-aligned arenas which are released in bulk. + // + // Why an arena instead of one posix_memalign(4K, 4K) per read: + // Android Bionic scudo's small-class chunk pool is prone to large- + // alignment starvation under fragmentation (we observed sporadic + // posix_memalign(4096, 4096) returning ENOMEM even with plenty of + // free memory). A single large request (>= kArenaSize) is served + // from scudo's secondary allocator (mmap-backed), which is reliable + // up to the true OOM boundary. + struct ArenaBlock { + char *base{nullptr}; + size_t size{0}; // Total bytes in this arena (4K-aligned). + size_t used{0}; // Bytes already handed out (4K-aligned). + }; + // Caller MUST hold tmp_buffers_mutex_. alloc_size MUST be a + // multiple of 4096. Returns nullptr only if scudo cannot satisfy a + // fresh arena allocation, i.e. effectively true OOM. + char *tmp_arena_alloc_locked(size_t alloc_size) { + static constexpr size_t kAlign = 4096UL; + static constexpr size_t kArenaSize = 1UL << 20; // 1 MiB + if (!tmp_buffers_.empty()) { + ArenaBlock &back = tmp_buffers_.back(); + if (back.base && back.size - back.used >= alloc_size) { + char *out = back.base + back.used; + back.used += alloc_size; + return out; + } + } + size_t new_size = alloc_size > kArenaSize ? alloc_size : kArenaSize; + char *p = + static_cast(ailego_aligned_malloc(new_size, kAlign)); + if (!p) { + return nullptr; + } + tmp_buffers_.push_back(ArenaBlock{p, new_size, alloc_size}); + return p; + } + std::vector tmp_buffers_{}; mutable std::mutex tmp_buffers_mutex_{}; // buffer manager From 9a5cf34f8362c88cd0cd7b5f21e29023df19b79a Mon Sep 17 00:00:00 2001 From: Zefeng Yin Date: Sat, 30 May 2026 13:03:39 +0800 Subject: [PATCH 47/47] clang format --- src/core/utility/buffer_storage.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index 0118e4285..bf2485724 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -1448,8 +1448,8 @@ class BufferStorage : public IndexStorage { // up to the true OOM boundary. struct ArenaBlock { char *base{nullptr}; - size_t size{0}; // Total bytes in this arena (4K-aligned). - size_t used{0}; // Bytes already handed out (4K-aligned). + size_t size{0}; // Total bytes in this arena (4K-aligned). + size_t used{0}; // Bytes already handed out (4K-aligned). }; // Caller MUST hold tmp_buffers_mutex_. alloc_size MUST be a // multiple of 4096. Returns nullptr only if scudo cannot satisfy a @@ -1466,8 +1466,7 @@ class BufferStorage : public IndexStorage { } } size_t new_size = alloc_size > kArenaSize ? alloc_size : kArenaSize; - char *p = - static_cast(ailego_aligned_malloc(new_size, kAlign)); + char *p = static_cast(ailego_aligned_malloc(new_size, kAlign)); if (!p) { return nullptr; }