diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc index 553919fb3..c9296d640 100644 --- a/src/ailego/buffer/vector_page_table.cc +++ b/src/ailego/buffer/vector_page_table.cc @@ -13,15 +13,13 @@ // limitations under the License. #include +#include #include +#include #include #include #include -#if !defined(_MSC_VER) -#include -#endif - #if defined(_MSC_VER) #ifndef NOMINMAX #define NOMINMAX @@ -39,6 +37,29 @@ static ssize_t zvec_pread(int fd, void *buf, size_t count, size_t offset) { } return static_cast(bytes_read); } +static ssize_t zvec_pwrite(int fd, const void *buf, size_t count, + size_t offset) { + HANDLE handle = reinterpret_cast(_get_osfhandle(fd)); + if (handle == INVALID_HANDLE_VALUE) return -1; + OVERLAPPED ov = {}; + ov.Offset = static_cast(offset & 0xFFFFFFFF); + ov.OffsetHigh = static_cast(offset >> 32); + DWORD bytes_written = 0; + if (!WriteFile(handle, buf, static_cast(count), &bytes_written, &ov)) { + return -1; + } + return static_cast(bytes_written); +} +#else +#include +static inline ssize_t zvec_pread(int fd, void *buf, size_t count, + size_t offset) { + return ::pread(fd, buf, count, static_cast(offset)); +} +static inline ssize_t zvec_pwrite(int fd, const void *buf, size_t count, + size_t offset) { + return ::pwrite(fd, buf, count, static_cast(offset)); +} #endif namespace zvec { @@ -46,104 +67,220 @@ namespace ailego { const size_t kVectorPageSize = MemoryHelper::PageSize(); -void VectorPageTable::init(size_t entry_num) { - if (entries_) { - delete[] entries_; +bool VectorPageTable::init(size_t entry_num) { + size_t need_segments = (entry_num + kSegmentSize - 1) / kSegmentSize; + if (need_segments > kMaxSegments) { + LOG_ERROR( + "VectorPageTable::init: entry_num=%zu exceeds capacity " + "(kMaxEntries=%zu, need_segments=%zu, kMaxSegments=%zu); " + "refusing to init.", + entry_num, kMaxEntries, need_segments, kMaxSegments); + return false; + } + // Free old segments if any. init() is only called from VecBufferPool::init + // which is single-threaded with respect to other accesses, so a relaxed + // load of segment_count_ is sufficient here. + size_t old_count = segment_count_.load(std::memory_order_relaxed); + for (size_t i = 0; i < old_count; ++i) { + delete[] segments_[i]; + segments_[i] = nullptr; + } + for (size_t s = 0; s < need_segments; ++s) { + segments_[s] = new Entry[kSegmentSize]; + for (size_t i = 0; i < kSegmentSize; ++i) { + segments_[s][i].ref_count.store(std::numeric_limits::min()); + segments_[s][i].in_evict_queue.store(false); + segments_[s][i].is_dirty.store(false); + segments_[s][i].buffer = nullptr; + segments_[s][i].file_offset = 0; + } + } + // Publish new segments to readers. segment_count_ is published first + // (release) so that a reader that acquire-loads segment_count_ before + // entry_num_ also sees a consistent segment table; entry_num_ is the + // primary synchronization point used by callers via entry_num(). + segment_count_.store(need_segments, std::memory_order_release); + entry_num_.store(entry_num, std::memory_order_release); + return true; +} + +bool VectorPageTable::extend(size_t new_entry_num) { + // Relaxed read is fine: extend() is serialized by the caller (extend_file + // is invoked under the BufferStorage write latch). No other writer races + // with us on entry_num_ / segment_count_. + if (new_entry_num <= entry_num_.load(std::memory_order_relaxed)) { + return true; + } + size_t new_segment_count = (new_entry_num + kSegmentSize - 1) / kSegmentSize; + if (new_segment_count > kMaxSegments) { + LOG_ERROR( + "VectorPageTable::extend: new_entry_num=%zu exceeds capacity " + "(kMaxEntries=%zu, new_segment_count=%zu, kMaxSegments=%zu); " + "refusing to extend.", + new_entry_num, kMaxEntries, new_segment_count, kMaxSegments); + return false; } - entry_num_ = entry_num; - entries_ = new Entry[entry_num_]; - for (size_t i = 0; i < entry_num_; i++) { - entries_[i].ref_count.store(std::numeric_limits::min()); - entries_[i].in_evict_queue.store(false); - entries_[i].buffer = nullptr; + size_t old_count = segment_count_.load(std::memory_order_relaxed); + for (size_t s = old_count; s < new_segment_count; ++s) { + segments_[s] = new Entry[kSegmentSize]; + for (size_t i = 0; i < kSegmentSize; ++i) { + segments_[s][i].ref_count.store(std::numeric_limits::min()); + segments_[s][i].in_evict_queue.store(false); + segments_[s][i].is_dirty.store(false); + segments_[s][i].buffer = nullptr; + segments_[s][i].file_offset = 0; + } } + // Publish in the same order as init(): segment_count_ first, entry_num_ + // last. Both are release-stores so that the prior segment allocation / + // Entry initialization is visible to any reader that acquire-loads either + // counter (typically via entry_num()). + segment_count_.store(new_segment_count, std::memory_order_release); + entry_num_.store(new_entry_num, std::memory_order_release); + return true; } char *VectorPageTable::acquire_block(block_id_t block_id) { - assert(block_id < entry_num_); - Entry &entry = entries_[block_id]; + assert(block_id < entry_num_.load(std::memory_order_relaxed)); + Entry &e = entry_at(block_id); while (true) { - int current_count = entry.ref_count.load(std::memory_order_acquire); + int current_count = e.ref_count.load(std::memory_order_acquire); if (current_count < 0) { return nullptr; } - if (entry.ref_count.compare_exchange_weak(current_count, current_count + 1, - std::memory_order_acq_rel, - std::memory_order_acquire)) { - return entry.buffer; + if (e.ref_count.compare_exchange_weak(current_count, current_count + 1, + std::memory_order_acq_rel, + std::memory_order_acquire)) { + return e.buffer; } } } void VectorPageTable::release_block(block_id_t block_id) { - assert(block_id < entry_num_); - Entry &entry = entries_[block_id]; + assert(block_id < entry_num_.load(std::memory_order_relaxed)); + Entry &e = entry_at(block_id); - if (entry.ref_count.fetch_sub(1, std::memory_order_release) == 1) { + if (e.ref_count.fetch_sub(1, std::memory_order_release) == 1) { std::atomic_thread_fence(std::memory_order_acquire); - // Attempt to transition in_evict_queue from false -> true. The CAS ensures - // only one thread enqueues this block even if multiple threads race here. bool expected = false; - if (entry.in_evict_queue.compare_exchange_strong( - expected, true, std::memory_order_acq_rel, - std::memory_order_relaxed)) { + if (e.in_evict_queue.compare_exchange_strong(expected, true, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { BlockEvictionQueue::BlockType block; block.page_table = this; block.vector_block.first = block_id; block.vector_block.second = 0; BlockEvictionQueue::get_instance().add_single_block(block, 0); } - // else: block is already in the eviction queue; do not add a duplicate - // entry. } } void VectorPageTable::evict_block(block_id_t block_id) { - assert(block_id < entry_num_); - Entry &entry = entries_[block_id]; - char *buffer = entry.buffer; + assert(block_id < entry_num_.load(std::memory_order_relaxed)); + Entry &e = entry_at(block_id); int expected = 0; - if (entry.ref_count.compare_exchange_strong( - expected, std::numeric_limits::min())) { + // Two-phase eviction to prevent data race on e.buffer with + // set_block_acquired. We first CAS to kEvicting (-1), which causes + // set_block_acquired to spin-wait; then do the actual work (flush, free, + // null buffer); finally store INT_MIN ("evicted") which unblocks + // set_block_acquired. + static constexpr int kEvicting = -1; + if (e.ref_count.compare_exchange_strong(expected, kEvicting)) { + char *buffer = e.buffer; + if (buffer && e.is_dirty.load(std::memory_order_relaxed) && + flush_callback_) { + flush_callback_(block_id, buffer, kVectorPageSize, e.file_offset); + e.is_dirty.store(false, std::memory_order_relaxed); + } if (buffer) { + e.buffer = nullptr; MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); } + // Transition to fully-evicted state. Use release so that the + // set_block_acquired acquire-load sees e.buffer == nullptr. + e.ref_count.store(std::numeric_limits::min(), + std::memory_order_release); } - // Always reset in_evict_queue regardless of whether the CAS succeeded: - // - On success: the block is evicted; future releases should re-register it. - // - On failure: the block was re-acquired by another thread between the - // ref-count check and this call. Clearing in_evict_queue lets the next - // release_block() re-enqueue it so it is not silently lost. - entry.in_evict_queue.store(false, std::memory_order_relaxed); + e.in_evict_queue.store(false, std::memory_order_relaxed); } -char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer) { - assert(block_id < entry_num_); - Entry &entry = entries_[block_id]; +char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer, + size_t file_offset) { + assert(block_id < entry_num_.load(std::memory_order_acquire)); + Entry &e = entry_at(block_id); + // Diagnostics for the kEvicting wait. The wait itself never gives up: + // the only thread that can transition kEvicting -> INT_MIN is the + // evict_block() owner, so abandoning the spin here would orphan the + // entry in kEvicting forever. Instead, we use bounded backoff and emit + // tiered logs so a stuck eviction is observable. + using clock = std::chrono::steady_clock; + const auto wait_start = clock::now(); + auto last_log = wait_start; + unsigned spin_count = 0; + bool warned = false; while (true) { - int current_count = entry.ref_count.load(std::memory_order_relaxed); + int current_count = e.ref_count.load(std::memory_order_acquire); if (current_count >= 0) { - if (entry.ref_count.compare_exchange_weak( - current_count, current_count + 1, std::memory_order_acq_rel, - std::memory_order_acquire)) { + if (e.ref_count.compare_exchange_weak(current_count, current_count + 1, + std::memory_order_acq_rel, + std::memory_order_acquire)) { MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); - return entry.buffer; + return e.buffer; } + } else if (current_count == std::numeric_limits::min()) { + // Fully evicted — safe to claim this entry for our new buffer. + e.buffer = buffer; + e.file_offset = file_offset; + e.in_evict_queue.store(false, std::memory_order_relaxed); + e.is_dirty.store(false, std::memory_order_relaxed); + e.ref_count.store(1, std::memory_order_release); + return e.buffer; } else { - entry.buffer = buffer; - entry.in_evict_queue.store(false, std::memory_order_relaxed); - entry.ref_count.store(1, std::memory_order_release); - return entry.buffer; + // kEvicting (-1): eviction is in progress on this entry. + // Tiered backoff: hot spin first, then short sleep, then longer sleep. + ++spin_count; + if (spin_count < 64) { + // Pure busy wait for the common ~μs case. + } else if (spin_count < 1024) { + std::this_thread::yield(); + } else if (spin_count < 8192) { + std::this_thread::sleep_for(std::chrono::microseconds(100)); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + } + // Tiered diagnostics: warn once after 100ms, error every 1s after 1s. + const auto now = clock::now(); + const auto elapsed = now - wait_start; + if (!warned && elapsed >= std::chrono::milliseconds(100)) { + LOG_WARN( + "set_block_acquired: long kEvicting wait on block_id=%zu " + "(>=100ms); evict_block may be slow", + static_cast(block_id)); + warned = true; + } + if (elapsed >= std::chrono::seconds(1) && + (now - last_log) >= std::chrono::seconds(1)) { + const auto secs = + std::chrono::duration_cast(elapsed).count(); + LOG_ERROR( + "set_block_acquired: stuck in kEvicting on block_id=%zu for " + "%lld s; evict_block owner may be hung or starved", + static_cast(block_id), static_cast(secs)); + last_log = now; + } } } } -VecBufferPool::VecBufferPool(const std::string &filename) { +VecBufferPool::VecBufferPool(const std::string &filename, bool writable) { file_name_ = filename; + writable_ = writable; #if defined(_MSC_VER) - fd_ = _open(filename.c_str(), O_RDONLY | _O_BINARY); + int flags = writable_ ? (O_RDWR | _O_BINARY) : (O_RDONLY | _O_BINARY); + fd_ = _open(filename.c_str(), flags, 0644); #else - fd_ = open(filename.c_str(), O_RDONLY); + int flags = writable_ ? O_RDWR : O_RDONLY; + fd_ = ::open(filename.c_str(), flags, 0644); #endif if (fd_ < 0) { throw std::runtime_error("Failed to open file: " + filename); @@ -164,11 +301,40 @@ VecBufferPool::VecBufferPool(const std::string &filename) { int VecBufferPool::init() { size_t block_num = (file_size_ + kVectorPageSize - 1) / kVectorPageSize; - page_table_.init(block_num); + if (!page_table_.init(block_num)) { + LOG_ERROR( + "VecBufferPool::init: page_table_ init failed for file[%s], " + "file_size=%zu, block_num=%zu (exceeds " + "VectorPageTable::kMaxEntries=%zu)", + file_name_.c_str(), file_size_, block_num, + VectorPageTable::kMaxEntries); + return -1; + } block_mutexes_ = std::make_unique(VecBufferPool::kMutexBucketCount); LOG_DEBUG("entry num: %zu, file_size: %zu", page_table_.entry_num(), file_size_); + + // In writable mode, inject a flush callback into the page table so that + // evict_block()/flush_block()/flush_all() can pwrite dirty blocks back to + // the backing file without needing to know about fd_ directly. + if (writable_) { + int fd = fd_; + const std::string &name = file_name_; + page_table_.set_flush_callback([fd, &name](block_id_t /*block_id*/, + char *buf, size_t sz, + size_t off) -> int { + ssize_t w = zvec_pwrite(fd, buf, sz, off); + if (w != static_cast(sz)) { + LOG_ERROR( + "Buffer pool flush failed: file[%s], offset[%zu], " + "expected[%zu], got[%zd]", + name.c_str(), off, sz, w); + return -1; + } + return 0; + }); + } return 0; } @@ -213,11 +379,7 @@ char *VecBufferPool::acquire_buffer(block_id_t page_id, int retry) { if (expected_bytes < kVectorPageSize) { std::memset(buffer + expected_bytes, 0, kVectorPageSize - expected_bytes); } -#if defined(_MSC_VER) ssize_t read_bytes = zvec_pread(fd_, buffer, expected_bytes, page_offset); -#else - ssize_t read_bytes = pread(fd_, buffer, expected_bytes, page_offset); -#endif if (read_bytes != static_cast(expected_bytes)) { LOG_ERROR( "Buffer pool failed to read file at offset: file[%s], page_id[%zu], " @@ -226,15 +388,11 @@ char *VecBufferPool::acquire_buffer(block_id_t page_id, int retry) { MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize); return nullptr; } - return page_table_.set_block_acquired(page_id, buffer); + return page_table_.set_block_acquired(page_id, buffer, page_offset); } int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) { -#if defined(_MSC_VER) ssize_t read_bytes = zvec_pread(fd_, buffer, length, offset); -#else - ssize_t read_bytes = pread(fd_, buffer, length, offset); -#endif if (read_bytes != static_cast(length)) { LOG_ERROR( "Buffer pool failed to read file at offset: file[%s], offset[%zu], " @@ -245,6 +403,141 @@ int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) { return 0; } +int VecBufferPool::write_range(size_t file_offset, size_t length, + const char *src) { + if (!writable_) { + LOG_ERROR("write_range called on read-only pool: file[%s]", + file_name_.c_str()); + return -1; + } + if (length == 0) { + return 0; + } + size_t first_page = file_offset / kVectorPageSize; + size_t last_page = (file_offset + length - 1) / kVectorPageSize; + size_t remaining = length; + size_t src_cursor = 0; + for (size_t pg = first_page; pg <= last_page; ++pg) { + // Loading the page ensures we do not clobber unrelated bytes within the + // same page when the write is not page-aligned. acquire_buffer() pre-fills + // from the backing file (or zero-pads beyond EOF). + char *page = this->acquire_buffer(pg, 50); + if (!page) { + LOG_ERROR("write_range acquire failed: file[%s], page[%zu]", + file_name_.c_str(), pg); + return -1; + } + size_t page_start = pg * kVectorPageSize; + size_t intra_offset = (pg == first_page) ? (file_offset - page_start) : 0; + size_t chunk = std::min(kVectorPageSize - intra_offset, remaining); + std::memcpy(page + intra_offset, src + src_cursor, chunk); + page_table_.mark_dirty(pg); + page_table_.release_block(pg); + src_cursor += chunk; + remaining -= chunk; + } + return 0; +} + +int VecBufferPool::write_meta(size_t offset, size_t length, + const char *buffer) { + if (!writable_) { + LOG_ERROR("write_meta called on read-only pool: file[%s]", + file_name_.c_str()); + return -1; + } + ssize_t w = zvec_pwrite(fd_, buffer, length, offset); + if (w != static_cast(length)) { + LOG_ERROR( + "Buffer pool failed to write meta: file[%s], offset[%zu], " + "length[%zu], got[%zd]", + file_name_.c_str(), offset, length, w); + return -1; + } + return 0; +} + +int VecBufferPool::flush_all() { + if (!writable_) { + return 0; + } + int rc = 0; + size_t total_dirty = 0; + size_t fail_count = 0; + for (size_t i = 0; i < page_table_.entry_num(); ++i) { + if (page_table_.is_block_dirty(i)) { + ++total_dirty; + int r = page_table_.flush_block(i); + if (r != 0) { + rc = r; + ++fail_count; + } + } + } + if (fail_count != 0) { + // Aggregated diagnostic so that callers (notably ~VecBufferPool, which + // discards the return value) cannot silently lose dirty pages: any + // unflushed page at this point means the on-disk image is now stale. + LOG_ERROR( + "VecBufferPool::flush_all: %zu/%zu dirty page(s) failed to flush, " + "file[%s] last_rc=%d -- on-disk data may be stale.", + fail_count, total_dirty, file_name_.c_str(), rc); + } + return rc; +} + +bool VecBufferPool::extend_file(size_t new_size) { + if (!writable_) { + LOG_ERROR("extend_file called on read-only pool: file[%s]", + file_name_.c_str()); + return false; + } + if (new_size <= file_size_) { + return true; + } + // Pre-validate against the page table's static capacity BEFORE mutating + // any on-disk state. Otherwise a successful ftruncate followed by a + // failed page_table_.extend() would leave the file size and the page + // table out of sync (file grew, but no Entry slots cover the new range). + size_t new_entry_num = (new_size + kVectorPageSize - 1) / kVectorPageSize; + if (new_entry_num > VectorPageTable::kMaxEntries) { + LOG_ERROR( + "extend_file: requested new_size=%zu would require %zu page entries, " + "exceeding VectorPageTable::kMaxEntries=%zu (file=%s).", + new_size, new_entry_num, VectorPageTable::kMaxEntries, + file_name_.c_str()); + return false; + } +#if defined(_MSC_VER) + if (_chsize_s(fd_, static_cast(new_size)) != 0) { + LOG_ERROR("extend_file _chsize_s failed: file[%s], new_size[%zu]", + file_name_.c_str(), new_size); + return false; + } +#else + if (::ftruncate(fd_, static_cast(new_size)) != 0) { + LOG_ERROR("extend_file ftruncate failed: file[%s], new_size[%zu]", + file_name_.c_str(), new_size); + return false; + } +#endif + file_size_ = new_size; + // Extend the page table to cover the new file range. Existing entries + // stay at their original addresses so concurrent readers are unaffected. + // Capacity has already been validated above, so this should never fail; + // a failure here would indicate a programming error and is logged. + if (new_entry_num > page_table_.entry_num()) { + if (!page_table_.extend(new_entry_num)) { + LOG_ERROR( + "extend_file: page_table_.extend(%zu) failed unexpectedly after " + "capacity pre-check (file=%s, new_size=%zu).", + new_entry_num, file_name_.c_str(), new_size); + return false; + } + } + return true; +} + char *VecBufferPoolHandle::get_single_page(size_t file_offset, size_t len, size_t &out_page_id) { size_t first_page = file_offset / kVectorPageSize; @@ -252,6 +545,10 @@ char *VecBufferPoolHandle::get_single_page(size_t file_offset, size_t len, out_page_id = first_page; char *page = pool_.acquire_buffer(first_page, 50); if (!page) { + LOG_ERROR( + "VecBufferPoolHandle::get_single_page: acquire_buffer failed, " + "file_offset=%zu, len=%zu, page=%zu, page_size=%zu", + file_offset, len, first_page, kVectorPageSize); return nullptr; } return page + (file_offset - first_page * kVectorPageSize); @@ -269,6 +566,11 @@ bool VecBufferPoolHandle::read_range(size_t file_offset, size_t len, for (size_t pg = first_page; pg <= last_page; ++pg) { char *page = pool_.acquire_buffer(pg, 50); if (!page) { + LOG_ERROR( + "VecBufferPoolHandle::read_range: acquire_buffer failed, " + "file_offset=%zu, len=%zu, page=%zu, first_page=%zu, last_page=%zu, " + "page_size=%zu", + file_offset, len, pg, first_page, last_page, kVectorPageSize); return false; } size_t page_start = pg * kVectorPageSize; @@ -286,6 +588,24 @@ int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *buffer) { return pool_.get_meta(offset, length, buffer); } +int VecBufferPoolHandle::write_range(size_t file_offset, size_t len, + const char *src) { + return pool_.write_range(file_offset, len, src); +} + +int VecBufferPoolHandle::write_meta(size_t offset, size_t length, + const char *buffer) { + return pool_.write_meta(offset, length, buffer); +} + +int VecBufferPoolHandle::flush_all() { + return pool_.flush_all(); +} + +bool VecBufferPoolHandle::writable() const { + return pool_.writable(); +} + void VecBufferPoolHandle::release_one(block_id_t block_id) { pool_.page_table_.release_block(block_id); } diff --git a/src/core/algorithm/flat/flat_streamer.cc b/src/core/algorithm/flat/flat_streamer.cc index 8969efc14..5e6171659 100644 --- a/src/core/algorithm/flat/flat_streamer.cc +++ b/src/core/algorithm/flat/flat_streamer.cc @@ -34,7 +34,7 @@ FlatStreamer::FlatStreamer() : entity_(stats_) {} template FlatStreamer::~FlatStreamer() { - if (state_ == STATE_INITED) { + if (state_ == STATE_INITED || state_ == STATE_OPENED) { this->cleanup(); } } diff --git a/src/core/algorithm/flat/flat_streamer_entity.cc b/src/core/algorithm/flat/flat_streamer_entity.cc index 988f5fdfb..87d9a1906 100644 --- a/src/core/algorithm/flat/flat_streamer_entity.cc +++ b/src/core/algorithm/flat/flat_streamer_entity.cc @@ -165,13 +165,20 @@ int FlatStreamerEntity::add(uint64_t key, const void *vec, size_t size) { IndexStorage::MemoryBlock head_block; this->get_head_block(head_block); - const BlockLocation *bl = - reinterpret_cast(head_block.data()); - if (ailego_unlikely(bl == nullptr)) { - LOG_ERROR("Failed to get block loc"); - return IndexError_ReadData; + BlockLocation block; + { + const BlockLocation *bl = + reinterpret_cast(head_block.data()); + if (ailego_unlikely(bl == nullptr)) { + LOG_ERROR("Failed to get block loc"); + return IndexError_ReadData; + } + block = *bl; } - BlockLocation block = *bl; + // Release the head block reference early so that the buffer pool ref_count + // and memory budget held by it do not block subsequent acquire/evict in this + // function (alloc_block / add_to_block may compete for the same memory). + head_block.reset(nullptr); if (!this->is_valid_block(block)) { int ret = this->alloc_block(block, &block); @@ -922,6 +929,9 @@ int FlatStreamerEntity::add_vector_with_id(const uint32_t id, const void *query, this->get_head_block(head_block); BlockLocation block = *reinterpret_cast(head_block.data()); + // Release buffer-pool pin before any alloc_block() call that may trigger + // append_segment() and rebuild the pool (same reason as in add()). + head_block.reset(nullptr); if (!this->is_valid_block(block)) { int ret = this->alloc_block(block, &block); if (ailego_unlikely(ret != 0)) { diff --git a/src/core/algorithm/hnsw/hnsw_index_hash.h b/src/core/algorithm/hnsw/hnsw_index_hash.h index 1557dcd93..cc59e84ab 100644 --- a/src/core/algorithm/hnsw/hnsw_index_hash.h +++ b/src/core/algorithm/hnsw/hnsw_index_hash.h @@ -41,9 +41,9 @@ class HnswIndexHashMap { items_(reinterpret_cast(data)) {} //! Return a empty loc or the key item loc - Slot(Chunk::Pointer &&chunk, IndexStorage::MemoryBlock &&mem_block) - : chunk_(std::move(chunk)), items_block_(std::move(mem_block)) { - items_ = reinterpret_cast(items_block_.data()); + Slot(Chunk::Pointer &&chunk, std::vector &&local_data) + : chunk_(std::move(chunk)), local_data_(std::move(local_data)) { + items_ = reinterpret_cast(local_data_.data()); } const_iterator find(key_type key, uint32_t max_items, uint32_t mask) const { auto it = &items_[key & mask]; @@ -73,8 +73,8 @@ class HnswIndexHashMap { private: Chunk::Pointer chunk_{}; - const Item *items_{nullptr}; // point to chunk data - IndexStorage::MemoryBlock items_block_{}; + const Item *items_{nullptr}; // point to local_data_ + std::vector local_data_{}; }; public: @@ -114,9 +114,9 @@ class HnswIndexHashMap { } int cleanup(void) { - broker_.reset(); slots_.clear(); slots_.shrink_to_fit(); + broker_.reset(); mask_bits_ = 0U; slot_items_ = 0U; slot_loc_mask_ = 0U; @@ -141,7 +141,6 @@ class HnswIndexHashMap { auto idx = key >> mask_bits_; if (idx >= slots_.size()) { if (ailego_unlikely(idx >= slots_.capacity())) { - LOG_ERROR("no space to insert"); return false; } for (auto i = slots_.size(); i <= idx; ++i) { @@ -152,7 +151,6 @@ class HnswIndexHashMap { } auto it = slots_[idx].find(key, slot_items_, slot_loc_mask_); if (ailego_unlikely(it == nullptr)) { - LOG_ERROR("no space to insert"); return false; } @@ -179,14 +177,10 @@ class HnswIndexHashMap { LOG_ERROR("Chunk resize failed, size=%zu", size); return false; } - //! Read the whole data to memory - IndexStorage::MemoryBlock data_block; - if (ailego_unlikely(chunk->read(0U, data_block, size) != size)) { - LOG_ERROR("Chunk read failed, size=%zu", size); - return false; - } - - slots_.emplace_back(std::move(chunk), std::move(data_block)); + //! Use a local zero-initialized buffer; new chunks contain all zeros, + //! so no buffer-pool read is needed and no ref_count is pinned. + std::vector local_buf(size, 0); + slots_.emplace_back(std::move(chunk), std::move(local_buf)); return true; } @@ -208,13 +202,14 @@ class HnswIndexHashMap { i, chunk->data_size(), size); return IndexError_InvalidFormat; } - //! Read the whole data to memory - IndexStorage::MemoryBlock data_block; - if (ailego_unlikely(chunk->read(0U, data_block, size) != size)) { - LOG_ERROR("Chunk read failed, size=%zu", size); - return false; + //! Copy chunk data into a local buffer via fetch() so that no + //! buffer-pool block is pinned for the lifetime of the Slot. + std::vector local_buf(size); + if (ailego_unlikely(chunk->fetch(0U, local_buf.data(), size) != size)) { + LOG_ERROR("Chunk fetch failed, size=%zu", size); + return IndexError_InvalidFormat; } - slots_.emplace_back(std::move(chunk), std::move(data_block)); + slots_.emplace_back(std::move(chunk), std::move(local_buf)); } return 0; } diff --git a/src/core/algorithm/hnsw/hnsw_streamer.cc b/src/core/algorithm/hnsw/hnsw_streamer.cc index 935cae5d4..c5e78f415 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer.cc +++ b/src/core/algorithm/hnsw/hnsw_streamer.cc @@ -28,7 +28,7 @@ namespace core { HnswStreamer::HnswStreamer() = default; HnswStreamer::~HnswStreamer() { - if (state_ == STATE_INITED) { + if (state_ == STATE_INITED || state_ == STATE_OPENED) { this->cleanup(); } } diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc index acc9bee36..a8ada19e6 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc +++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc @@ -37,6 +37,7 @@ int HnswStreamerEntity::init(size_t max_doc_cnt) { std::lock_guard lock(mutex_); broker_ = std::make_shared(stats_); upper_neighbor_index_ = std::make_shared(); + upper_neighbor_rw_mutex_ = std::make_shared(); keys_map_lock_ = std::make_shared(); keys_map_ = std::make_shared>(); if (!keys_map_ || !upper_neighbor_index_ || !broker_ || !keys_map_lock_) { @@ -767,9 +768,10 @@ const HnswEntity::Pointer HnswStreamerEntity::clone() const { HnswStreamerEntity *entity = new (std::nothrow) HnswStreamerEntity( stats_, header(), chunk_size_, node_index_mask_bits_, upper_neighbor_mask_bits_, filter_same_key_, get_vector_enabled_, - upper_neighbor_index_, keys_map_lock_, keys_map_, use_key_info_map_, - std::move(node_chunks), std::move(upper_neighbor_chunks), broker_, - node_chunk_bases_, upper_neighbor_chunk_bases_); + upper_neighbor_index_, upper_neighbor_rw_mutex_, keys_map_lock_, + keys_map_, use_key_info_map_, std::move(node_chunks), + std::move(upper_neighbor_chunks), broker_, node_chunk_bases_, + upper_neighbor_chunk_bases_); if (ailego_unlikely(!entity)) { LOG_ERROR("HnswStreamerEntity new failed"); } @@ -800,9 +802,9 @@ const HnswEntity::Pointer HnswMmapStreamerEntity::clone() const { auto *entity = new (std::nothrow) HnswMmapStreamerEntity( stats_, header(), chunk_size_, node_index_mask_bits_, upper_neighbor_mask_bits_, filter_same_key_, get_vector_enabled_, - upper_neighbor_index_, keys_map_lock_, keys_map_, use_key_info_map_, - std::move(node_chunks), std::move(upper_neighbor_chunks), broker_, - nullptr, nullptr); + upper_neighbor_index_, upper_neighbor_rw_mutex_, keys_map_lock_, + keys_map_, use_key_info_map_, std::move(node_chunks), + std::move(upper_neighbor_chunks), broker_, nullptr, nullptr); if (ailego_unlikely(!entity)) { LOG_ERROR("HnswMmapStreamerEntity new failed"); } @@ -833,9 +835,9 @@ const HnswEntity::Pointer HnswContiguousStreamerEntity::clone() const { auto *entity = new (std::nothrow) HnswContiguousStreamerEntity( stats_, header(), chunk_size_, node_index_mask_bits_, upper_neighbor_mask_bits_, filter_same_key_, get_vector_enabled_, - upper_neighbor_index_, keys_map_lock_, keys_map_, use_key_info_map_, - std::move(node_chunks), std::move(upper_neighbor_chunks), broker_, - nullptr, nullptr); + upper_neighbor_index_, upper_neighbor_rw_mutex_, keys_map_lock_, + keys_map_, use_key_info_map_, std::move(node_chunks), + std::move(upper_neighbor_chunks), broker_, nullptr, nullptr); if (ailego_unlikely(!entity)) { LOG_ERROR("HnswContiguousStreamerEntity new failed"); return HnswEntity::Pointer(); diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.h b/src/core/algorithm/hnsw/hnsw_streamer_entity.h index 3c2fb0cea..677393de3 100644 --- a/src/core/algorithm/hnsw/hnsw_streamer_entity.h +++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.h @@ -17,6 +17,7 @@ #include #include #include +#include #if defined(__linux__) || defined(__APPLE__) #include #endif @@ -246,19 +247,19 @@ class HnswStreamerEntity : public HnswEntity { using NIHashMapPointer = std::shared_ptr; //! Clone construct, used by clone method in subclasses - HnswStreamerEntity(IndexStreamer::Stats &stats, const HNSWHeader &hd, - size_t chunk_size, uint32_t node_index_mask_bits, - uint32_t upper_neighbor_mask_bits, bool filter_same_key, - bool get_vector_enabled, - const NIHashMapPointer &upper_neighbor_index, - std::shared_ptr &keys_map_lock, - const HashMapPointer &keys_map, - bool use_key_info_map, - std::vector &&node_chunks, - std::vector &&upper_neighbor_chunks, - const ChunkBroker::Pointer &broker, - std::shared_ptr> node_bases, - std::shared_ptr> upper_bases) + HnswStreamerEntity( + IndexStreamer::Stats &stats, const HNSWHeader &hd, size_t chunk_size, + uint32_t node_index_mask_bits, uint32_t upper_neighbor_mask_bits, + bool filter_same_key, bool get_vector_enabled, + const NIHashMapPointer &upper_neighbor_index, + const std::shared_ptr &upper_neighbor_rw_mutex, + std::shared_ptr &keys_map_lock, + const HashMapPointer &keys_map, bool use_key_info_map, + std::vector &&node_chunks, + std::vector &&upper_neighbor_chunks, + const ChunkBroker::Pointer &broker, + std::shared_ptr> node_bases, + std::shared_ptr> upper_bases) : stats_(stats), chunk_size_(chunk_size), node_index_mask_bits_(node_index_mask_bits), @@ -269,6 +270,7 @@ class HnswStreamerEntity : public HnswEntity { filter_same_key_(filter_same_key), get_vector_enabled_(get_vector_enabled), use_key_info_map_(use_key_info_map), + upper_neighbor_rw_mutex_(upper_neighbor_rw_mutex), upper_neighbor_index_(upper_neighbor_index), keys_map_lock_(keys_map_lock), keys_map_(keys_map), @@ -323,6 +325,10 @@ class HnswStreamerEntity : public HnswEntity { inline std::pair get_upper_neighbor_chunk_loc( level_t level, node_id_t id) const { + // Shared lock: concurrent readers are fine, but must synchronize with + // add_upper_neighbor's exclusive lock to avoid data-race on + // slots_.size() inside HnswIndexHashMap. + std::shared_lock lk(*upper_neighbor_rw_mutex_); auto it = upper_neighbor_index_->find(id); ailego_assert_abort(it != upper_neighbor_index_->end(), "Get upper neighbor header failed"); @@ -370,6 +376,10 @@ class HnswStreamerEntity : public HnswEntity { if (level == 0) { return 0; } + // Exclusive lock: protects upper_neighbor_chunks_.emplace_back() and + // upper_neighbor_index_->insert() from racing with concurrent find() + // calls in get_upper_neighbor_chunk_loc(). + std::unique_lock lk(*upper_neighbor_rw_mutex_); Chunk::Pointer chunk; uint64_t chunk_offset = UINT64_MAX; size_t neighbors_size = get_total_upper_neighbors_size(level); @@ -408,17 +418,40 @@ class HnswStreamerEntity : public HnswEntity { meta.level = level; meta.index = (chunk_index << upper_neighbor_mask_bits_) | (chunk_offset / upper_neighbor_size_); + size_t zero_start = chunk_offset; chunk_offset += upper_neighbor_size_ * level; - if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) { - LOG_ERROR("HashMap insert value failed"); - return IndexError_Runtime; - } + // IMPORTANT: order matters here. + // 1) resize so the chunk's data_size covers the new region. + // 2) zero-fill the new region: storage backends like BufferStorage do + // NOT zero on resize -- only metadata is updated, and the underlying + // page may contain stale content from a previously-evicted page. + // Without this step, NeighborsHeader::neighbor_cnt is garbage and + // select_entry_point()/search_neighbors() iterate over garbage + // node_ids, eventually triggering find()'s assertion in + // get_upper_neighbor_chunk_loc(). + // 3) ONLY THEN publish the entry to upper_neighbor_index_, so that any + // concurrent reader that finds this id already sees a properly + // zeroed upper-neighbor slot. if (ailego_unlikely(chunk->resize(chunk_offset) != chunk_offset)) { LOG_ERROR("Chunk resize to %zu failed", (size_t)chunk_offset); return IndexError_Runtime; } + // Use std::vector instead of a VLA: VLAs are a GNU extension and may + // produce different codegen / be rejected under clang/MSVC. + std::vector zeros(neighbors_size, 0); + if (ailego_unlikely(chunk->write(zero_start, zeros.data(), + neighbors_size) != neighbors_size)) { + LOG_ERROR("Chunk write zeros failed"); + return IndexError_Runtime; + } + + if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) { + LOG_ERROR("HashMap insert value failed"); + return IndexError_Runtime; + } + return 0; } @@ -529,6 +562,10 @@ class HnswStreamerEntity : public HnswEntity { protected: IndexStreamer::Stats &stats_; std::mutex mutex_{}; + //! Guards upper_neighbor_index_ and upper_neighbor_chunks_ against + //! concurrent reads (find) and writes (insert/emplace_back). + //! Shared via shared_ptr so all clones synchronize on the SAME mutex. + mutable std::shared_ptr upper_neighbor_rw_mutex_{}; size_t max_index_size_{0UL}; uint32_t chunk_size_{kDefaultChunkSize}; uint32_t upper_neighbor_chunk_size_{kDefaultChunkSize}; diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_index_hash.h b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_index_hash.h index 4f01aabb3..bf3dc1e7c 100644 --- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_index_hash.h +++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_index_hash.h @@ -41,9 +41,9 @@ class HnswIndexHashMap { items_(reinterpret_cast(data)) {} //! Return a empty loc or the key item loc - Slot(Chunk::Pointer &&chunk, IndexStorage::MemoryBlock &&mem_block) - : chunk_(std::move(chunk)), items_block_(std::move(mem_block)) { - items_ = reinterpret_cast(items_block_.data()); + Slot(Chunk::Pointer &&chunk, std::vector &&local_data) + : chunk_(std::move(chunk)), local_data_(std::move(local_data)) { + items_ = reinterpret_cast(local_data_.data()); } const_iterator find(key_type key, uint32_t max_items, uint32_t mask) const { auto it = &items_[key & mask]; @@ -73,8 +73,8 @@ class HnswIndexHashMap { private: Chunk::Pointer chunk_{}; - const Item *items_{nullptr}; // point to chunk data - IndexStorage::MemoryBlock items_block_{}; + const Item *items_{nullptr}; // point to local_data_ + std::vector local_data_{}; }; public: @@ -179,14 +179,18 @@ class HnswIndexHashMap { LOG_ERROR("Chunk resize failed, size=%zu", size); return false; } - //! Read the whole data to memory - IndexStorage::MemoryBlock data_block; - if (ailego_unlikely(chunk->read(0U, data_block, size) != size)) { - LOG_ERROR("Chunk read failed, size=%zu", size); - return false; - } - - slots_.emplace_back(std::move(chunk), std::move(data_block)); + //! Use a local zero-initialized buffer; new chunks contain all zeros, + //! so no buffer-pool read is needed and no ref_count is pinned. + //! NOTE: Previously this used `chunk->read(0U, data_block, size)` which + //! returns a view into the underlying BufferPool page. That made the + //! Slot's `items_` pointer alias buffer-pool memory shared across + //! threads, which under clang -O3 release exposed a data race on + //! Slot::find()'s probing read of `it->second` (concurrent + //! const_cast writes from insert() were not reliably visible). Using a + //! private zero-initialized vector matches the HNSW (non-RABITQ) + //! implementation and avoids this race. + std::vector local_buf(size, 0); + slots_.emplace_back(std::move(chunk), std::move(local_buf)); return true; } @@ -208,13 +212,14 @@ class HnswIndexHashMap { i, chunk->data_size(), size); return IndexError_InvalidFormat; } - //! Read the whole data to memory - IndexStorage::MemoryBlock data_block; - if (ailego_unlikely(chunk->read(0U, data_block, size) != size)) { - LOG_ERROR("Chunk read failed, size=%zu", size); - return false; + //! Copy chunk data into a local buffer via fetch() so that no + //! buffer-pool block is pinned for the lifetime of the Slot. + std::vector local_buf(size); + if (ailego_unlikely(chunk->fetch(0U, local_buf.data(), size) != size)) { + LOG_ERROR("Chunk fetch failed, size=%zu", size); + return IndexError_InvalidFormat; } - slots_.emplace_back(std::move(chunk), std::move(data_block)); + slots_.emplace_back(std::move(chunk), std::move(local_buf)); } return 0; } diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer.cc b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer.cc index 9eacf0bc6..2ea2f6aa0 100644 --- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer.cc +++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer.cc @@ -40,7 +40,7 @@ HnswRabitqStreamer::HnswRabitqStreamer(IndexProvider::Pointer provider, provider_(std::move(provider)) {} HnswRabitqStreamer::~HnswRabitqStreamer() { - if (state_ == STATE_INITED) { + if (state_ == STATE_INITED || state_ == STATE_OPENED) { this->cleanup(); } } diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.cc b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.cc index 35501ed94..cef59c35c 100644 --- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.cc +++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.cc @@ -34,6 +34,7 @@ int HnswRabitqStreamerEntity::init(size_t max_doc_cnt) { std::lock_guard lock(mutex_); broker_ = std::make_shared(stats_); upper_neighbor_index_ = std::make_shared(); + upper_neighbor_rw_mutex_ = std::make_shared(); keys_map_lock_ = std::make_shared(); keys_map_ = std::make_shared>(); if (!keys_map_ || !upper_neighbor_index_ || !broker_ || !keys_map_lock_) { @@ -697,8 +698,9 @@ const HnswRabitqEntity::Pointer HnswRabitqStreamerEntity::clone() const { new (std::nothrow) HnswRabitqStreamerEntity( stats_, header(), chunk_size_, node_index_mask_bits_, upper_neighbor_mask_bits_, filter_same_key_, get_vector_enabled_, - upper_neighbor_index_, keys_map_lock_, keys_map_, use_key_info_map_, - std::move(node_chunks), std::move(upper_neighbor_chunks), broker_); + upper_neighbor_index_, upper_neighbor_rw_mutex_, keys_map_lock_, + keys_map_, use_key_info_map_, std::move(node_chunks), + std::move(upper_neighbor_chunks), broker_); if (ailego_unlikely(!entity)) { LOG_ERROR("HnswRabitqStreamerEntity new failed"); } diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h index ea36143af..7c5b600e7 100644 --- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h +++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include @@ -216,17 +217,17 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { using NIHashMapPointer = std::shared_ptr; //! Private construct, only be called by clone method - HnswRabitqStreamerEntity(IndexStreamer::Stats &stats, const HNSWHeader &hd, - size_t chunk_size, uint32_t node_index_mask_bits, - uint32_t upper_neighbor_mask_bits, - bool filter_same_key, bool get_vector_enabled, - const NIHashMapPointer &upper_neighbor_index, - std::shared_ptr &keys_map_lock, - const HashMapPointer &keys_map, - bool use_key_info_map, - std::vector &&node_chunks, - std::vector &&upper_neighbor_chunks, - const HnswRabitqChunkBroker::Pointer &broker) + HnswRabitqStreamerEntity( + IndexStreamer::Stats &stats, const HNSWHeader &hd, size_t chunk_size, + uint32_t node_index_mask_bits, uint32_t upper_neighbor_mask_bits, + bool filter_same_key, bool get_vector_enabled, + const NIHashMapPointer &upper_neighbor_index, + const std::shared_ptr &upper_neighbor_rw_mutex, + std::shared_ptr &keys_map_lock, + const HashMapPointer &keys_map, bool use_key_info_map, + std::vector &&node_chunks, + std::vector &&upper_neighbor_chunks, + const HnswRabitqChunkBroker::Pointer &broker) : stats_(stats), chunk_size_(chunk_size), node_index_mask_bits_(node_index_mask_bits), @@ -237,6 +238,7 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { filter_same_key_(filter_same_key), get_vector_enabled_(get_vector_enabled), use_key_info_map_(use_key_info_map), + upper_neighbor_rw_mutex_(upper_neighbor_rw_mutex), upper_neighbor_index_(upper_neighbor_index), keys_map_lock_(keys_map_lock), keys_map_(keys_map), @@ -286,6 +288,11 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { inline std::pair get_upper_neighbor_chunk_loc( level_t level, node_id_t id) const { + // Shared lock: concurrent readers are fine, but must synchronize with + // add_upper_neighbor's exclusive lock to avoid data-race on + // slots_.size() inside HnswIndexHashMap (the emplace_back in alloc_slot + // is not atomic and concurrent find() may see a stale size value). + std::shared_lock lk(*upper_neighbor_rw_mutex_); auto it = upper_neighbor_index_->find(id); ailego_assert_abort(it != upper_neighbor_index_->end(), "Get upper neighbor header failed"); @@ -334,6 +341,10 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { if (level == 0) { return 0; } + // Exclusive lock: protects upper_neighbor_chunks_.emplace_back() and + // upper_neighbor_index_->insert() from racing with concurrent find() + // calls in get_upper_neighbor_chunk_loc(). + std::unique_lock lk(*upper_neighbor_rw_mutex_); Chunk::Pointer chunk; uint64_t chunk_offset = -1UL; size_t neighbors_size = get_total_upper_neighbors_size(level); @@ -373,17 +384,40 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { meta.level = level; meta.index = (chunk_index << upper_neighbor_mask_bits_) | (chunk_offset / upper_neighbor_size_); + size_t zero_start = chunk_offset; chunk_offset += upper_neighbor_size_ * level; - if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) { - LOG_ERROR("HashMap insert value failed"); - return IndexError_Runtime; - } + // IMPORTANT: order matters here. + // 1) resize so the chunk's data_size covers the new region. + // 2) zero-fill the new region: storage backends like BufferStorage do + // NOT zero on resize -- only metadata is updated, and the underlying + // page may contain stale content from a previously-evicted page. + // Without this step, NeighborsHeader::neighbor_cnt is garbage and + // select_entry_point()/search_neighbors() iterate over garbage + // node_ids, eventually triggering find()'s assertion in + // get_upper_neighbor_chunk_loc() at line 291. + // 3) ONLY THEN publish the entry to upper_neighbor_index_, so that any + // concurrent reader that finds this id already sees a properly + // zeroed upper-neighbor slot. if (ailego_unlikely(chunk->resize(chunk_offset) != chunk_offset)) { LOG_ERROR("Chunk resize to %zu failed", (size_t)chunk_offset); return IndexError_Runtime; } + // Use std::vector instead of a VLA: VLAs are a GNU extension and may + // produce different codegen / be rejected under clang/MSVC. + std::vector zeros(neighbors_size, 0); + if (ailego_unlikely(chunk->write(zero_start, zeros.data(), + neighbors_size) != neighbors_size)) { + LOG_ERROR("Chunk write zeros failed"); + return IndexError_Runtime; + } + + if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) { + LOG_ERROR("HashMap insert value failed"); + return IndexError_Runtime; + } + return 0; } @@ -503,6 +537,11 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity { bool get_vector_enabled_{false}; bool use_key_info_map_{true}; + // Shared via shared_ptr so that all cloned entities synchronize against + // the SAME mutex instance. A plain std::shared_mutex member would be + // independent per clone and provide no real protection for the shared + // upper_neighbor_index_ hashmap. + mutable std::shared_ptr upper_neighbor_rw_mutex_{}; NIHashMapPointer upper_neighbor_index_{}; mutable std::shared_ptr keys_map_lock_{}; diff --git a/src/core/algorithm/hnsw_sparse/hnsw_sparse_streamer.cc b/src/core/algorithm/hnsw_sparse/hnsw_sparse_streamer.cc index 3abce8087..20c215257 100644 --- a/src/core/algorithm/hnsw_sparse/hnsw_sparse_streamer.cc +++ b/src/core/algorithm/hnsw_sparse/hnsw_sparse_streamer.cc @@ -27,7 +27,7 @@ namespace core { HnswSparseStreamer::HnswSparseStreamer() : entity_(stats_) {} HnswSparseStreamer::~HnswSparseStreamer() { - if (state_ == STATE_INITED) { + if (state_ == STATE_INITED || state_ == STATE_OPENED) { this->cleanup(); } } diff --git a/src/core/algorithm/vamana/vamana_streamer.cc b/src/core/algorithm/vamana/vamana_streamer.cc index ae935eb81..2738a98ad 100644 --- a/src/core/algorithm/vamana/vamana_streamer.cc +++ b/src/core/algorithm/vamana/vamana_streamer.cc @@ -26,7 +26,7 @@ namespace core { VamanaStreamer::VamanaStreamer() = default; VamanaStreamer::~VamanaStreamer() { - if (state_ == STATE_INITED) { + if (state_ == STATE_INITED || state_ == STATE_OPENED) { this->cleanup(); } } diff --git a/src/core/interface/indexes/ivf_index.cc b/src/core/interface/indexes/ivf_index.cc index 0cfba037c..1b91eebea 100644 --- a/src/core/interface/indexes/ivf_index.cc +++ b/src/core/interface/indexes/ivf_index.cc @@ -84,15 +84,22 @@ int IVFIndex::Open(const std::string &file_path, break; } case StorageOptions::StorageType::kBufferPool: { - storage_ = core::IndexFactory::CreateStorage("BufferStorage"); + // NOTE: IVF index is dumped via FileDumper (plain binary file), which is + // not compatible with BufferStorage's IndexFormat layout (header/footer + // chain). Until IVF gains a BufferStorage-aware dump path, fall back to + // MMapFileReadStorage so the freshly-dumped file can be reopened. + storage_ = core::IndexFactory::CreateStorage("MMapFileReadStorage"); if (storage_ == nullptr) { - LOG_ERROR("Failed to create BufferStorage"); + LOG_ERROR( + "Failed to create MMapFileReadStorage (IVF buffer-pool fallback)"); return core::IndexError_Runtime; } int ret = storage_->init(storage_params); if (ret != 0) { - LOG_ERROR("Failed to init BufferStorage, path: %s, err: %s", - file_path_.c_str(), core::IndexError::What(ret)); + LOG_ERROR( + "Failed to init MMapFileReadStorage (IVF buffer-pool fallback), " + "path: %s, err: %s", + file_path_.c_str(), core::IndexError::What(ret)); return ret; } break; diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc index c934dd5d9..bf2485724 100644 --- a/src/core/utility/buffer_storage.cc +++ b/src/core/utility/buffer_storage.cc @@ -12,9 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include +#include +#include +#include #include +#include +#include #include +#include #include #include #include @@ -24,8 +31,62 @@ namespace zvec { namespace core { +namespace { -/*! MMap File Storage +// Cross-compiler helpers for lock-free 64-bit acquire/release access +// to SegmentMeta::data_size / padding_size. +// +// These fields are POD (uint64_t) inside a serialised struct so we cannot +// change their type to std::atomic<>; std::atomic_ref is C++20 and the +// project targets C++17. GCC/Clang have native __atomic_* builtins that +// emit single ldar/stlr on arm64 and plain mov on x86_64. MSVC lacks +// these builtins, so we fall back to volatile load/store paired with a +// std::atomic_thread_fence, which is correct on all targets MSVC ships +// (x86_64 / arm64 desktop) and equivalent in cost. +inline uint64_t bs_load_acquire(const uint64_t *p) { +#if defined(__GNUC__) || defined(__clang__) + return __atomic_load_n(p, __ATOMIC_ACQUIRE); +#else + uint64_t v = *static_cast(p); + std::atomic_thread_fence(std::memory_order_acquire); + return v; +#endif +} + +inline uint64_t bs_load_relaxed(const uint64_t *p) { +#if defined(__GNUC__) || defined(__clang__) + return __atomic_load_n(p, __ATOMIC_RELAXED); +#else + return *static_cast(p); +#endif +} + +inline void bs_store_release(uint64_t *p, uint64_t v) { +#if defined(__GNUC__) || defined(__clang__) + __atomic_store_n(p, v, __ATOMIC_RELEASE); +#else + std::atomic_thread_fence(std::memory_order_release); + *static_cast(p) = v; +#endif +} + +inline void bs_store_relaxed(uint64_t *p, uint64_t v) { +#if defined(__GNUC__) || defined(__clang__) + __atomic_store_n(p, v, __ATOMIC_RELAXED); +#else + *static_cast(p) = v; +#endif +} + +} // namespace + +// The legacy read(const void**) overload guarantees the returned pointer +// stays valid until close_index(). Single-page reads pin the page +// (never released); cross-page reads allocate a temp buffer owned by +// tmp_buffers_ (freed in close_index()). Callers wanting bounded +// lifetime should use the read(MemoryBlock&) overload. + +/*! Buffer Storage */ class BufferStorage : public IndexStorage { public: @@ -37,33 +98,38 @@ class BufferStorage : public IndexStorage { //! Index Storage Pointer typedef std::shared_ptr Pointer; - //! Constructor - WrappedSegment(BufferStorage *owner, IndexMapping::Segment *segment, - uint64_t segment_header_start_offset, - IndexFormat::MetaHeader *segment_header, size_t segment_id) - : segment_(segment), + //! Constructor. See segment_info_ for the pointer-stability contract. + WrappedSegment(BufferStorage *owner, IndexMapping::SegmentInfo *info, + size_t segment_id) + : segment_info_(info), owner_(owner), segment_id_(segment_id), - capacity_(static_cast(segment->meta()->data_size + - segment->meta()->padding_size)), - segment_header_start_offset_(segment_header_start_offset), - segment_header_(segment_header) {} + capacity_(static_cast(info->segment.meta()->data_size + + info->segment.meta()->padding_size)) {} //! Destructor ~WrappedSegment(void) override {} //! Retrieve size of data + //! + //! data_size / padding_size are mutated lock-free by concurrent + //! writers (write/resize) and observed by concurrent readers on the + //! lock-free hot path. Use acquire/release ordering so weakly-ordered + //! ARM (e.g. Android arm64) cannot see stale values that would cause + //! read() to truncate len to 0. size_t data_size(void) const override { - return static_cast(segment_->meta()->data_size); + return static_cast( + bs_load_acquire(&segment_info_->segment.meta()->data_size)); } //! Retrieve crc of data uint32_t data_crc(void) const override { - return segment_->meta()->data_crc; + return segment_info_->segment.meta()->data_crc; } //! Retrieve size of padding size_t padding_size(void) const override { - return static_cast(segment_->meta()->padding_size); + return static_cast( + bs_load_acquire(&segment_info_->segment.meta()->padding_size)); } //! Retrieve capacity of segment @@ -72,36 +138,57 @@ class BufferStorage : public IndexStorage { } //! Fetch data from segment (with own buffer) + //! + //! C1: pool/handle are stable for the lifetime of the index + //! (no retire/rebuild), so no lock is needed on the hot path. size_t fetch(size_t offset, void *buf, size_t len) const override { - if (ailego_unlikely(offset + len > segment_->meta()->data_size)) { - auto meta = segment_->meta(); - if (offset > meta->data_size) { - offset = meta->data_size; + if (ailego_unlikely(!owner_->buffer_pool_handle_)) { + LOG_ERROR("WrappedSegment::fetch: handle is null, file[%s], id[%zu]", + owner_->file_name_.c_str(), segment_id_); + return 0; + } + const size_t data_size = + bs_load_acquire(&segment_info_->segment.meta()->data_size); + if (ailego_unlikely(offset > data_size || len > data_size - offset)) { + if (offset > data_size) { + offset = data_size; } - len = meta->data_size - offset; + len = data_size - offset; } - size_t abs_offset = segment_header_start_offset_ + - segment_header_->content_offset + - segment_->meta()->data_index + offset; + size_t abs_offset = segment_info_->segment_header_start_offset + + segment_info_->segment_header->content_offset + + segment_info_->segment.meta()->data_index + offset; if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, static_cast(buf))) { + LOG_ERROR( + "WrappedSegment::fetch: read_range failed, file[%s], id[%zu], " + "abs_offset=%zu, len=%zu", + owner_->file_name_.c_str(), segment_id_, abs_offset, len); return 0; } return len; } //! Read data from segment + //! C1: lock-free hot path (pool/handle never change during operation). size_t read(size_t offset, const void **data, size_t len) override { - if (ailego_unlikely(offset + len > segment_->meta()->data_size)) { - auto meta = segment_->meta(); - if (offset > meta->data_size) { - offset = meta->data_size; + if (ailego_unlikely(!owner_->buffer_pool_handle_)) { + LOG_ERROR("WrappedSegment::read: handle is null, file[%s], id[%zu]", + owner_->file_name_.c_str(), segment_id_); + *data = nullptr; + return 0; + } + const size_t data_size = + bs_load_acquire(&segment_info_->segment.meta()->data_size); + if (ailego_unlikely(offset > data_size || len > data_size - offset)) { + if (offset > data_size) { + offset = data_size; } - len = meta->data_size - offset; + len = data_size - offset; } - size_t abs_offset = segment_header_start_offset_ + - segment_header_->content_offset + - segment_->meta()->data_index + offset; + size_t abs_offset = segment_info_->segment_header_start_offset + + segment_info_->segment_header->content_offset + + segment_info_->segment.meta()->data_index + offset; size_t first_page = abs_offset / ailego::kVectorPageSize; size_t last_page = (len == 0) ? first_page @@ -111,35 +198,86 @@ class BufferStorage : public IndexStorage { char *raw = owner_->buffer_pool_handle_->get_single_page(abs_offset, len, page_id); if (!raw) { + LOG_ERROR( + "WrappedSegment::read: single-page acquire failed, file[%s], " + "id[%zu], abs_offset=%zu, len=%zu, page=%zu", + owner_->file_name_.c_str(), segment_id_, abs_offset, len, + first_page); + *data = nullptr; return 0; } *data = raw; + // Pin held until close_index() per the never-released contract + // of this overload. + (void)page_id; return len; } - char *tmp = static_cast(ailego_aligned_malloc(len, 4096)); + // Cross-page path: see file-level banner. C11 aligned_alloc requires + // size to be a multiple of alignment, and alignment must be a power + // of two. Use a fixed 4096-byte alignment for the dst buffer: 4K is + // the minimum page granularity across all supported platforms + // (always a divisor of the 16K/64K page sizes used on Apple Silicon + // and some Android arm64 configurations) and is sufficient for the + // downstream SIMD/DMA-friendly access contract. Pinning kAlign to + // 4096 also avoids over-allocating 16KB per cross-page read on + // large-page platforms. + static constexpr size_t kAlign = 4096UL; + size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL); + // Allocate a 4K-aligned slot from the per-storage arena pool. + // This batches page-aligned allocation: under heap fragmentation + // (notably Android Bionic scudo), one large posix_memalign per + // arena via the secondary (mmap-backed) allocator is far more + // reliable than many independent posix_memalign(4K, 4K) calls. + char *tmp = nullptr; + { + std::lock_guard tmp_latch(owner_->tmp_buffers_mutex_); + tmp = owner_->tmp_arena_alloc_locked(alloc_size); + } if (!tmp) { + LOG_ERROR( + "WrappedSegment::read: cross-page alloc failed, file[%s], " + "id[%zu], abs_offset=%zu, len=%zu, alloc_size=%zu, align=%zu", + owner_->file_name_.c_str(), segment_id_, abs_offset, len, + alloc_size, kAlign); + *data = nullptr; return 0; } if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, tmp)) { - ailego_free(tmp); + LOG_ERROR( + "WrappedSegment::read: cross-page read_range failed, file[%s], " + "id[%zu], abs_offset=%zu, len=%zu, first_page=%zu, last_page=%zu", + owner_->file_name_.c_str(), segment_id_, abs_offset, len, + first_page, last_page); + // The arena slot is intentionally not rolled back: rolling back + // would require holding the arena lock across read_range, while + // the worst-case leak per failed read is one slot (alloc_size). + *data = nullptr; return 0; } - owner_->register_tmp_buffer(tmp); *data = tmp; return len; } + //! C1: lock-free hot path (pool/handle never change during operation). size_t read(size_t offset, MemoryBlock &data, size_t len) override { - if (ailego_unlikely(offset + len > segment_->meta()->data_size)) { - auto meta = segment_->meta(); - if (offset > meta->data_size) { - offset = meta->data_size; + if (ailego_unlikely(!owner_->buffer_pool_handle_)) { + LOG_ERROR( + "WrappedSegment::read(MemoryBlock&): handle is null, file[%s], " + "id[%zu]", + owner_->file_name_.c_str(), segment_id_); + return 0; + } + const size_t data_size = + bs_load_acquire(&segment_info_->segment.meta()->data_size); + if (ailego_unlikely(offset > data_size || len > data_size - offset)) { + if (offset > data_size) { + offset = data_size; } - len = meta->data_size - offset; + len = data_size - offset; } - size_t abs_offset = segment_header_start_offset_ + - segment_header_->content_offset + - segment_->meta()->data_index + offset; + size_t abs_offset = segment_info_->segment_header_start_offset + + segment_info_->segment_header->content_offset + + segment_info_->segment.meta()->data_index + offset; size_t first_page = abs_offset / ailego::kVectorPageSize; size_t last_page = (len == 0) ? first_page @@ -150,38 +288,154 @@ class BufferStorage : public IndexStorage { len, page_id); if (!raw) { LOG_ERROR("read error (single-page acquire failed)."); - return -1; + return 0; } data.reset(owner_->buffer_pool_handle_.get(), page_id, raw); return len; } - char *tmp = static_cast(ailego_aligned_malloc(len, 4096)); + // C11 aligned_alloc requires the requested size to be a multiple of + // the alignment, and alignment must be a power of two. See the + // sibling read(const void**) overload above for the rationale of + // pinning kAlign to a fixed 4096 instead of sysconf(_SC_PAGESIZE). + static constexpr size_t kAlign = 4096UL; + size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL); + char *tmp = + static_cast(ailego_aligned_malloc(alloc_size, kAlign)); if (!tmp) { LOG_ERROR("read error (alloc cross-page temp buffer failed)."); - return -1; + return 0; } if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, tmp)) { ailego_free(tmp); LOG_ERROR("read error (cross-page read_range failed)."); - return -1; + return 0; } - data = MemoryBlock::MakeOwned(tmp); + data = MemoryBlock::MakeOwned(tmp, len); return len; } - //! Write data into the storage with offset - size_t write(size_t /*offset*/, const void * /*data*/, - size_t len) override { + //! Write data into the storage with offset. + //! + //! Locking: shared shard latch pairs with flush_index()'s exclusive + //! all-shards latch -- excludes CRC compute over meta_buf while we + //! mutate (data_size, padding_size). meta_mtx_ additionally + //! serialises concurrent writers on the SAME segment so the pair + //! stays consistent (sum == capacity_). + size_t write(size_t offset, const void *data, size_t len) override { + std::shared_lock latch( + owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); + if (ailego_unlikely(!owner_->buffer_pool_handle_ || + !owner_->buffer_pool_)) { + LOG_ERROR("WrappedSegment::write: pool is null, file[%s], id[%zu]", + owner_->file_name_.c_str(), segment_id_); + return 0; + } + if (ailego_unlikely(owner_->corrupted_.load(std::memory_order_acquire))) { + LOG_ERROR( + "WrappedSegment::write: storage is marked corrupted, refusing " + "write, file[%s], id[%zu]", + owner_->file_name_.c_str(), segment_id_); + return 0; + } + // In read-only mode the write is a silent no-op so that callers that + // unconditionally write (e.g. CRC updates) do not return an error. + if (!owner_->buffer_pool_->writable()) { + return len; + } + if (ailego_unlikely(offset > capacity_ || len > capacity_ - offset)) { + LOG_ERROR( + "write() exceeds segment capacity: offset=%zu len=%zu cap=%zu", + offset, len, capacity_); + return 0; + } + auto meta = segment_info_->segment.meta(); + size_t abs_offset = segment_info_->segment_header_start_offset + + segment_info_->segment_header->content_offset + + meta->data_index + offset; + // Write the bytes BEFORE publishing the new data_size to readers. + // Lock-free readers observe data_size with acquire ordering; the + // release-store below establishes happens-before with the page + // contents written above. Publishing data_size first (the previous + // ordering) allowed a reader on weakly-ordered ARM to see the new + // length but still read stale page contents -- or, in the inverse + // direction, see a stale length and truncate len to 0 + // (root cause of "Read sparse vector failed ... ret=0"). + if (owner_->buffer_pool_handle_->write_range( + abs_offset, len, static_cast(data)) != 0) { + LOG_ERROR("write() page-cache write_range failed at abs_offset=%zu", + abs_offset); + return 0; + } + { + std::lock_guard meta_latch(meta_mtx_); + uint64_t cur = bs_load_relaxed(&meta->data_size); + if (offset + len > cur) { + uint64_t new_size = offset + len; + // padding_size is paired with data_size; publish it first + // (relaxed) so readers that acquire data_size see a + // consistent (data_size + padding_size == capacity_) pair. + bs_store_relaxed(&meta->padding_size, capacity_ - new_size); + bs_store_release(&meta->data_size, new_size); + } + } + // Mark dirty unconditionally even when data_size did not grow: + // fixed-size in-place rewrites (e.g. chunk_meta_segment) must still + // trigger flush_all() before the next append_segment(). + owner_->set_as_dirty(); return len; } - //! Resize size of data - size_t resize(size_t /*size*/) override { - return 0; + //! Resize size of data. See write() for the locking contract. + size_t resize(size_t size) override { + std::shared_lock latch( + owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); + if (ailego_unlikely(owner_->corrupted_.load(std::memory_order_acquire))) { + LOG_ERROR( + "WrappedSegment::resize: storage is marked corrupted, refusing " + "resize, file[%s], id[%zu]", + owner_->file_name_.c_str(), segment_id_); + return 0; + } + auto meta = segment_info_->segment.meta(); + bool changed = false; + { + std::lock_guard meta_latch(meta_mtx_); + uint64_t cur = bs_load_relaxed(&meta->data_size); + if (cur != size) { + if (size > capacity_) { + size = capacity_; + } + // See write() for the publish ordering rationale: padding first + // (relaxed), then release-store data_size so concurrent lock-free + // readers observe a consistent pair. + bs_store_relaxed(&meta->padding_size, capacity_ - size); + bs_store_release(&meta->data_size, size); + changed = true; + } + } + if (changed) { + owner_->set_as_dirty(); + } + return size; } - //! Update crc of data - void update_data_crc(uint32_t /*crc*/) override {} + //! Update crc of data. See write() for the locking contract. + void update_data_crc(uint32_t crc) override { + std::shared_lock latch( + owner_->mapping_shards_[owner_->mapping_shard_id()].mtx); + if (ailego_unlikely(owner_->corrupted_.load(std::memory_order_acquire))) { + LOG_ERROR( + "WrappedSegment::update_data_crc: storage is marked corrupted, " + "refusing CRC update, file[%s], id[%zu]", + owner_->file_name_.c_str(), segment_id_); + return; + } + { + std::lock_guard meta_latch(meta_mtx_); + segment_info_->segment.meta()->data_crc = crc; + } + owner_->set_as_dirty(); + } //! Clone the segment IndexStorage::Segment::Pointer clone(void) override { @@ -190,14 +444,18 @@ class BufferStorage : public IndexStorage { protected: friend BufferStorage; - IndexMapping::Segment *segment_{}; + // Pointer into BufferStorage::segments_ (unordered_map mapped value). + // The address is stable across map insertions, so re-parses after + // append_segment() are picked up without recreating WrappedSegment. + IndexMapping::SegmentInfo *segment_info_{nullptr}; + // Serialises hot-path writers on the SAME segment so + // (data_size, padding_size, data_crc) updates do not interleave. + mutable std::mutex meta_mtx_{}; private: BufferStorage *owner_{nullptr}; size_t segment_id_{}; size_t capacity_{}; - uint64_t segment_header_start_offset_; - IndexFormat::MetaHeader *segment_header_; }; //! Destructor @@ -211,7 +469,11 @@ class BufferStorage : public IndexStorage { } //! Initialize storage - int init(const ailego::Params & /*params*/) override { + int init(const ailego::Params ¶ms) override { + uint32_t val = params.get_as_uint32(MMAPFILE_STORAGE_SEGMENT_META_CAPACITY); + if (val != 0) { + segment_meta_capacity_ = val; + } return 0; } @@ -222,62 +484,62 @@ class BufferStorage : public IndexStorage { } //! Open storage - int open(const std::string &path, bool /*create_if_missing*/) override { + int open(const std::string &path, bool create_if_missing) override { file_name_ = path; - buffer_pool_ = std::make_shared(path); + if (!ailego::File::IsExist(path) && create_if_missing) { + size_t last_slash = path.rfind('/'); + if (last_slash != std::string::npos) { + ailego::File::MakePath(path.substr(0, last_slash)); + } + int error_code = this->init_index(path); + if (error_code != 0) { + LOG_ERROR("init_index failed for %s, errno=%d", path.c_str(), + error_code); + return error_code; + } + } + + // Open in writable mode when the caller expects to modify the index + // (create_if_missing=true implies write intent, same as MMapFileStorage). + buffer_pool_ = std::make_shared( + path, /*writable=*/create_if_missing); buffer_pool_handle_ = std::make_shared( buffer_pool_->get_handle()); int ret = ParseToMapping(); if (ret != 0) { + this->close_index(); return ret; } ret = buffer_pool_->init(); if (ret != 0) { + this->close_index(); return ret; } LOG_INFO( - "BufferStorage opened: file=%s, max_segment_size=%zu, " + "BufferStorage opened: file=%s, writable=%d, max_segment_size=%lu, " "segment_count=%zu", - file_name_.c_str(), (size_t)max_segment_size_, segments_.size()); + file_name_.c_str(), static_cast(create_if_missing), + max_segment_size_, segments_.size()); return 0; } - void register_tmp_buffer(char *buf) { - std::lock_guard latch(tmp_buffers_mutex_); - tmp_buffers_.push_back(buf); - } - - char *get_buffer(size_t offset, size_t length, size_t /*block_id*/) { - char *tmp = static_cast(ailego_aligned_malloc(length, 4096)); - if (!tmp) { - return nullptr; - } - if (!buffer_pool_handle_->read_range(offset, length, tmp)) { - ailego_free(tmp); - return nullptr; - } - register_tmp_buffer(tmp); - return tmp; - } - - int get_meta(size_t offset, size_t length, char *out) { - return buffer_pool_handle_->get_meta(offset, length, out); - } - - int ParseHeader(size_t offset) { - std::unique_ptr buffer(new char[sizeof(header_)]); - if (get_meta(offset, sizeof(header_), buffer.get()) != 0) { + // PRECONDITION (also for ParseFooter/ParseSegment/ParseToMapping): + // caller holds either single-threaded open() or AllShardsExclusiveLatch. + // Do NOT add an internal lock here -- std::shared_mutex is not reentrant. + int ParseHeader(size_t offset, IndexFormat::MetaHeader *out) { + constexpr size_t kHeaderSize = sizeof(IndexFormat::MetaHeader); + std::unique_ptr buffer(new char[kHeaderSize]); + if (buffer_pool_handle_->get_meta(offset, kHeaderSize, buffer.get()) != 0) { LOG_ERROR("Get segment header failed."); return IndexError_Runtime; } - uint8_t *header_ptr = reinterpret_cast(buffer.get()); - memcpy(&header_, header_ptr, sizeof(header_)); - if (header_.meta_header_size != sizeof(IndexFormat::MetaHeader)) { + memcpy(out, buffer.get(), kHeaderSize); + if (out->meta_header_size != kHeaderSize) { LOG_ERROR("Header meta size is invalid."); return IndexError_InvalidLength; } - if (ailego::Crc32c::Hash(&header_, sizeof(header_), header_.header_crc) != - header_.header_crc) { + if (ailego::Crc32c::Hash(out, kHeaderSize, out->header_crc) != + out->header_crc) { LOG_ERROR("Header meta checksum is invalid."); return IndexError_InvalidChecksum; } @@ -286,7 +548,8 @@ class BufferStorage : public IndexStorage { int ParseFooter(size_t offset) { std::unique_ptr buffer(new char[sizeof(footer_)]); - if (get_meta(offset, sizeof(footer_), buffer.get()) != 0) { + if (buffer_pool_handle_->get_meta(offset, sizeof(footer_), buffer.get()) != + 0) { LOG_ERROR("Get segment footer failed."); return IndexError_Runtime; } @@ -304,12 +567,12 @@ class BufferStorage : public IndexStorage { return 0; } - int ParseSegment(size_t offset) { - std::lock_guard latch(mapping_mutex_); + int ParseSegment(size_t offset, IndexFormat::MetaHeader *chain_header, + uint32_t *out_segment_ids_offset) { std::unique_ptr segment_buffer = std::make_unique(footer_.segments_meta_size); - if (get_meta(offset, footer_.segments_meta_size, segment_buffer.get()) != - 0) { + if (buffer_pool_handle_->get_meta(offset, footer_.segments_meta_size, + segment_buffer.get()) != 0) { LOG_ERROR("Get segment meta failed."); return IndexError_Runtime; } @@ -324,7 +587,7 @@ class BufferStorage : public IndexStorage { for (IndexFormat::SegmentMeta *iter = segment_start, *end = segment_start + footer_.segment_count; iter != end; ++iter) { - if (iter->segment_id_offset > footer_.segments_meta_size) { + if (iter->segment_id_offset >= footer_.segments_meta_size) { return IndexError_InvalidValue; } if (iter->data_index > footer_.content_size) { @@ -337,15 +600,34 @@ class BufferStorage : public IndexStorage { if (iter->segment_id_offset < segment_ids_offset) { segment_ids_offset = iter->segment_id_offset; } - id_hash_.emplace( - std::string(reinterpret_cast(segment_start) + - iter->segment_id_offset), - segments_.size()); - segments_.emplace( - std::string(reinterpret_cast(segment_start) + - iter->segment_id_offset), + // Use id_hash_.size() (not segments_.size()) for the block_id: + // segments_ is intentionally NOT cleared between appends to keep + // existing WrappedSegment pointers valid, so it carries stale entries. + // + // Bound the C-string scan to the segments_meta buffer so a missing + // NUL terminator cannot walk past the buffer end (defence against + // crafted-CRC inputs; CRC already covers benign bit flips). + const char *seg_name_start = + reinterpret_cast(segment_start) + + iter->segment_id_offset; + const size_t seg_name_max = + footer_.segments_meta_size - iter->segment_id_offset; + const size_t seg_name_len = ::strnlen(seg_name_start, seg_name_max); + if (seg_name_len == seg_name_max) { + LOG_ERROR("ParseSegment: segment_id missing NUL terminator, file[%s]", + file_name_.c_str()); + return IndexError_InvalidValue; + } + const std::string seg_name(seg_name_start, seg_name_len); + const size_t seg_id = id_hash_.size(); + id_hash_[seg_name] = seg_id; + // In-place update so existing WrappedSegment pointers see the + // refreshed meta_ptr_ after re-parse. chain_header MUST be the + // per-chain owning copy (not a shared &header_) -- see + // chain_headers_ field comment. + segments_[seg_name] = IndexMapping::SegmentInfo{IndexMapping::Segment{iter}, - current_header_start_offset_, &header_}); + current_header_start_offset_, chain_header}; max_segment_size_ = std::max(max_segment_size_, iter->data_size + iter->padding_size); if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count > @@ -354,36 +636,53 @@ class BufferStorage : public IndexStorage { } } buffer_pool_buffers_.push_back(std::move(segment_buffer)); + if (out_segment_ids_offset) { + *out_segment_ids_offset = segment_ids_offset; + } return 0; } int ParseToMapping() { while (true) { int ret; - ret = ParseHeader(current_header_start_offset_); + // Per-chain owning MetaHeader; see chain_headers_ field comment. + chain_headers_.emplace_back(std::make_unique()); + IndexFormat::MetaHeader *chain_header = chain_headers_.back().get(); + ret = ParseHeader(current_header_start_offset_, chain_header); if (ret != 0) { LOG_ERROR("Failed to parse header, errno %d, %s", ret, IndexError::What(ret)); return ret; } - switch (header_.version) { + switch (chain_header->version) { case IndexFormat::FORMAT_VERSION: break; default: - LOG_ERROR("Unsupported index version: %u", header_.version); + LOG_ERROR("Unsupported index version: %u", chain_header->version); return IndexError_Unsupported; } // Unpack footer - if (header_.meta_footer_size != sizeof(IndexFormat::MetaFooter)) { + if (chain_header->meta_footer_size != sizeof(IndexFormat::MetaFooter)) { return IndexError_InvalidLength; } - if ((int32_t)header_.meta_footer_offset < 0) { + if ((int32_t)chain_header->meta_footer_offset < 0) { return IndexError_Unsupported; } uint64_t footer_offset = - header_.meta_footer_offset + current_header_start_offset_; + chain_header->meta_footer_offset + current_header_start_offset_; + // Reject uint64 wrap-around and offsets past file_size. + if (footer_offset < current_header_start_offset_ || + footer_offset + sizeof(IndexFormat::MetaFooter) > + buffer_pool_->file_size()) { + LOG_ERROR( + "ParseToMapping: invalid footer_offset=%lu (header=%lu, " + "file_size=%lu), file[%s]", + footer_offset, current_header_start_offset_, + buffer_pool_->file_size(), file_name_.c_str()); + return IndexError_InvalidValue; + } ret = ParseFooter(footer_offset); if (ret != 0) { LOG_ERROR("Failed to parse footer, errno %d, %s", ret, @@ -398,17 +697,49 @@ class BufferStorage : public IndexStorage { } const uint64_t segment_start_offset = footer_offset - footer_.segments_meta_size; - ret = ParseSegment(segment_start_offset); + uint32_t segment_ids_offset = footer_.segments_meta_size; + ret = + ParseSegment(segment_start_offset, chain_header, &segment_ids_offset); if (ret != 0) { LOG_ERROR("Failed to parse segment, errno %d, %s", ret, IndexError::What(ret)); return ret; } + // Record per-chain metadata offsets so flush_index() can write + // updated segment metas and footers back to the backing file. + meta_chains_.push_back({current_header_start_offset_, footer_offset, + segment_start_offset, footer_.segments_meta_size, + segment_ids_offset, footer_}); + if (footer_.next_meta_header_offset == 0) { break; } - current_header_start_offset_ = footer_.next_meta_header_offset; + // Reject self-reference / backward jumps and offsets past file_size: + // such a corrupted next_meta_header_offset would otherwise drive the + // loop into infinite chain growth -> OOM. + const uint64_t next_off = footer_.next_meta_header_offset; + if (next_off <= current_header_start_offset_ || + next_off + sizeof(IndexFormat::MetaHeader) > + buffer_pool_->file_size()) { + LOG_ERROR( + "ParseToMapping: invalid next_meta_header_offset=%lu " + "(current=%lu, file_size=%lu), file[%s]", + next_off, current_header_start_offset_, buffer_pool_->file_size(), + file_name_.c_str()); + return IndexError_InvalidValue; + } + // Bound chain count: 1024 chains @ default 1MB segment_meta_capacity + // covers >1GB of metadata, far above realistic load. + constexpr size_t kMaxChains = 1024; + if (chain_headers_.size() >= kMaxChains) { + LOG_ERROR( + "ParseToMapping: chain count exceeds limit %zu, file[%s] may " + "be corrupted", + kMaxChains, file_name_.c_str()); + return IndexError_InvalidLength; + } + current_header_start_offset_ = next_off; } return 0; } @@ -441,13 +772,18 @@ class BufferStorage : public IndexStorage { //! Retrieve a segment by id IndexStorage::Segment::Pointer get(const std::string &id, int) override { - auto segment_info = this->get_segment_info(id); - if (!segment_info) { + std::shared_lock latch( + mapping_shards_[mapping_shard_id()].mtx); + auto seg_iter = segments_.find(id); + if (seg_iter == segments_.end()) { + return WrappedSegment::Pointer{}; + } + auto id_iter = id_hash_.find(id); + if (id_iter == id_hash_.end()) { return WrappedSegment::Pointer{}; } - return std::make_shared( - this, &segment_info->segment, segment_info->segment_header_start_offset, - segment_info->segment_header, id_hash_[id]); + return std::make_shared(this, &seg_iter->second, + id_iter->second); } //! Test if it a segment exists @@ -457,20 +793,24 @@ class BufferStorage : public IndexStorage { //! Retrieve magic number of index uint32_t magic(void) const override { - return header_.magic; + if (chain_headers_.empty()) { + return 0u; + } + return chain_headers_.front()->magic; } protected: - //! Initialize index version segment - int init_version_segment(void) { + //! Initialize index version segment (writes content into an IndexMapping). + //! Only intended to be called from init_index() while `mapping` is still + //! open in create-mode. + int init_version_segment(IndexMapping &mapping) { size_t data_size = std::strlen(IndexVersion::Details()); - int error_code = - this->append_segment(INDEX_VERSION_SEGMENT_NAME, data_size); + int error_code = mapping.append(INDEX_VERSION_SEGMENT_NAME, data_size); if (error_code != 0) { return error_code; } - - auto segment = &get_segment_info(INDEX_VERSION_SEGMENT_NAME)->segment; + IndexMapping::Segment *segment = + mapping.map(INDEX_VERSION_SEGMENT_NAME, false, false); if (!segment) { return IndexError_MMapFile; } @@ -484,45 +824,189 @@ class BufferStorage : public IndexStorage { return 0; } - //! Initialize index file - int init_index(const std::string & /*path*/) { - // Add index version - int error_code = this->init_version_segment(); - if (error_code != 0) { - return error_code; + //! Create the initial on-disk index structure and write the mandatory + //! version segment. Uses IndexMapping (the same engine as MMapFileStorage) + //! so the produced file is fully compatible with both storage backends. + int init_index(const std::string &path) { + IndexMapping mapping; + int ret = mapping.create(path, segment_meta_capacity_); + if (ret != 0) { + LOG_ERROR( + "BufferStorage failed to create index file: path[%s], errno[%d]", + path.c_str(), ret); + return ret; } - - // Refresh mapping - this->refresh_index(0); - return 0; + ret = this->init_version_segment(mapping); + if (ret != 0) { + LOG_ERROR( + "BufferStorage failed to append version segment: path[%s], errno[%d]", + path.c_str(), ret); + mapping.close(); + return ret; + } + mapping.refresh(0); + ret = mapping.flush(); + mapping.close(); + if (ret != 0) { + LOG_ERROR( + "BufferStorage failed to flush new index file: path[%s], errno[%d]", + path.c_str(), ret); + } + return ret; } - //! Set the index file as dirty + //! Mark the index as dirty. HOT PATH: store(true) unconditionally -- + //! a load-then-store guard could let a stale cached `true` skip the + //! store after flush_index() CAS'd dirty=false on another core, losing + //! the writer's modification. void set_as_dirty(void) { - index_dirty_ = true; + index_dirty_.store(true, std::memory_order_relaxed); } //! Refresh meta information (checksum, update time, etc.) - void refresh_index(uint64_t /*chkp*/) {} + void refresh_index(uint64_t chkp) { + // CAS-loop max: callers may invoke refresh() out of order, and the + // persisted check_point must be non-decreasing. Relaxed ordering is + // sufficient because flush_index() takes AllShardsExclusiveLatch which + // establishes the necessary happens-before for the disk write. + if (chkp != 0) { + uint64_t cur = pending_check_point_.load(std::memory_order_relaxed); + while (chkp > cur) { + if (pending_check_point_.compare_exchange_weak( + cur, chkp, std::memory_order_relaxed)) { + break; + } + } + } + // Set dirty unconditionally even if our chkp lost the CAS race: the + // winning larger chkp must still be flushed. + index_dirty_.store(true, std::memory_order_relaxed); + } - //! Flush index storage + //! Flush index storage. int flush_index(void) { + if (!index_dirty_.load(std::memory_order_relaxed)) { + return 0; + } + // Exclusive all-shards latch excludes the lock-free hot path while we + // hash meta_buf and pwrite footer; without it segments_meta_crc would + // not match the bytes on disk. + AllShardsExclusiveLatch latch(mapping_shards_); + return flush_index_locked(); + } + + //! PRECONDITION: caller holds AllShardsExclusiveLatch. Used by + //! flush_index() (acquires the latch) and close_index() (must flush + //! and tear down under one continuous latch hold). + int flush_index_locked(void) { + // No-op on never-opened / already-closed storage: close_index() + // unconditionally calls us during teardown. + if (!buffer_pool_ || !buffer_pool_handle_) { + index_dirty_.store(false, std::memory_order_relaxed); + return 0; + } + if (corrupted_.load(std::memory_order_acquire)) { + LOG_ERROR( + "BufferStorage::flush_index skipped: storage is marked corrupted, " + "file[%s]", + file_name_.c_str()); + return IndexError_Runtime; + } + if (!buffer_pool_->writable()) { + // Read-only pool: nothing to flush. + index_dirty_.store(false, std::memory_order_relaxed); + return 0; + } + // Claim dirty atomically AT THE START so any concurrent write() that + // lands during this flush re-sets dirty=true and is picked up by the + // next flush; an unconditional store(false) at the end would silently + // swallow it. + bool expected_dirty = true; + if (!index_dirty_.compare_exchange_strong(expected_dirty, false, + std::memory_order_relaxed)) { + // Another thread already claimed; bail out. + return 0; + } + // Snapshot pending_check_point_ AFTER claiming dirty: any newer chkp + // stored by a concurrent refresh_index() will be preserved by the + // CAS-reset at the end (and refresh_index() will have re-set dirty). + const uint64_t consumed_chkp = + pending_check_point_.load(std::memory_order_relaxed); + // Restore consumed_chkp on failure paths (CAS-loop max, same as + // refresh_index()) so a concurrent larger chkp wins. + auto restore_chkp_on_failure = [this, consumed_chkp]() { + if (consumed_chkp == 0) return; + uint64_t cur = pending_check_point_.load(std::memory_order_relaxed); + while (consumed_chkp > cur) { + if (pending_check_point_.compare_exchange_weak( + cur, consumed_chkp, std::memory_order_relaxed)) { + break; + } + } + }; + // Flush dirty data blocks first. + if (buffer_pool_handle_->flush_all() != 0) { + index_dirty_.store(true, std::memory_order_relaxed); + restore_chkp_on_failure(); + LOG_ERROR("flush_all data blocks failed: file[%s]", file_name_.c_str()); + return IndexError_WriteData; + } + // Per-chain: recompute segments_meta CRC, refresh footer, pwrite both. + for (size_t ci = 0; + ci < meta_chains_.size() && ci < buffer_pool_buffers_.size(); ++ci) { + MetaChain &mchain = meta_chains_[ci]; + const char *seg_buf = buffer_pool_buffers_[ci].get(); + mchain.footer.segments_meta_crc = + ailego::Crc32c::Hash(seg_buf, mchain.segment_meta_size, 0u); + IndexFormat::UpdateMetaFooter(&mchain.footer, consumed_chkp); + if (buffer_pool_handle_->write_meta(mchain.segment_meta_file_offset, + mchain.segment_meta_size, + seg_buf) != 0) { + LOG_ERROR("Failed to write segment meta: file[%s], chain[%zu]", + file_name_.c_str(), ci); + index_dirty_.store(true, std::memory_order_relaxed); + restore_chkp_on_failure(); + return IndexError_WriteData; + } + if (buffer_pool_handle_->write_meta( + mchain.footer_file_offset, sizeof(mchain.footer), + reinterpret_cast(&mchain.footer)) != 0) { + LOG_ERROR("Failed to write footer: file[%s], chain[%zu]", + file_name_.c_str(), ci); + index_dirty_.store(true, std::memory_order_relaxed); + restore_chkp_on_failure(); + return IndexError_WriteData; + } + } + if (!meta_chains_.empty()) { + footer_ = meta_chains_.back().footer; + } + // CAS-reset pending: only consume the chkp we observed. A concurrent + // larger chkp survives and will be flushed next round (refresh_index() + // also re-set dirty). + uint64_t expected_chkp = consumed_chkp; + pending_check_point_.compare_exchange_strong(expected_chkp, 0, + std::memory_order_relaxed); return 0; } //! Close index storage void close_index(void) { - std::lock_guard latch(mapping_mutex_); + // Hold ONE continuous all-shards latch across flush + teardown so no + // writer can slip in between (which would dirty meta_buf only to have + // the page table reset under it, dropping the modification). + AllShardsExclusiveLatch latch(mapping_shards_); + flush_index_locked(); file_name_.clear(); id_hash_.clear(); segments_.clear(); - memset(&header_, 0, sizeof(header_)); + chain_headers_.clear(); memset(&footer_, 0, sizeof(footer_)); { std::lock_guard tmp_latch(tmp_buffers_mutex_); - for (char *p : tmp_buffers_) { - if (p) { - ailego_free(p); + for (const ArenaBlock &b : tmp_buffers_) { + if (b.base) { + ailego_free(b.base); } } tmp_buffers_.clear(); @@ -531,39 +1015,473 @@ class BufferStorage : public IndexStorage { buffer_pool_.reset(); max_segment_size_ = 0; buffer_pool_buffers_.clear(); + meta_chains_.clear(); + current_header_start_offset_ = 0; + pending_check_point_.store(0, std::memory_order_relaxed); + index_dirty_.store(false, std::memory_order_relaxed); + corrupted_.store(false, std::memory_order_relaxed); } - //! Append a segment into storage - int append_segment(const std::string & /*id*/, size_t /*size*/) { + //! Append a segment into storage. C1: page table extends in-place; + //! latch held only briefly to protect segments_/id_hash_ insertion. + int append_segment(const std::string &id, size_t size) { + // Persist any pending data_size/padding/CRC mutations from prior + // write()/resize() before we re-hash and rewrite the segment_meta. + this->flush_index(); + + AllShardsExclusiveLatch latch(mapping_shards_); + + if (!buffer_pool_ || !buffer_pool_handle_) { + LOG_ERROR("append_segment: pool not ready, file[%s]", file_name_.c_str()); + return IndexError_Runtime; + } + if (corrupted_.load(std::memory_order_acquire)) { + LOG_ERROR( + "append_segment: storage is marked corrupted, refusing to append, " + "file[%s], id[%s]", + file_name_.c_str(), id.c_str()); + return IndexError_Runtime; + } + if (!buffer_pool_->writable()) { + LOG_ERROR("append_segment: pool is read-only, file[%s]", + file_name_.c_str()); + return IndexError_Runtime; + } + if (size == 0) { + return IndexError_InvalidArgument; + } + if (segments_.find(id) != segments_.end()) { + return IndexError_Duplicate; + } + if (meta_chains_.empty() || chain_headers_.empty() || + buffer_pool_buffers_.empty()) { + LOG_ERROR("append_segment: invalid state, file[%s]", file_name_.c_str()); + return IndexError_Runtime; + } + + // Page-aligned padded size; matches IndexMapping::CalcPageAlignedSize(). + const size_t page_size = ailego::kVectorPageSize; + const size_t padded_size = (size + page_size - 1) / page_size * page_size; + + // The current last chain owns footer_ (overwritten by ParseFooter). + size_t id_size = id.length() + 1; + size_t need_size = sizeof(IndexFormat::SegmentMeta) + id_size; + MetaChain *chain = &meta_chains_.back(); + IndexFormat::MetaHeader *header = chain_headers_.back().get(); + char *meta_buf = buffer_pool_buffers_.back().get(); + + // Rollback handle for an in-memory-committed chain split. Default + // no-op; populated only after Step 1 commits, so a Step 2 failure + // can fully undo the split (otherwise an orphan empty chain would + // remain linked in the file). + std::function rollback_step1 = []() {}; + + // ---- Step 1: chain split if current chain has no meta capacity left. + if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count + need_size > + chain->segment_ids_offset) { + size_t new_chain_start = buffer_pool_->file_size(); + new_chain_start = + (new_chain_start + page_size - 1) / page_size * page_size; + size_t new_meta_total = + (segment_meta_capacity_ + sizeof(IndexFormat::MetaHeader) + + sizeof(IndexFormat::MetaFooter) + page_size - 1) / + page_size * page_size; + uint32_t new_segments_meta_size = static_cast( + new_meta_total - sizeof(IndexFormat::MetaHeader) - + sizeof(IndexFormat::MetaFooter)); + + // Stage the linked old footer without mutating footer_ yet. + const auto saved_footer_before_split = footer_; + IndexFormat::MetaFooter linked_footer = footer_; + linked_footer.next_meta_header_offset = new_chain_start; + IndexFormat::UpdateMetaFooter(&linked_footer, 0); + + if (buffer_pool_handle_->write_meta( + chain->footer_file_offset, sizeof(linked_footer), + reinterpret_cast(&linked_footer)) != 0) { + LOG_ERROR("append_segment: write old footer failed, file[%s]", + file_name_.c_str()); + return IndexError_WriteData; + } + + // Best-effort restore of the old footer if any subsequent write in + // this split block fails. If the restore itself fails, mark the + // storage corrupted -- on-disk old footer now points at a partial + // new chain region. + auto undo_old_footer = [this, chain, &saved_footer_before_split]() { + if (buffer_pool_handle_->write_meta( + chain->footer_file_offset, sizeof(saved_footer_before_split), + reinterpret_cast(&saved_footer_before_split)) != + 0) { + LOG_ERROR( + "append_segment: rollback write of old footer FAILED, file[%s] " + "is now in an inconsistent state -- marking storage as " + "corrupted; further writes will be rejected.", + file_name_.c_str()); + corrupted_.store(true, std::memory_order_release); + } + }; + + // Extend the file and write the new chain's header + (zero) footer. + // The segment_meta region is zero-filled by ftruncate. + if (!buffer_pool_->extend_file(new_chain_start + new_meta_total)) { + undo_old_footer(); + return IndexError_Runtime; + } + + auto new_header = std::make_unique(); + IndexFormat::SetupMetaHeader( + new_header.get(), + static_cast(new_meta_total - + sizeof(IndexFormat::MetaFooter)), + static_cast(new_meta_total)); + + auto new_meta_buf = std::make_unique(new_segments_meta_size); + std::memset(new_meta_buf.get(), 0, new_segments_meta_size); + + IndexFormat::MetaFooter new_footer; + IndexFormat::SetupMetaFooter(&new_footer); + new_footer.segments_meta_size = new_segments_meta_size; + new_footer.total_size = new_meta_total; + new_footer.segments_meta_crc = + ailego::Crc32c::Hash(new_meta_buf.get(), new_segments_meta_size, 0u); + IndexFormat::UpdateMetaFooter(&new_footer, 0); + + if (buffer_pool_handle_->write_meta( + new_chain_start, sizeof(IndexFormat::MetaHeader), + reinterpret_cast(new_header.get())) != 0) { + undo_old_footer(); + return IndexError_WriteData; + } + uint64_t new_segment_meta_file_offset = + new_chain_start + sizeof(IndexFormat::MetaHeader); + uint64_t new_footer_file_offset = + new_chain_start + new_header->meta_footer_offset; + if (buffer_pool_handle_->write_meta( + new_footer_file_offset, sizeof(new_footer), + reinterpret_cast(&new_footer)) != 0) { + undo_old_footer(); + return IndexError_WriteData; + } + + // Snapshot the OLD chain's pre-commit state for rollback_step1 + // (captured by value: `chain` is reassigned below). + const auto saved_old_chain_footer = chain->footer; + const uint64_t saved_old_footer_file_offset = chain->footer_file_offset; + const uint64_t saved_current_header_start = current_header_start_offset_; + + // Strong exception guarantee: reserve() FIRST so the three + // push_back's cannot throw mid-way and leave + // chain_headers_/buffer_pool_buffers_/meta_chains_ at mismatched + // sizes (which flush_index_locked() would silently skip while + // ParseToMapping() on next open follows the on-disk forward link). + try { + chain_headers_.reserve(chain_headers_.size() + 1); + buffer_pool_buffers_.reserve(buffer_pool_buffers_.size() + 1); + meta_chains_.reserve(meta_chains_.size() + 1); + } catch (const std::bad_alloc &) { + LOG_ERROR( + "append_segment: reserve for chain-split commit failed, file[%s]", + file_name_.c_str()); + undo_old_footer(); + return IndexError_Runtime; + } + chain = &meta_chains_.back(); + chain->footer = linked_footer; // old chain keeps linked footer + chain_headers_.push_back(std::move(new_header)); + buffer_pool_buffers_.push_back(std::move(new_meta_buf)); + meta_chains_.push_back(MetaChain{ + new_chain_start, new_footer_file_offset, new_segment_meta_file_offset, + new_segments_meta_size, new_segments_meta_size, new_footer}); + footer_ = new_footer; + current_header_start_offset_ = new_chain_start; + + chain = &meta_chains_.back(); + header = chain_headers_.back().get(); + meta_buf = buffer_pool_buffers_.back().get(); + + // Install rollback for the committed split. Captures by value so + // later reassignment of chain/header/meta_buf does not corrupt the + // closure. + rollback_step1 = [this, saved_footer_before_split, saved_old_chain_footer, + saved_old_footer_file_offset, + saved_current_header_start]() { + // 1. Drop the forward link on the old footer. If this fails the + // on-disk old footer still points at the popped new chain + // region -- mark corrupted. + if (buffer_pool_handle_->write_meta( + saved_old_footer_file_offset, sizeof(saved_footer_before_split), + reinterpret_cast(&saved_footer_before_split)) != + 0) { + LOG_ERROR( + "append_segment: rollback_step1 write of old footer FAILED, " + "file[%s] is now in an inconsistent state -- marking storage " + "as corrupted; further writes will be rejected.", + file_name_.c_str()); + corrupted_.store(true, std::memory_order_release); + } + // 2. Pop the freshly-pushed new chain (releases its unique_ptrs). + if (!meta_chains_.empty()) meta_chains_.pop_back(); + if (!chain_headers_.empty()) chain_headers_.pop_back(); + if (!buffer_pool_buffers_.empty()) buffer_pool_buffers_.pop_back(); + // 3. Restore the old chain's in-memory footer (forward link cleared). + if (!meta_chains_.empty()) { + meta_chains_.back().footer = saved_old_chain_footer; + } + // 4. Restore footer_ + current_header_start_offset_. The on-disk + // file size is intentionally NOT shrunk: the orphan region is + // unreachable (step 1 cleared the link) and reusable by the + // next split via file_size() realignment. + footer_ = saved_footer_before_split; + current_header_start_offset_ = saved_current_header_start; + }; + } + + // ---- Step 2: append SegmentMeta + ID into the (possibly new) last + // chain, then persist meta_buf and footer. + uint64_t new_data_index = footer_.content_size; + uint64_t new_seg_abs_offset = + chain->header_start_offset + header->content_offset + new_data_index; + uint64_t new_file_size = new_seg_abs_offset + padded_size; + if (new_file_size > buffer_pool_->file_size()) { + if (!buffer_pool_->extend_file(new_file_size)) { + return IndexError_Runtime; + } + } + + // Save mutable state for rollback if a Step 2 disk write fails. The + // meta_buf regions that get overwritten (SegmentMeta entry + ID + // string) are also snapshotted so they can be restored exactly, + // keeping CRC consistent for a later flush_index(). + const auto saved_footer = footer_; + const auto saved_chain_footer = chain->footer; + const auto saved_segment_ids_offset = chain->segment_ids_offset; + const size_t meta_entry_off = + sizeof(IndexFormat::SegmentMeta) * footer_.segment_count; + const uint32_t new_ids_off = + chain->segment_ids_offset - static_cast(id_size); + char saved_meta_entry[sizeof(IndexFormat::SegmentMeta)]; + std::memcpy(saved_meta_entry, meta_buf + meta_entry_off, + sizeof(IndexFormat::SegmentMeta)); + std::unique_ptr saved_id_bytes(new char[id_size]); + std::memcpy(saved_id_bytes.get(), meta_buf + new_ids_off, id_size); + + chain->segment_ids_offset -= static_cast(id_size); + IndexFormat::SegmentMeta *new_seg = + reinterpret_cast(meta_buf) + + footer_.segment_count; + new_seg->segment_id_offset = chain->segment_ids_offset; + new_seg->data_index = new_data_index; + new_seg->data_size = 0; + new_seg->data_crc = 0; + new_seg->padding_size = padded_size; + std::memcpy(meta_buf + chain->segment_ids_offset, id.c_str(), id_size); + + footer_.segment_count += 1; + footer_.content_size += padded_size; + footer_.total_size += padded_size; + footer_.segments_meta_crc = + ailego::Crc32c::Hash(meta_buf, chain->segment_meta_size, 0u); + IndexFormat::UpdateMetaFooter(&footer_, 0); + chain->footer = footer_; // sync in-memory copy for flush_index + + // Rollback for Step 2: restore in-memory state AND best-effort + // rewrite the OLD segments_meta + footer back to disk. Without the + // disk rewrite, a write_meta(footer) failure (or post-write OOM) + // would tell the caller the append failed yet leave on-disk bytes + // describing the failed append -- ParseToMapping() on next open + // would surface a ghost segment with no entry in segments_/id_hash_. + // + // If the rewrite itself fails the file is unrepairable from here: + // raise corrupted_ so subsequent writers refuse to proceed. + auto rollback_step2 = [&]() { + std::memcpy(meta_buf + meta_entry_off, saved_meta_entry, + sizeof(IndexFormat::SegmentMeta)); + std::memcpy(meta_buf + new_ids_off, saved_id_bytes.get(), id_size); + footer_ = saved_footer; + chain->footer = saved_chain_footer; + chain->segment_ids_offset = saved_segment_ids_offset; + + const int rc_meta = buffer_pool_handle_->write_meta( + chain->segment_meta_file_offset, chain->segment_meta_size, meta_buf); + const int rc_footer = buffer_pool_handle_->write_meta( + chain->footer_file_offset, sizeof(footer_), + reinterpret_cast(&footer_)); + if (rc_meta != 0 || rc_footer != 0) { + LOG_ERROR( + "append_segment: rollback_step2 disk rewrite FAILED " + "(rc_meta=%d, rc_footer=%d), file[%s] is now in an " + "inconsistent state -- marking storage as corrupted; further " + "writes will be rejected.", + rc_meta, rc_footer, file_name_.c_str()); + corrupted_.store(true, std::memory_order_release); + } + }; + + if (buffer_pool_handle_->write_meta(chain->segment_meta_file_offset, + chain->segment_meta_size, + meta_buf) != 0) { + LOG_ERROR("append_segment: write segment_meta failed, file[%s]", + file_name_.c_str()); + rollback_step2(); + rollback_step1(); + return IndexError_WriteData; + } + if (buffer_pool_handle_->write_meta( + chain->footer_file_offset, sizeof(footer_), + reinterpret_cast(&footer_)) != 0) { + LOG_ERROR("append_segment: write footer failed, file[%s]", + file_name_.c_str()); + rollback_step2(); + rollback_step1(); + return IndexError_WriteData; + } + + // Strong exception guarantee for the in-memory commit: emplace into + // segments_ and id_hash_ as one transactional unit -- if id_hash_ + // throws after segments_ succeeded, undo segments_ before + // propagating. unordered_map::emplace() leaves existing element + // addresses stable, so WrappedSegment instances pointing into + // segments_ remain valid. + auto seg_ins = segments_.end(); + bool seg_inserted = false; + try { + auto ins = segments_.emplace( + id, IndexMapping::SegmentInfo{IndexMapping::Segment{new_seg}, + chain->header_start_offset, header}); + if (!ins.second) { + // Cannot happen under the exclusive latch we hold (find() above + // checked), but be defensive. + LOG_ERROR( + "append_segment: duplicate id appeared after commit, file[%s], " + "id[%s]", + file_name_.c_str(), id.c_str()); + rollback_step2(); + rollback_step1(); + return IndexError_Duplicate; + } + seg_ins = ins.first; + seg_inserted = true; + const size_t new_id = id_hash_.size(); + id_hash_.emplace(id, new_id); + } catch (const std::bad_alloc &) { + LOG_ERROR( + "append_segment: in-memory commit OOM, rolling back, file[%s], " + "id[%s]", + file_name_.c_str(), id.c_str()); + if (seg_inserted) { + segments_.erase(seg_ins); + } + rollback_step2(); + rollback_step1(); + return IndexError_Runtime; + } + max_segment_size_ = std::max(max_segment_size_, padded_size); + // C1: extend_file() already extended the page table in-place; no pool + // rotation or flush_all needed. return 0; } //! Test if a segment exists bool has_segment(const std::string &id) const { - std::lock_guard latch(mapping_mutex_); + std::shared_lock latch( + mapping_shards_[mapping_shard_id()].mtx); return (segments_.find(id) != segments_.end()); } - //! Get a segment from storage - IndexMapping::SegmentInfo *get_segment_info(const std::string &id) { - std::lock_guard latch(mapping_mutex_); - auto iter = segments_.find(id); - if (iter == segments_.end()) { - return nullptr; - } - return &iter->second; + private: + std::atomic index_dirty_{false}; + std::atomic pending_check_point_{0}; + // Set when an append_segment() rollback fails to restore on-disk state. + // Once set, all writers (write/append_segment/flush_index_locked) refuse + // to proceed. Only ever raised; cleared only by close_index(). + std::atomic corrupted_{false}; + + // Sharded reader-writer lock: each reader hashes to its own shard to + // avoid cache-line ping-pong on the reader counter; writers lock all + // shards. + static constexpr size_t kMappingMutexShards = 32; + struct alignas(64) MutexShard { + std::shared_mutex mtx; + }; + mutable MutexShard mapping_shards_[kMappingMutexShards]{}; + + // Per-(thread, instance) shard selection. Combining thread::id with + // `this` ensures two BufferStorage instances on the same thread map to + // different shards (a thread_local-only id collapses them onto one + // shard). boost-style hash_combine disperses skewed thread::id + // distributions across the 32 shards. + size_t mapping_shard_id() const { + size_t seed = std::hash()(std::this_thread::get_id()); + size_t inst = std::hash()(static_cast(this)); + // boost::hash_combine(seed, inst) + seed ^= inst + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2); + return seed % kMappingMutexShards; } - private: - bool index_dirty_{false}; - mutable std::mutex mapping_mutex_{}; + // RAII guard that locks ALL shards exclusively (for writers). + struct AllShardsExclusiveLatch { + MutexShard *shards_; + AllShardsExclusiveLatch(MutexShard *shards) : shards_(shards) { + for (size_t i = 0; i < kMappingMutexShards; ++i) shards_[i].mtx.lock(); + } + ~AllShardsExclusiveLatch() { + for (size_t i = 0; i < kMappingMutexShards; ++i) shards_[i].mtx.unlock(); + } + AllShardsExclusiveLatch(const AllShardsExclusiveLatch &) = delete; + AllShardsExclusiveLatch &operator=(const AllShardsExclusiveLatch &) = + delete; + }; - std::vector tmp_buffers_{}; + // Arena slab for cross-page temp buffers handed out by + // WrappedSegment::read(const void**). The legacy contract requires + // every returned pointer to stay valid until close_index(), so slots + // are never freed individually -- they are carved out of large + // 4K-aligned arenas which are released in bulk. + // + // Why an arena instead of one posix_memalign(4K, 4K) per read: + // Android Bionic scudo's small-class chunk pool is prone to large- + // alignment starvation under fragmentation (we observed sporadic + // posix_memalign(4096, 4096) returning ENOMEM even with plenty of + // free memory). A single large request (>= kArenaSize) is served + // from scudo's secondary allocator (mmap-backed), which is reliable + // up to the true OOM boundary. + struct ArenaBlock { + char *base{nullptr}; + size_t size{0}; // Total bytes in this arena (4K-aligned). + size_t used{0}; // Bytes already handed out (4K-aligned). + }; + // Caller MUST hold tmp_buffers_mutex_. alloc_size MUST be a + // multiple of 4096. Returns nullptr only if scudo cannot satisfy a + // fresh arena allocation, i.e. effectively true OOM. + char *tmp_arena_alloc_locked(size_t alloc_size) { + static constexpr size_t kAlign = 4096UL; + static constexpr size_t kArenaSize = 1UL << 20; // 1 MiB + if (!tmp_buffers_.empty()) { + ArenaBlock &back = tmp_buffers_.back(); + if (back.base && back.size - back.used >= alloc_size) { + char *out = back.base + back.used; + back.used += alloc_size; + return out; + } + } + size_t new_size = alloc_size > kArenaSize ? alloc_size : kArenaSize; + char *p = static_cast(ailego_aligned_malloc(new_size, kAlign)); + if (!p) { + return nullptr; + } + tmp_buffers_.push_back(ArenaBlock{p, new_size, alloc_size}); + return p; + } + std::vector tmp_buffers_{}; mutable std::mutex tmp_buffers_mutex_{}; // buffer manager std::string file_name_; - IndexFormat::MetaHeader header_{}; + // Per-chain owning copies of MetaHeader. segments_[name].segment_header + // points into one of these; using a single shared header_ would let the + // next chain's ParseHeader overwrite earlier-chain content_offset. + std::vector> chain_headers_{}; IndexFormat::MetaFooter footer_{}; std::unordered_map segments_{}; std::unordered_map id_hash_{}; @@ -573,6 +1491,26 @@ class BufferStorage : public IndexStorage { ailego::VecBufferPool::Pointer buffer_pool_{nullptr}; ailego::VecBufferPoolHandle::Pointer buffer_pool_handle_{nullptr}; uint64_t current_header_start_offset_{0u}; + + // Capacity (in bytes) of the segment metadata section written by + // init_index(). + uint32_t segment_meta_capacity_{4096u}; + + // Per-header-chain file offsets used by flush_index() and append_segment(). + struct MetaChain { + uint64_t header_start_offset; + uint64_t footer_file_offset; + uint64_t segment_meta_file_offset; + uint32_t segment_meta_size; + // Lowest segment-ID-string offset within segment_meta; equals + // segment_meta_size when empty, decreases by strlen(id)+1 per append. + // Used to detect when a chain split is needed. + uint32_t segment_ids_offset; + // In-memory copy of this chain's MetaFooter, kept in sync with disk by + // flush_index() and append_segment() to avoid a pread per chain. + IndexFormat::MetaFooter footer; + }; + std::vector meta_chains_{}; }; INDEX_FACTORY_REGISTER_STORAGE(BufferStorage); diff --git a/src/db/index/segment/segment.cc b/src/db/index/segment/segment.cc index 96ec3dc37..6dd765262 100644 --- a/src/db/index/segment/segment.cc +++ b/src/db/index/segment/segment.cc @@ -526,10 +526,20 @@ Status SegmentImpl::close() { } } vector_indexers_.clear(); + for (const auto &[name, indexers] : quant_vector_indexers_) { + for (auto indexer : indexers) { + indexer->Close(); + } + } + quant_vector_indexers_.clear(); for (auto [name, indexer] : memory_vector_indexers_) { indexer->Close(); } memory_vector_indexers_.clear(); + for (auto [name, indexer] : quant_memory_vector_indexers_) { + indexer->Close(); + } + quant_memory_vector_indexers_.clear(); return Status::OK(); } diff --git a/src/db/index/storage/lazy_record_batch_reader.h b/src/db/index/storage/lazy_record_batch_reader.h index 451bba8e0..e1286e305 100644 --- a/src/db/index/storage/lazy_record_batch_reader.h +++ b/src/db/index/storage/lazy_record_batch_reader.h @@ -128,7 +128,8 @@ class ParquetRecordBatchReader : public arrow::RecordBatchReader { std::vector> chunks(col_indices_.size()); if (with_cache_) { for (size_t col_idx = 0; col_idx < col_indices_.size(); ++col_idx) { - auto buffer_id = ailego::ParquetBufferID(file_path_, col_idx, rg_id); + auto buffer_id = + ailego::ParquetBufferID(file_path_, col_indices_[col_idx], rg_id); auto buffer_handle = ailego::ParquetBufferPool::get_instance().acquire_buffer(buffer_id); std::shared_ptr col_chunked_array = diff --git a/src/db/index/storage/store_helper.h b/src/db/index/storage/store_helper.h index f930e42ec..abb4599e5 100644 --- a/src/db/index/storage/store_helper.h +++ b/src/db/index/storage/store_helper.h @@ -267,12 +267,7 @@ inline arrow::Status ConvertScalarVectorToArrayByType( return arrow::Status::Invalid( "Cannot convert empty vector to list array"); } - - auto list_type = std::dynamic_pointer_cast(type); - if (!list_type) { - return arrow::Status::TypeError("Expected ListType for LIST scalar"); - } - + auto list_type = std::static_pointer_cast(type); std::unique_ptr value_builder; ARROW_RETURN_NOT_OK(arrow::MakeBuilder(arrow::default_memory_pool(), list_type->value_type(), @@ -287,10 +282,9 @@ inline arrow::Status ConvertScalarVectorToArrayByType( continue; } - auto list_scalar = std::dynamic_pointer_cast(scalar); - if (!list_scalar) { - return arrow::Status::TypeError("Expected ListScalar for LIST type"); - } + // Same rationale: scalar->type->id() == LIST implies the + // scalar IS a ListScalar; avoid RTTI-dependent cast. + auto list_scalar = std::static_pointer_cast(scalar); ARROW_RETURN_NOT_OK(builder.Append()); auto value_builder_ptr = builder.value_builder(); @@ -371,12 +365,10 @@ inline arrow::Status AppendFieldValueToBuilder( } case arrow::Type::LIST: { auto list_builder = dynamic_cast(builder); - auto list_type = - std::dynamic_pointer_cast(field->type()); - - if (!list_type) { - return arrow::Status::TypeError("Field type is not ListType"); - } + // Use static_pointer_cast: the switch guarantees type == LIST; + // dynamic_pointer_cast fails on Android due to RTTI divergence + // when Arrow is linked as a static archive. + auto list_type = std::static_pointer_cast(field->type()); auto value_type = list_type->value_type()->id(); @@ -699,8 +691,9 @@ inline arrow::Status BuildArrayFromIndicesWithType( return BuildArrayFromIndices( chunked_array, indices_in_table, out_array); case arrow::Type::LIST: { - auto list_type = - std::dynamic_pointer_cast(col_data_type); + // static_pointer_cast: switch guarantees type == LIST; avoids + // Android RTTI divergence with Arrow static archive. + auto list_type = std::static_pointer_cast(col_data_type); return BuildListArrayFromIndices(chunked_array, indices_in_table, list_type, out_array); } diff --git a/src/include/zvec/ailego/buffer/vector_page_table.h b/src/include/zvec/ailego/buffer/vector_page_table.h index c6a08c9da..8bcc13e99 100644 --- a/src/include/zvec/ailego/buffer/vector_page_table.h +++ b/src/include/zvec/ailego/buffer/vector_page_table.h @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -48,16 +49,28 @@ class VectorPageTable { struct Entry { std::atomic ref_count; std::atomic in_evict_queue; + std::atomic is_dirty; char *buffer; + size_t file_offset; }; public: - VectorPageTable() : entry_num_(0), entries_(nullptr) { + // Callback invoked by evict_block() to persist a dirty block before its + // memory is released. Signature: (block_id, buffer, size, file_offset). + using FlushCallback = std::function; + + VectorPageTable() { BlockEvictionQueue::get_instance().set_valid(this); } ~VectorPageTable() { BlockEvictionQueue::get_instance().set_invalid(this); - delete[] entries_; + // Destructor runs without concurrent readers/writers (callers guarantee + // no live handles by the time the page table is destroyed), so a relaxed + // load is sufficient here. + size_t cnt = segment_count_.load(std::memory_order_relaxed); + for (size_t i = 0; i < cnt; ++i) { + delete[] segments_[i]; + } } VectorPageTable(const VectorPageTable &) = delete; @@ -65,7 +78,17 @@ class VectorPageTable { VectorPageTable(VectorPageTable &&) = delete; VectorPageTable &operator=(VectorPageTable &&) = delete; - void init(size_t entry_num); + //! Initialize the page table to cover `entry_num` entries. + //! Returns false (without modifying state) if `entry_num` exceeds the + //! statically allocated segment table capacity (kMaxEntries). + bool init(size_t entry_num); + + //! Extend the page table to cover at least `new_entry_num` entries. + //! Existing entries stay at their original addresses (no invalidation). + //! Safe to call while readers operate on existing pages. + //! Returns false (without modifying state) if `new_entry_num` exceeds + //! the statically allocated segment table capacity (kMaxEntries). + bool extend(size_t new_entry_num); char *acquire_block(block_id_t block_id); @@ -73,25 +96,101 @@ class VectorPageTable { void evict_block(block_id_t block_id); - char *set_block_acquired(block_id_t block_id, char *buffer); + char *set_block_acquired(block_id_t block_id, char *buffer, + size_t file_offset); + + void set_flush_callback(FlushCallback cb) { + flush_callback_ = std::move(cb); + } + + //! Mark a loaded block as dirty so that it is persisted on eviction. + void mark_dirty(block_id_t block_id) { + assert(block_id < entry_num_.load(std::memory_order_acquire)); + entry_at(block_id).is_dirty.store(true, std::memory_order_relaxed); + } + + bool is_block_dirty(block_id_t block_id) const { + assert(block_id < entry_num_.load(std::memory_order_acquire)); + return entry_at(block_id).is_dirty.load(std::memory_order_relaxed); + } + + //! Flush a single dirty block without evicting it. Caller guarantees the + //! block is currently loaded (buffer != nullptr). + int flush_block(block_id_t block_id) { + assert(block_id < entry_num_.load(std::memory_order_acquire)); + Entry &e = entry_at(block_id); + char *buffer = e.buffer; + if (!buffer || !flush_callback_) { + return 0; + } + if (!e.is_dirty.load(std::memory_order_relaxed)) { + return 0; + } + int rc = flush_callback_(block_id, buffer, kVectorPageSize, e.file_offset); + if (rc == 0) { + e.is_dirty.store(false, std::memory_order_relaxed); + } + return rc; + } + //! Returns the current number of entries. Uses acquire ordering so that + //! callers iterating over [0, entry_num()) are guaranteed to see all + //! segments_[s] writes performed by a concurrent extend()/init(). size_t entry_num() const { - return entry_num_; + return entry_num_.load(std::memory_order_acquire); } bool is_released(block_id_t block_id) const { - assert(block_id < entry_num_); - return entries_[block_id].ref_count.load(std::memory_order_relaxed) <= 0; + assert(block_id < entry_num_.load(std::memory_order_acquire)); + return entry_at(block_id).ref_count.load(std::memory_order_relaxed) <= 0; } inline bool is_dead_block(BlockEvictionQueue::BlockType block) const { - Entry &entry = entries_[block.vector_block.first]; - return !entry.in_evict_queue.load(std::memory_order_relaxed); + const Entry &e = entry_at(block.vector_block.first); + return !e.in_evict_queue.load(std::memory_order_relaxed); } private: - size_t entry_num_{0}; - Entry *entries_{nullptr}; + // Segmented page table: entries are split across fixed-size segments so + // that extend() can grow the table without moving existing entries. + static constexpr size_t kSegmentShift = 16; // 65536 entries per segment + static constexpr size_t kSegmentSize = size_t{1} << kSegmentShift; + static constexpr size_t kSegmentMask = kSegmentSize - 1; + + public: + static constexpr size_t kMaxSegments = + 2048; // up to 128M entries (512GB @ 4K) + // Maximum number of entries the segment table can ever hold. Callers + // (e.g. VecBufferPool::extend_file) can use this to pre-validate a target + // file size before mutating any on-disk state. + static constexpr size_t kMaxEntries = kMaxSegments * kSegmentSize; + + private: + // entry_num_ and segment_count_ are mutated by writers in init()/extend() + // and observed by readers in entry_num() and the hot-path methods. They + // are atomic to establish a release/acquire synchronization edge with the + // (non-atomic) writes to segments_[s] performed prior to the store: any + // reader that observes the new entry_num_ is guaranteed to see the + // fully-initialized Entry slots in the corresponding segment. + std::atomic entry_num_{0}; + std::atomic segment_count_{0}; + Entry *segments_[kMaxSegments]{}; + + // Pair with the release-store on segment_count_ in init()/extend() so + // that any reader observing the published segment table also sees the + // fully-initialized segments_[s] pointer and Entry slots. Without this + // acquire load, segments_[s] can be re-read as nullptr or a torn + // pointer on weak memory models (and even reordered on x86 under -O2). + Entry &entry_at(size_t idx) { + (void)segment_count_.load(std::memory_order_acquire); + return segments_[idx >> kSegmentShift][idx & kSegmentMask]; + } + const Entry &entry_at(size_t idx) const { + (void)segment_count_.load(std::memory_order_acquire); + return segments_[idx >> kSegmentShift][idx & kSegmentMask]; + } + + FlushCallback flush_callback_{}; }; class VecBufferPoolHandle; @@ -102,8 +201,11 @@ class VecBufferPool { static constexpr size_t kMutexBucketCount = 64UL * 1024UL; - VecBufferPool(const std::string &filename); + VecBufferPool(const std::string &filename, bool writable = false); ~VecBufferPool() { + // Flush any remaining dirty blocks before tearing down memory/fd so that + // writes are not silently lost. Safe to call even in read-only mode. + (void)this->flush_all(); for (size_t i = 0; i < page_table_.entry_num(); ++i) { assert(page_table_.is_released(i)); page_table_.evict_block(i); @@ -123,6 +225,29 @@ class VecBufferPool { int get_meta(size_t offset, size_t length, char *buffer); + //! Write a contiguous range via the page cache; marks touched pages dirty. + //! Returns 0 on success, -1 on failure (e.g. read-only pool or I/O error). + int write_range(size_t file_offset, size_t length, const char *src); + + //! Write raw bytes directly via pwrite, bypassing the page cache. Used for + //! metadata regions (header/footer/segments_meta) which are only read via + //! get_meta() and never cached. + int write_meta(size_t offset, size_t length, const char *buffer); + + //! Iterate all entries and persist any dirty blocks to disk. Safe to call + //! repeatedly; no-op in read-only mode. + int flush_all(); + + //! Extend the backing file to `new_size` bytes via ftruncate (no-op if + //! already >= new_size), refresh the cached file_size_, and extend the + //! page_table to cover the new range. Returns true on success, false on + //! a read-only pool or I/O failure. + bool extend_file(size_t new_size); + + bool writable() const { + return writable_; + } + size_t file_size() const { return file_size_; } @@ -131,6 +256,7 @@ class VecBufferPool { int fd_; size_t file_size_; std::string file_name_; + bool writable_{false}; public: VectorPageTable page_table_; @@ -154,6 +280,14 @@ class VecBufferPoolHandle { int get_meta(size_t offset, size_t length, char *buffer); + int write_range(size_t file_offset, size_t len, const char *src); + + int write_meta(size_t offset, size_t length, const char *buffer); + + int flush_all(); + + bool writable() const; + void release_one(block_id_t block_id); void acquire_one(block_id_t block_id); diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h index 530073aad..3da2e6669 100644 --- a/src/include/zvec/core/framework/index_storage.h +++ b/src/include/zvec/core/framework/index_storage.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -47,23 +48,35 @@ class IndexStorage : public IndexModule { } MemoryBlock(void *data) : type_(MemoryBlockType::MBT_MMAP), data_(data) {} - static MemoryBlock MakeOwned(void *owned) { + //! Build an HEAP_SCRATCH MemoryBlock that owns `owned` (allocated via + //! ailego_malloc / ailego_aligned_malloc). `size` is the byte length of + //! the buffer and is required so that copy construction / copy + //! assignment can deep-copy the buffer instead of aliasing it (a shallow + //! copy would result in use-after-free once the original block is + //! destructed and frees the buffer). + static MemoryBlock MakeOwned(void *owned, size_t size) { MemoryBlock mb; mb.type_ = MemoryBlockType::MBT_HEAP_SCRATCH; mb.data_ = owned; + mb.scratch_size_ = size; return mb; } MemoryBlock(const MemoryBlock &rhs) { switch (rhs.type_) { case MemoryBlockType::MBT_MMAP: - case MemoryBlockType::MBT_HEAP_SCRATCH: this->reset(rhs.data_); break; case MemoryBlockType::MBT_BUFFERPOOL: this->reset(rhs.buffer_pool_handle_, rhs.buffer_block_id_, rhs.data_); buffer_pool_handle_->acquire_one(buffer_block_id_); break; + case MemoryBlockType::MBT_HEAP_SCRATCH: + // Deep copy: each owner must hold its own buffer, otherwise the + // first destructor frees the buffer and leaves the surviving + // copies dangling. + deep_copy_from(rhs); + break; default: break; } @@ -83,7 +96,9 @@ class IndexStorage : public IndexModule { case MemoryBlockType::MBT_HEAP_SCRATCH: type_ = MemoryBlockType::MBT_HEAP_SCRATCH; data_ = rhs.data_; + scratch_size_ = rhs.scratch_size_; rhs.data_ = nullptr; + rhs.scratch_size_ = 0; rhs.type_ = MemoryBlockType::MBT_UNKNOWN; break; default: @@ -103,7 +118,8 @@ class IndexStorage : public IndexModule { buffer_pool_handle_->acquire_one(buffer_block_id_); break; case MemoryBlockType::MBT_HEAP_SCRATCH: - this->reset(rhs.data_); + release_current(); + deep_copy_from(rhs); break; default: break; @@ -125,10 +141,12 @@ class IndexStorage : public IndexModule { rhs.type_ = MemoryBlockType::MBT_UNKNOWN; break; case MemoryBlockType::MBT_HEAP_SCRATCH: - release_owned(); + release_current(); type_ = MemoryBlockType::MBT_HEAP_SCRATCH; data_ = rhs.data_; + scratch_size_ = rhs.scratch_size_; rhs.data_ = nullptr; + rhs.scratch_size_ = 0; rhs.type_ = MemoryBlockType::MBT_UNKNOWN; break; default: @@ -154,6 +172,7 @@ class IndexStorage : public IndexModule { break; } data_ = nullptr; + scratch_size_ = 0; } const void *data() const { @@ -188,6 +207,10 @@ class IndexStorage : public IndexModule { void *data_{nullptr}; mutable ailego::VecBufferPoolHandle *buffer_pool_handle_{nullptr}; size_t buffer_block_id_{0}; + //! Byte size of the heap-scratch buffer pointed to by `data_`; only used + //! when type_ == MBT_HEAP_SCRATCH. Required for safe deep-copy on + //! copy-construction / copy-assignment of HEAP_SCRATCH blocks. + size_t scratch_size_{0}; private: void release_owned() { @@ -195,6 +218,44 @@ class IndexStorage : public IndexModule { ailego_free(data_); data_ = nullptr; } + scratch_size_ = 0; + } + + //! Drop whatever the current MemoryBlock holds, regardless of type, so + //! that the slot is ready to receive new ownership. Mirrors what the + //! destructor would do (minus zeroing data_) but leaves the type alone + //! for the caller to overwrite immediately afterwards. + void release_current() { + switch (type_) { + case MemoryBlockType::MBT_BUFFERPOOL: + if (buffer_pool_handle_) { + buffer_pool_handle_->release_one(buffer_block_id_); + buffer_pool_handle_ = nullptr; + } + break; + case MemoryBlockType::MBT_HEAP_SCRATCH: + release_owned(); + break; + default: + break; + } + data_ = nullptr; + type_ = MemoryBlockType::MBT_UNKNOWN; + } + + //! Allocate a fresh buffer of the same size as `rhs.scratch_size_`, + //! memcpy `rhs.data_` into it, and become the new owner. Used by the + //! HEAP_SCRATCH copy ctor / copy assignment so the original and the + //! copy each free their own buffer independently. + void deep_copy_from(const MemoryBlock &rhs) { + type_ = MemoryBlockType::MBT_HEAP_SCRATCH; + scratch_size_ = rhs.scratch_size_; + if (scratch_size_ > 0 && rhs.data_) { + data_ = ailego_malloc(scratch_size_); + std::memcpy(data_, rhs.data_, scratch_size_); + } else { + data_ = nullptr; + } } }; diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_test.cc b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc index 441853c86..e3fce1f24 100644 --- a/tests/core/algorithm/flat/flat_streamer_buffer_test.cc +++ b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc @@ -168,6 +168,251 @@ TEST_F(FlatStreamerTest, TestLinearSearch) { read_streamer.reset(); } +TEST_F(FlatStreamerTest, TestLinearSearchBuffer) { + MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL); + IndexStreamer::Pointer write_streamer = + IndexFactory::CreateStreamer("FlatStreamer"); + ASSERT_TRUE(write_streamer != nullptr); + + Params params; + ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params)); + auto storage = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_NE(nullptr, storage); + Params stg_params; + ASSERT_EQ(0, storage->init(stg_params)); + ASSERT_EQ(0, storage->open(dir_ + "Test/LinearSearchBuffer", true)); + ASSERT_EQ(0, write_streamer->open(storage)); + + auto ctx = write_streamer->create_context(); + ASSERT_TRUE(!!ctx); + + size_t cnt = 10000UL; + IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim); + for (size_t i = 0; i < cnt; i++) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + write_streamer->add_impl(i, vec.data(), qmeta, ctx); + } + write_streamer->flush(0UL); + write_streamer->close(); + write_streamer.reset(); + storage->close(); + + IndexStreamer::Pointer read_streamer = + IndexFactory::CreateStreamer("FlatStreamer"); + ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params)); + auto read_storage = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_NE(nullptr, read_storage); + ASSERT_EQ(0, read_storage->init(stg_params)); + ASSERT_EQ(0, read_storage->open(dir_ + "Test/LinearSearchBuffer", false)); + ASSERT_EQ(0, read_streamer->open(read_storage)); + size_t topk = 3; + auto provider = read_streamer->create_provider(); + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_FLOAT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + + ctx->set_topk(100U); + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = 10.1f; + } + ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx)); + auto &result = ctx->result(); + ASSERT_EQ(100U, result.size()); + ASSERT_EQ(10, result[0].key()); + ASSERT_EQ(11, result[1].key()); + ASSERT_EQ(5, result[10].key()); + ASSERT_EQ(0, result[20].key()); + ASSERT_EQ(30, result[30].key()); + ASSERT_EQ(35, result[35].key()); + ASSERT_EQ(99, result[99].key()); + + ElapsedTime elapsed_time; + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_FLOAT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl; + + read_streamer->close(); + read_streamer.reset(); +} + +TEST_F(FlatStreamerTest, TestLinearSearchBufferMMap) { + MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL); + IndexStreamer::Pointer write_streamer = + IndexFactory::CreateStreamer("FlatStreamer"); + ASSERT_TRUE(write_streamer != nullptr); + + Params params; + ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params)); + auto storage = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_NE(nullptr, storage); + Params stg_params; + ASSERT_EQ(0, storage->init(stg_params)); + ASSERT_EQ(0, storage->open(dir_ + "Test/LinearSearchBuffer", true)); + ASSERT_EQ(0, write_streamer->open(storage)); + + auto ctx = write_streamer->create_context(); + ASSERT_TRUE(!!ctx); + + size_t cnt = 10000UL; + IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim); + for (size_t i = 0; i < cnt; i++) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + write_streamer->add_impl(i, vec.data(), qmeta, ctx); + } + write_streamer->flush(0UL); + write_streamer->close(); + write_streamer.reset(); + storage->close(); + + IndexStreamer::Pointer read_streamer = + IndexFactory::CreateStreamer("FlatStreamer"); + ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params)); + auto read_storage = IndexFactory::CreateStorage("MMapFileStorage"); + ASSERT_NE(nullptr, read_storage); + ASSERT_EQ(0, read_storage->init(stg_params)); + ASSERT_EQ(0, read_storage->open(dir_ + "Test/LinearSearchBuffer", false)); + ASSERT_EQ(0, read_streamer->open(read_storage)); + size_t topk = 3; + auto provider = read_streamer->create_provider(); + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_FLOAT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + + ctx->set_topk(100U); + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = 10.1f; + } + ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx)); + auto &result = ctx->result(); + ASSERT_EQ(100U, result.size()); + ASSERT_EQ(10, result[0].key()); + ASSERT_EQ(11, result[1].key()); + ASSERT_EQ(5, result[10].key()); + ASSERT_EQ(0, result[20].key()); + ASSERT_EQ(30, result[30].key()); + ASSERT_EQ(35, result[35].key()); + ASSERT_EQ(99, result[99].key()); + + ElapsedTime elapsed_time; + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_FLOAT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl; + + read_streamer->close(); + read_streamer.reset(); +} + + TEST_F(FlatStreamerTest, TestLinearSearchWithLRU) { MemoryLimitPool::get_instance().init(100 * 1024UL * 1024UL); #ifdef __ANDROID__ @@ -351,7 +596,6 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) { ASSERT_EQ(topk, result1.size()); IndexStorage::MemoryBlock block; ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); - const float *data = (float *)block.data(); for (size_t j = 0; j < dim; ++j) { const float *data = (float *)provider->get_vector(result1[0].key()); EXPECT_FLOAT_EQ(data[j], i); diff --git a/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc b/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc index cf3093e22..cd21ff912 100644 --- a/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc +++ b/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc @@ -171,6 +171,254 @@ TEST_F(HnswStreamerTest, TestHnswSearch) { cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl; } +TEST_F(HnswStreamerTest, TestHnswSearchBuffer) { + MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL); + IndexStreamer::Pointer write_streamer = + IndexFactory::CreateStreamer("HnswStreamer"); + ASSERT_TRUE(write_streamer != nullptr); + + Params params; + params.set(PARAM_HNSW_STREAMER_GET_VECTOR_ENABLE, true); + + ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params)); + auto storage = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_NE(nullptr, storage); + Params stg_params; + ASSERT_EQ(0, storage->init(stg_params)); + ASSERT_EQ(0, storage->open(dir_ + "Test/TestHnswSearchBuffer", true)); + ASSERT_EQ(0, write_streamer->open(storage)); + + auto ctx = write_streamer->create_context(); + ASSERT_TRUE(!!ctx); + + size_t cnt = 10000UL; + IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim); + for (size_t i = 0; i < cnt; i++) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + write_streamer->add_impl(i, vec.data(), qmeta, ctx); + } + write_streamer->flush(0UL); + write_streamer->close(); + write_streamer.reset(); + storage->close(); + + IndexStreamer::Pointer read_streamer = + IndexFactory::CreateStreamer("HnswStreamer"); + ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params)); + auto read_storage = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_NE(nullptr, read_storage); + ASSERT_EQ(0, read_storage->init(stg_params)); + ASSERT_EQ(0, read_storage->open(dir_ + "Test/TestHnswSearchBuffer", false)); + ASSERT_EQ(0, read_streamer->open(read_storage)); + size_t topk = 3; + auto provider = read_streamer->create_provider(); + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + + ctx->set_topk(100U); + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = 10.1f; + } + ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx)); + auto &result = ctx->result(); + ASSERT_EQ(100U, result.size()); + ASSERT_EQ(10, result[0].key()); + ASSERT_EQ(11, result[1].key()); + ASSERT_EQ(5, result[10].key()); + ASSERT_EQ(0, result[20].key()); + ASSERT_EQ(30, result[30].key()); + ASSERT_EQ(35, result[35].key()); + ASSERT_EQ(99, result[99].key()); + + ElapsedTime elapsed_time; + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + + read_streamer->close(); + read_streamer.reset(); + cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl; +} + +TEST_F(HnswStreamerTest, TestHnswSearchBufferMMap) { + MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL); + IndexStreamer::Pointer write_streamer = + IndexFactory::CreateStreamer("HnswStreamer"); + ASSERT_TRUE(write_streamer != nullptr); + + Params params; + params.set(PARAM_HNSW_STREAMER_GET_VECTOR_ENABLE, true); + + ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params)); + auto storage = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_NE(nullptr, storage); + Params stg_params; + ASSERT_EQ(0, storage->init(stg_params)); + ASSERT_EQ(0, storage->open(dir_ + "Test/TestHnswSearchBufferMMap", true)); + ASSERT_EQ(0, write_streamer->open(storage)); + + auto ctx = write_streamer->create_context(); + ASSERT_TRUE(!!ctx); + + size_t cnt = 10000UL; + IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim); + for (size_t i = 0; i < cnt; i++) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + write_streamer->add_impl(i, vec.data(), qmeta, ctx); + } + write_streamer->flush(0UL); + write_streamer->close(); + write_streamer.reset(); + storage->close(); + + IndexStreamer::Pointer read_streamer = + IndexFactory::CreateStreamer("HnswStreamer"); + ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params)); + auto read_storage = IndexFactory::CreateStorage("MMapFileStorage"); + ASSERT_NE(nullptr, read_storage); + ASSERT_EQ(0, read_storage->init(stg_params)); + ASSERT_EQ(0, read_storage->open(dir_ + "Test/TestHnswSearchBufferMMap", false)); + ASSERT_EQ(0, read_streamer->open(read_storage)); + size_t topk = 3; + auto provider = read_streamer->create_provider(); + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + + ctx->set_topk(100U); + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = 10.1f; + } + ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx)); + auto &result = ctx->result(); + ASSERT_EQ(100U, result.size()); + ASSERT_EQ(10, result[0].key()); + ASSERT_EQ(11, result[1].key()); + ASSERT_EQ(5, result[10].key()); + ASSERT_EQ(0, result[20].key()); + ASSERT_EQ(30, result[30].key()); + ASSERT_EQ(35, result[35].key()); + ASSERT_EQ(99, result[99].key()); + + ElapsedTime elapsed_time; + for (size_t i = 0; i < cnt; i += 1) { + NumericalVector vec(dim); + for (size_t j = 0; j < dim; ++j) { + vec[j] = i; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result1 = ctx->result(); + ASSERT_EQ(topk, result1.size()); + IndexStorage::MemoryBlock block; + ASSERT_EQ(0, provider->get_vector(result1[0].key(), block)); + const float *data = (float *)block.data(); + for (size_t j = 0; j < dim; ++j) { + ASSERT_EQ(data[j], i); + } + ASSERT_EQ(i, result1[0].key()); + + for (size_t j = 0; j < dim; ++j) { + vec[j] = i + 0.1f; + } + ctx->set_topk(topk); + ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx)); + auto &result2 = ctx->result(); + ASSERT_EQ(topk, result2.size()); + ASSERT_EQ(i, result2[0].key()); + ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key()); + ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key()); + } + + read_streamer->close(); + read_streamer.reset(); + cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl; +} + TEST_F(HnswStreamerTest, TestHnswSearchMMap) { IndexStreamer::Pointer write_streamer = IndexFactory::CreateStreamer("HnswStreamer"); diff --git a/tests/core/utility/buffer_storage_write_test.cc b/tests/core/utility/buffer_storage_write_test.cc new file mode 100644 index 000000000..a97a32c17 --- /dev/null +++ b/tests/core/utility/buffer_storage_write_test.cc @@ -0,0 +1,1181 @@ +// Copyright 2025-present the zvec project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace zvec; +using namespace zvec::core; + +class BufferStorageWriteTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + // Initialize the memory limit pool with 64MB - enough for all tests. + ailego::MemoryLimitPool::get_instance().init(64 * 1024UL * 1024UL); + } + + void SetUp() override { + file_path_ = "buffer_storage_write_test_dir/test_" + + std::to_string(reinterpret_cast(this)); + ailego::File::Delete(file_path_); + ailego::File::MakePath("buffer_storage_write_test_dir"); + } + + void TearDown() override { ailego::File::Delete(file_path_); } + + // Open BufferStorage in writable mode (create_if_missing=true) + IndexStorage::Pointer OpenWritable() { + auto storage = IndexFactory::CreateStorage("BufferStorage"); + if (!storage) return nullptr; + ailego::Params params; + storage->init(params); + if (storage->open(file_path_, true) != 0) return nullptr; + return storage; + } + + // Open BufferStorage in read-only mode + IndexStorage::Pointer OpenReadOnly() { + auto storage = IndexFactory::CreateStorage("BufferStorage"); + if (!storage) return nullptr; + ailego::Params params; + storage->init(params); + if (storage->open(file_path_, false) != 0) return nullptr; + return storage; + } + + std::string file_path_; +}; + +// ===== Basic Write Tests ===== + +// Test: Create new index via BufferStorage, append segment, write data, read back +TEST_F(BufferStorageWriteTest, WriteBasicCreateAndWrite) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + std::string data = "Hello BufferStorage Write!"; + EXPECT_EQ(data.size(), seg->write(0, data.data(), data.size())); + + // Verify data via fetch + std::vector buf(data.size()); + EXPECT_EQ(data.size(), seg->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(data, std::string(buf.data(), buf.size())); + + // data_size should reflect the written bytes + EXPECT_EQ(data.size(), seg->data_size()); + EXPECT_EQ(0, storage->close()); +} + +// Test: Write at non-zero offset within the segment +TEST_F(BufferStorageWriteTest, WriteAtNonZeroOffset) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 8192)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + // First write at offset 0 + std::string first = "AAAA"; + EXPECT_EQ(first.size(), seg->write(0, first.data(), first.size())); + + // Second write at offset 100 + std::string second = "BBBB"; + EXPECT_EQ(second.size(), seg->write(100, second.data(), second.size())); + + // data_size should be max(first.end, second.end) = 104 + EXPECT_EQ(104u, seg->data_size()); + + // Verify both writes + std::vector buf1(first.size()); + EXPECT_EQ(first.size(), seg->fetch(0, buf1.data(), buf1.size())); + EXPECT_EQ(first, std::string(buf1.data(), buf1.size())); + + std::vector buf2(second.size()); + EXPECT_EQ(second.size(), seg->fetch(100, buf2.data(), buf2.size())); + EXPECT_EQ(second, std::string(buf2.data(), buf2.size())); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Write to multiple independent segments +TEST_F(BufferStorageWriteTest, WriteMultipleSegments) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg_a", 4096)); + ASSERT_EQ(0, storage->append("seg_b", 4096)); + ASSERT_EQ(0, storage->append("seg_c", 4096)); + + auto seg_a = storage->get("seg_a"); + auto seg_b = storage->get("seg_b"); + auto seg_c = storage->get("seg_c"); + ASSERT_TRUE(seg_a); + ASSERT_TRUE(seg_b); + ASSERT_TRUE(seg_c); + + std::string da = "data_for_a"; + std::string db = "data_for_b_longer"; + std::string dc = "c"; + + EXPECT_EQ(da.size(), seg_a->write(0, da.data(), da.size())); + EXPECT_EQ(db.size(), seg_b->write(0, db.data(), db.size())); + EXPECT_EQ(dc.size(), seg_c->write(0, dc.data(), dc.size())); + + // Verify independently + std::vector buf(db.size()); + EXPECT_EQ(da.size(), seg_a->fetch(0, buf.data(), da.size())); + EXPECT_EQ(da, std::string(buf.data(), da.size())); + + EXPECT_EQ(db.size(), seg_b->fetch(0, buf.data(), db.size())); + EXPECT_EQ(db, std::string(buf.data(), db.size())); + + EXPECT_EQ(dc.size(), seg_c->fetch(0, buf.data(), dc.size())); + EXPECT_EQ(dc, std::string(buf.data(), dc.size())); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Overwrite existing data at the same offset +TEST_F(BufferStorageWriteTest, WriteOverwrite) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + std::string first = "XXXXXXXX"; + EXPECT_EQ(first.size(), seg->write(0, first.data(), first.size())); + + std::string second = "YYYYYYYY"; + EXPECT_EQ(second.size(), seg->write(0, second.data(), second.size())); + + // Second write should overwrite + std::vector buf(second.size()); + EXPECT_EQ(second.size(), seg->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(second, std::string(buf.data(), buf.size())); + + EXPECT_EQ(0, storage->close()); +} + +// ===== Boundary / Error Tests ===== + +// Test: Write exceeding segment capacity returns 0 +TEST_F(BufferStorageWriteTest, WriteExceedsCapacity) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + // Append a small segment (page-aligned, so at least 4096 bytes capacity) + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + size_t cap = seg->capacity(); + ASSERT_GT(cap, 0u); + + // Write at an offset that causes overflow: offset + len > capacity + std::vector big_data(cap + 1, 'Z'); + EXPECT_EQ(0u, seg->write(0, big_data.data(), big_data.size())); + + // Write at offset that exceeds capacity + std::string small = "small"; + EXPECT_EQ(0u, seg->write(cap + 1, small.data(), small.size())); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Write with zero length (edge case) +TEST_F(BufferStorageWriteTest, WriteZeroLength) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + // Writing zero bytes should succeed (no-op but valid) + EXPECT_EQ(0u, seg->write(0, "x", 0)); + EXPECT_EQ(0u, seg->data_size()); + + EXPECT_EQ(0, storage->close()); +} + +// ===== Persistence Tests ===== + +// Test: Write, flush, close, reopen, verify data persisted +TEST_F(BufferStorageWriteTest, WriteFlushReopenVerify) { + std::string data = "Persistent data that survives close/reopen"; + + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("persist_seg", 8192)); + auto seg = storage->get("persist_seg"); + ASSERT_TRUE(seg); + EXPECT_EQ(data.size(), seg->write(0, data.data(), data.size())); + EXPECT_EQ(0, storage->flush()); + EXPECT_EQ(0, storage->close()); + } + + // Reopen in read-only mode and verify + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("persist_seg"); + ASSERT_TRUE(seg); + EXPECT_EQ(data.size(), seg->data_size()); + + std::vector buf(data.size()); + EXPECT_EQ(data.size(), seg->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(data, std::string(buf.data(), buf.size())); + EXPECT_EQ(0, storage->close()); + } +} + +// Test: Multiple write-flush cycles persist all data +TEST_F(BufferStorageWriteTest, WriteMultipleFlushCycles) { + std::string data1 = "first_write"; + std::string data2 = "second_write_longer"; + + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + // First write + flush + EXPECT_EQ(data1.size(), seg->write(0, data1.data(), data1.size())); + EXPECT_EQ(0, storage->flush()); + + // Second write at a different offset + flush + EXPECT_EQ(data2.size(), + seg->write(200, data2.data(), data2.size())); + EXPECT_EQ(0, storage->flush()); + EXPECT_EQ(0, storage->close()); + } + + // Verify persistence + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + std::vector buf1(data1.size()); + EXPECT_EQ(data1.size(), seg->fetch(0, buf1.data(), buf1.size())); + EXPECT_EQ(data1, std::string(buf1.data(), buf1.size())); + + std::vector buf2(data2.size()); + EXPECT_EQ(data2.size(), seg->fetch(200, buf2.data(), buf2.size())); + EXPECT_EQ(data2, std::string(buf2.data(), buf2.size())); + + EXPECT_EQ(0, storage->close()); + } +} + +// Test: Close without explicit flush still persists (close_index does flush) +TEST_F(BufferStorageWriteTest, WriteCloseWithoutExplicitFlush) { + std::string data = "should_persist_on_close"; + + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + EXPECT_EQ(data.size(), seg->write(0, data.data(), data.size())); + // No explicit flush - close should handle it + EXPECT_EQ(0, storage->close()); + } + + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + std::vector buf(data.size()); + EXPECT_EQ(data.size(), seg->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(data, std::string(buf.data(), buf.size())); + EXPECT_EQ(0, storage->close()); + } +} + +// ===== Read-Only Behavior ===== + +// Test: Write to read-only storage is a silent no-op (returns len) +TEST_F(BufferStorageWriteTest, WriteReadOnlyNoOp) { + // First create an index file with a segment + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + std::string init_data = "initial"; + seg->write(0, init_data.data(), init_data.size()); + EXPECT_EQ(0, storage->flush()); + EXPECT_EQ(0, storage->close()); + } + + // Open read-only and attempt write + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + std::string new_data = "overwrite_attempt"; + // Should return len (silent no-op) + EXPECT_EQ(new_data.size(), + seg->write(0, new_data.data(), new_data.size())); + + // Data should remain unchanged (still "initial") + std::vector buf(7); + EXPECT_EQ(7u, seg->fetch(0, buf.data(), 7)); + EXPECT_EQ("initial", std::string(buf.data(), 7)); + + EXPECT_EQ(0, storage->close()); + } +} + +// ===== Resize Tests ===== + +// Test: Resize increases data_size without writing +TEST_F(BufferStorageWriteTest, ResizeGrow) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + EXPECT_EQ(0u, seg->data_size()); + size_t new_size = seg->resize(512); + EXPECT_EQ(512u, new_size); + EXPECT_EQ(512u, seg->data_size()); + EXPECT_EQ(seg->capacity() - 512, seg->padding_size()); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Resize shrinks data_size +TEST_F(BufferStorageWriteTest, ResizeShrink) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + // Write to grow data_size to 100 + std::vector buf(100, 'X'); + seg->write(0, buf.data(), buf.size()); + EXPECT_EQ(100u, seg->data_size()); + + // Resize to smaller + size_t new_size = seg->resize(50); + EXPECT_EQ(50u, new_size); + EXPECT_EQ(50u, seg->data_size()); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Resize beyond capacity is clamped +TEST_F(BufferStorageWriteTest, ResizeBeyondCapacityClamped) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + size_t cap = seg->capacity(); + size_t result = seg->resize(cap + 1000); + EXPECT_EQ(cap, result); + EXPECT_EQ(cap, seg->data_size()); + EXPECT_EQ(0u, seg->padding_size()); + + EXPECT_EQ(0, storage->close()); +} + +// ===== CRC Tests ===== + +// Test: update_data_crc reflects in data_crc() getter +TEST_F(BufferStorageWriteTest, UpdateDataCrc) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + uint32_t new_crc = 0xDEADBEEF; + seg->update_data_crc(new_crc); + EXPECT_EQ(new_crc, seg->data_crc()); + + EXPECT_EQ(0, storage->close()); +} + +// Test: CRC persists after flush and reopen +TEST_F(BufferStorageWriteTest, UpdateDataCrcPersistence) { + uint32_t crc_val = 0x12345678; + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + std::string data = "crc_test_data"; + seg->write(0, data.data(), data.size()); + seg->update_data_crc(crc_val); + EXPECT_EQ(0, storage->flush()); + EXPECT_EQ(0, storage->close()); + } + + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + EXPECT_EQ(crc_val, seg->data_crc()); + EXPECT_EQ(0, storage->close()); + } +} + +// ===== Concurrency Tests ===== + +// Test: Multiple threads writing to different segments concurrently +TEST_F(BufferStorageWriteTest, ConcurrentWriteDifferentSegments) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + const int kNumSegments = 8; + for (int i = 0; i < kNumSegments; ++i) { + ASSERT_EQ(0, storage->append("seg_" + std::to_string(i), 16384)); + } + + std::vector threads; + std::atomic errors{0}; + + for (int i = 0; i < kNumSegments; ++i) { + threads.emplace_back([&, i]() { + auto seg = storage->get("seg_" + std::to_string(i)); + if (!seg) { + errors.fetch_add(1); + return; + } + // Each thread writes its own pattern to its own segment + std::vector data(1024, static_cast('A' + i)); + for (int j = 0; j < 10; ++j) { + size_t offset = j * 1024; + if (seg->write(offset, data.data(), data.size()) != data.size()) { + errors.fetch_add(1); + } + } + }); + } + + for (auto &t : threads) t.join(); + EXPECT_EQ(0, errors.load()); + + // Verify each segment's data + for (int i = 0; i < kNumSegments; ++i) { + auto seg = storage->get("seg_" + std::to_string(i)); + ASSERT_TRUE(seg); + // Last write was at offset 9*1024, so data_size >= 10*1024 + EXPECT_GE(seg->data_size(), 10u * 1024u); + + std::vector buf(1024); + seg->fetch(0, buf.data(), 1024); + EXPECT_EQ(buf[0], static_cast('A' + i)); + } + + EXPECT_EQ(0, storage->close()); +} + +// Test: Multiple threads writing to the same segment at different offsets +TEST_F(BufferStorageWriteTest, ConcurrentWriteSameSegment) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + // Need large enough segment for all threads + ASSERT_EQ(0, storage->append("shared_seg", 65536)); + auto seg = storage->get("shared_seg"); + ASSERT_TRUE(seg); + + const int kNumThreads = 8; + const size_t kChunkSize = 256; + std::atomic errors{0}; + std::vector threads; + + for (int i = 0; i < kNumThreads; ++i) { + threads.emplace_back([&, i]() { + // Each thread writes to its own non-overlapping region + size_t offset = i * kChunkSize * 10; + std::vector data(kChunkSize, static_cast('A' + i)); + for (int j = 0; j < 10; ++j) { + if (seg->write(offset + j * kChunkSize, data.data(), data.size()) != + data.size()) { + errors.fetch_add(1); + } + } + }); + } + + for (auto &t : threads) t.join(); + EXPECT_EQ(0, errors.load()); + + // Verify each thread's region + for (int i = 0; i < kNumThreads; ++i) { + size_t offset = i * kChunkSize * 10; + std::vector buf(kChunkSize); + seg->fetch(offset, buf.data(), kChunkSize); + for (size_t b = 0; b < kChunkSize; ++b) { + EXPECT_EQ(buf[b], static_cast('A' + i)) + << "Mismatch at thread " << i << " byte " << b; + } + } + + EXPECT_EQ(0, storage->close()); +} + +// Test: Concurrent writers + flush (simulates real workload) +TEST_F(BufferStorageWriteTest, ConcurrentWriteWithFlush) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 65536)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + std::atomic stop{false}; + std::atomic write_errors{0}; + + // Writer threads + std::vector writers; + for (int i = 0; i < 4; ++i) { + writers.emplace_back([&, i]() { + std::vector data(128, static_cast('0' + i)); + int iter = 0; + while (!stop.load(std::memory_order_relaxed) && iter < 100) { + size_t offset = (i * 128 + (iter % 10) * 128) % 4096; + if (seg->write(offset, data.data(), data.size()) != data.size()) { + write_errors.fetch_add(1); + } + ++iter; + } + }); + } + + // Flush thread + std::thread flusher([&]() { + for (int i = 0; i < 5; ++i) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + storage->flush(); + } + stop.store(true); + }); + + for (auto &w : writers) w.join(); + flusher.join(); + + EXPECT_EQ(0, write_errors.load()); + EXPECT_EQ(0, storage->close()); +} + +// ===== Append + Write Integration ===== + +// Test: Append multiple segments then write to each +TEST_F(BufferStorageWriteTest, AppendThenWriteSequence) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + for (int i = 0; i < 5; ++i) { + std::string seg_name = "seg_" + std::to_string(i); + ASSERT_EQ(0, storage->append(seg_name, 4096)); + auto seg = storage->get(seg_name); + ASSERT_TRUE(seg); + + std::string data = "content_of_segment_" + std::to_string(i); + EXPECT_EQ(data.size(), seg->write(0, data.data(), data.size())); + } + + // Verify all segments have correct data + for (int i = 0; i < 5; ++i) { + std::string seg_name = "seg_" + std::to_string(i); + auto seg = storage->get(seg_name); + ASSERT_TRUE(seg); + std::string expected = "content_of_segment_" + std::to_string(i); + std::vector buf(expected.size()); + EXPECT_EQ(expected.size(), seg->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(expected, std::string(buf.data(), buf.size())); + } + + EXPECT_EQ(0, storage->close()); +} + +// Test: Write to a segment, append another, write to both, verify all +TEST_F(BufferStorageWriteTest, InterleavedAppendAndWrite) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + // Append and write first segment + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg1 = storage->get("seg1"); + ASSERT_TRUE(seg1); + std::string d1 = "first_data"; + EXPECT_EQ(d1.size(), seg1->write(0, d1.data(), d1.size())); + + // Append second segment (triggers flush_index internally) + ASSERT_EQ(0, storage->append("seg2", 4096)); + auto seg2 = storage->get("seg2"); + ASSERT_TRUE(seg2); + std::string d2 = "second_data"; + EXPECT_EQ(d2.size(), seg2->write(0, d2.data(), d2.size())); + + // Re-get seg1 (pointer stability) and write more + auto seg1_again = storage->get("seg1"); + ASSERT_TRUE(seg1_again); + std::string d1_extra = "extra"; + EXPECT_EQ(d1_extra.size(), + seg1_again->write(d1.size(), d1_extra.data(), d1_extra.size())); + + // Verify all data + std::vector buf(d1.size() + d1_extra.size()); + EXPECT_EQ(buf.size(), seg1_again->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(d1 + d1_extra, std::string(buf.data(), buf.size())); + + std::vector buf2(d2.size()); + EXPECT_EQ(d2.size(), seg2->fetch(0, buf2.data(), buf2.size())); + EXPECT_EQ(d2, std::string(buf2.data(), buf2.size())); + + EXPECT_EQ(0, storage->close()); +} + +// ===== Large Write Tests ===== + +// Test: Fill entire segment capacity with data +TEST_F(BufferStorageWriteTest, WriteLargeBuffer) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + // Request 16KB segment (will be page-aligned) + ASSERT_EQ(0, storage->append("big_seg", 16384)); + auto seg = storage->get("big_seg"); + ASSERT_TRUE(seg); + + size_t cap = seg->capacity(); + ASSERT_GE(cap, 16384u); + + // Fill with a pattern + std::vector data(cap); + std::iota(data.begin(), data.end(), static_cast(0)); + EXPECT_EQ(cap, seg->write(0, data.data(), data.size())); + EXPECT_EQ(cap, seg->data_size()); + EXPECT_EQ(0u, seg->padding_size()); + + // Verify a portion + std::vector verify(1024); + EXPECT_EQ(1024u, seg->fetch(0, verify.data(), 1024)); + EXPECT_EQ(0, std::memcmp(data.data(), verify.data(), 1024)); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Large write persistence across close/reopen +TEST_F(BufferStorageWriteTest, WriteLargeBufferPersistence) { + const size_t kSize = 8192; + std::vector data(kSize); + for (size_t i = 0; i < kSize; ++i) { + data[i] = static_cast(i % 256); + } + + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("large_seg", kSize)); + auto seg = storage->get("large_seg"); + ASSERT_TRUE(seg); + EXPECT_EQ(kSize, seg->write(0, data.data(), data.size())); + EXPECT_EQ(0, storage->close()); + } + + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("large_seg"); + ASSERT_TRUE(seg); + EXPECT_EQ(kSize, seg->data_size()); + + std::vector buf(kSize); + EXPECT_EQ(kSize, seg->fetch(0, buf.data(), kSize)); + EXPECT_EQ(0, std::memcmp(data.data(), buf.data(), kSize)); + EXPECT_EQ(0, storage->close()); + } +} + +// ===== Refresh / Checkpoint Tests ===== + +// Test: refresh() updates checkpoint and marks dirty +TEST_F(BufferStorageWriteTest, RefreshCheckpoint) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("seg1", 4096)); + + storage->refresh(42); + EXPECT_EQ(0, storage->flush()); + + // After flush the check_point should be >= 42 + EXPECT_GE(storage->check_point(), 42u); + + // Increasing checkpoint + storage->refresh(100); + EXPECT_EQ(0, storage->flush()); + EXPECT_GE(storage->check_point(), 100u); + + EXPECT_EQ(0, storage->close()); +} + +// ===== Duplicate / Error Handling ===== + +// Test: Appending a duplicate segment ID returns error +TEST_F(BufferStorageWriteTest, AppendDuplicateSegment) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("dup_seg", 4096)); + // Second append with same ID should fail + EXPECT_NE(0, storage->append("dup_seg", 4096)); + + EXPECT_EQ(0, storage->close()); +} + +// Test: Appending a zero-size segment returns error +TEST_F(BufferStorageWriteTest, AppendZeroSize) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + EXPECT_NE(0, storage->append("zero_seg", 0)); + + EXPECT_EQ(0, storage->close()); +} + +// ===== Code Review Issue Tests ===== +// The following tests target specific bugs/races found during code review. + +// PR#414 Issue: data_size concurrent race on same segment. +// Multiple threads calling write() with different offsets should not corrupt +// the (data_size, padding_size) pair. Their sum must equal capacity when +// observed after all writers quiesce (individual unsynchronized reads during +// concurrent writes may appear torn, which is expected). +TEST_F(BufferStorageWriteTest, CR_DataSizePaddingSizeInvariant) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 8192)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + const size_t cap = seg->capacity(); + + const int kNumThreads = 8; + const int kIters = 200; + std::atomic write_failures{0}; + std::vector threads; + + for (int i = 0; i < kNumThreads; ++i) { + threads.emplace_back([&, i]() { + char buf[64]; + std::memset(buf, 'A' + i, sizeof(buf)); + for (int j = 0; j < kIters; ++j) { + // Write at various offsets within capacity to exercise data_size growth + size_t offset = ((i * 64) + j * 7) % (cap - 64); + if (seg->write(offset, buf, sizeof(buf)) != sizeof(buf)) { + write_failures.fetch_add(1); + } + } + }); + } + + for (auto &t : threads) t.join(); + EXPECT_EQ(0, write_failures.load()); + // After all writers stop, the invariant MUST hold + EXPECT_EQ(cap, seg->data_size() + seg->padding_size()); + EXPECT_GT(seg->data_size(), 0u); + EXPECT_EQ(0, storage->close()); +} + +// PR#414 Issue: Concurrent write() + resize() on same segment. +// meta_mtx_ must serialize so that (data_size, padding_size) stays consistent. +// The invariant is verified after all threads stop (reads without meta_mtx_ +// during concurrent mutation may observe a torn pair, which is expected). +TEST_F(BufferStorageWriteTest, CR_ConcurrentWriteAndResize) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 8192)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + const size_t cap = seg->capacity(); + + std::atomic stop{false}; + std::atomic write_failures{0}; + + // Writer thread: grows data_size by writing at increasing offsets + std::thread writer([&]() { + char buf[128]; + std::memset(buf, 'W', sizeof(buf)); + for (int j = 0; j < 300 && !stop.load(std::memory_order_relaxed); ++j) { + size_t offset = j % (cap - 128); + if (seg->write(offset, buf, sizeof(buf)) != sizeof(buf)) { + write_failures.fetch_add(1); + } + } + }); + + // Resizer thread: constantly resizes + std::thread resizer([&]() { + for (int j = 0; j < 300 && !stop.load(std::memory_order_relaxed); ++j) { + size_t new_size = (j * 37) % cap; + seg->resize(new_size); + } + stop.store(true); + }); + + writer.join(); + resizer.join(); + + EXPECT_EQ(0, write_failures.load()); + // After quiescence, invariant must hold + EXPECT_EQ(cap, seg->data_size() + seg->padding_size()); + EXPECT_EQ(0, storage->close()); +} + +// Chain-split bug: Many appends exhaust segment_meta capacity, triggering +// chain split. After reopen, ALL segments must be findable. +// (Tests fix for reserve()-induced dangling pointer in append_segment.) +TEST_F(BufferStorageWriteTest, CR_ChainSplitAllSegmentsAccessible) { + const int kNumSegments = 50; // Enough to trigger chain split with default 4096 meta capacity + + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + for (int i = 0; i < kNumSegments; ++i) { + std::string name = "chain_seg_" + std::to_string(i); + ASSERT_EQ(0, storage->append(name, 4096)) + << "Failed to append segment " << i; + auto seg = storage->get(name); + ASSERT_TRUE(seg) << "Failed to get segment " << name << " right after append"; + // Write a marker so we can verify on reopen + std::string marker = "marker_" + std::to_string(i); + EXPECT_EQ(marker.size(), seg->write(0, marker.data(), marker.size())); + } + EXPECT_EQ(0, storage->flush()); + EXPECT_EQ(0, storage->close()); + } + + // Reopen and verify ALL segments are present and readable + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + for (int i = 0; i < kNumSegments; ++i) { + std::string name = "chain_seg_" + std::to_string(i); + auto seg = storage->get(name); + ASSERT_TRUE(seg) << "Segment " << name << " missing after reopen (chain-split bug?)"; + std::string expected = "marker_" + std::to_string(i); + std::vector buf(expected.size()); + EXPECT_EQ(expected.size(), seg->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(expected, std::string(buf.data(), buf.size())) + << "Data mismatch for " << name; + } + EXPECT_EQ(0, storage->close()); + } +} + +// mapping_shard_id bug: Multiple BufferStorage instances opened on the +// same thread must work correctly (the old thread_local shard_id would +// map them to the same shard, causing potential conflicts). +TEST_F(BufferStorageWriteTest, CR_MultipleInstancesSameThread) { + std::string path2 = file_path_ + "_second"; + ailego::File::Delete(path2); + + auto storage1 = OpenWritable(); + ASSERT_TRUE(storage1); + + // Open a second independent BufferStorage instance + auto storage2 = IndexFactory::CreateStorage("BufferStorage"); + ASSERT_TRUE(storage2); + ailego::Params params; + storage2->init(params); + ASSERT_EQ(0, storage2->open(path2, true)); + + // Append and write to both concurrently from the SAME thread + ASSERT_EQ(0, storage1->append("seg_a", 4096)); + ASSERT_EQ(0, storage2->append("seg_b", 4096)); + + auto seg_a = storage1->get("seg_a"); + auto seg_b = storage2->get("seg_b"); + ASSERT_TRUE(seg_a); + ASSERT_TRUE(seg_b); + + std::string da = "instance_one_data"; + std::string db = "instance_two_data"; + EXPECT_EQ(da.size(), seg_a->write(0, da.data(), da.size())); + EXPECT_EQ(db.size(), seg_b->write(0, db.data(), db.size())); + + // Verify data isolation + std::vector buf1(da.size()); + EXPECT_EQ(da.size(), seg_a->fetch(0, buf1.data(), buf1.size())); + EXPECT_EQ(da, std::string(buf1.data(), buf1.size())); + + std::vector buf2(db.size()); + EXPECT_EQ(db.size(), seg_b->fetch(0, buf2.data(), buf2.size())); + EXPECT_EQ(db, std::string(buf2.data(), buf2.size())); + + EXPECT_EQ(0, storage1->close()); + EXPECT_EQ(0, storage2->close()); + ailego::File::Delete(path2); +} + +// Cross-page read/write: Write data spanning page boundaries (4KB pages), +// then read back via both fetch() and read(MemoryBlock&) to verify the +// cross-page buffer allocation path. (Tests fix for UAF in cross-page read.) +TEST_F(BufferStorageWriteTest, CR_CrossPageWriteAndRead) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + // Segment large enough to span multiple pages + ASSERT_EQ(0, storage->append("cross_page_seg", 16384)); + auto seg = storage->get("cross_page_seg"); + ASSERT_TRUE(seg); + + // Write 5000 bytes starting at offset 2000, which crosses the first + // page boundary at 4096 (relative to segment data start in the file). + const size_t kWriteOffset = 2000; + const size_t kWriteLen = 5000; + std::vector write_data(kWriteLen); + for (size_t i = 0; i < kWriteLen; ++i) { + write_data[i] = static_cast((i * 7 + 13) % 256); + } + EXPECT_EQ(kWriteLen, seg->write(kWriteOffset, write_data.data(), kWriteLen)); + + // Read back via fetch (uses read_range internally for cross-page) + std::vector fetch_buf(kWriteLen); + EXPECT_EQ(kWriteLen, seg->fetch(kWriteOffset, fetch_buf.data(), kWriteLen)); + EXPECT_EQ(write_data, fetch_buf); + + // Read back via read(MemoryBlock&) - exercises the cross-page alloc path. + // Scope the MemoryBlock so it is destroyed BEFORE storage->close(): + // when the read happens to land on a single page (e.g. macOS arm64 with + // 16KB pages, where [2000, 7000) fits in one page) the returned block + // is MBT_BUFFERPOOL holding a raw pointer to buffer_pool_handle_. Once + // close_index() resets buffer_pool_handle_/buffer_pool_, that raw + // pointer dangles and ~MemoryBlock()'s release_one() segfaults. + { + IndexStorage::MemoryBlock mb; + EXPECT_EQ(kWriteLen, seg->read(kWriteOffset, mb, kWriteLen)); + EXPECT_EQ(0, std::memcmp(write_data.data(), mb.data(), kWriteLen)); + } + + EXPECT_EQ(0, storage->close()); +} + +// Dirty flag race: write() after flush_index() must re-set the dirty flag. +// If the write lands between CAS(dirty, false) and the end of flush, +// the next flush must still persist it. Verified by close→reopen→read. +TEST_F(BufferStorageWriteTest, CR_DirtyFlagNotLostAfterFlush) { + std::string early_data = "early"; + std::string late_data = "late_write_after_flush"; + + { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + ASSERT_EQ(0, storage->append("seg1", 4096)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + // Write and flush + EXPECT_EQ(early_data.size(), + seg->write(0, early_data.data(), early_data.size())); + EXPECT_EQ(0, storage->flush()); + + // Write again AFTER flush - dirty flag must be re-set + EXPECT_EQ(late_data.size(), + seg->write(100, late_data.data(), late_data.size())); + // Close without explicit flush (close_index will flush) + EXPECT_EQ(0, storage->close()); + } + + // Reopen and verify the late write persisted + { + auto storage = OpenReadOnly(); + ASSERT_TRUE(storage); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + std::vector buf(late_data.size()); + EXPECT_EQ(late_data.size(), seg->fetch(100, buf.data(), buf.size())); + EXPECT_EQ(late_data, std::string(buf.data(), buf.size())); + EXPECT_EQ(0, storage->close()); + } +} + +// Stress test: Concurrent flush + write interleaving to expose dirty flag races. +// All writes that return successfully MUST be visible after final close+reopen. +TEST_F(BufferStorageWriteTest, CR_ConcurrentFlushWriteDirtyFlagStress) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 65536)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + + // Track the highest offset+len successfully written + std::atomic max_committed_end{0}; + std::atomic stop{false}; + + // Writer: writes sequentially increasing offsets + std::thread writer([&]() { + char pattern[64]; + std::memset(pattern, 'P', sizeof(pattern)); + for (int i = 0; i < 500 && !stop.load(std::memory_order_relaxed); ++i) { + size_t offset = i * 64; + if (offset + 64 > 65536) break; + if (seg->write(offset, pattern, 64) == 64) { + // Update max committed end + size_t end = offset + 64; + size_t cur = max_committed_end.load(std::memory_order_relaxed); + while (end > cur) { + if (max_committed_end.compare_exchange_weak( + cur, end, std::memory_order_relaxed)) { + break; + } + } + } + } + }); + + // Flusher: repeatedly flushes to trigger the CAS(dirty, false) path + std::thread flusher([&]() { + for (int i = 0; i < 50; ++i) { + storage->flush(); + std::this_thread::sleep_for(std::chrono::microseconds(100)); + } + stop.store(true); + }); + + writer.join(); + flusher.join(); + + size_t final_data_size = seg->data_size(); + EXPECT_GE(final_data_size, max_committed_end.load()); + EXPECT_EQ(0, storage->close()); +} + +// Pointer stability after append: WrappedSegment obtained BEFORE a new +// append must still work correctly AFTER the append (unordered_map address +// stability guarantee). This tests the fix for reserve()-based invalidation. +TEST_F(BufferStorageWriteTest, CR_PointerStabilityAcrossAppend) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg_first", 4096)); + auto seg_first = storage->get("seg_first"); + ASSERT_TRUE(seg_first); + + // Write initial data + std::string initial = "before_append"; + EXPECT_EQ(initial.size(), seg_first->write(0, initial.data(), initial.size())); + + // Append many more segments (may trigger internal rehash/resize) + for (int i = 0; i < 20; ++i) { + ASSERT_EQ(0, storage->append("new_seg_" + std::to_string(i), 4096)); + } + + // The original segment handle must still be valid and writable + std::string after = "_after_appends"; + EXPECT_EQ(after.size(), + seg_first->write(initial.size(), after.data(), after.size())); + + // Verify full data + std::string expected = initial + after; + std::vector buf(expected.size()); + EXPECT_EQ(expected.size(), seg_first->fetch(0, buf.data(), buf.size())); + EXPECT_EQ(expected, std::string(buf.data(), buf.size())); + + EXPECT_EQ(0, storage->close()); +} + +// update_data_crc concurrent with write: CRC update must be serialized +// with data_size changes via meta_mtx_. Invariant verified post-quiescence. +TEST_F(BufferStorageWriteTest, CR_ConcurrentWriteAndCrcUpdate) { + auto storage = OpenWritable(); + ASSERT_TRUE(storage); + + ASSERT_EQ(0, storage->append("seg1", 8192)); + auto seg = storage->get("seg1"); + ASSERT_TRUE(seg); + const size_t cap = seg->capacity(); + + std::atomic stop{false}; + std::atomic write_failures{0}; + + // Writer thread + std::thread writer([&]() { + char buf[128]; + std::memset(buf, 'X', sizeof(buf)); + for (int i = 0; i < 500 && !stop.load(std::memory_order_relaxed); ++i) { + size_t offset = (i * 128) % (cap - 128); + if (seg->write(offset, buf, sizeof(buf)) != sizeof(buf)) { + write_failures.fetch_add(1); + } + } + }); + + // CRC updater thread: concurrently updates CRC + std::thread crc_updater([&]() { + for (int i = 0; i < 500 && !stop.load(std::memory_order_relaxed); ++i) { + seg->update_data_crc(static_cast(i)); + } + stop.store(true); + }); + + writer.join(); + crc_updater.join(); + + EXPECT_EQ(0, write_failures.load()); + // After all threads stop, invariant must hold + EXPECT_EQ(cap, seg->data_size() + seg->padding_size()); + // CRC should have been updated (last writer wins) + // Just verify it doesn't crash and the value is readable + (void)seg->data_crc(); + EXPECT_EQ(0, storage->close()); +} diff --git a/tests/db/collection_test.cc b/tests/db/collection_test.cc index 2fcf3de18..1ffdca863 100644 --- a/tests/db/collection_test.cc +++ b/tests/db/collection_test.cc @@ -47,6 +47,8 @@ std::string col_path = "test_collection"; class CollectionTest : public ::testing::Test { protected: void SetUp() override { + zvec::ailego::MemoryLimitPool::get_instance().init(2 * 1024ll * 1024ll * + 1024ll); FileHelper::RemoveDirectory(col_path); } @@ -57,128 +59,132 @@ class CollectionTest : public ::testing::Test { }; TEST_F(CollectionTest, Feature_CreateAndOpen_General) { - CollectionOptions options; - options.read_only_ = false; - options.enable_mmap_ = true; + auto func = [&](bool enable_mmap) { + CollectionOptions options; + options.read_only_ = false; + options.enable_mmap_ = enable_mmap; - std::string path = "./demo"; + std::string path = "./demo"; - ailego::FileHelper::RemoveDirectory(path.c_str()); + ailego::FileHelper::RemoveDirectory(path.c_str()); - auto schema = TestHelper::CreateNormalSchema(); - auto result = Collection::CreateAndOpen(path, *schema, options); - if (!result.has_value()) { - std::cout << result.error().message() << std::endl; - } - ASSERT_TRUE(result.has_value()); - ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str())); - - auto col = result.value(); - ASSERT_EQ(col->Path(), path); - ASSERT_EQ(col->Schema(), *schema); - ASSERT_EQ(col->Options(), options); - auto stats = col->Stats().value(); - ASSERT_TRUE(stats.doc_count == 0); - ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); - ASSERT_EQ(stats.index_completeness["dense_fp16"], 1); - // ASSERT_EQ(stats.index_completeness["dense_fp64"], 1); - ASSERT_EQ(stats.index_completeness["sparse_fp32"], 1); - ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); - - ASSERT_EQ(col->Destroy(), Status::OK()); - - // after destroyed, every interface should return error - std::vector empty_docs; - ASSERT_FALSE(col->Insert(empty_docs).has_value()); - ASSERT_FALSE(col->Update(empty_docs).has_value()); - ASSERT_FALSE(col->Delete({}).has_value()); - ASSERT_FALSE(col->DeleteByFilter("").ok()); - ASSERT_FALSE(col->Fetch({}).has_value()); - ASSERT_FALSE(col->Query(SearchQuery{}).has_value()); - ASSERT_FALSE(col->Query(MultiQuery{}).has_value()); - ASSERT_FALSE(col->GroupByQuery({}).has_value()); - ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); - ASSERT_FALSE(col->DropIndex("").ok()); - ASSERT_FALSE(col->AddColumn(nullptr, "").ok()); - ASSERT_FALSE(col->AlterColumn("", "", nullptr).ok()); - ASSERT_FALSE(col->DropColumn("").ok()); - ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); - ASSERT_FALSE(col->Optimize().ok()); - ASSERT_FALSE(col->Flush().ok()); - ASSERT_FALSE(col->Destroy().ok()); - ASSERT_FALSE(col->Options().has_value()); - ASSERT_FALSE(col->Path().has_value()); - ASSERT_FALSE(col->Stats().has_value()); - ASSERT_FALSE(col->Schema().has_value()); - - ASSERT_FALSE(ailego::FileHelper::IsExist(path.c_str())); - - // recreate - result = Collection::CreateAndOpen(path, *schema, options); - ASSERT_TRUE(result.has_value()); - ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str())); + auto schema = TestHelper::CreateNormalSchema(); + auto result = Collection::CreateAndOpen(path, *schema, options); + if (!result.has_value()) { + std::cout << result.error().message() << std::endl; + } + ASSERT_TRUE(result.has_value()); + ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str())); + + auto col = result.value(); + ASSERT_EQ(col->Path(), path); + ASSERT_EQ(col->Schema(), *schema); + ASSERT_EQ(col->Options(), options); + auto stats = col->Stats().value(); + ASSERT_TRUE(stats.doc_count == 0); + ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); + ASSERT_EQ(stats.index_completeness["dense_fp16"], 1); + // ASSERT_EQ(stats.index_completeness["dense_fp64"], 1); + ASSERT_EQ(stats.index_completeness["sparse_fp32"], 1); + ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); - col = std::move(result.value()); - col.reset(); - col = nullptr; + ASSERT_EQ(col->Destroy(), Status::OK()); + + // after destroyed, every interface should return error + std::vector empty_docs; + ASSERT_FALSE(col->Insert(empty_docs).has_value()); + ASSERT_FALSE(col->Update(empty_docs).has_value()); + ASSERT_FALSE(col->Delete({}).has_value()); + ASSERT_FALSE(col->DeleteByFilter("").ok()); + ASSERT_FALSE(col->Fetch({}).has_value()); + ASSERT_FALSE(col->Query(SearchQuery{}).has_value()); + ASSERT_FALSE(col->Query(MultiQuery{}).has_value()); + ASSERT_FALSE(col->GroupByQuery({}).has_value()); + ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); + ASSERT_FALSE(col->DropIndex("").ok()); + ASSERT_FALSE(col->AddColumn(nullptr, "").ok()); + ASSERT_FALSE(col->AlterColumn("", "", nullptr).ok()); + ASSERT_FALSE(col->DropColumn("").ok()); + ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); + ASSERT_FALSE(col->Optimize().ok()); + ASSERT_FALSE(col->Flush().ok()); + ASSERT_FALSE(col->Destroy().ok()); + ASSERT_FALSE(col->Options().has_value()); + ASSERT_FALSE(col->Path().has_value()); + ASSERT_FALSE(col->Stats().has_value()); + ASSERT_FALSE(col->Schema().has_value()); + + ASSERT_FALSE(ailego::FileHelper::IsExist(path.c_str())); + + // recreate + result = Collection::CreateAndOpen(path, *schema, options); + ASSERT_TRUE(result.has_value()); + ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str())); - ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str())); + col = std::move(result.value()); + col.reset(); + col = nullptr; - // reopen - result = Collection::Open(path, options); - ASSERT_TRUE(result.has_value()); - col = std::move(result.value()); - col.reset(); + ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str())); - // reopen with read-only - options.read_only_ = true; - result = Collection::Open(path, options); - if (!result.has_value()) { - std::cout << result.error().message() << std::endl; - } - ASSERT_TRUE(result.has_value()); - col = result.value(); + // reopen + result = Collection::Open(path, options); + ASSERT_TRUE(result.has_value()); + col = std::move(result.value()); + col.reset(); - ASSERT_EQ(col->Path(), path); - ASSERT_EQ(col->Schema(), *schema); - ASSERT_EQ(col->Options(), options); - stats = col->Stats().value(); - ASSERT_TRUE(stats.doc_count == 0); - ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); - ASSERT_EQ(stats.index_completeness["dense_fp16"], 1); - // ASSERT_EQ(stats.index_completeness["dense_fp64"], 1); - ASSERT_EQ(stats.index_completeness["sparse_fp32"], 1); - ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); - - // when open with read-only, write operation should fail - ASSERT_FALSE(col->Flush().ok()); - ASSERT_FALSE(col->Destroy().ok()); - ASSERT_FALSE(col->Insert(empty_docs).has_value()); - ASSERT_FALSE(col->Update(empty_docs).has_value()); - ASSERT_FALSE(col->Delete({}).has_value()); - ASSERT_FALSE(col->DeleteByFilter("").ok()); - ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); - ASSERT_FALSE(col->DropIndex("").ok()); - ASSERT_FALSE(col->AddColumn(nullptr, "").ok()); - ASSERT_FALSE(col->AlterColumn("", "", nullptr).ok()); - ASSERT_FALSE(col->DropColumn("").ok()); - ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); - ASSERT_FALSE(col->Optimize().ok()); - - // two threads open with read_only - result = Collection::Open(path, options); - if (!result.has_value()) { - std::cout << result.error().message() << std::endl; - } - ASSERT_TRUE(result.has_value()); - col = result.value(); + // reopen with read-only + options.read_only_ = true; + result = Collection::Open(path, options); + if (!result.has_value()) { + std::cout << result.error().message() << std::endl; + } + ASSERT_TRUE(result.has_value()); + col = result.value(); - auto result1 = Collection::Open(path, options); - if (!result1.has_value()) { - std::cout << result1.error().message() << std::endl; - } - ASSERT_TRUE(result1.has_value()); - auto col1 = result1.value(); + ASSERT_EQ(col->Path(), path); + ASSERT_EQ(col->Schema(), *schema); + ASSERT_EQ(col->Options(), options); + stats = col->Stats().value(); + ASSERT_TRUE(stats.doc_count == 0); + ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); + ASSERT_EQ(stats.index_completeness["dense_fp16"], 1); + // ASSERT_EQ(stats.index_completeness["dense_fp64"], 1); + ASSERT_EQ(stats.index_completeness["sparse_fp32"], 1); + ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); + + // when open with read-only, write operation should fail + ASSERT_FALSE(col->Flush().ok()); + ASSERT_FALSE(col->Destroy().ok()); + ASSERT_FALSE(col->Insert(empty_docs).has_value()); + ASSERT_FALSE(col->Update(empty_docs).has_value()); + ASSERT_FALSE(col->Delete({}).has_value()); + ASSERT_FALSE(col->DeleteByFilter("").ok()); + ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); + ASSERT_FALSE(col->DropIndex("").ok()); + ASSERT_FALSE(col->AddColumn(nullptr, "").ok()); + ASSERT_FALSE(col->AlterColumn("", "", nullptr).ok()); + ASSERT_FALSE(col->DropColumn("").ok()); + ASSERT_FALSE(col->CreateIndex("", nullptr).ok()); + ASSERT_FALSE(col->Optimize().ok()); + + // two threads open with read_only + result = Collection::Open(path, options); + if (!result.has_value()) { + std::cout << result.error().message() << std::endl; + } + ASSERT_TRUE(result.has_value()); + col = result.value(); + + auto result1 = Collection::Open(path, options); + if (!result1.has_value()) { + std::cout << result1.error().message() << std::endl; + } + ASSERT_TRUE(result1.has_value()); + auto col1 = result1.value(); + }; + func(true); + func(false); } TEST_F(CollectionTest, Feature_CreateAndOpen_Empty) { @@ -391,13 +397,13 @@ TEST_F(CollectionTest, Feature_Write_Batch_Validate) { } TEST_F(CollectionTest, Feature_Insert_General) { - auto func = [&](bool schema_nullable, bool doc_nullable, + auto func = [&](bool enable_mmap, bool schema_nullable, bool doc_nullable, int doc_count = 1000) { FileHelper::RemoveDirectory(col_path); // create with normal schema auto schema = TestHelper::CreateNormalSchema(schema_nullable); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; auto collection = TestHelper::CreateCollectionWithDoc( col_path, *schema, options, 0, doc_count, doc_nullable); @@ -478,14 +484,16 @@ TEST_F(CollectionTest, Feature_Insert_General) { ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); }; - func(false, false); - func(true, true); - func(true, false); - func(false, true); + for (bool enable_mmap : {true, false}) { + func(enable_mmap, false, false); + func(enable_mmap, true, true); + func(enable_mmap, true, false); + func(enable_mmap, false, true); - func(false, false, 0); - func(false, false, 1); - func(false, false, 2); + func(enable_mmap, false, false, 0); + func(enable_mmap, false, false, 1); + func(enable_mmap, false, false, 2); + } } TEST_F(CollectionTest, Feature_Insert_ScalarIndex) { @@ -809,13 +817,13 @@ TEST_F(CollectionTest, Feature_Insert_Duplicate) { } TEST_F(CollectionTest, Feature_Upsert_General) { - auto func = [&](bool schema_nullable, bool doc_nullable, + auto func = [&](bool enable_mmap, bool schema_nullable, bool doc_nullable, int doc_count = 1000) { FileHelper::RemoveDirectory(col_path); // create with normal schema auto schema = TestHelper::CreateNormalSchema(schema_nullable); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; auto collection = TestHelper::CreateCollectionWithDoc( col_path, *schema, options, 0, doc_count, doc_nullable, true); @@ -896,14 +904,16 @@ TEST_F(CollectionTest, Feature_Upsert_General) { ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1); }; - func(false, false); - func(true, true); - func(true, false); - func(false, true); + for (bool enable_mmap : {true, false}) { + func(enable_mmap, false, false); + func(enable_mmap, true, true); + func(enable_mmap, true, false); + func(enable_mmap, false, true); - func(false, false, 0); - func(false, false, 1); - func(false, false, 2); + func(enable_mmap, false, false, 0); + func(enable_mmap, false, false, 1); + func(enable_mmap, false, false, 2); + } } TEST_F(CollectionTest, Feature_Upsert_Incremental) { @@ -1096,9 +1106,9 @@ TEST_F(CollectionTest, Feature_Upsert_Nullable) { TEST_F(CollectionTest, Feature_Update_General) { - auto func = [&](int doc_count) { + auto func = [&](bool enable_mmap, int doc_count) { auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; FileHelper::RemoveDirectory(col_path); // insert first @@ -1180,10 +1190,12 @@ TEST_F(CollectionTest, Feature_Update_General) { check_doc(doc_count); }; - func(99); - func(100); - func(101); - func(1000); + for (bool enable_mmap : {true, false}) { + func(enable_mmap, 99); + func(enable_mmap, 100); + func(enable_mmap, 101); + func(enable_mmap, 1000); + } } TEST_F(CollectionTest, Feature_Update_Incremental) { @@ -1437,9 +1449,9 @@ TEST_F(CollectionTest, Feature_Update_Empty) { } TEST_F(CollectionTest, Feature_Delete_General) { - auto func = [&](int doc_count) { + auto func = [&](bool enable_mmap, int doc_count) { auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; FileHelper::RemoveDirectory(col_path); // insert first @@ -1515,10 +1527,12 @@ TEST_F(CollectionTest, Feature_Delete_General) { check_doc(doc_count); }; - func(99); - func(100); - func(101); - func(1000); + for (bool enable_mmap : {true, false}) { + func(enable_mmap, 99); + func(enable_mmap, 100); + func(enable_mmap, 101); + func(enable_mmap, 1000); + } } TEST_F(CollectionTest, Feature_Delete_Repeated) { @@ -1578,9 +1592,9 @@ TEST_F(CollectionTest, Feature_Delete_Repeated) { } TEST_F(CollectionTest, Feature_DeleteByFilter_General) { - auto func = [&](int doc_count) { + auto func = [&](bool enable_mmap, int doc_count) { auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; FileHelper::RemoveDirectory(col_path); // insert first @@ -1659,10 +1673,12 @@ TEST_F(CollectionTest, Feature_DeleteByFilter_General) { check_doc(doc_count); }; - func(99); - func(100); - func(101); - func(1000); + for (bool enable_mmap : {true, false}) { + func(enable_mmap, 99); + func(enable_mmap, 100); + func(enable_mmap, 101); + func(enable_mmap, 1000); + } } TEST_F(CollectionTest, Feature_DeleteByFilter_ScalarIndex) { @@ -1755,122 +1771,132 @@ TEST_F(CollectionTest, Feature_DeleteByFilter_ScalarIndex) { } TEST_F(CollectionTest, Feature_MixedWrite_General) { - // case1: insert -> upsert -> update -> delete - auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; - FileHelper::RemoveDirectory(col_path); + auto func = [&](bool enable_mmap) { + // case1: insert -> upsert -> update -> delete + auto schema = TestHelper::CreateNormalSchema(); + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; + FileHelper::RemoveDirectory(col_path); - // insert first - auto collection = - TestHelper::CreateCollectionWithDoc(col_path, *schema, options, 0, 0); + // insert first + auto collection = + TestHelper::CreateCollectionWithDoc(col_path, *schema, options, 0, 0); - for (int i = 0; i < 100; i++) { - // std::cout << "insert: " << i << std::endl; - - // insert - auto new_doc = TestHelper::CreateDoc(i, *schema); - std::vector new_docs = {new_doc}; - auto res = collection->Insert(new_docs); - ASSERT_TRUE(res.has_value()); - ASSERT_TRUE(res.value()[0].ok()); - - // fetch - auto docs = collection->Fetch({TestHelper::MakePK(i)}); - ASSERT_TRUE(docs.has_value()); - ASSERT_EQ(docs.value().size(), 1); - ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); - ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); + for (int i = 0; i < 100; i++) { + // std::cout << "insert: " << i << std::endl; - auto stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, i + 1); - - // upsert - new_doc = TestHelper::CreateDoc(i + 1, *schema, TestHelper::MakePK(i)); - new_docs = {new_doc}; - res = collection->Upsert(new_docs); - ASSERT_TRUE(res.has_value()); - ASSERT_TRUE(res.value()[0].ok()); - - // fetch - docs = collection->Fetch({TestHelper::MakePK(i)}).value(); - ASSERT_TRUE(docs.has_value()); - ASSERT_EQ(docs.value().size(), 1); - ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); - ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); + // insert + auto new_doc = TestHelper::CreateDoc(i, *schema); + std::vector new_docs = {new_doc}; + auto res = collection->Insert(new_docs); + ASSERT_TRUE(res.has_value()); + ASSERT_TRUE(res.value()[0].ok()); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, i + 1); - - // update - new_doc = TestHelper::CreateDoc(i + 2, *schema, TestHelper::MakePK(i)); - new_docs = {new_doc}; - res = collection->Update(new_docs); - ASSERT_TRUE(res.has_value()); - ASSERT_TRUE(res.value()[0].ok()); - - // fetch - docs = collection->Fetch({TestHelper::MakePK(i)}).value(); - ASSERT_TRUE(docs.has_value()); - ASSERT_EQ(docs.value().size(), 1); - ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); - ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); + // fetch + auto docs = collection->Fetch({TestHelper::MakePK(i)}); + ASSERT_TRUE(docs.has_value()); + ASSERT_EQ(docs.value().size(), 1); + ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); + ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, i + 1); + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, i + 1); - // delete - res = collection->Delete({TestHelper::MakePK(i)}); - ASSERT_TRUE(res.has_value()); - ASSERT_TRUE(res.value()[0].ok()); + // upsert + new_doc = TestHelper::CreateDoc(i + 1, *schema, TestHelper::MakePK(i)); + new_docs = {new_doc}; + res = collection->Upsert(new_docs); + ASSERT_TRUE(res.has_value()); + ASSERT_TRUE(res.value()[0].ok()); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, i); - - // insert again - new_doc = TestHelper::CreateDoc(i, *schema); - new_docs = {new_doc}; - res = collection->Insert(new_docs); - ASSERT_TRUE(res.has_value()); - ASSERT_TRUE(res.value()[0].ok()); - - // fetch - docs = collection->Fetch({TestHelper::MakePK(i)}); - ASSERT_TRUE(docs.has_value()); - ASSERT_EQ(docs.value().size(), 1); - ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); - ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); + // fetch + docs = collection->Fetch({TestHelper::MakePK(i)}).value(); + ASSERT_TRUE(docs.has_value()); + ASSERT_EQ(docs.value().size(), 1); + ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); + ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, i + 1); - } + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, i + 1); + + // update + new_doc = TestHelper::CreateDoc(i + 2, *schema, TestHelper::MakePK(i)); + new_docs = {new_doc}; + res = collection->Update(new_docs); + ASSERT_TRUE(res.has_value()); + ASSERT_TRUE(res.value()[0].ok()); + + // fetch + docs = collection->Fetch({TestHelper::MakePK(i)}).value(); + ASSERT_TRUE(docs.has_value()); + ASSERT_EQ(docs.value().size(), 1); + ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); + ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, i + 1); + + // delete + res = collection->Delete({TestHelper::MakePK(i)}); + ASSERT_TRUE(res.has_value()); + ASSERT_TRUE(res.value()[0].ok()); + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, i); + + // insert again + new_doc = TestHelper::CreateDoc(i, *schema); + new_docs = {new_doc}; + res = collection->Insert(new_docs); + ASSERT_TRUE(res.has_value()); + ASSERT_TRUE(res.value()[0].ok()); + + // fetch + docs = collection->Fetch({TestHelper::MakePK(i)}); + ASSERT_TRUE(docs.has_value()); + ASSERT_EQ(docs.value().size(), 1); + ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1); + ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]); + + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, i + 1); + } + }; + func(true); + func(false); } TEST_F(CollectionTest, Feature_CreateIndex_General) { - // create empty collection - auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; - auto collection = TestHelper::CreateCollectionWithDoc(col_path, *schema, - options, 0, 0, false); + auto func = [&](bool enable_mmap) { + FileHelper::RemoveDirectory(col_path); + // create empty collection + auto schema = TestHelper::CreateNormalSchema(); + auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1024}; + auto collection = TestHelper::CreateCollectionWithDoc(col_path, *schema, + options, 0, 0, false); - ASSERT_TRUE(collection->Flush().ok()); - auto stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, 0); + ASSERT_TRUE(collection->Flush().ok()); + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, 0); - auto index_params = std::make_shared(MetricType::IP); - auto s = collection->CreateIndex("dense_fp32", index_params); - if (!s.ok()) { - std::cout << "status: " << s.message() << std::endl; - ASSERT_TRUE(false); - } - auto new_index_params = std::make_shared(MetricType::COSINE); - s = collection->CreateIndex("dense_fp32", index_params); - if (!s.ok()) { - std::cout << "status: " << s.message() << std::endl; - ASSERT_TRUE(false); - } + auto index_params = std::make_shared(MetricType::IP); + auto s = collection->CreateIndex("dense_fp32", index_params); + if (!s.ok()) { + std::cout << "status: " << s.message() << std::endl; + ASSERT_TRUE(false); + } + auto new_index_params = + std::make_shared(MetricType::COSINE); + s = collection->CreateIndex("dense_fp32", index_params); + if (!s.ok()) { + std::cout << "status: " << s.message() << std::endl; + ASSERT_TRUE(false); + } - s = collection->CreateIndex("dense_fp32_invalid", index_params); - ASSERT_FALSE(s.ok()); + s = collection->CreateIndex("dense_fp32_invalid", index_params); + ASSERT_FALSE(s.ok()); + }; + func(true); + func(false); } TEST_F(CollectionTest, Feature_CreateIndex_Vector) { @@ -2229,72 +2255,77 @@ TEST_F(CollectionTest, Feature_CreateIndex_Scalar) { } TEST_F(CollectionTest, Feature_DropIndex_General) { - // create empty collection - auto schema = TestHelper::CreateSchemaWithVectorIndex(); - auto options = CollectionOptions{false, true, 64 * 1024 * 1204}; - auto collection = TestHelper::CreateCollectionWithDoc(col_path, *schema, - options, 0, 0, false); + auto func = [&](bool enable_mmap) { + FileHelper::RemoveDirectory(col_path); + // create empty collection + auto schema = TestHelper::CreateSchemaWithVectorIndex(); + auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1204}; + auto collection = TestHelper::CreateCollectionWithDoc(col_path, *schema, + options, 0, 0, false); - ASSERT_TRUE(collection->Flush().ok()); - auto stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, 0); - ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); + ASSERT_TRUE(collection->Flush().ok()); + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, 0); + ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); - ASSERT_EQ(collection->Schema(), *schema); + ASSERT_EQ(collection->Schema(), *schema); - auto s = collection->DropIndex("dense_fp32_invalid"); - ASSERT_FALSE(s.ok()); + auto s = collection->DropIndex("dense_fp32_invalid"); + ASSERT_FALSE(s.ok()); - s = collection->DropIndex("dense_fp32"); - if (!s.ok()) { - std::cout << "drop index err: " << s.message() << std::endl; - } - ASSERT_TRUE(s.ok()); + s = collection->DropIndex("dense_fp32"); + if (!s.ok()) { + std::cout << "drop index err: " << s.message() << std::endl; + } + ASSERT_TRUE(s.ok()); - s = collection->DropIndex("dense_fp32"); - ASSERT_TRUE(s.ok()); + s = collection->DropIndex("dense_fp32"); + ASSERT_TRUE(s.ok()); - auto new_schema = std::make_shared(*schema); - s = new_schema->drop_index("dense_fp32"); - ASSERT_TRUE(s.ok()); - ASSERT_EQ(*new_schema, collection->Schema()); + auto new_schema = std::make_shared(*schema); + s = new_schema->drop_index("dense_fp32"); + ASSERT_TRUE(s.ok()); + ASSERT_EQ(*new_schema, collection->Schema()); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, 0); - ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, 0); + ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); - ASSERT_EQ(*collection->Schema() - .value() - .get_vector_field("dense_fp32") - ->index_params(), - DefaultVectorIndexParams); + ASSERT_EQ(*collection->Schema() + .value() + .get_vector_field("dense_fp32") + ->index_params(), + DefaultVectorIndexParams); - s = collection->DropIndex("dense_fp32"); - if (!s.ok()) { - std::cout << "drop index err: " << s.message() << std::endl; - } - ASSERT_TRUE(s.ok()); + s = collection->DropIndex("dense_fp32"); + if (!s.ok()) { + std::cout << "drop index err: " << s.message() << std::endl; + } + ASSERT_TRUE(s.ok()); - auto schema1 = collection->Schema().value(); + auto schema1 = collection->Schema().value(); - collection.reset(); + collection.reset(); - auto result = Collection::Open(col_path, options); - ASSERT_TRUE(result.has_value()); + auto result = Collection::Open(col_path, options); + ASSERT_TRUE(result.has_value()); - collection = std::move(result.value()); - auto schema2 = collection->Schema().value(); + collection = std::move(result.value()); + auto schema2 = collection->Schema().value(); - if (schema1 != schema2) { - std::cout << "schema1: " << schema1.to_string_formatted() << std::endl; - std::cout << "schema2: " << schema2.to_string_formatted() << std::endl; - } - ASSERT_EQ(schema1, schema2); + if (schema1 != schema2) { + std::cout << "schema1: " << schema1.to_string_formatted() << std::endl; + std::cout << "schema2: " << schema2.to_string_formatted() << std::endl; + } + ASSERT_EQ(schema1, schema2); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, 0); - ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, 0); + ASSERT_EQ(stats.index_completeness["dense_fp32"], 1); + }; + func(true); + func(false); } TEST_F(CollectionTest, Feature_DropIndex_Vector) { @@ -2526,14 +2557,14 @@ TEST_F(CollectionTest, Feature_DropIndex_AfterCreate) { } TEST_F(CollectionTest, Feature_Optimize_General) { - auto func = [](int concurrency) { + auto func = [](bool enable_mmap, int concurrency) { FileHelper::RemoveDirectory(col_path); int doc_count = 1000; // create empty collection auto schema = TestHelper::CreateSchemaWithVectorIndex(); - auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1024}; auto collection = TestHelper::CreateCollectionWithDoc( col_path, *schema, options, 0, doc_count, false); @@ -2585,12 +2616,15 @@ TEST_F(CollectionTest, Feature_Optimize_General) { std::cout << "check success 3" << std::endl; }; - func(0); - func(4); + for (bool enable_mmap : {true, false}) { + func(enable_mmap, 0); + func(enable_mmap, 4); + } } TEST_F(CollectionTest, Feature_Optimize_Repeated) { - auto run_repeated_optimize_test = [&](IndexParams::Ptr index_params) { + auto run_repeated_optimize_test = [&](bool enable_mmap, + IndexParams::Ptr index_params) { ASSERT_NE(index_params, nullptr); SCOPED_TRACE(testing::Message() << "index_params=" << index_params->to_string()); @@ -2599,7 +2633,7 @@ TEST_F(CollectionTest, Feature_Optimize_Repeated) { int doc_count = 1000; auto schema = TestHelper::CreateSchemaWithVectorIndex(false, "demo", index_params); - auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1024}; auto collection = TestHelper::CreateCollectionWithDoc( col_path, *schema, options, 0, doc_count, false); @@ -2740,22 +2774,31 @@ TEST_F(CollectionTest, Feature_Optimize_Repeated) { }; - run_repeated_optimize_test(std::make_shared( - MetricType::IP, QuantizeType::UNDEFINED)); - run_repeated_optimize_test( - std::make_shared(MetricType::IP, QuantizeType::FP16)); - run_repeated_optimize_test(std::make_shared( - MetricType::IP, 16, 200, QuantizeType::UNDEFINED)); - run_repeated_optimize_test(std::make_shared( - MetricType::IP, 16, 200, QuantizeType::FP16)); - run_repeated_optimize_test(std::make_shared( - MetricType::IP, 10, 4, false, QuantizeType::UNDEFINED)); - run_repeated_optimize_test(std::make_shared( - MetricType::IP, 10, 4, false, QuantizeType::FP16)); + for (bool enable_mmap : {true, false}) { + run_repeated_optimize_test(enable_mmap, + std::make_shared( + MetricType::IP, QuantizeType::UNDEFINED)); + run_repeated_optimize_test( + enable_mmap, + std::make_shared(MetricType::IP, QuantizeType::FP16)); + run_repeated_optimize_test( + enable_mmap, std::make_shared( + MetricType::IP, 16, 200, QuantizeType::UNDEFINED)); + run_repeated_optimize_test( + enable_mmap, std::make_shared(MetricType::IP, 16, 200, + QuantizeType::FP16)); + run_repeated_optimize_test(enable_mmap, std::make_shared( + MetricType::IP, 10, 4, false, + QuantizeType::UNDEFINED)); + run_repeated_optimize_test( + enable_mmap, std::make_shared( + MetricType::IP, 10, 4, false, QuantizeType::FP16)); #if RABITQ_SUPPORTED - run_repeated_optimize_test(std::make_shared( - MetricType::IP, 7, 256, 16, 200, 0)); + run_repeated_optimize_test( + enable_mmap, std::make_shared(MetricType::IP, 7, + 256, 16, 200, 0)); #endif + } } TEST_F(CollectionTest, Feature_Optimize_MetricType) { @@ -3428,13 +3471,13 @@ TEST_F(CollectionTest, Feature_Query_Validate) { } TEST_F(CollectionTest, Feature_Query_General) { - auto func = [&](std::string field_name) { + auto func = [&](bool enable_mmap, std::string field_name) { FileHelper::RemoveDirectory(col_path); int doc_count = 1000; // create with normal schema auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 100 * 1024 * 1024}; + auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024}; auto collection = TestHelper::CreateCollectionWithDoc( col_path, *schema, options, 0, doc_count); @@ -3496,8 +3539,10 @@ TEST_F(CollectionTest, Feature_Query_General) { } }; - func("dense_fp32"); - func("sparse_fp32"); + for (bool enable_mmap : {true, false}) { + func(enable_mmap, "dense_fp32"); + func(enable_mmap, "sparse_fp32"); + } } TEST_F(CollectionTest, Feature_Query_Empty) { @@ -4114,69 +4159,96 @@ TEST_F(CollectionTest, Feature_MultiQuery_CallbackReranker) { TEST_F(CollectionTest, Feature_GroupByQuery) {} TEST_F(CollectionTest, Feature_AddColumn_General) { - // create collection - int doc_count = 1000; - auto schema = TestHelper::CreateNormalSchema(); - auto options = CollectionOptions{false, true, 64 * 1024 * 1024}; - auto collection = TestHelper::CreateCollectionWithDoc( - col_path, *schema, options, 0, doc_count, false); + auto func = [&](bool enable_mmap) { + FileHelper::RemoveDirectory(col_path); + // create collection + int doc_count = 1000; + auto schema = TestHelper::CreateNormalSchema(); + auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1024}; + auto collection = TestHelper::CreateCollectionWithDoc( + col_path, *schema, options, 0, doc_count, false); - ASSERT_TRUE(collection->Flush().ok()); - auto stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, doc_count); - auto field_schema = - std::make_shared("add_int32", DataType::INT32, false); - auto s = collection->AddColumn(field_schema, "int32", AddColumnOptions()); - if (!s.ok()) { - std::cout << "status: " << s.message() << std::endl; - ASSERT_TRUE(false); - } - auto new_schema = collection->Schema().value(); - ASSERT_TRUE(new_schema.has_field("add_int32")); + ASSERT_TRUE(collection->Flush().ok()); + auto stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, doc_count); + auto field_schema = + std::make_shared("add_int32", DataType::INT32, false); + auto s = collection->AddColumn(field_schema, "int32", AddColumnOptions()); + if (!s.ok()) { + std::cout << "status: " << s.message() << std::endl; + ASSERT_TRUE(false); + } + auto new_schema = collection->Schema().value(); + ASSERT_TRUE(new_schema.has_field("add_int32")); - stats = collection->Stats().value(); - ASSERT_EQ(stats.doc_count, doc_count); + stats = collection->Stats().value(); + ASSERT_EQ(stats.doc_count, doc_count); - auto check_doc = [&](int doc_count) { - for (int i = 0; i < doc_count; i++) { - auto expect_doc = TestHelper::CreateDoc(i, new_schema); - auto result = collection->Fetch({expect_doc.pk()}); - ASSERT_TRUE(result.has_value()); - ASSERT_EQ(result.value().size(), 1); - ASSERT_EQ(result.value().count(expect_doc.pk()), 1); - auto doc = result.value()[expect_doc.pk()]; - ASSERT_NE(doc, nullptr); - if (*doc != expect_doc) { - std::cout << " doc:" << doc->to_detail_string() << std::endl; - std::cout << "expect_doc:" << expect_doc.to_detail_string() - << std::endl; + auto check_doc = [&](int doc_count) { + for (int i = 0; i < doc_count; i++) { + auto expect_doc = TestHelper::CreateDoc(i, new_schema); + auto result = collection->Fetch({expect_doc.pk()}); + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value().size(), 1); + ASSERT_EQ(result.value().count(expect_doc.pk()), 1); + auto doc = result.value()[expect_doc.pk()]; + ASSERT_NE(doc, nullptr); + if (*doc != expect_doc) { + std::cout << " doc:" << doc->to_detail_string() << std::endl; + std::cout << "expect_doc:" << expect_doc.to_detail_string() + << std::endl; + } + ASSERT_EQ(*doc, expect_doc); } - ASSERT_EQ(*doc, expect_doc); - } - }; + }; - check_doc(doc_count); + check_doc(doc_count); - // validate query result - for (int i = 1; i < 2; i++) { - SearchQuery query; - query.topk_ = 10; - query.include_vector_ = true; + // validate query result + for (int i = 1; i < 2; i++) { + SearchQuery query; + query.topk_ = 10; + query.include_vector_ = true; - auto result = collection->Query(query); - if (!result.has_value()) { - std::cout << "err: " << result.error().message() << std::endl; + auto result = collection->Query(query); + if (!result.has_value()) { + std::cout << "err: " << result.error().message() << std::endl; + } + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value().size(), std::min(query.topk_, doc_count)); + + auto fields_name = new_schema.all_field_names(); + for (int j = 0; j < std::min(query.topk_, doc_count); j++) { + auto result_doc = result.value()[j]; + auto doc_fields_names = result_doc->field_names(); + ASSERT_TRUE(vectors_equal_when_sorted(fields_name, doc_fields_names)); + } } - ASSERT_TRUE(result.has_value()); - ASSERT_EQ(result.value().size(), std::min(query.topk_, doc_count)); + check_doc(doc_count); - auto fields_name = new_schema.all_field_names(); - for (int j = 0; j < std::min(query.topk_, doc_count); j++) { - auto result_doc = result.value()[j]; - auto doc_fields_names = result_doc->field_names(); - ASSERT_TRUE(vectors_equal_when_sorted(fields_name, doc_fields_names)); + // validate query result + for (int i = 1; i < 2; i++) { + SearchQuery query; + query.topk_ = 10; + query.include_vector_ = true; + + auto result = collection->Query(query); + if (!result.has_value()) { + std::cout << "err: " << result.error().message() << std::endl; + } + ASSERT_TRUE(result.has_value()); + ASSERT_EQ(result.value().size(), std::min(query.topk_, doc_count)); + + auto fields_name = new_schema.all_field_names(); + for (int j = 0; j < std::min(query.topk_, doc_count); j++) { + auto result_doc = result.value()[j]; + auto doc_fields_names = result_doc->field_names(); + ASSERT_TRUE(vectors_equal_when_sorted(fields_name, doc_fields_names)); + } } - } + }; + func(true); + func(false); } TEST_F(CollectionTest, Feature_AddColumn_CornerCase) {