diff --git a/src/ailego/buffer/vector_page_table.cc b/src/ailego/buffer/vector_page_table.cc
index 553919fb3..c9296d640 100644
--- a/src/ailego/buffer/vector_page_table.cc
+++ b/src/ailego/buffer/vector_page_table.cc
@@ -13,15 +13,13 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <chrono>
 #include <cstring>
+#include <thread>
 #include <ailego/utility/memory_helper.h>
 #include <zvec/ailego/buffer/vector_page_table.h>
 #include <zvec/core/framework/index_logger.h>
 
-#if !defined(_MSC_VER)
-#include <unistd.h>
-#endif
-
 #if defined(_MSC_VER)
 #ifndef NOMINMAX
 #define NOMINMAX
@@ -39,6 +37,29 @@ static ssize_t zvec_pread(int fd, void *buf, size_t count, size_t offset) {
   }
   return static_cast<ssize_t>(bytes_read);
 }
+static ssize_t zvec_pwrite(int fd, const void *buf, size_t count,
+                           size_t offset) {
+  HANDLE handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
+  if (handle == INVALID_HANDLE_VALUE) return -1;
+  OVERLAPPED ov = {};
+  ov.Offset = static_cast<DWORD>(offset & 0xFFFFFFFF);
+  ov.OffsetHigh = static_cast<DWORD>(offset >> 32);
+  DWORD bytes_written = 0;
+  if (!WriteFile(handle, buf, static_cast<DWORD>(count), &bytes_written, &ov)) {
+    return -1;
+  }
+  return static_cast<ssize_t>(bytes_written);
+}
+#else
+#include <unistd.h>
+static inline ssize_t zvec_pread(int fd, void *buf, size_t count,
+                                 size_t offset) {
+  return ::pread(fd, buf, count, static_cast<off_t>(offset));
+}
+static inline ssize_t zvec_pwrite(int fd, const void *buf, size_t count,
+                                  size_t offset) {
+  return ::pwrite(fd, buf, count, static_cast<off_t>(offset));
+}
 #endif
 
 namespace zvec {
@@ -46,104 +67,220 @@ namespace ailego {
 
 const size_t kVectorPageSize = MemoryHelper::PageSize();
 
-void VectorPageTable::init(size_t entry_num) {
-  if (entries_) {
-    delete[] entries_;
+bool VectorPageTable::init(size_t entry_num) {
+  size_t need_segments = (entry_num + kSegmentSize - 1) / kSegmentSize;
+  if (need_segments > kMaxSegments) {
+    LOG_ERROR(
+        "VectorPageTable::init: entry_num=%zu exceeds capacity "
+        "(kMaxEntries=%zu, need_segments=%zu, kMaxSegments=%zu); "
+        "refusing to init.",
+        entry_num, kMaxEntries, need_segments, kMaxSegments);
+    return false;
+  }
+  // Free old segments if any.  init() is only called from VecBufferPool::init
+  // which is single-threaded with respect to other accesses, so a relaxed
+  // load of segment_count_ is sufficient here.
+  size_t old_count = segment_count_.load(std::memory_order_relaxed);
+  for (size_t i = 0; i < old_count; ++i) {
+    delete[] segments_[i];
+    segments_[i] = nullptr;
+  }
+  for (size_t s = 0; s < need_segments; ++s) {
+    segments_[s] = new Entry[kSegmentSize];
+    for (size_t i = 0; i < kSegmentSize; ++i) {
+      segments_[s][i].ref_count.store(std::numeric_limits<int>::min());
+      segments_[s][i].in_evict_queue.store(false);
+      segments_[s][i].is_dirty.store(false);
+      segments_[s][i].buffer = nullptr;
+      segments_[s][i].file_offset = 0;
+    }
+  }
+  // Publish new segments to readers.  segment_count_ is published first
+  // (release) so that a reader that acquire-loads segment_count_ before
+  // entry_num_ also sees a consistent segment table; entry_num_ is the
+  // primary synchronization point used by callers via entry_num().
+  segment_count_.store(need_segments, std::memory_order_release);
+  entry_num_.store(entry_num, std::memory_order_release);
+  return true;
+}
+
+bool VectorPageTable::extend(size_t new_entry_num) {
+  // Relaxed read is fine: extend() is serialized by the caller (extend_file
+  // is invoked under the BufferStorage write latch).  No other writer races
+  // with us on entry_num_ / segment_count_.
+  if (new_entry_num <= entry_num_.load(std::memory_order_relaxed)) {
+    return true;
+  }
+  size_t new_segment_count = (new_entry_num + kSegmentSize - 1) / kSegmentSize;
+  if (new_segment_count > kMaxSegments) {
+    LOG_ERROR(
+        "VectorPageTable::extend: new_entry_num=%zu exceeds capacity "
+        "(kMaxEntries=%zu, new_segment_count=%zu, kMaxSegments=%zu); "
+        "refusing to extend.",
+        new_entry_num, kMaxEntries, new_segment_count, kMaxSegments);
+    return false;
   }
-  entry_num_ = entry_num;
-  entries_ = new Entry[entry_num_];
-  for (size_t i = 0; i < entry_num_; i++) {
-    entries_[i].ref_count.store(std::numeric_limits<int>::min());
-    entries_[i].in_evict_queue.store(false);
-    entries_[i].buffer = nullptr;
+  size_t old_count = segment_count_.load(std::memory_order_relaxed);
+  for (size_t s = old_count; s < new_segment_count; ++s) {
+    segments_[s] = new Entry[kSegmentSize];
+    for (size_t i = 0; i < kSegmentSize; ++i) {
+      segments_[s][i].ref_count.store(std::numeric_limits<int>::min());
+      segments_[s][i].in_evict_queue.store(false);
+      segments_[s][i].is_dirty.store(false);
+      segments_[s][i].buffer = nullptr;
+      segments_[s][i].file_offset = 0;
+    }
   }
+  // Publish in the same order as init(): segment_count_ first, entry_num_
+  // last.  Both are release-stores so that the prior segment allocation /
+  // Entry initialization is visible to any reader that acquire-loads either
+  // counter (typically via entry_num()).
+  segment_count_.store(new_segment_count, std::memory_order_release);
+  entry_num_.store(new_entry_num, std::memory_order_release);
+  return true;
 }
 
 char *VectorPageTable::acquire_block(block_id_t block_id) {
-  assert(block_id < entry_num_);
-  Entry &entry = entries_[block_id];
+  assert(block_id < entry_num_.load(std::memory_order_relaxed));
+  Entry &e = entry_at(block_id);
   while (true) {
-    int current_count = entry.ref_count.load(std::memory_order_acquire);
+    int current_count = e.ref_count.load(std::memory_order_acquire);
     if (current_count < 0) {
       return nullptr;
     }
-    if (entry.ref_count.compare_exchange_weak(current_count, current_count + 1,
-                                              std::memory_order_acq_rel,
-                                              std::memory_order_acquire)) {
-      return entry.buffer;
+    if (e.ref_count.compare_exchange_weak(current_count, current_count + 1,
+                                          std::memory_order_acq_rel,
+                                          std::memory_order_acquire)) {
+      return e.buffer;
     }
   }
 }
 
 void VectorPageTable::release_block(block_id_t block_id) {
-  assert(block_id < entry_num_);
-  Entry &entry = entries_[block_id];
+  assert(block_id < entry_num_.load(std::memory_order_relaxed));
+  Entry &e = entry_at(block_id);
 
-  if (entry.ref_count.fetch_sub(1, std::memory_order_release) == 1) {
+  if (e.ref_count.fetch_sub(1, std::memory_order_release) == 1) {
     std::atomic_thread_fence(std::memory_order_acquire);
-    // Attempt to transition in_evict_queue from false -> true.  The CAS ensures
-    // only one thread enqueues this block even if multiple threads race here.
     bool expected = false;
-    if (entry.in_evict_queue.compare_exchange_strong(
-            expected, true, std::memory_order_acq_rel,
-            std::memory_order_relaxed)) {
+    if (e.in_evict_queue.compare_exchange_strong(expected, true,
+                                                 std::memory_order_acq_rel,
+                                                 std::memory_order_relaxed)) {
       BlockEvictionQueue::BlockType block;
       block.page_table = this;
       block.vector_block.first = block_id;
       block.vector_block.second = 0;
       BlockEvictionQueue::get_instance().add_single_block(block, 0);
     }
-    // else: block is already in the eviction queue; do not add a duplicate
-    // entry.
   }
 }
 
 void VectorPageTable::evict_block(block_id_t block_id) {
-  assert(block_id < entry_num_);
-  Entry &entry = entries_[block_id];
-  char *buffer = entry.buffer;
+  assert(block_id < entry_num_.load(std::memory_order_relaxed));
+  Entry &e = entry_at(block_id);
   int expected = 0;
-  if (entry.ref_count.compare_exchange_strong(
-          expected, std::numeric_limits<int>::min())) {
+  // Two-phase eviction to prevent data race on e.buffer with
+  // set_block_acquired.  We first CAS to kEvicting (-1), which causes
+  // set_block_acquired to spin-wait; then do the actual work (flush, free,
+  // null buffer); finally store INT_MIN ("evicted") which unblocks
+  // set_block_acquired.
+  static constexpr int kEvicting = -1;
+  if (e.ref_count.compare_exchange_strong(expected, kEvicting)) {
+    char *buffer = e.buffer;
+    if (buffer && e.is_dirty.load(std::memory_order_relaxed) &&
+        flush_callback_) {
+      flush_callback_(block_id, buffer, kVectorPageSize, e.file_offset);
+      e.is_dirty.store(false, std::memory_order_relaxed);
+    }
     if (buffer) {
+      e.buffer = nullptr;
       MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize);
     }
+    // Transition to fully-evicted state.  Use release so that the
+    // set_block_acquired acquire-load sees e.buffer == nullptr.
+    e.ref_count.store(std::numeric_limits<int>::min(),
+                      std::memory_order_release);
   }
-  // Always reset in_evict_queue regardless of whether the CAS succeeded:
-  //  - On success: the block is evicted; future releases should re-register it.
-  //  - On failure: the block was re-acquired by another thread between the
-  //    ref-count check and this call.  Clearing in_evict_queue lets the next
-  //    release_block() re-enqueue it so it is not silently lost.
-  entry.in_evict_queue.store(false, std::memory_order_relaxed);
+  e.in_evict_queue.store(false, std::memory_order_relaxed);
 }
 
-char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer) {
-  assert(block_id < entry_num_);
-  Entry &entry = entries_[block_id];
+char *VectorPageTable::set_block_acquired(block_id_t block_id, char *buffer,
+                                          size_t file_offset) {
+  assert(block_id < entry_num_.load(std::memory_order_acquire));
+  Entry &e = entry_at(block_id);
+  // Diagnostics for the kEvicting wait. The wait itself never gives up:
+  // the only thread that can transition kEvicting -> INT_MIN is the
+  // evict_block() owner, so abandoning the spin here would orphan the
+  // entry in kEvicting forever. Instead, we use bounded backoff and emit
+  // tiered logs so a stuck eviction is observable.
+  using clock = std::chrono::steady_clock;
+  const auto wait_start = clock::now();
+  auto last_log = wait_start;
+  unsigned spin_count = 0;
+  bool warned = false;
   while (true) {
-    int current_count = entry.ref_count.load(std::memory_order_relaxed);
+    int current_count = e.ref_count.load(std::memory_order_acquire);
     if (current_count >= 0) {
-      if (entry.ref_count.compare_exchange_weak(
-              current_count, current_count + 1, std::memory_order_acq_rel,
-              std::memory_order_acquire)) {
+      if (e.ref_count.compare_exchange_weak(current_count, current_count + 1,
+                                            std::memory_order_acq_rel,
+                                            std::memory_order_acquire)) {
         MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize);
-        return entry.buffer;
+        return e.buffer;
       }
+    } else if (current_count == std::numeric_limits<int>::min()) {
+      // Fully evicted — safe to claim this entry for our new buffer.
+      e.buffer = buffer;
+      e.file_offset = file_offset;
+      e.in_evict_queue.store(false, std::memory_order_relaxed);
+      e.is_dirty.store(false, std::memory_order_relaxed);
+      e.ref_count.store(1, std::memory_order_release);
+      return e.buffer;
     } else {
-      entry.buffer = buffer;
-      entry.in_evict_queue.store(false, std::memory_order_relaxed);
-      entry.ref_count.store(1, std::memory_order_release);
-      return entry.buffer;
+      // kEvicting (-1): eviction is in progress on this entry.
+      // Tiered backoff: hot spin first, then short sleep, then longer sleep.
+      ++spin_count;
+      if (spin_count < 64) {
+        // Pure busy wait for the common ~μs case.
+      } else if (spin_count < 1024) {
+        std::this_thread::yield();
+      } else if (spin_count < 8192) {
+        std::this_thread::sleep_for(std::chrono::microseconds(100));
+      } else {
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
+      }
+      // Tiered diagnostics: warn once after 100ms, error every 1s after 1s.
+      const auto now = clock::now();
+      const auto elapsed = now - wait_start;
+      if (!warned && elapsed >= std::chrono::milliseconds(100)) {
+        LOG_WARN(
+            "set_block_acquired: long kEvicting wait on block_id=%zu "
+            "(>=100ms); evict_block may be slow",
+            static_cast<size_t>(block_id));
+        warned = true;
+      }
+      if (elapsed >= std::chrono::seconds(1) &&
+          (now - last_log) >= std::chrono::seconds(1)) {
+        const auto secs =
+            std::chrono::duration_cast<std::chrono::seconds>(elapsed).count();
+        LOG_ERROR(
+            "set_block_acquired: stuck in kEvicting on block_id=%zu for "
+            "%lld s; evict_block owner may be hung or starved",
+            static_cast<size_t>(block_id), static_cast<long long>(secs));
+        last_log = now;
+      }
     }
   }
 }
 
-VecBufferPool::VecBufferPool(const std::string &filename) {
+VecBufferPool::VecBufferPool(const std::string &filename, bool writable) {
   file_name_ = filename;
+  writable_ = writable;
 #if defined(_MSC_VER)
-  fd_ = _open(filename.c_str(), O_RDONLY | _O_BINARY);
+  int flags = writable_ ? (O_RDWR | _O_BINARY) : (O_RDONLY | _O_BINARY);
+  fd_ = _open(filename.c_str(), flags, 0644);
 #else
-  fd_ = open(filename.c_str(), O_RDONLY);
+  int flags = writable_ ? O_RDWR : O_RDONLY;
+  fd_ = ::open(filename.c_str(), flags, 0644);
 #endif
   if (fd_ < 0) {
     throw std::runtime_error("Failed to open file: " + filename);
@@ -164,11 +301,40 @@ VecBufferPool::VecBufferPool(const std::string &filename) {
 
 int VecBufferPool::init() {
   size_t block_num = (file_size_ + kVectorPageSize - 1) / kVectorPageSize;
-  page_table_.init(block_num);
+  if (!page_table_.init(block_num)) {
+    LOG_ERROR(
+        "VecBufferPool::init: page_table_ init failed for file[%s], "
+        "file_size=%zu, block_num=%zu (exceeds "
+        "VectorPageTable::kMaxEntries=%zu)",
+        file_name_.c_str(), file_size_, block_num,
+        VectorPageTable::kMaxEntries);
+    return -1;
+  }
   block_mutexes_ =
       std::make_unique<std::mutex[]>(VecBufferPool::kMutexBucketCount);
   LOG_DEBUG("entry num: %zu, file_size: %zu", page_table_.entry_num(),
             file_size_);
+
+  // In writable mode, inject a flush callback into the page table so that
+  // evict_block()/flush_block()/flush_all() can pwrite dirty blocks back to
+  // the backing file without needing to know about fd_ directly.
+  if (writable_) {
+    int fd = fd_;
+    const std::string &name = file_name_;
+    page_table_.set_flush_callback([fd, &name](block_id_t /*block_id*/,
+                                               char *buf, size_t sz,
+                                               size_t off) -> int {
+      ssize_t w = zvec_pwrite(fd, buf, sz, off);
+      if (w != static_cast<ssize_t>(sz)) {
+        LOG_ERROR(
+            "Buffer pool flush failed: file[%s], offset[%zu], "
+            "expected[%zu], got[%zd]",
+            name.c_str(), off, sz, w);
+        return -1;
+      }
+      return 0;
+    });
+  }
   return 0;
 }
 
@@ -213,11 +379,7 @@ char *VecBufferPool::acquire_buffer(block_id_t page_id, int retry) {
   if (expected_bytes < kVectorPageSize) {
     std::memset(buffer + expected_bytes, 0, kVectorPageSize - expected_bytes);
   }
-#if defined(_MSC_VER)
   ssize_t read_bytes = zvec_pread(fd_, buffer, expected_bytes, page_offset);
-#else
-  ssize_t read_bytes = pread(fd_, buffer, expected_bytes, page_offset);
-#endif
   if (read_bytes != static_cast<ssize_t>(expected_bytes)) {
     LOG_ERROR(
         "Buffer pool failed to read file at offset: file[%s], page_id[%zu], "
@@ -226,15 +388,11 @@ char *VecBufferPool::acquire_buffer(block_id_t page_id, int retry) {
     MemoryLimitPool::get_instance().release_buffer(buffer, kVectorPageSize);
     return nullptr;
   }
-  return page_table_.set_block_acquired(page_id, buffer);
+  return page_table_.set_block_acquired(page_id, buffer, page_offset);
 }
 
 int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) {
-#if defined(_MSC_VER)
   ssize_t read_bytes = zvec_pread(fd_, buffer, length, offset);
-#else
-  ssize_t read_bytes = pread(fd_, buffer, length, offset);
-#endif
   if (read_bytes != static_cast<ssize_t>(length)) {
     LOG_ERROR(
         "Buffer pool failed to read file at offset: file[%s], offset[%zu], "
@@ -245,6 +403,141 @@ int VecBufferPool::get_meta(size_t offset, size_t length, char *buffer) {
   return 0;
 }
 
+int VecBufferPool::write_range(size_t file_offset, size_t length,
+                               const char *src) {
+  if (!writable_) {
+    LOG_ERROR("write_range called on read-only pool: file[%s]",
+              file_name_.c_str());
+    return -1;
+  }
+  if (length == 0) {
+    return 0;
+  }
+  size_t first_page = file_offset / kVectorPageSize;
+  size_t last_page = (file_offset + length - 1) / kVectorPageSize;
+  size_t remaining = length;
+  size_t src_cursor = 0;
+  for (size_t pg = first_page; pg <= last_page; ++pg) {
+    // Loading the page ensures we do not clobber unrelated bytes within the
+    // same page when the write is not page-aligned. acquire_buffer() pre-fills
+    // from the backing file (or zero-pads beyond EOF).
+    char *page = this->acquire_buffer(pg, 50);
+    if (!page) {
+      LOG_ERROR("write_range acquire failed: file[%s], page[%zu]",
+                file_name_.c_str(), pg);
+      return -1;
+    }
+    size_t page_start = pg * kVectorPageSize;
+    size_t intra_offset = (pg == first_page) ? (file_offset - page_start) : 0;
+    size_t chunk = std::min(kVectorPageSize - intra_offset, remaining);
+    std::memcpy(page + intra_offset, src + src_cursor, chunk);
+    page_table_.mark_dirty(pg);
+    page_table_.release_block(pg);
+    src_cursor += chunk;
+    remaining -= chunk;
+  }
+  return 0;
+}
+
+int VecBufferPool::write_meta(size_t offset, size_t length,
+                              const char *buffer) {
+  if (!writable_) {
+    LOG_ERROR("write_meta called on read-only pool: file[%s]",
+              file_name_.c_str());
+    return -1;
+  }
+  ssize_t w = zvec_pwrite(fd_, buffer, length, offset);
+  if (w != static_cast<ssize_t>(length)) {
+    LOG_ERROR(
+        "Buffer pool failed to write meta: file[%s], offset[%zu], "
+        "length[%zu], got[%zd]",
+        file_name_.c_str(), offset, length, w);
+    return -1;
+  }
+  return 0;
+}
+
+int VecBufferPool::flush_all() {
+  if (!writable_) {
+    return 0;
+  }
+  int rc = 0;
+  size_t total_dirty = 0;
+  size_t fail_count = 0;
+  for (size_t i = 0; i < page_table_.entry_num(); ++i) {
+    if (page_table_.is_block_dirty(i)) {
+      ++total_dirty;
+      int r = page_table_.flush_block(i);
+      if (r != 0) {
+        rc = r;
+        ++fail_count;
+      }
+    }
+  }
+  if (fail_count != 0) {
+    // Aggregated diagnostic so that callers (notably ~VecBufferPool, which
+    // discards the return value) cannot silently lose dirty pages: any
+    // unflushed page at this point means the on-disk image is now stale.
+    LOG_ERROR(
+        "VecBufferPool::flush_all: %zu/%zu dirty page(s) failed to flush, "
+        "file[%s] last_rc=%d -- on-disk data may be stale.",
+        fail_count, total_dirty, file_name_.c_str(), rc);
+  }
+  return rc;
+}
+
+bool VecBufferPool::extend_file(size_t new_size) {
+  if (!writable_) {
+    LOG_ERROR("extend_file called on read-only pool: file[%s]",
+              file_name_.c_str());
+    return false;
+  }
+  if (new_size <= file_size_) {
+    return true;
+  }
+  // Pre-validate against the page table's static capacity BEFORE mutating
+  // any on-disk state.  Otherwise a successful ftruncate followed by a
+  // failed page_table_.extend() would leave the file size and the page
+  // table out of sync (file grew, but no Entry slots cover the new range).
+  size_t new_entry_num = (new_size + kVectorPageSize - 1) / kVectorPageSize;
+  if (new_entry_num > VectorPageTable::kMaxEntries) {
+    LOG_ERROR(
+        "extend_file: requested new_size=%zu would require %zu page entries, "
+        "exceeding VectorPageTable::kMaxEntries=%zu (file=%s).",
+        new_size, new_entry_num, VectorPageTable::kMaxEntries,
+        file_name_.c_str());
+    return false;
+  }
+#if defined(_MSC_VER)
+  if (_chsize_s(fd_, static_cast<int64_t>(new_size)) != 0) {
+    LOG_ERROR("extend_file _chsize_s failed: file[%s], new_size[%zu]",
+              file_name_.c_str(), new_size);
+    return false;
+  }
+#else
+  if (::ftruncate(fd_, static_cast<off_t>(new_size)) != 0) {
+    LOG_ERROR("extend_file ftruncate failed: file[%s], new_size[%zu]",
+              file_name_.c_str(), new_size);
+    return false;
+  }
+#endif
+  file_size_ = new_size;
+  // Extend the page table to cover the new file range.  Existing entries
+  // stay at their original addresses so concurrent readers are unaffected.
+  // Capacity has already been validated above, so this should never fail;
+  // a failure here would indicate a programming error and is logged.
+  if (new_entry_num > page_table_.entry_num()) {
+    if (!page_table_.extend(new_entry_num)) {
+      LOG_ERROR(
+          "extend_file: page_table_.extend(%zu) failed unexpectedly after "
+          "capacity pre-check (file=%s, new_size=%zu).",
+          new_entry_num, file_name_.c_str(), new_size);
+      return false;
+    }
+  }
+  return true;
+}
+
 char *VecBufferPoolHandle::get_single_page(size_t file_offset, size_t len,
                                            size_t &out_page_id) {
   size_t first_page = file_offset / kVectorPageSize;
@@ -252,6 +545,10 @@ char *VecBufferPoolHandle::get_single_page(size_t file_offset, size_t len,
   out_page_id = first_page;
   char *page = pool_.acquire_buffer(first_page, 50);
   if (!page) {
+    LOG_ERROR(
+        "VecBufferPoolHandle::get_single_page: acquire_buffer failed, "
+        "file_offset=%zu, len=%zu, page=%zu, page_size=%zu",
+        file_offset, len, first_page, kVectorPageSize);
     return nullptr;
   }
   return page + (file_offset - first_page * kVectorPageSize);
@@ -269,6 +566,11 @@ bool VecBufferPoolHandle::read_range(size_t file_offset, size_t len,
   for (size_t pg = first_page; pg <= last_page; ++pg) {
     char *page = pool_.acquire_buffer(pg, 50);
     if (!page) {
+      LOG_ERROR(
+          "VecBufferPoolHandle::read_range: acquire_buffer failed, "
+          "file_offset=%zu, len=%zu, page=%zu, first_page=%zu, last_page=%zu, "
+          "page_size=%zu",
+          file_offset, len, pg, first_page, last_page, kVectorPageSize);
       return false;
     }
     size_t page_start = pg * kVectorPageSize;
@@ -286,6 +588,24 @@ int VecBufferPoolHandle::get_meta(size_t offset, size_t length, char *buffer) {
   return pool_.get_meta(offset, length, buffer);
 }
 
+int VecBufferPoolHandle::write_range(size_t file_offset, size_t len,
+                                     const char *src) {
+  return pool_.write_range(file_offset, len, src);
+}
+
+int VecBufferPoolHandle::write_meta(size_t offset, size_t length,
+                                    const char *buffer) {
+  return pool_.write_meta(offset, length, buffer);
+}
+
+int VecBufferPoolHandle::flush_all() {
+  return pool_.flush_all();
+}
+
+bool VecBufferPoolHandle::writable() const {
+  return pool_.writable();
+}
+
 void VecBufferPoolHandle::release_one(block_id_t block_id) {
   pool_.page_table_.release_block(block_id);
 }
diff --git a/src/core/algorithm/flat/flat_streamer.cc b/src/core/algorithm/flat/flat_streamer.cc
index 8969efc14..5e6171659 100644
--- a/src/core/algorithm/flat/flat_streamer.cc
+++ b/src/core/algorithm/flat/flat_streamer.cc
@@ -34,7 +34,7 @@ FlatStreamer<BATCH_SIZE>::FlatStreamer() : entity_(stats_) {}
 
 template <size_t BATCH_SIZE>
 FlatStreamer<BATCH_SIZE>::~FlatStreamer() {
-  if (state_ == STATE_INITED) {
+  if (state_ == STATE_INITED || state_ == STATE_OPENED) {
     this->cleanup();
   }
 }
diff --git a/src/core/algorithm/flat/flat_streamer_entity.cc b/src/core/algorithm/flat/flat_streamer_entity.cc
index 988f5fdfb..87d9a1906 100644
--- a/src/core/algorithm/flat/flat_streamer_entity.cc
+++ b/src/core/algorithm/flat/flat_streamer_entity.cc
@@ -165,13 +165,20 @@ int FlatStreamerEntity::add(uint64_t key, const void *vec, size_t size) {
 
   IndexStorage::MemoryBlock head_block;
   this->get_head_block(head_block);
-  const BlockLocation *bl =
-      reinterpret_cast<const BlockLocation *>(head_block.data());
-  if (ailego_unlikely(bl == nullptr)) {
-    LOG_ERROR("Failed to get block loc");
-    return IndexError_ReadData;
+  BlockLocation block;
+  {
+    const BlockLocation *bl =
+        reinterpret_cast<const BlockLocation *>(head_block.data());
+    if (ailego_unlikely(bl == nullptr)) {
+      LOG_ERROR("Failed to get block loc");
+      return IndexError_ReadData;
+    }
+    block = *bl;
   }
-  BlockLocation block = *bl;
+  // Release the head block reference early so that the buffer pool ref_count
+  // and memory budget held by it do not block subsequent acquire/evict in this
+  // function (alloc_block / add_to_block may compete for the same memory).
+  head_block.reset(nullptr);
 
   if (!this->is_valid_block(block)) {
     int ret = this->alloc_block(block, &block);
@@ -922,6 +929,9 @@ int FlatStreamerEntity::add_vector_with_id(const uint32_t id, const void *query,
     this->get_head_block(head_block);
     BlockLocation block =
         *reinterpret_cast<const BlockLocation *>(head_block.data());
+    // Release buffer-pool pin before any alloc_block() call that may trigger
+    // append_segment() and rebuild the pool (same reason as in add()).
+    head_block.reset(nullptr);
     if (!this->is_valid_block(block)) {
       int ret = this->alloc_block(block, &block);
       if (ailego_unlikely(ret != 0)) {
diff --git a/src/core/algorithm/hnsw/hnsw_index_hash.h b/src/core/algorithm/hnsw/hnsw_index_hash.h
index 1557dcd93..cc59e84ab 100644
--- a/src/core/algorithm/hnsw/hnsw_index_hash.h
+++ b/src/core/algorithm/hnsw/hnsw_index_hash.h
@@ -41,9 +41,9 @@ class HnswIndexHashMap {
           items_(reinterpret_cast<const Item *>(data)) {}
     //! Return a empty loc or the key item loc
 
-    Slot(Chunk::Pointer &&chunk, IndexStorage::MemoryBlock &&mem_block)
-        : chunk_(std::move(chunk)), items_block_(std::move(mem_block)) {
-      items_ = reinterpret_cast<const Item *>(items_block_.data());
+    Slot(Chunk::Pointer &&chunk, std::vector<char> &&local_data)
+        : chunk_(std::move(chunk)), local_data_(std::move(local_data)) {
+      items_ = reinterpret_cast<const Item *>(local_data_.data());
     }
     const_iterator find(key_type key, uint32_t max_items, uint32_t mask) const {
       auto it = &items_[key & mask];
@@ -73,8 +73,8 @@ class HnswIndexHashMap {
 
    private:
     Chunk::Pointer chunk_{};
-    const Item *items_{nullptr};  // point to chunk data
-    IndexStorage::MemoryBlock items_block_{};
+    const Item *items_{nullptr};  // point to local_data_
+    std::vector<char> local_data_{};
   };
 
  public:
@@ -114,9 +114,9 @@ class HnswIndexHashMap {
   }
 
   int cleanup(void) {
-    broker_.reset();
     slots_.clear();
     slots_.shrink_to_fit();
+    broker_.reset();
     mask_bits_ = 0U;
     slot_items_ = 0U;
     slot_loc_mask_ = 0U;
@@ -141,7 +141,6 @@ class HnswIndexHashMap {
     auto idx = key >> mask_bits_;
     if (idx >= slots_.size()) {
       if (ailego_unlikely(idx >= slots_.capacity())) {
-        LOG_ERROR("no space to insert");
         return false;
       }
       for (auto i = slots_.size(); i <= idx; ++i) {
@@ -152,7 +151,6 @@ class HnswIndexHashMap {
     }
     auto it = slots_[idx].find(key, slot_items_, slot_loc_mask_);
     if (ailego_unlikely(it == nullptr)) {
-      LOG_ERROR("no space to insert");
       return false;
     }
 
@@ -179,14 +177,10 @@ class HnswIndexHashMap {
       LOG_ERROR("Chunk resize failed, size=%zu", size);
       return false;
     }
-    //! Read the whole data to memory
-    IndexStorage::MemoryBlock data_block;
-    if (ailego_unlikely(chunk->read(0U, data_block, size) != size)) {
-      LOG_ERROR("Chunk read failed, size=%zu", size);
-      return false;
-    }
-
-    slots_.emplace_back(std::move(chunk), std::move(data_block));
+    //! Use a local zero-initialized buffer; new chunks contain all zeros,
+    //! so no buffer-pool read is needed and no ref_count is pinned.
+    std::vector<char> local_buf(size, 0);
+    slots_.emplace_back(std::move(chunk), std::move(local_buf));
     return true;
   }
 
@@ -208,13 +202,14 @@ class HnswIndexHashMap {
             i, chunk->data_size(), size);
         return IndexError_InvalidFormat;
       }
-      //! Read the whole data to memory
-      IndexStorage::MemoryBlock data_block;
-      if (ailego_unlikely(chunk->read(0U, data_block, size) != size)) {
-        LOG_ERROR("Chunk read failed, size=%zu", size);
-        return false;
+      //! Copy chunk data into a local buffer via fetch() so that no
+      //! buffer-pool block is pinned for the lifetime of the Slot.
+      std::vector<char> local_buf(size);
+      if (ailego_unlikely(chunk->fetch(0U, local_buf.data(), size) != size)) {
+        LOG_ERROR("Chunk fetch failed, size=%zu", size);
+        return IndexError_InvalidFormat;
       }
-      slots_.emplace_back(std::move(chunk), std::move(data_block));
+      slots_.emplace_back(std::move(chunk), std::move(local_buf));
     }
     return 0;
   }
diff --git a/src/core/algorithm/hnsw/hnsw_streamer.cc b/src/core/algorithm/hnsw/hnsw_streamer.cc
index 935cae5d4..c5e78f415 100644
--- a/src/core/algorithm/hnsw/hnsw_streamer.cc
+++ b/src/core/algorithm/hnsw/hnsw_streamer.cc
@@ -28,7 +28,7 @@ namespace core {
 HnswStreamer::HnswStreamer() = default;
 
 HnswStreamer::~HnswStreamer() {
-  if (state_ == STATE_INITED) {
+  if (state_ == STATE_INITED || state_ == STATE_OPENED) {
     this->cleanup();
   }
 }
diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
index acc9bee36..a8ada19e6 100644
--- a/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
+++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.cc
@@ -37,6 +37,7 @@ int HnswStreamerEntity::init(size_t max_doc_cnt) {
   std::lock_guard<std::mutex> lock(mutex_);
   broker_ = std::make_shared<ChunkBroker>(stats_);
   upper_neighbor_index_ = std::make_shared<NIHashMap>();
+  upper_neighbor_rw_mutex_ = std::make_shared<std::shared_mutex>();
   keys_map_lock_ = std::make_shared<ailego::SharedMutex>();
   keys_map_ = std::make_shared<HashMap<key_t, node_id_t>>();
   if (!keys_map_ || !upper_neighbor_index_ || !broker_ || !keys_map_lock_) {
@@ -767,9 +768,10 @@ const HnswEntity::Pointer HnswStreamerEntity::clone() const {
   HnswStreamerEntity *entity = new (std::nothrow) HnswStreamerEntity(
       stats_, header(), chunk_size_, node_index_mask_bits_,
       upper_neighbor_mask_bits_, filter_same_key_, get_vector_enabled_,
-      upper_neighbor_index_, keys_map_lock_, keys_map_, use_key_info_map_,
-      std::move(node_chunks), std::move(upper_neighbor_chunks), broker_,
-      node_chunk_bases_, upper_neighbor_chunk_bases_);
+      upper_neighbor_index_, upper_neighbor_rw_mutex_, keys_map_lock_,
+      keys_map_, use_key_info_map_, std::move(node_chunks),
+      std::move(upper_neighbor_chunks), broker_, node_chunk_bases_,
+      upper_neighbor_chunk_bases_);
   if (ailego_unlikely(!entity)) {
     LOG_ERROR("HnswStreamerEntity new failed");
   }
@@ -800,9 +802,9 @@ const HnswEntity::Pointer HnswMmapStreamerEntity::clone() const {
   auto *entity = new (std::nothrow) HnswMmapStreamerEntity(
       stats_, header(), chunk_size_, node_index_mask_bits_,
       upper_neighbor_mask_bits_, filter_same_key_, get_vector_enabled_,
-      upper_neighbor_index_, keys_map_lock_, keys_map_, use_key_info_map_,
-      std::move(node_chunks), std::move(upper_neighbor_chunks), broker_,
-      nullptr, nullptr);
+      upper_neighbor_index_, upper_neighbor_rw_mutex_, keys_map_lock_,
+      keys_map_, use_key_info_map_, std::move(node_chunks),
+      std::move(upper_neighbor_chunks), broker_, nullptr, nullptr);
   if (ailego_unlikely(!entity)) {
     LOG_ERROR("HnswMmapStreamerEntity new failed");
   }
@@ -833,9 +835,9 @@ const HnswEntity::Pointer HnswContiguousStreamerEntity::clone() const {
   auto *entity = new (std::nothrow) HnswContiguousStreamerEntity(
       stats_, header(), chunk_size_, node_index_mask_bits_,
       upper_neighbor_mask_bits_, filter_same_key_, get_vector_enabled_,
-      upper_neighbor_index_, keys_map_lock_, keys_map_, use_key_info_map_,
-      std::move(node_chunks), std::move(upper_neighbor_chunks), broker_,
-      nullptr, nullptr);
+      upper_neighbor_index_, upper_neighbor_rw_mutex_, keys_map_lock_,
+      keys_map_, use_key_info_map_, std::move(node_chunks),
+      std::move(upper_neighbor_chunks), broker_, nullptr, nullptr);
   if (ailego_unlikely(!entity)) {
     LOG_ERROR("HnswContiguousStreamerEntity new failed");
     return HnswEntity::Pointer();
diff --git a/src/core/algorithm/hnsw/hnsw_streamer_entity.h b/src/core/algorithm/hnsw/hnsw_streamer_entity.h
index 3c2fb0cea..677393de3 100644
--- a/src/core/algorithm/hnsw/hnsw_streamer_entity.h
+++ b/src/core/algorithm/hnsw/hnsw_streamer_entity.h
@@ -17,6 +17,7 @@
 #include <iostream>
 #include <memory>
 #include <mutex>
+#include <shared_mutex>
 #if defined(__linux__) || defined(__APPLE__)
 #include <sys/mman.h>
 #endif
@@ -246,19 +247,19 @@ class HnswStreamerEntity : public HnswEntity {
   using NIHashMapPointer = std::shared_ptr<NIHashMap>;
 
   //! Clone construct, used by clone method in subclasses
-  HnswStreamerEntity(IndexStreamer::Stats &stats, const HNSWHeader &hd,
-                     size_t chunk_size, uint32_t node_index_mask_bits,
-                     uint32_t upper_neighbor_mask_bits, bool filter_same_key,
-                     bool get_vector_enabled,
-                     const NIHashMapPointer &upper_neighbor_index,
-                     std::shared_ptr<ailego::SharedMutex> &keys_map_lock,
-                     const HashMapPointer<key_t, node_id_t> &keys_map,
-                     bool use_key_info_map,
-                     std::vector<Chunk::Pointer> &&node_chunks,
-                     std::vector<Chunk::Pointer> &&upper_neighbor_chunks,
-                     const ChunkBroker::Pointer &broker,
-                     std::shared_ptr<std::vector<const uint8_t *>> node_bases,
-                     std::shared_ptr<std::vector<const uint8_t *>> upper_bases)
+  HnswStreamerEntity(
+      IndexStreamer::Stats &stats, const HNSWHeader &hd, size_t chunk_size,
+      uint32_t node_index_mask_bits, uint32_t upper_neighbor_mask_bits,
+      bool filter_same_key, bool get_vector_enabled,
+      const NIHashMapPointer &upper_neighbor_index,
+      const std::shared_ptr<std::shared_mutex> &upper_neighbor_rw_mutex,
+      std::shared_ptr<ailego::SharedMutex> &keys_map_lock,
+      const HashMapPointer<key_t, node_id_t> &keys_map, bool use_key_info_map,
+      std::vector<Chunk::Pointer> &&node_chunks,
+      std::vector<Chunk::Pointer> &&upper_neighbor_chunks,
+      const ChunkBroker::Pointer &broker,
+      std::shared_ptr<std::vector<const uint8_t *>> node_bases,
+      std::shared_ptr<std::vector<const uint8_t *>> upper_bases)
       : stats_(stats),
         chunk_size_(chunk_size),
         node_index_mask_bits_(node_index_mask_bits),
@@ -269,6 +270,7 @@ class HnswStreamerEntity : public HnswEntity {
         filter_same_key_(filter_same_key),
         get_vector_enabled_(get_vector_enabled),
         use_key_info_map_(use_key_info_map),
+        upper_neighbor_rw_mutex_(upper_neighbor_rw_mutex),
         upper_neighbor_index_(upper_neighbor_index),
         keys_map_lock_(keys_map_lock),
         keys_map_(keys_map),
@@ -323,6 +325,10 @@ class HnswStreamerEntity : public HnswEntity {
 
   inline std::pair<uint32_t, uint32_t> get_upper_neighbor_chunk_loc(
       level_t level, node_id_t id) const {
+    // Shared lock: concurrent readers are fine, but must synchronize with
+    // add_upper_neighbor's exclusive lock to avoid data-race on
+    // slots_.size() inside HnswIndexHashMap.
+    std::shared_lock<std::shared_mutex> lk(*upper_neighbor_rw_mutex_);
     auto it = upper_neighbor_index_->find(id);
     ailego_assert_abort(it != upper_neighbor_index_->end(),
                         "Get upper neighbor header failed");
@@ -370,6 +376,10 @@ class HnswStreamerEntity : public HnswEntity {
     if (level == 0) {
       return 0;
     }
+    // Exclusive lock: protects upper_neighbor_chunks_.emplace_back() and
+    // upper_neighbor_index_->insert() from racing with concurrent find()
+    // calls in get_upper_neighbor_chunk_loc().
+    std::unique_lock<std::shared_mutex> lk(*upper_neighbor_rw_mutex_);
     Chunk::Pointer chunk;
     uint64_t chunk_offset = UINT64_MAX;
     size_t neighbors_size = get_total_upper_neighbors_size(level);
@@ -408,17 +418,40 @@ class HnswStreamerEntity : public HnswEntity {
     meta.level = level;
     meta.index = (chunk_index << upper_neighbor_mask_bits_) |
                  (chunk_offset / upper_neighbor_size_);
+    size_t zero_start = chunk_offset;
     chunk_offset += upper_neighbor_size_ * level;
-    if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) {
-      LOG_ERROR("HashMap insert value failed");
-      return IndexError_Runtime;
-    }
 
+    // IMPORTANT: order matters here.
+    // 1) resize so the chunk's data_size covers the new region.
+    // 2) zero-fill the new region: storage backends like BufferStorage do
+    //    NOT zero on resize -- only metadata is updated, and the underlying
+    //    page may contain stale content from a previously-evicted page.
+    //    Without this step, NeighborsHeader::neighbor_cnt is garbage and
+    //    select_entry_point()/search_neighbors() iterate over garbage
+    //    node_ids, eventually triggering find()'s assertion in
+    //    get_upper_neighbor_chunk_loc().
+    // 3) ONLY THEN publish the entry to upper_neighbor_index_, so that any
+    //    concurrent reader that finds this id already sees a properly
+    //    zeroed upper-neighbor slot.
     if (ailego_unlikely(chunk->resize(chunk_offset) != chunk_offset)) {
       LOG_ERROR("Chunk resize to %zu failed", (size_t)chunk_offset);
       return IndexError_Runtime;
     }
 
+    // Use std::vector instead of a VLA: VLAs are a GNU extension and may
+    // produce different codegen / be rejected under clang/MSVC.
+    std::vector<char> zeros(neighbors_size, 0);
+    if (ailego_unlikely(chunk->write(zero_start, zeros.data(),
+                                     neighbors_size) != neighbors_size)) {
+      LOG_ERROR("Chunk write zeros failed");
+      return IndexError_Runtime;
+    }
+
+    if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) {
+      LOG_ERROR("HashMap insert value failed");
+      return IndexError_Runtime;
+    }
+
     return 0;
   }
 
@@ -529,6 +562,10 @@ class HnswStreamerEntity : public HnswEntity {
  protected:
   IndexStreamer::Stats &stats_;
   std::mutex mutex_{};
+  //! Guards upper_neighbor_index_ and upper_neighbor_chunks_ against
+  //! concurrent reads (find) and writes (insert/emplace_back).
+  //! Shared via shared_ptr so all clones synchronize on the SAME mutex.
+  mutable std::shared_ptr<std::shared_mutex> upper_neighbor_rw_mutex_{};
   size_t max_index_size_{0UL};
   uint32_t chunk_size_{kDefaultChunkSize};
   uint32_t upper_neighbor_chunk_size_{kDefaultChunkSize};
diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_index_hash.h b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_index_hash.h
index 4f01aabb3..bf3dc1e7c 100644
--- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_index_hash.h
+++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_index_hash.h
@@ -41,9 +41,9 @@ class HnswIndexHashMap {
           items_(reinterpret_cast<const Item *>(data)) {}
     //! Return a empty loc or the key item loc
 
-    Slot(Chunk::Pointer &&chunk, IndexStorage::MemoryBlock &&mem_block)
-        : chunk_(std::move(chunk)), items_block_(std::move(mem_block)) {
-      items_ = reinterpret_cast<const Item *>(items_block_.data());
+    Slot(Chunk::Pointer &&chunk, std::vector<char> &&local_data)
+        : chunk_(std::move(chunk)), local_data_(std::move(local_data)) {
+      items_ = reinterpret_cast<const Item *>(local_data_.data());
     }
     const_iterator find(key_type key, uint32_t max_items, uint32_t mask) const {
       auto it = &items_[key & mask];
@@ -73,8 +73,8 @@ class HnswIndexHashMap {
 
    private:
     Chunk::Pointer chunk_{};
-    const Item *items_{nullptr};  // point to chunk data
-    IndexStorage::MemoryBlock items_block_{};
+    const Item *items_{nullptr};  // point to local_data_
+    std::vector<char> local_data_{};
   };
 
  public:
@@ -179,14 +179,18 @@ class HnswIndexHashMap {
       LOG_ERROR("Chunk resize failed, size=%zu", size);
       return false;
     }
-    //! Read the whole data to memory
-    IndexStorage::MemoryBlock data_block;
-    if (ailego_unlikely(chunk->read(0U, data_block, size) != size)) {
-      LOG_ERROR("Chunk read failed, size=%zu", size);
-      return false;
-    }
-
-    slots_.emplace_back(std::move(chunk), std::move(data_block));
+    //! Use a local zero-initialized buffer; new chunks contain all zeros,
+    //! so no buffer-pool read is needed and no ref_count is pinned.
+    //! NOTE: Previously this used `chunk->read(0U, data_block, size)` which
+    //! returns a view into the underlying BufferPool page. That made the
+    //! Slot's `items_` pointer alias buffer-pool memory shared across
+    //! threads, which under clang -O3 release exposed a data race on
+    //! Slot::find()'s probing read of `it->second` (concurrent
+    //! const_cast writes from insert() were not reliably visible). Using a
+    //! private zero-initialized vector matches the HNSW (non-RABITQ)
+    //! implementation and avoids this race.
+    std::vector<char> local_buf(size, 0);
+    slots_.emplace_back(std::move(chunk), std::move(local_buf));
     return true;
   }
 
@@ -208,13 +212,14 @@ class HnswIndexHashMap {
             i, chunk->data_size(), size);
         return IndexError_InvalidFormat;
       }
-      //! Read the whole data to memory
-      IndexStorage::MemoryBlock data_block;
-      if (ailego_unlikely(chunk->read(0U, data_block, size) != size)) {
-        LOG_ERROR("Chunk read failed, size=%zu", size);
-        return false;
+      //! Copy chunk data into a local buffer via fetch() so that no
+      //! buffer-pool block is pinned for the lifetime of the Slot.
+      std::vector<char> local_buf(size);
+      if (ailego_unlikely(chunk->fetch(0U, local_buf.data(), size) != size)) {
+        LOG_ERROR("Chunk fetch failed, size=%zu", size);
+        return IndexError_InvalidFormat;
       }
-      slots_.emplace_back(std::move(chunk), std::move(data_block));
+      slots_.emplace_back(std::move(chunk), std::move(local_buf));
     }
     return 0;
   }
diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer.cc b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer.cc
index 9eacf0bc6..2ea2f6aa0 100644
--- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer.cc
+++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer.cc
@@ -40,7 +40,7 @@ HnswRabitqStreamer::HnswRabitqStreamer(IndexProvider::Pointer provider,
       provider_(std::move(provider)) {}
 
 HnswRabitqStreamer::~HnswRabitqStreamer() {
-  if (state_ == STATE_INITED) {
+  if (state_ == STATE_INITED || state_ == STATE_OPENED) {
     this->cleanup();
   }
 }
diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.cc b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.cc
index 35501ed94..cef59c35c 100644
--- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.cc
+++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.cc
@@ -34,6 +34,7 @@ int HnswRabitqStreamerEntity::init(size_t max_doc_cnt) {
   std::lock_guard<std::mutex> lock(mutex_);
   broker_ = std::make_shared<HnswRabitqChunkBroker>(stats_);
   upper_neighbor_index_ = std::make_shared<NIHashMap>();
+  upper_neighbor_rw_mutex_ = std::make_shared<std::shared_mutex>();
   keys_map_lock_ = std::make_shared<ailego::SharedMutex>();
   keys_map_ = std::make_shared<HashMap<key_t, node_id_t>>();
   if (!keys_map_ || !upper_neighbor_index_ || !broker_ || !keys_map_lock_) {
@@ -697,8 +698,9 @@ const HnswRabitqEntity::Pointer HnswRabitqStreamerEntity::clone() const {
       new (std::nothrow) HnswRabitqStreamerEntity(
           stats_, header(), chunk_size_, node_index_mask_bits_,
           upper_neighbor_mask_bits_, filter_same_key_, get_vector_enabled_,
-          upper_neighbor_index_, keys_map_lock_, keys_map_, use_key_info_map_,
-          std::move(node_chunks), std::move(upper_neighbor_chunks), broker_);
+          upper_neighbor_index_, upper_neighbor_rw_mutex_, keys_map_lock_,
+          keys_map_, use_key_info_map_, std::move(node_chunks),
+          std::move(upper_neighbor_chunks), broker_);
   if (ailego_unlikely(!entity)) {
     LOG_ERROR("HnswRabitqStreamerEntity new failed");
   }
diff --git a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h
index ea36143af..7c5b600e7 100644
--- a/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h
+++ b/src/core/algorithm/hnsw_rabitq/hnsw_rabitq_streamer_entity.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <iostream>
+#include <shared_mutex>
 #include <ailego/parallel/lock.h>
 #include <sparsehash/dense_hash_map>
 #include <sparsehash/dense_hash_set>
@@ -216,17 +217,17 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity {
   using NIHashMapPointer = std::shared_ptr<NIHashMap>;
 
   //! Private construct, only be called by clone method
-  HnswRabitqStreamerEntity(IndexStreamer::Stats &stats, const HNSWHeader &hd,
-                           size_t chunk_size, uint32_t node_index_mask_bits,
-                           uint32_t upper_neighbor_mask_bits,
-                           bool filter_same_key, bool get_vector_enabled,
-                           const NIHashMapPointer &upper_neighbor_index,
-                           std::shared_ptr<ailego::SharedMutex> &keys_map_lock,
-                           const HashMapPointer<key_t, node_id_t> &keys_map,
-                           bool use_key_info_map,
-                           std::vector<Chunk::Pointer> &&node_chunks,
-                           std::vector<Chunk::Pointer> &&upper_neighbor_chunks,
-                           const HnswRabitqChunkBroker::Pointer &broker)
+  HnswRabitqStreamerEntity(
+      IndexStreamer::Stats &stats, const HNSWHeader &hd, size_t chunk_size,
+      uint32_t node_index_mask_bits, uint32_t upper_neighbor_mask_bits,
+      bool filter_same_key, bool get_vector_enabled,
+      const NIHashMapPointer &upper_neighbor_index,
+      const std::shared_ptr<std::shared_mutex> &upper_neighbor_rw_mutex,
+      std::shared_ptr<ailego::SharedMutex> &keys_map_lock,
+      const HashMapPointer<key_t, node_id_t> &keys_map, bool use_key_info_map,
+      std::vector<Chunk::Pointer> &&node_chunks,
+      std::vector<Chunk::Pointer> &&upper_neighbor_chunks,
+      const HnswRabitqChunkBroker::Pointer &broker)
       : stats_(stats),
         chunk_size_(chunk_size),
         node_index_mask_bits_(node_index_mask_bits),
@@ -237,6 +238,7 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity {
         filter_same_key_(filter_same_key),
         get_vector_enabled_(get_vector_enabled),
         use_key_info_map_(use_key_info_map),
+        upper_neighbor_rw_mutex_(upper_neighbor_rw_mutex),
         upper_neighbor_index_(upper_neighbor_index),
         keys_map_lock_(keys_map_lock),
         keys_map_(keys_map),
@@ -286,6 +288,11 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity {
 
   inline std::pair<uint32_t, uint32_t> get_upper_neighbor_chunk_loc(
       level_t level, node_id_t id) const {
+    // Shared lock: concurrent readers are fine, but must synchronize with
+    // add_upper_neighbor's exclusive lock to avoid data-race on
+    // slots_.size() inside HnswIndexHashMap (the emplace_back in alloc_slot
+    // is not atomic and concurrent find() may see a stale size value).
+    std::shared_lock<std::shared_mutex> lk(*upper_neighbor_rw_mutex_);
     auto it = upper_neighbor_index_->find(id);
     ailego_assert_abort(it != upper_neighbor_index_->end(),
                         "Get upper neighbor header failed");
@@ -334,6 +341,10 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity {
     if (level == 0) {
       return 0;
     }
+    // Exclusive lock: protects upper_neighbor_chunks_.emplace_back() and
+    // upper_neighbor_index_->insert() from racing with concurrent find()
+    // calls in get_upper_neighbor_chunk_loc().
+    std::unique_lock<std::shared_mutex> lk(*upper_neighbor_rw_mutex_);
     Chunk::Pointer chunk;
     uint64_t chunk_offset = -1UL;
     size_t neighbors_size = get_total_upper_neighbors_size(level);
@@ -373,17 +384,40 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity {
     meta.level = level;
     meta.index = (chunk_index << upper_neighbor_mask_bits_) |
                  (chunk_offset / upper_neighbor_size_);
+    size_t zero_start = chunk_offset;
     chunk_offset += upper_neighbor_size_ * level;
-    if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) {
-      LOG_ERROR("HashMap insert value failed");
-      return IndexError_Runtime;
-    }
 
+    // IMPORTANT: order matters here.
+    // 1) resize so the chunk's data_size covers the new region.
+    // 2) zero-fill the new region: storage backends like BufferStorage do
+    //    NOT zero on resize -- only metadata is updated, and the underlying
+    //    page may contain stale content from a previously-evicted page.
+    //    Without this step, NeighborsHeader::neighbor_cnt is garbage and
+    //    select_entry_point()/search_neighbors() iterate over garbage
+    //    node_ids, eventually triggering find()'s assertion in
+    //    get_upper_neighbor_chunk_loc() at line 291.
+    // 3) ONLY THEN publish the entry to upper_neighbor_index_, so that any
+    //    concurrent reader that finds this id already sees a properly
+    //    zeroed upper-neighbor slot.
     if (ailego_unlikely(chunk->resize(chunk_offset) != chunk_offset)) {
       LOG_ERROR("Chunk resize to %zu failed", (size_t)chunk_offset);
       return IndexError_Runtime;
     }
 
+    // Use std::vector instead of a VLA: VLAs are a GNU extension and may
+    // produce different codegen / be rejected under clang/MSVC.
+    std::vector<char> zeros(neighbors_size, 0);
+    if (ailego_unlikely(chunk->write(zero_start, zeros.data(),
+                                     neighbors_size) != neighbors_size)) {
+      LOG_ERROR("Chunk write zeros failed");
+      return IndexError_Runtime;
+    }
+
+    if (ailego_unlikely(!upper_neighbor_index_->insert(id, meta.data))) {
+      LOG_ERROR("HashMap insert value failed");
+      return IndexError_Runtime;
+    }
+
     return 0;
   }
 
@@ -503,6 +537,11 @@ class HnswRabitqStreamerEntity : public HnswRabitqEntity {
   bool get_vector_enabled_{false};
   bool use_key_info_map_{true};
 
+  // Shared via shared_ptr so that all cloned entities synchronize against
+  // the SAME mutex instance. A plain std::shared_mutex member would be
+  // independent per clone and provide no real protection for the shared
+  // upper_neighbor_index_ hashmap.
+  mutable std::shared_ptr<std::shared_mutex> upper_neighbor_rw_mutex_{};
   NIHashMapPointer upper_neighbor_index_{};
 
   mutable std::shared_ptr<ailego::SharedMutex> keys_map_lock_{};
diff --git a/src/core/algorithm/hnsw_sparse/hnsw_sparse_streamer.cc b/src/core/algorithm/hnsw_sparse/hnsw_sparse_streamer.cc
index 3abce8087..20c215257 100644
--- a/src/core/algorithm/hnsw_sparse/hnsw_sparse_streamer.cc
+++ b/src/core/algorithm/hnsw_sparse/hnsw_sparse_streamer.cc
@@ -27,7 +27,7 @@ namespace core {
 HnswSparseStreamer::HnswSparseStreamer() : entity_(stats_) {}
 
 HnswSparseStreamer::~HnswSparseStreamer() {
-  if (state_ == STATE_INITED) {
+  if (state_ == STATE_INITED || state_ == STATE_OPENED) {
     this->cleanup();
   }
 }
diff --git a/src/core/algorithm/vamana/vamana_streamer.cc b/src/core/algorithm/vamana/vamana_streamer.cc
index ae935eb81..2738a98ad 100644
--- a/src/core/algorithm/vamana/vamana_streamer.cc
+++ b/src/core/algorithm/vamana/vamana_streamer.cc
@@ -26,7 +26,7 @@ namespace core {
 VamanaStreamer::VamanaStreamer() = default;
 
 VamanaStreamer::~VamanaStreamer() {
-  if (state_ == STATE_INITED) {
+  if (state_ == STATE_INITED || state_ == STATE_OPENED) {
     this->cleanup();
   }
 }
diff --git a/src/core/interface/indexes/ivf_index.cc b/src/core/interface/indexes/ivf_index.cc
index 0cfba037c..1b91eebea 100644
--- a/src/core/interface/indexes/ivf_index.cc
+++ b/src/core/interface/indexes/ivf_index.cc
@@ -84,15 +84,22 @@ int IVFIndex::Open(const std::string &file_path,
       break;
     }
     case StorageOptions::StorageType::kBufferPool: {
-      storage_ = core::IndexFactory::CreateStorage("BufferStorage");
+      // NOTE: IVF index is dumped via FileDumper (plain binary file), which is
+      // not compatible with BufferStorage's IndexFormat layout (header/footer
+      // chain). Until IVF gains a BufferStorage-aware dump path, fall back to
+      // MMapFileReadStorage so the freshly-dumped file can be reopened.
+      storage_ = core::IndexFactory::CreateStorage("MMapFileReadStorage");
       if (storage_ == nullptr) {
-        LOG_ERROR("Failed to create BufferStorage");
+        LOG_ERROR(
+            "Failed to create MMapFileReadStorage (IVF buffer-pool fallback)");
         return core::IndexError_Runtime;
       }
       int ret = storage_->init(storage_params);
       if (ret != 0) {
-        LOG_ERROR("Failed to init BufferStorage, path: %s, err: %s",
-                  file_path_.c_str(), core::IndexError::What(ret));
+        LOG_ERROR(
+            "Failed to init MMapFileReadStorage (IVF buffer-pool fallback), "
+            "path: %s, err: %s",
+            file_path_.c_str(), core::IndexError::What(ret));
         return ret;
       }
       break;
diff --git a/src/core/utility/buffer_storage.cc b/src/core/utility/buffer_storage.cc
index c934dd5d9..bf2485724 100644
--- a/src/core/utility/buffer_storage.cc
+++ b/src/core/utility/buffer_storage.cc
@@ -12,9 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <sys/stat.h>
 #include <algorithm>
+#include <atomic>
+#include <cstring>
+#include <functional>
 #include <mutex>
+#include <shared_mutex>
+#include <thread>
 #include <zvec/ailego/buffer/vector_page_table.h>
+#include <zvec/ailego/io/file.h>
 #include <zvec/ailego/utility/time_helper.h>
 #include <zvec/core/framework/index_error.h>
 #include <zvec/core/framework/index_factory.h>
@@ -24,8 +31,62 @@
 
 namespace zvec {
 namespace core {
+namespace {
 
-/*! MMap File Storage
+// Cross-compiler helpers for lock-free 64-bit acquire/release access
+// to SegmentMeta::data_size / padding_size.
+//
+// These fields are POD (uint64_t) inside a serialised struct so we cannot
+// change their type to std::atomic<>; std::atomic_ref is C++20 and the
+// project targets C++17.  GCC/Clang have native __atomic_* builtins that
+// emit single ldar/stlr on arm64 and plain mov on x86_64.  MSVC lacks
+// these builtins, so we fall back to volatile load/store paired with a
+// std::atomic_thread_fence, which is correct on all targets MSVC ships
+// (x86_64 / arm64 desktop) and equivalent in cost.
+inline uint64_t bs_load_acquire(const uint64_t *p) {
+#if defined(__GNUC__) || defined(__clang__)
+  return __atomic_load_n(p, __ATOMIC_ACQUIRE);
+#else
+  uint64_t v = *static_cast<const volatile uint64_t *>(p);
+  std::atomic_thread_fence(std::memory_order_acquire);
+  return v;
+#endif
+}
+
+inline uint64_t bs_load_relaxed(const uint64_t *p) {
+#if defined(__GNUC__) || defined(__clang__)
+  return __atomic_load_n(p, __ATOMIC_RELAXED);
+#else
+  return *static_cast<const volatile uint64_t *>(p);
+#endif
+}
+
+inline void bs_store_release(uint64_t *p, uint64_t v) {
+#if defined(__GNUC__) || defined(__clang__)
+  __atomic_store_n(p, v, __ATOMIC_RELEASE);
+#else
+  std::atomic_thread_fence(std::memory_order_release);
+  *static_cast<volatile uint64_t *>(p) = v;
+#endif
+}
+
+inline void bs_store_relaxed(uint64_t *p, uint64_t v) {
+#if defined(__GNUC__) || defined(__clang__)
+  __atomic_store_n(p, v, __ATOMIC_RELAXED);
+#else
+  *static_cast<volatile uint64_t *>(p) = v;
+#endif
+}
+
+}  // namespace
+
+// The legacy read(const void**) overload guarantees the returned pointer
+// stays valid until close_index().  Single-page reads pin the page
+// (never released); cross-page reads allocate a temp buffer owned by
+// tmp_buffers_ (freed in close_index()).  Callers wanting bounded
+// lifetime should use the read(MemoryBlock&) overload.
+
+/*! Buffer Storage
  */
 class BufferStorage : public IndexStorage {
  public:
@@ -37,33 +98,38 @@ class BufferStorage : public IndexStorage {
     //! Index Storage Pointer
     typedef std::shared_ptr<Segment> Pointer;
 
-    //! Constructor
-    WrappedSegment(BufferStorage *owner, IndexMapping::Segment *segment,
-                   uint64_t segment_header_start_offset,
-                   IndexFormat::MetaHeader *segment_header, size_t segment_id)
-        : segment_(segment),
+    //! Constructor.  See segment_info_ for the pointer-stability contract.
+    WrappedSegment(BufferStorage *owner, IndexMapping::SegmentInfo *info,
+                   size_t segment_id)
+        : segment_info_(info),
           owner_(owner),
           segment_id_(segment_id),
-          capacity_(static_cast<size_t>(segment->meta()->data_size +
-                                        segment->meta()->padding_size)),
-          segment_header_start_offset_(segment_header_start_offset),
-          segment_header_(segment_header) {}
+          capacity_(static_cast<size_t>(info->segment.meta()->data_size +
+                                        info->segment.meta()->padding_size)) {}
     //! Destructor
     ~WrappedSegment(void) override {}
 
     //! Retrieve size of data
+    //!
+    //! data_size / padding_size are mutated lock-free by concurrent
+    //! writers (write/resize) and observed by concurrent readers on the
+    //! lock-free hot path.  Use acquire/release ordering so weakly-ordered
+    //! ARM (e.g. Android arm64) cannot see stale values that would cause
+    //! read() to truncate len to 0.
     size_t data_size(void) const override {
-      return static_cast<size_t>(segment_->meta()->data_size);
+      return static_cast<size_t>(
+          bs_load_acquire(&segment_info_->segment.meta()->data_size));
     }
 
     //! Retrieve crc of data
     uint32_t data_crc(void) const override {
-      return segment_->meta()->data_crc;
+      return segment_info_->segment.meta()->data_crc;
     }
 
     //! Retrieve size of padding
     size_t padding_size(void) const override {
-      return static_cast<size_t>(segment_->meta()->padding_size);
+      return static_cast<size_t>(
+          bs_load_acquire(&segment_info_->segment.meta()->padding_size));
     }
 
     //! Retrieve capacity of segment
@@ -72,36 +138,57 @@ class BufferStorage : public IndexStorage {
     }
 
     //! Fetch data from segment (with own buffer)
+    //!
+    //! C1: pool/handle are stable for the lifetime of the index
+    //! (no retire/rebuild), so no lock is needed on the hot path.
     size_t fetch(size_t offset, void *buf, size_t len) const override {
-      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
-        auto meta = segment_->meta();
-        if (offset > meta->data_size) {
-          offset = meta->data_size;
+      if (ailego_unlikely(!owner_->buffer_pool_handle_)) {
+        LOG_ERROR("WrappedSegment::fetch: handle is null, file[%s], id[%zu]",
+                  owner_->file_name_.c_str(), segment_id_);
+        return 0;
+      }
+      const size_t data_size =
+          bs_load_acquire(&segment_info_->segment.meta()->data_size);
+      if (ailego_unlikely(offset > data_size || len > data_size - offset)) {
+        if (offset > data_size) {
+          offset = data_size;
         }
-        len = meta->data_size - offset;
+        len = data_size - offset;
       }
-      size_t abs_offset = segment_header_start_offset_ +
-                          segment_header_->content_offset +
-                          segment_->meta()->data_index + offset;
+      size_t abs_offset = segment_info_->segment_header_start_offset +
+                          segment_info_->segment_header->content_offset +
+                          segment_info_->segment.meta()->data_index + offset;
       if (!owner_->buffer_pool_handle_->read_range(abs_offset, len,
                                                    static_cast<char *>(buf))) {
+        LOG_ERROR(
+            "WrappedSegment::fetch: read_range failed, file[%s], id[%zu], "
+            "abs_offset=%zu, len=%zu",
+            owner_->file_name_.c_str(), segment_id_, abs_offset, len);
         return 0;
       }
       return len;
     }
 
     //! Read data from segment
+    //! C1: lock-free hot path (pool/handle never change during operation).
     size_t read(size_t offset, const void **data, size_t len) override {
-      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
-        auto meta = segment_->meta();
-        if (offset > meta->data_size) {
-          offset = meta->data_size;
+      if (ailego_unlikely(!owner_->buffer_pool_handle_)) {
+        LOG_ERROR("WrappedSegment::read: handle is null, file[%s], id[%zu]",
+                  owner_->file_name_.c_str(), segment_id_);
+        *data = nullptr;
+        return 0;
+      }
+      const size_t data_size =
+          bs_load_acquire(&segment_info_->segment.meta()->data_size);
+      if (ailego_unlikely(offset > data_size || len > data_size - offset)) {
+        if (offset > data_size) {
+          offset = data_size;
         }
-        len = meta->data_size - offset;
+        len = data_size - offset;
       }
-      size_t abs_offset = segment_header_start_offset_ +
-                          segment_header_->content_offset +
-                          segment_->meta()->data_index + offset;
+      size_t abs_offset = segment_info_->segment_header_start_offset +
+                          segment_info_->segment_header->content_offset +
+                          segment_info_->segment.meta()->data_index + offset;
       size_t first_page = abs_offset / ailego::kVectorPageSize;
       size_t last_page = (len == 0)
                              ? first_page
@@ -111,35 +198,86 @@ class BufferStorage : public IndexStorage {
         char *raw = owner_->buffer_pool_handle_->get_single_page(abs_offset,
                                                                  len, page_id);
         if (!raw) {
+          LOG_ERROR(
+              "WrappedSegment::read: single-page acquire failed, file[%s], "
+              "id[%zu], abs_offset=%zu, len=%zu, page=%zu",
+              owner_->file_name_.c_str(), segment_id_, abs_offset, len,
+              first_page);
+          *data = nullptr;
           return 0;
         }
         *data = raw;
+        // Pin held until close_index() per the never-released contract
+        // of this overload.
+        (void)page_id;
         return len;
       }
-      char *tmp = static_cast<char *>(ailego_aligned_malloc(len, 4096));
+      // Cross-page path: see file-level banner.  C11 aligned_alloc requires
+      // size to be a multiple of alignment, and alignment must be a power
+      // of two.  Use a fixed 4096-byte alignment for the dst buffer: 4K is
+      // the minimum page granularity across all supported platforms
+      // (always a divisor of the 16K/64K page sizes used on Apple Silicon
+      // and some Android arm64 configurations) and is sufficient for the
+      // downstream SIMD/DMA-friendly access contract.  Pinning kAlign to
+      // 4096 also avoids over-allocating 16KB per cross-page read on
+      // large-page platforms.
+      static constexpr size_t kAlign = 4096UL;
+      size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL);
+      // Allocate a 4K-aligned slot from the per-storage arena pool.
+      // This batches page-aligned allocation: under heap fragmentation
+      // (notably Android Bionic scudo), one large posix_memalign per
+      // arena via the secondary (mmap-backed) allocator is far more
+      // reliable than many independent posix_memalign(4K, 4K) calls.
+      char *tmp = nullptr;
+      {
+        std::lock_guard<std::mutex> tmp_latch(owner_->tmp_buffers_mutex_);
+        tmp = owner_->tmp_arena_alloc_locked(alloc_size);
+      }
       if (!tmp) {
+        LOG_ERROR(
+            "WrappedSegment::read: cross-page alloc failed, file[%s], "
+            "id[%zu], abs_offset=%zu, len=%zu, alloc_size=%zu, align=%zu",
+            owner_->file_name_.c_str(), segment_id_, abs_offset, len,
+            alloc_size, kAlign);
+        *data = nullptr;
         return 0;
       }
       if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, tmp)) {
-        ailego_free(tmp);
+        LOG_ERROR(
+            "WrappedSegment::read: cross-page read_range failed, file[%s], "
+            "id[%zu], abs_offset=%zu, len=%zu, first_page=%zu, last_page=%zu",
+            owner_->file_name_.c_str(), segment_id_, abs_offset, len,
+            first_page, last_page);
+        // The arena slot is intentionally not rolled back: rolling back
+        // would require holding the arena lock across read_range, while
+        // the worst-case leak per failed read is one slot (alloc_size).
+        *data = nullptr;
         return 0;
       }
-      owner_->register_tmp_buffer(tmp);
       *data = tmp;
       return len;
     }
 
+    //! C1: lock-free hot path (pool/handle never change during operation).
     size_t read(size_t offset, MemoryBlock &data, size_t len) override {
-      if (ailego_unlikely(offset + len > segment_->meta()->data_size)) {
-        auto meta = segment_->meta();
-        if (offset > meta->data_size) {
-          offset = meta->data_size;
+      if (ailego_unlikely(!owner_->buffer_pool_handle_)) {
+        LOG_ERROR(
+            "WrappedSegment::read(MemoryBlock&): handle is null, file[%s], "
+            "id[%zu]",
+            owner_->file_name_.c_str(), segment_id_);
+        return 0;
+      }
+      const size_t data_size =
+          bs_load_acquire(&segment_info_->segment.meta()->data_size);
+      if (ailego_unlikely(offset > data_size || len > data_size - offset)) {
+        if (offset > data_size) {
+          offset = data_size;
         }
-        len = meta->data_size - offset;
+        len = data_size - offset;
       }
-      size_t abs_offset = segment_header_start_offset_ +
-                          segment_header_->content_offset +
-                          segment_->meta()->data_index + offset;
+      size_t abs_offset = segment_info_->segment_header_start_offset +
+                          segment_info_->segment_header->content_offset +
+                          segment_info_->segment.meta()->data_index + offset;
       size_t first_page = abs_offset / ailego::kVectorPageSize;
       size_t last_page = (len == 0)
                              ? first_page
@@ -150,38 +288,154 @@ class BufferStorage : public IndexStorage {
                                                                  len, page_id);
         if (!raw) {
           LOG_ERROR("read error (single-page acquire failed).");
-          return -1;
+          return 0;
         }
         data.reset(owner_->buffer_pool_handle_.get(), page_id, raw);
         return len;
       }
-      char *tmp = static_cast<char *>(ailego_aligned_malloc(len, 4096));
+      // C11 aligned_alloc requires the requested size to be a multiple of
+      // the alignment, and alignment must be a power of two.  See the
+      // sibling read(const void**) overload above for the rationale of
+      // pinning kAlign to a fixed 4096 instead of sysconf(_SC_PAGESIZE).
+      static constexpr size_t kAlign = 4096UL;
+      size_t alloc_size = (len + (kAlign - 1UL)) & ~(kAlign - 1UL);
+      char *tmp =
+          static_cast<char *>(ailego_aligned_malloc(alloc_size, kAlign));
       if (!tmp) {
         LOG_ERROR("read error (alloc cross-page temp buffer failed).");
-        return -1;
+        return 0;
       }
       if (!owner_->buffer_pool_handle_->read_range(abs_offset, len, tmp)) {
         ailego_free(tmp);
         LOG_ERROR("read error (cross-page read_range failed).");
-        return -1;
+        return 0;
       }
-      data = MemoryBlock::MakeOwned(tmp);
+      data = MemoryBlock::MakeOwned(tmp, len);
       return len;
     }
 
-    //! Write data into the storage with offset
-    size_t write(size_t /*offset*/, const void * /*data*/,
-                 size_t len) override {
+    //! Write data into the storage with offset.
+    //!
+    //! Locking: shared shard latch pairs with flush_index()'s exclusive
+    //! all-shards latch -- excludes CRC compute over meta_buf while we
+    //! mutate (data_size, padding_size).  meta_mtx_ additionally
+    //! serialises concurrent writers on the SAME segment so the pair
+    //! stays consistent (sum == capacity_).
+    size_t write(size_t offset, const void *data, size_t len) override {
+      std::shared_lock<std::shared_mutex> latch(
+          owner_->mapping_shards_[owner_->mapping_shard_id()].mtx);
+      if (ailego_unlikely(!owner_->buffer_pool_handle_ ||
+                          !owner_->buffer_pool_)) {
+        LOG_ERROR("WrappedSegment::write: pool is null, file[%s], id[%zu]",
+                  owner_->file_name_.c_str(), segment_id_);
+        return 0;
+      }
+      if (ailego_unlikely(owner_->corrupted_.load(std::memory_order_acquire))) {
+        LOG_ERROR(
+            "WrappedSegment::write: storage is marked corrupted, refusing "
+            "write, file[%s], id[%zu]",
+            owner_->file_name_.c_str(), segment_id_);
+        return 0;
+      }
+      // In read-only mode the write is a silent no-op so that callers that
+      // unconditionally write (e.g. CRC updates) do not return an error.
+      if (!owner_->buffer_pool_->writable()) {
+        return len;
+      }
+      if (ailego_unlikely(offset > capacity_ || len > capacity_ - offset)) {
+        LOG_ERROR(
+            "write() exceeds segment capacity: offset=%zu len=%zu cap=%zu",
+            offset, len, capacity_);
+        return 0;
+      }
+      auto meta = segment_info_->segment.meta();
+      size_t abs_offset = segment_info_->segment_header_start_offset +
+                          segment_info_->segment_header->content_offset +
+                          meta->data_index + offset;
+      // Write the bytes BEFORE publishing the new data_size to readers.
+      // Lock-free readers observe data_size with acquire ordering; the
+      // release-store below establishes happens-before with the page
+      // contents written above.  Publishing data_size first (the previous
+      // ordering) allowed a reader on weakly-ordered ARM to see the new
+      // length but still read stale page contents -- or, in the inverse
+      // direction, see a stale length and truncate len to 0
+      // (root cause of "Read sparse vector failed ... ret=0").
+      if (owner_->buffer_pool_handle_->write_range(
+              abs_offset, len, static_cast<const char *>(data)) != 0) {
+        LOG_ERROR("write() page-cache write_range failed at abs_offset=%zu",
+                  abs_offset);
+        return 0;
+      }
+      {
+        std::lock_guard<std::mutex> meta_latch(meta_mtx_);
+        uint64_t cur = bs_load_relaxed(&meta->data_size);
+        if (offset + len > cur) {
+          uint64_t new_size = offset + len;
+          // padding_size is paired with data_size; publish it first
+          // (relaxed) so readers that acquire data_size see a
+          // consistent (data_size + padding_size == capacity_) pair.
+          bs_store_relaxed(&meta->padding_size, capacity_ - new_size);
+          bs_store_release(&meta->data_size, new_size);
+        }
+      }
+      // Mark dirty unconditionally even when data_size did not grow:
+      // fixed-size in-place rewrites (e.g. chunk_meta_segment) must still
+      // trigger flush_all() before the next append_segment().
+      owner_->set_as_dirty();
       return len;
     }
 
-    //! Resize size of data
-    size_t resize(size_t /*size*/) override {
-      return 0;
+    //! Resize size of data.  See write() for the locking contract.
+    size_t resize(size_t size) override {
+      std::shared_lock<std::shared_mutex> latch(
+          owner_->mapping_shards_[owner_->mapping_shard_id()].mtx);
+      if (ailego_unlikely(owner_->corrupted_.load(std::memory_order_acquire))) {
+        LOG_ERROR(
+            "WrappedSegment::resize: storage is marked corrupted, refusing "
+            "resize, file[%s], id[%zu]",
+            owner_->file_name_.c_str(), segment_id_);
+        return 0;
+      }
+      auto meta = segment_info_->segment.meta();
+      bool changed = false;
+      {
+        std::lock_guard<std::mutex> meta_latch(meta_mtx_);
+        uint64_t cur = bs_load_relaxed(&meta->data_size);
+        if (cur != size) {
+          if (size > capacity_) {
+            size = capacity_;
+          }
+          // See write() for the publish ordering rationale: padding first
+          // (relaxed), then release-store data_size so concurrent lock-free
+          // readers observe a consistent pair.
+          bs_store_relaxed(&meta->padding_size, capacity_ - size);
+          bs_store_release(&meta->data_size, size);
+          changed = true;
+        }
+      }
+      if (changed) {
+        owner_->set_as_dirty();
+      }
+      return size;
     }
 
-    //! Update crc of data
-    void update_data_crc(uint32_t /*crc*/) override {}
+    //! Update crc of data.  See write() for the locking contract.
+    void update_data_crc(uint32_t crc) override {
+      std::shared_lock<std::shared_mutex> latch(
+          owner_->mapping_shards_[owner_->mapping_shard_id()].mtx);
+      if (ailego_unlikely(owner_->corrupted_.load(std::memory_order_acquire))) {
+        LOG_ERROR(
+            "WrappedSegment::update_data_crc: storage is marked corrupted, "
+            "refusing CRC update, file[%s], id[%zu]",
+            owner_->file_name_.c_str(), segment_id_);
+        return;
+      }
+      {
+        std::lock_guard<std::mutex> meta_latch(meta_mtx_);
+        segment_info_->segment.meta()->data_crc = crc;
+      }
+      owner_->set_as_dirty();
+    }
 
     //! Clone the segment
     IndexStorage::Segment::Pointer clone(void) override {
@@ -190,14 +444,18 @@ class BufferStorage : public IndexStorage {
 
    protected:
     friend BufferStorage;
-    IndexMapping::Segment *segment_{};
+    // Pointer into BufferStorage::segments_ (unordered_map mapped value).
+    // The address is stable across map insertions, so re-parses after
+    // append_segment() are picked up without recreating WrappedSegment.
+    IndexMapping::SegmentInfo *segment_info_{nullptr};
+    // Serialises hot-path writers on the SAME segment so
+    // (data_size, padding_size, data_crc) updates do not interleave.
+    mutable std::mutex meta_mtx_{};
 
    private:
     BufferStorage *owner_{nullptr};
     size_t segment_id_{};
     size_t capacity_{};
-    uint64_t segment_header_start_offset_;
-    IndexFormat::MetaHeader *segment_header_;
   };
 
   //! Destructor
@@ -211,7 +469,11 @@ class BufferStorage : public IndexStorage {
   }
 
   //! Initialize storage
-  int init(const ailego::Params & /*params*/) override {
+  int init(const ailego::Params &params) override {
+    uint32_t val = params.get_as_uint32(MMAPFILE_STORAGE_SEGMENT_META_CAPACITY);
+    if (val != 0) {
+      segment_meta_capacity_ = val;
+    }
     return 0;
   }
 
@@ -222,62 +484,62 @@ class BufferStorage : public IndexStorage {
   }
 
   //! Open storage
-  int open(const std::string &path, bool /*create_if_missing*/) override {
+  int open(const std::string &path, bool create_if_missing) override {
     file_name_ = path;
-    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(path);
+    if (!ailego::File::IsExist(path) && create_if_missing) {
+      size_t last_slash = path.rfind('/');
+      if (last_slash != std::string::npos) {
+        ailego::File::MakePath(path.substr(0, last_slash));
+      }
+      int error_code = this->init_index(path);
+      if (error_code != 0) {
+        LOG_ERROR("init_index failed for %s, errno=%d", path.c_str(),
+                  error_code);
+        return error_code;
+      }
+    }
+
+    // Open in writable mode when the caller expects to modify the index
+    // (create_if_missing=true implies write intent, same as MMapFileStorage).
+    buffer_pool_ = std::make_shared<ailego::VecBufferPool>(
+        path, /*writable=*/create_if_missing);
     buffer_pool_handle_ = std::make_shared<ailego::VecBufferPoolHandle>(
         buffer_pool_->get_handle());
     int ret = ParseToMapping();
     if (ret != 0) {
+      this->close_index();
       return ret;
     }
     ret = buffer_pool_->init();
     if (ret != 0) {
+      this->close_index();
       return ret;
     }
     LOG_INFO(
-        "BufferStorage opened: file=%s, max_segment_size=%zu, "
+        "BufferStorage opened: file=%s, writable=%d, max_segment_size=%lu, "
         "segment_count=%zu",
-        file_name_.c_str(), (size_t)max_segment_size_, segments_.size());
+        file_name_.c_str(), static_cast<int>(create_if_missing),
+        max_segment_size_, segments_.size());
     return 0;
   }
 
-  void register_tmp_buffer(char *buf) {
-    std::lock_guard<std::mutex> latch(tmp_buffers_mutex_);
-    tmp_buffers_.push_back(buf);
-  }
-
-  char *get_buffer(size_t offset, size_t length, size_t /*block_id*/) {
-    char *tmp = static_cast<char *>(ailego_aligned_malloc(length, 4096));
-    if (!tmp) {
-      return nullptr;
-    }
-    if (!buffer_pool_handle_->read_range(offset, length, tmp)) {
-      ailego_free(tmp);
-      return nullptr;
-    }
-    register_tmp_buffer(tmp);
-    return tmp;
-  }
-
-  int get_meta(size_t offset, size_t length, char *out) {
-    return buffer_pool_handle_->get_meta(offset, length, out);
-  }
-
-  int ParseHeader(size_t offset) {
-    std::unique_ptr<char[]> buffer(new char[sizeof(header_)]);
-    if (get_meta(offset, sizeof(header_), buffer.get()) != 0) {
+  // PRECONDITION (also for ParseFooter/ParseSegment/ParseToMapping):
+  // caller holds either single-threaded open() or AllShardsExclusiveLatch.
+  // Do NOT add an internal lock here -- std::shared_mutex is not reentrant.
+  int ParseHeader(size_t offset, IndexFormat::MetaHeader *out) {
+    constexpr size_t kHeaderSize = sizeof(IndexFormat::MetaHeader);
+    std::unique_ptr<char[]> buffer(new char[kHeaderSize]);
+    if (buffer_pool_handle_->get_meta(offset, kHeaderSize, buffer.get()) != 0) {
       LOG_ERROR("Get segment header failed.");
       return IndexError_Runtime;
     }
-    uint8_t *header_ptr = reinterpret_cast<uint8_t *>(buffer.get());
-    memcpy(&header_, header_ptr, sizeof(header_));
-    if (header_.meta_header_size != sizeof(IndexFormat::MetaHeader)) {
+    memcpy(out, buffer.get(), kHeaderSize);
+    if (out->meta_header_size != kHeaderSize) {
       LOG_ERROR("Header meta size is invalid.");
       return IndexError_InvalidLength;
     }
-    if (ailego::Crc32c::Hash(&header_, sizeof(header_), header_.header_crc) !=
-        header_.header_crc) {
+    if (ailego::Crc32c::Hash(out, kHeaderSize, out->header_crc) !=
+        out->header_crc) {
       LOG_ERROR("Header meta checksum is invalid.");
       return IndexError_InvalidChecksum;
     }
@@ -286,7 +548,8 @@ class BufferStorage : public IndexStorage {
 
   int ParseFooter(size_t offset) {
     std::unique_ptr<char[]> buffer(new char[sizeof(footer_)]);
-    if (get_meta(offset, sizeof(footer_), buffer.get()) != 0) {
+    if (buffer_pool_handle_->get_meta(offset, sizeof(footer_), buffer.get()) !=
+        0) {
       LOG_ERROR("Get segment footer failed.");
       return IndexError_Runtime;
     }
@@ -304,12 +567,12 @@ class BufferStorage : public IndexStorage {
     return 0;
   }
 
-  int ParseSegment(size_t offset) {
-    std::lock_guard<std::mutex> latch(mapping_mutex_);
+  int ParseSegment(size_t offset, IndexFormat::MetaHeader *chain_header,
+                   uint32_t *out_segment_ids_offset) {
     std::unique_ptr<char[]> segment_buffer =
         std::make_unique<char[]>(footer_.segments_meta_size);
-    if (get_meta(offset, footer_.segments_meta_size, segment_buffer.get()) !=
-        0) {
+    if (buffer_pool_handle_->get_meta(offset, footer_.segments_meta_size,
+                                      segment_buffer.get()) != 0) {
       LOG_ERROR("Get segment meta failed.");
       return IndexError_Runtime;
     }
@@ -324,7 +587,7 @@ class BufferStorage : public IndexStorage {
     for (IndexFormat::SegmentMeta *iter = segment_start,
                                   *end = segment_start + footer_.segment_count;
          iter != end; ++iter) {
-      if (iter->segment_id_offset > footer_.segments_meta_size) {
+      if (iter->segment_id_offset >= footer_.segments_meta_size) {
         return IndexError_InvalidValue;
       }
       if (iter->data_index > footer_.content_size) {
@@ -337,15 +600,34 @@ class BufferStorage : public IndexStorage {
       if (iter->segment_id_offset < segment_ids_offset) {
         segment_ids_offset = iter->segment_id_offset;
       }
-      id_hash_.emplace(
-          std::string(reinterpret_cast<const char *>(segment_start) +
-                      iter->segment_id_offset),
-          segments_.size());
-      segments_.emplace(
-          std::string(reinterpret_cast<const char *>(segment_start) +
-                      iter->segment_id_offset),
+      // Use id_hash_.size() (not segments_.size()) for the block_id:
+      // segments_ is intentionally NOT cleared between appends to keep
+      // existing WrappedSegment pointers valid, so it carries stale entries.
+      //
+      // Bound the C-string scan to the segments_meta buffer so a missing
+      // NUL terminator cannot walk past the buffer end (defence against
+      // crafted-CRC inputs; CRC already covers benign bit flips).
+      const char *seg_name_start =
+          reinterpret_cast<const char *>(segment_start) +
+          iter->segment_id_offset;
+      const size_t seg_name_max =
+          footer_.segments_meta_size - iter->segment_id_offset;
+      const size_t seg_name_len = ::strnlen(seg_name_start, seg_name_max);
+      if (seg_name_len == seg_name_max) {
+        LOG_ERROR("ParseSegment: segment_id missing NUL terminator, file[%s]",
+                  file_name_.c_str());
+        return IndexError_InvalidValue;
+      }
+      const std::string seg_name(seg_name_start, seg_name_len);
+      const size_t seg_id = id_hash_.size();
+      id_hash_[seg_name] = seg_id;
+      // In-place update so existing WrappedSegment pointers see the
+      // refreshed meta_ptr_ after re-parse.  chain_header MUST be the
+      // per-chain owning copy (not a shared &header_) -- see
+      // chain_headers_ field comment.
+      segments_[seg_name] =
           IndexMapping::SegmentInfo{IndexMapping::Segment{iter},
-                                    current_header_start_offset_, &header_});
+                                    current_header_start_offset_, chain_header};
       max_segment_size_ =
           std::max(max_segment_size_, iter->data_size + iter->padding_size);
       if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count >
@@ -354,36 +636,53 @@ class BufferStorage : public IndexStorage {
       }
     }
     buffer_pool_buffers_.push_back(std::move(segment_buffer));
+    if (out_segment_ids_offset) {
+      *out_segment_ids_offset = segment_ids_offset;
+    }
     return 0;
   }
 
   int ParseToMapping() {
     while (true) {
       int ret;
-      ret = ParseHeader(current_header_start_offset_);
+      // Per-chain owning MetaHeader; see chain_headers_ field comment.
+      chain_headers_.emplace_back(std::make_unique<IndexFormat::MetaHeader>());
+      IndexFormat::MetaHeader *chain_header = chain_headers_.back().get();
+      ret = ParseHeader(current_header_start_offset_, chain_header);
       if (ret != 0) {
         LOG_ERROR("Failed to parse header, errno %d, %s", ret,
                   IndexError::What(ret));
         return ret;
       }
 
-      switch (header_.version) {
+      switch (chain_header->version) {
         case IndexFormat::FORMAT_VERSION:
           break;
         default:
-          LOG_ERROR("Unsupported index version: %u", header_.version);
+          LOG_ERROR("Unsupported index version: %u", chain_header->version);
           return IndexError_Unsupported;
       }
 
       // Unpack footer
-      if (header_.meta_footer_size != sizeof(IndexFormat::MetaFooter)) {
+      if (chain_header->meta_footer_size != sizeof(IndexFormat::MetaFooter)) {
         return IndexError_InvalidLength;
       }
-      if ((int32_t)header_.meta_footer_offset < 0) {
+      if ((int32_t)chain_header->meta_footer_offset < 0) {
         return IndexError_Unsupported;
       }
       uint64_t footer_offset =
-          header_.meta_footer_offset + current_header_start_offset_;
+          chain_header->meta_footer_offset + current_header_start_offset_;
+      // Reject uint64 wrap-around and offsets past file_size.
+      if (footer_offset < current_header_start_offset_ ||
+          footer_offset + sizeof(IndexFormat::MetaFooter) >
+              buffer_pool_->file_size()) {
+        LOG_ERROR(
+            "ParseToMapping: invalid footer_offset=%lu (header=%lu, "
+            "file_size=%lu), file[%s]",
+            footer_offset, current_header_start_offset_,
+            buffer_pool_->file_size(), file_name_.c_str());
+        return IndexError_InvalidValue;
+      }
       ret = ParseFooter(footer_offset);
       if (ret != 0) {
         LOG_ERROR("Failed to parse footer, errno %d, %s", ret,
@@ -398,17 +697,49 @@ class BufferStorage : public IndexStorage {
       }
       const uint64_t segment_start_offset =
           footer_offset - footer_.segments_meta_size;
-      ret = ParseSegment(segment_start_offset);
+      uint32_t segment_ids_offset = footer_.segments_meta_size;
+      ret =
+          ParseSegment(segment_start_offset, chain_header, &segment_ids_offset);
       if (ret != 0) {
         LOG_ERROR("Failed to parse segment, errno %d, %s", ret,
                   IndexError::What(ret));
         return ret;
       }
 
+      // Record per-chain metadata offsets so flush_index() can write
+      // updated segment metas and footers back to the backing file.
+      meta_chains_.push_back({current_header_start_offset_, footer_offset,
+                              segment_start_offset, footer_.segments_meta_size,
+                              segment_ids_offset, footer_});
+
       if (footer_.next_meta_header_offset == 0) {
         break;
       }
-      current_header_start_offset_ = footer_.next_meta_header_offset;
+      // Reject self-reference / backward jumps and offsets past file_size:
+      // such a corrupted next_meta_header_offset would otherwise drive the
+      // loop into infinite chain growth -> OOM.
+      const uint64_t next_off = footer_.next_meta_header_offset;
+      if (next_off <= current_header_start_offset_ ||
+          next_off + sizeof(IndexFormat::MetaHeader) >
+              buffer_pool_->file_size()) {
+        LOG_ERROR(
+            "ParseToMapping: invalid next_meta_header_offset=%lu "
+            "(current=%lu, file_size=%lu), file[%s]",
+            next_off, current_header_start_offset_, buffer_pool_->file_size(),
+            file_name_.c_str());
+        return IndexError_InvalidValue;
+      }
+      // Bound chain count: 1024 chains @ default 1MB segment_meta_capacity
+      // covers >1GB of metadata, far above realistic load.
+      constexpr size_t kMaxChains = 1024;
+      if (chain_headers_.size() >= kMaxChains) {
+        LOG_ERROR(
+            "ParseToMapping: chain count exceeds limit %zu, file[%s] may "
+            "be corrupted",
+            kMaxChains, file_name_.c_str());
+        return IndexError_InvalidLength;
+      }
+      current_header_start_offset_ = next_off;
     }
     return 0;
   }
@@ -441,13 +772,18 @@ class BufferStorage : public IndexStorage {
 
   //! Retrieve a segment by id
   IndexStorage::Segment::Pointer get(const std::string &id, int) override {
-    auto segment_info = this->get_segment_info(id);
-    if (!segment_info) {
+    std::shared_lock<std::shared_mutex> latch(
+        mapping_shards_[mapping_shard_id()].mtx);
+    auto seg_iter = segments_.find(id);
+    if (seg_iter == segments_.end()) {
+      return WrappedSegment::Pointer{};
+    }
+    auto id_iter = id_hash_.find(id);
+    if (id_iter == id_hash_.end()) {
       return WrappedSegment::Pointer{};
     }
-    return std::make_shared<WrappedSegment>(
-        this, &segment_info->segment, segment_info->segment_header_start_offset,
-        segment_info->segment_header, id_hash_[id]);
+    return std::make_shared<WrappedSegment>(this, &seg_iter->second,
+                                            id_iter->second);
   }
 
   //! Test if it a segment exists
@@ -457,20 +793,24 @@ class BufferStorage : public IndexStorage {
 
   //! Retrieve magic number of index
   uint32_t magic(void) const override {
-    return header_.magic;
+    if (chain_headers_.empty()) {
+      return 0u;
+    }
+    return chain_headers_.front()->magic;
   }
 
  protected:
-  //! Initialize index version segment
-  int init_version_segment(void) {
+  //! Initialize index version segment (writes content into an IndexMapping).
+  //! Only intended to be called from init_index() while `mapping` is still
+  //! open in create-mode.
+  int init_version_segment(IndexMapping &mapping) {
     size_t data_size = std::strlen(IndexVersion::Details());
-    int error_code =
-        this->append_segment(INDEX_VERSION_SEGMENT_NAME, data_size);
+    int error_code = mapping.append(INDEX_VERSION_SEGMENT_NAME, data_size);
     if (error_code != 0) {
       return error_code;
     }
-
-    auto segment = &get_segment_info(INDEX_VERSION_SEGMENT_NAME)->segment;
+    IndexMapping::Segment *segment =
+        mapping.map(INDEX_VERSION_SEGMENT_NAME, false, false);
     if (!segment) {
       return IndexError_MMapFile;
     }
@@ -484,45 +824,189 @@ class BufferStorage : public IndexStorage {
     return 0;
   }
 
-  //! Initialize index file
-  int init_index(const std::string & /*path*/) {
-    // Add index version
-    int error_code = this->init_version_segment();
-    if (error_code != 0) {
-      return error_code;
+  //! Create the initial on-disk index structure and write the mandatory
+  //! version segment.  Uses IndexMapping (the same engine as MMapFileStorage)
+  //! so the produced file is fully compatible with both storage backends.
+  int init_index(const std::string &path) {
+    IndexMapping mapping;
+    int ret = mapping.create(path, segment_meta_capacity_);
+    if (ret != 0) {
+      LOG_ERROR(
+          "BufferStorage failed to create index file: path[%s], errno[%d]",
+          path.c_str(), ret);
+      return ret;
     }
-
-    // Refresh mapping
-    this->refresh_index(0);
-    return 0;
+    ret = this->init_version_segment(mapping);
+    if (ret != 0) {
+      LOG_ERROR(
+          "BufferStorage failed to append version segment: path[%s], errno[%d]",
+          path.c_str(), ret);
+      mapping.close();
+      return ret;
+    }
+    mapping.refresh(0);
+    ret = mapping.flush();
+    mapping.close();
+    if (ret != 0) {
+      LOG_ERROR(
+          "BufferStorage failed to flush new index file: path[%s], errno[%d]",
+          path.c_str(), ret);
+    }
+    return ret;
   }
 
-  //! Set the index file as dirty
+  //! Mark the index as dirty.  HOT PATH: store(true) unconditionally --
+  //! a load-then-store guard could let a stale cached `true` skip the
+  //! store after flush_index() CAS'd dirty=false on another core, losing
+  //! the writer's modification.
   void set_as_dirty(void) {
-    index_dirty_ = true;
+    index_dirty_.store(true, std::memory_order_relaxed);
   }
 
   //! Refresh meta information (checksum, update time, etc.)
-  void refresh_index(uint64_t /*chkp*/) {}
+  void refresh_index(uint64_t chkp) {
+    // CAS-loop max: callers may invoke refresh() out of order, and the
+    // persisted check_point must be non-decreasing.  Relaxed ordering is
+    // sufficient because flush_index() takes AllShardsExclusiveLatch which
+    // establishes the necessary happens-before for the disk write.
+    if (chkp != 0) {
+      uint64_t cur = pending_check_point_.load(std::memory_order_relaxed);
+      while (chkp > cur) {
+        if (pending_check_point_.compare_exchange_weak(
+                cur, chkp, std::memory_order_relaxed)) {
+          break;
+        }
+      }
+    }
+    // Set dirty unconditionally even if our chkp lost the CAS race: the
+    // winning larger chkp must still be flushed.
+    index_dirty_.store(true, std::memory_order_relaxed);
+  }
 
-  //! Flush index storage
+  //! Flush index storage.
   int flush_index(void) {
+    if (!index_dirty_.load(std::memory_order_relaxed)) {
+      return 0;
+    }
+    // Exclusive all-shards latch excludes the lock-free hot path while we
+    // hash meta_buf and pwrite footer; without it segments_meta_crc would
+    // not match the bytes on disk.
+    AllShardsExclusiveLatch latch(mapping_shards_);
+    return flush_index_locked();
+  }
+
+  //! PRECONDITION: caller holds AllShardsExclusiveLatch.  Used by
+  //! flush_index() (acquires the latch) and close_index() (must flush
+  //! and tear down under one continuous latch hold).
+  int flush_index_locked(void) {
+    // No-op on never-opened / already-closed storage: close_index()
+    // unconditionally calls us during teardown.
+    if (!buffer_pool_ || !buffer_pool_handle_) {
+      index_dirty_.store(false, std::memory_order_relaxed);
+      return 0;
+    }
+    if (corrupted_.load(std::memory_order_acquire)) {
+      LOG_ERROR(
+          "BufferStorage::flush_index skipped: storage is marked corrupted, "
+          "file[%s]",
+          file_name_.c_str());
+      return IndexError_Runtime;
+    }
+    if (!buffer_pool_->writable()) {
+      // Read-only pool: nothing to flush.
+      index_dirty_.store(false, std::memory_order_relaxed);
+      return 0;
+    }
+    // Claim dirty atomically AT THE START so any concurrent write() that
+    // lands during this flush re-sets dirty=true and is picked up by the
+    // next flush; an unconditional store(false) at the end would silently
+    // swallow it.
+    bool expected_dirty = true;
+    if (!index_dirty_.compare_exchange_strong(expected_dirty, false,
+                                              std::memory_order_relaxed)) {
+      // Another thread already claimed; bail out.
+      return 0;
+    }
+    // Snapshot pending_check_point_ AFTER claiming dirty: any newer chkp
+    // stored by a concurrent refresh_index() will be preserved by the
+    // CAS-reset at the end (and refresh_index() will have re-set dirty).
+    const uint64_t consumed_chkp =
+        pending_check_point_.load(std::memory_order_relaxed);
+    // Restore consumed_chkp on failure paths (CAS-loop max, same as
+    // refresh_index()) so a concurrent larger chkp wins.
+    auto restore_chkp_on_failure = [this, consumed_chkp]() {
+      if (consumed_chkp == 0) return;
+      uint64_t cur = pending_check_point_.load(std::memory_order_relaxed);
+      while (consumed_chkp > cur) {
+        if (pending_check_point_.compare_exchange_weak(
+                cur, consumed_chkp, std::memory_order_relaxed)) {
+          break;
+        }
+      }
+    };
+    // Flush dirty data blocks first.
+    if (buffer_pool_handle_->flush_all() != 0) {
+      index_dirty_.store(true, std::memory_order_relaxed);
+      restore_chkp_on_failure();
+      LOG_ERROR("flush_all data blocks failed: file[%s]", file_name_.c_str());
+      return IndexError_WriteData;
+    }
+    // Per-chain: recompute segments_meta CRC, refresh footer, pwrite both.
+    for (size_t ci = 0;
+         ci < meta_chains_.size() && ci < buffer_pool_buffers_.size(); ++ci) {
+      MetaChain &mchain = meta_chains_[ci];
+      const char *seg_buf = buffer_pool_buffers_[ci].get();
+      mchain.footer.segments_meta_crc =
+          ailego::Crc32c::Hash(seg_buf, mchain.segment_meta_size, 0u);
+      IndexFormat::UpdateMetaFooter(&mchain.footer, consumed_chkp);
+      if (buffer_pool_handle_->write_meta(mchain.segment_meta_file_offset,
+                                          mchain.segment_meta_size,
+                                          seg_buf) != 0) {
+        LOG_ERROR("Failed to write segment meta: file[%s], chain[%zu]",
+                  file_name_.c_str(), ci);
+        index_dirty_.store(true, std::memory_order_relaxed);
+        restore_chkp_on_failure();
+        return IndexError_WriteData;
+      }
+      if (buffer_pool_handle_->write_meta(
+              mchain.footer_file_offset, sizeof(mchain.footer),
+              reinterpret_cast<const char *>(&mchain.footer)) != 0) {
+        LOG_ERROR("Failed to write footer: file[%s], chain[%zu]",
+                  file_name_.c_str(), ci);
+        index_dirty_.store(true, std::memory_order_relaxed);
+        restore_chkp_on_failure();
+        return IndexError_WriteData;
+      }
+    }
+    if (!meta_chains_.empty()) {
+      footer_ = meta_chains_.back().footer;
+    }
+    // CAS-reset pending: only consume the chkp we observed.  A concurrent
+    // larger chkp survives and will be flushed next round (refresh_index()
+    // also re-set dirty).
+    uint64_t expected_chkp = consumed_chkp;
+    pending_check_point_.compare_exchange_strong(expected_chkp, 0,
+                                                 std::memory_order_relaxed);
     return 0;
   }
 
   //! Close index storage
   void close_index(void) {
-    std::lock_guard<std::mutex> latch(mapping_mutex_);
+    // Hold ONE continuous all-shards latch across flush + teardown so no
+    // writer can slip in between (which would dirty meta_buf only to have
+    // the page table reset under it, dropping the modification).
+    AllShardsExclusiveLatch latch(mapping_shards_);
+    flush_index_locked();
     file_name_.clear();
     id_hash_.clear();
     segments_.clear();
-    memset(&header_, 0, sizeof(header_));
+    chain_headers_.clear();
     memset(&footer_, 0, sizeof(footer_));
     {
       std::lock_guard<std::mutex> tmp_latch(tmp_buffers_mutex_);
-      for (char *p : tmp_buffers_) {
-        if (p) {
-          ailego_free(p);
+      for (const ArenaBlock &b : tmp_buffers_) {
+        if (b.base) {
+          ailego_free(b.base);
         }
       }
       tmp_buffers_.clear();
@@ -531,39 +1015,473 @@ class BufferStorage : public IndexStorage {
     buffer_pool_.reset();
     max_segment_size_ = 0;
     buffer_pool_buffers_.clear();
+    meta_chains_.clear();
+    current_header_start_offset_ = 0;
+    pending_check_point_.store(0, std::memory_order_relaxed);
+    index_dirty_.store(false, std::memory_order_relaxed);
+    corrupted_.store(false, std::memory_order_relaxed);
   }
 
-  //! Append a segment into storage
-  int append_segment(const std::string & /*id*/, size_t /*size*/) {
+  //! Append a segment into storage.  C1: page table extends in-place;
+  //! latch held only briefly to protect segments_/id_hash_ insertion.
+  int append_segment(const std::string &id, size_t size) {
+    // Persist any pending data_size/padding/CRC mutations from prior
+    // write()/resize() before we re-hash and rewrite the segment_meta.
+    this->flush_index();
+
+    AllShardsExclusiveLatch latch(mapping_shards_);
+
+    if (!buffer_pool_ || !buffer_pool_handle_) {
+      LOG_ERROR("append_segment: pool not ready, file[%s]", file_name_.c_str());
+      return IndexError_Runtime;
+    }
+    if (corrupted_.load(std::memory_order_acquire)) {
+      LOG_ERROR(
+          "append_segment: storage is marked corrupted, refusing to append, "
+          "file[%s], id[%s]",
+          file_name_.c_str(), id.c_str());
+      return IndexError_Runtime;
+    }
+    if (!buffer_pool_->writable()) {
+      LOG_ERROR("append_segment: pool is read-only, file[%s]",
+                file_name_.c_str());
+      return IndexError_Runtime;
+    }
+    if (size == 0) {
+      return IndexError_InvalidArgument;
+    }
+    if (segments_.find(id) != segments_.end()) {
+      return IndexError_Duplicate;
+    }
+    if (meta_chains_.empty() || chain_headers_.empty() ||
+        buffer_pool_buffers_.empty()) {
+      LOG_ERROR("append_segment: invalid state, file[%s]", file_name_.c_str());
+      return IndexError_Runtime;
+    }
+
+    // Page-aligned padded size; matches IndexMapping::CalcPageAlignedSize().
+    const size_t page_size = ailego::kVectorPageSize;
+    const size_t padded_size = (size + page_size - 1) / page_size * page_size;
+
+    // The current last chain owns footer_ (overwritten by ParseFooter).
+    size_t id_size = id.length() + 1;
+    size_t need_size = sizeof(IndexFormat::SegmentMeta) + id_size;
+    MetaChain *chain = &meta_chains_.back();
+    IndexFormat::MetaHeader *header = chain_headers_.back().get();
+    char *meta_buf = buffer_pool_buffers_.back().get();
+
+    // Rollback handle for an in-memory-committed chain split.  Default
+    // no-op; populated only after Step 1 commits, so a Step 2 failure
+    // can fully undo the split (otherwise an orphan empty chain would
+    // remain linked in the file).
+    std::function<void()> rollback_step1 = []() {};
+
+    // ---- Step 1: chain split if current chain has no meta capacity left.
+    if (sizeof(IndexFormat::SegmentMeta) * footer_.segment_count + need_size >
+        chain->segment_ids_offset) {
+      size_t new_chain_start = buffer_pool_->file_size();
+      new_chain_start =
+          (new_chain_start + page_size - 1) / page_size * page_size;
+      size_t new_meta_total =
+          (segment_meta_capacity_ + sizeof(IndexFormat::MetaHeader) +
+           sizeof(IndexFormat::MetaFooter) + page_size - 1) /
+          page_size * page_size;
+      uint32_t new_segments_meta_size = static_cast<uint32_t>(
+          new_meta_total - sizeof(IndexFormat::MetaHeader) -
+          sizeof(IndexFormat::MetaFooter));
+
+      // Stage the linked old footer without mutating footer_ yet.
+      const auto saved_footer_before_split = footer_;
+      IndexFormat::MetaFooter linked_footer = footer_;
+      linked_footer.next_meta_header_offset = new_chain_start;
+      IndexFormat::UpdateMetaFooter(&linked_footer, 0);
+
+      if (buffer_pool_handle_->write_meta(
+              chain->footer_file_offset, sizeof(linked_footer),
+              reinterpret_cast<const char *>(&linked_footer)) != 0) {
+        LOG_ERROR("append_segment: write old footer failed, file[%s]",
+                  file_name_.c_str());
+        return IndexError_WriteData;
+      }
+
+      // Best-effort restore of the old footer if any subsequent write in
+      // this split block fails.  If the restore itself fails, mark the
+      // storage corrupted -- on-disk old footer now points at a partial
+      // new chain region.
+      auto undo_old_footer = [this, chain, &saved_footer_before_split]() {
+        if (buffer_pool_handle_->write_meta(
+                chain->footer_file_offset, sizeof(saved_footer_before_split),
+                reinterpret_cast<const char *>(&saved_footer_before_split)) !=
+            0) {
+          LOG_ERROR(
+              "append_segment: rollback write of old footer FAILED, file[%s] "
+              "is now in an inconsistent state -- marking storage as "
+              "corrupted; further writes will be rejected.",
+              file_name_.c_str());
+          corrupted_.store(true, std::memory_order_release);
+        }
+      };
+
+      // Extend the file and write the new chain's header + (zero) footer.
+      // The segment_meta region is zero-filled by ftruncate.
+      if (!buffer_pool_->extend_file(new_chain_start + new_meta_total)) {
+        undo_old_footer();
+        return IndexError_Runtime;
+      }
+
+      auto new_header = std::make_unique<IndexFormat::MetaHeader>();
+      IndexFormat::SetupMetaHeader(
+          new_header.get(),
+          static_cast<uint32_t>(new_meta_total -
+                                sizeof(IndexFormat::MetaFooter)),
+          static_cast<uint32_t>(new_meta_total));
+
+      auto new_meta_buf = std::make_unique<char[]>(new_segments_meta_size);
+      std::memset(new_meta_buf.get(), 0, new_segments_meta_size);
+
+      IndexFormat::MetaFooter new_footer;
+      IndexFormat::SetupMetaFooter(&new_footer);
+      new_footer.segments_meta_size = new_segments_meta_size;
+      new_footer.total_size = new_meta_total;
+      new_footer.segments_meta_crc =
+          ailego::Crc32c::Hash(new_meta_buf.get(), new_segments_meta_size, 0u);
+      IndexFormat::UpdateMetaFooter(&new_footer, 0);
+
+      if (buffer_pool_handle_->write_meta(
+              new_chain_start, sizeof(IndexFormat::MetaHeader),
+              reinterpret_cast<const char *>(new_header.get())) != 0) {
+        undo_old_footer();
+        return IndexError_WriteData;
+      }
+      uint64_t new_segment_meta_file_offset =
+          new_chain_start + sizeof(IndexFormat::MetaHeader);
+      uint64_t new_footer_file_offset =
+          new_chain_start + new_header->meta_footer_offset;
+      if (buffer_pool_handle_->write_meta(
+              new_footer_file_offset, sizeof(new_footer),
+              reinterpret_cast<const char *>(&new_footer)) != 0) {
+        undo_old_footer();
+        return IndexError_WriteData;
+      }
+
+      // Snapshot the OLD chain's pre-commit state for rollback_step1
+      // (captured by value: `chain` is reassigned below).
+      const auto saved_old_chain_footer = chain->footer;
+      const uint64_t saved_old_footer_file_offset = chain->footer_file_offset;
+      const uint64_t saved_current_header_start = current_header_start_offset_;
+
+      // Strong exception guarantee: reserve() FIRST so the three
+      // push_back's cannot throw mid-way and leave
+      // chain_headers_/buffer_pool_buffers_/meta_chains_ at mismatched
+      // sizes (which flush_index_locked() would silently skip while
+      // ParseToMapping() on next open follows the on-disk forward link).
+      try {
+        chain_headers_.reserve(chain_headers_.size() + 1);
+        buffer_pool_buffers_.reserve(buffer_pool_buffers_.size() + 1);
+        meta_chains_.reserve(meta_chains_.size() + 1);
+      } catch (const std::bad_alloc &) {
+        LOG_ERROR(
+            "append_segment: reserve for chain-split commit failed, file[%s]",
+            file_name_.c_str());
+        undo_old_footer();
+        return IndexError_Runtime;
+      }
+      chain = &meta_chains_.back();
+      chain->footer = linked_footer;  // old chain keeps linked footer
+      chain_headers_.push_back(std::move(new_header));
+      buffer_pool_buffers_.push_back(std::move(new_meta_buf));
+      meta_chains_.push_back(MetaChain{
+          new_chain_start, new_footer_file_offset, new_segment_meta_file_offset,
+          new_segments_meta_size, new_segments_meta_size, new_footer});
+      footer_ = new_footer;
+      current_header_start_offset_ = new_chain_start;
+
+      chain = &meta_chains_.back();
+      header = chain_headers_.back().get();
+      meta_buf = buffer_pool_buffers_.back().get();
+
+      // Install rollback for the committed split.  Captures by value so
+      // later reassignment of chain/header/meta_buf does not corrupt the
+      // closure.
+      rollback_step1 = [this, saved_footer_before_split, saved_old_chain_footer,
+                        saved_old_footer_file_offset,
+                        saved_current_header_start]() {
+        // 1. Drop the forward link on the old footer.  If this fails the
+        //    on-disk old footer still points at the popped new chain
+        //    region -- mark corrupted.
+        if (buffer_pool_handle_->write_meta(
+                saved_old_footer_file_offset, sizeof(saved_footer_before_split),
+                reinterpret_cast<const char *>(&saved_footer_before_split)) !=
+            0) {
+          LOG_ERROR(
+              "append_segment: rollback_step1 write of old footer FAILED, "
+              "file[%s] is now in an inconsistent state -- marking storage "
+              "as corrupted; further writes will be rejected.",
+              file_name_.c_str());
+          corrupted_.store(true, std::memory_order_release);
+        }
+        // 2. Pop the freshly-pushed new chain (releases its unique_ptrs).
+        if (!meta_chains_.empty()) meta_chains_.pop_back();
+        if (!chain_headers_.empty()) chain_headers_.pop_back();
+        if (!buffer_pool_buffers_.empty()) buffer_pool_buffers_.pop_back();
+        // 3. Restore the old chain's in-memory footer (forward link cleared).
+        if (!meta_chains_.empty()) {
+          meta_chains_.back().footer = saved_old_chain_footer;
+        }
+        // 4. Restore footer_ + current_header_start_offset_.  The on-disk
+        //    file size is intentionally NOT shrunk: the orphan region is
+        //    unreachable (step 1 cleared the link) and reusable by the
+        //    next split via file_size() realignment.
+        footer_ = saved_footer_before_split;
+        current_header_start_offset_ = saved_current_header_start;
+      };
+    }
+
+    // ---- Step 2: append SegmentMeta + ID into the (possibly new) last
+    //              chain, then persist meta_buf and footer.
+    uint64_t new_data_index = footer_.content_size;
+    uint64_t new_seg_abs_offset =
+        chain->header_start_offset + header->content_offset + new_data_index;
+    uint64_t new_file_size = new_seg_abs_offset + padded_size;
+    if (new_file_size > buffer_pool_->file_size()) {
+      if (!buffer_pool_->extend_file(new_file_size)) {
+        return IndexError_Runtime;
+      }
+    }
+
+    // Save mutable state for rollback if a Step 2 disk write fails.  The
+    // meta_buf regions that get overwritten (SegmentMeta entry + ID
+    // string) are also snapshotted so they can be restored exactly,
+    // keeping CRC consistent for a later flush_index().
+    const auto saved_footer = footer_;
+    const auto saved_chain_footer = chain->footer;
+    const auto saved_segment_ids_offset = chain->segment_ids_offset;
+    const size_t meta_entry_off =
+        sizeof(IndexFormat::SegmentMeta) * footer_.segment_count;
+    const uint32_t new_ids_off =
+        chain->segment_ids_offset - static_cast<uint32_t>(id_size);
+    char saved_meta_entry[sizeof(IndexFormat::SegmentMeta)];
+    std::memcpy(saved_meta_entry, meta_buf + meta_entry_off,
+                sizeof(IndexFormat::SegmentMeta));
+    std::unique_ptr<char[]> saved_id_bytes(new char[id_size]);
+    std::memcpy(saved_id_bytes.get(), meta_buf + new_ids_off, id_size);
+
+    chain->segment_ids_offset -= static_cast<uint32_t>(id_size);
+    IndexFormat::SegmentMeta *new_seg =
+        reinterpret_cast<IndexFormat::SegmentMeta *>(meta_buf) +
+        footer_.segment_count;
+    new_seg->segment_id_offset = chain->segment_ids_offset;
+    new_seg->data_index = new_data_index;
+    new_seg->data_size = 0;
+    new_seg->data_crc = 0;
+    new_seg->padding_size = padded_size;
+    std::memcpy(meta_buf + chain->segment_ids_offset, id.c_str(), id_size);
+
+    footer_.segment_count += 1;
+    footer_.content_size += padded_size;
+    footer_.total_size += padded_size;
+    footer_.segments_meta_crc =
+        ailego::Crc32c::Hash(meta_buf, chain->segment_meta_size, 0u);
+    IndexFormat::UpdateMetaFooter(&footer_, 0);
+    chain->footer = footer_;  // sync in-memory copy for flush_index
+
+    // Rollback for Step 2: restore in-memory state AND best-effort
+    // rewrite the OLD segments_meta + footer back to disk.  Without the
+    // disk rewrite, a write_meta(footer) failure (or post-write OOM)
+    // would tell the caller the append failed yet leave on-disk bytes
+    // describing the failed append -- ParseToMapping() on next open
+    // would surface a ghost segment with no entry in segments_/id_hash_.
+    //
+    // If the rewrite itself fails the file is unrepairable from here:
+    // raise corrupted_ so subsequent writers refuse to proceed.
+    auto rollback_step2 = [&]() {
+      std::memcpy(meta_buf + meta_entry_off, saved_meta_entry,
+                  sizeof(IndexFormat::SegmentMeta));
+      std::memcpy(meta_buf + new_ids_off, saved_id_bytes.get(), id_size);
+      footer_ = saved_footer;
+      chain->footer = saved_chain_footer;
+      chain->segment_ids_offset = saved_segment_ids_offset;
+
+      const int rc_meta = buffer_pool_handle_->write_meta(
+          chain->segment_meta_file_offset, chain->segment_meta_size, meta_buf);
+      const int rc_footer = buffer_pool_handle_->write_meta(
+          chain->footer_file_offset, sizeof(footer_),
+          reinterpret_cast<const char *>(&footer_));
+      if (rc_meta != 0 || rc_footer != 0) {
+        LOG_ERROR(
+            "append_segment: rollback_step2 disk rewrite FAILED "
+            "(rc_meta=%d, rc_footer=%d), file[%s] is now in an "
+            "inconsistent state -- marking storage as corrupted; further "
+            "writes will be rejected.",
+            rc_meta, rc_footer, file_name_.c_str());
+        corrupted_.store(true, std::memory_order_release);
+      }
+    };
+
+    if (buffer_pool_handle_->write_meta(chain->segment_meta_file_offset,
+                                        chain->segment_meta_size,
+                                        meta_buf) != 0) {
+      LOG_ERROR("append_segment: write segment_meta failed, file[%s]",
+                file_name_.c_str());
+      rollback_step2();
+      rollback_step1();
+      return IndexError_WriteData;
+    }
+    if (buffer_pool_handle_->write_meta(
+            chain->footer_file_offset, sizeof(footer_),
+            reinterpret_cast<const char *>(&footer_)) != 0) {
+      LOG_ERROR("append_segment: write footer failed, file[%s]",
+                file_name_.c_str());
+      rollback_step2();
+      rollback_step1();
+      return IndexError_WriteData;
+    }
+
+    // Strong exception guarantee for the in-memory commit: emplace into
+    // segments_ and id_hash_ as one transactional unit -- if id_hash_
+    // throws after segments_ succeeded, undo segments_ before
+    // propagating.  unordered_map::emplace() leaves existing element
+    // addresses stable, so WrappedSegment instances pointing into
+    // segments_ remain valid.
+    auto seg_ins = segments_.end();
+    bool seg_inserted = false;
+    try {
+      auto ins = segments_.emplace(
+          id, IndexMapping::SegmentInfo{IndexMapping::Segment{new_seg},
+                                        chain->header_start_offset, header});
+      if (!ins.second) {
+        // Cannot happen under the exclusive latch we hold (find() above
+        // checked), but be defensive.
+        LOG_ERROR(
+            "append_segment: duplicate id appeared after commit, file[%s], "
+            "id[%s]",
+            file_name_.c_str(), id.c_str());
+        rollback_step2();
+        rollback_step1();
+        return IndexError_Duplicate;
+      }
+      seg_ins = ins.first;
+      seg_inserted = true;
+      const size_t new_id = id_hash_.size();
+      id_hash_.emplace(id, new_id);
+    } catch (const std::bad_alloc &) {
+      LOG_ERROR(
+          "append_segment: in-memory commit OOM, rolling back, file[%s], "
+          "id[%s]",
+          file_name_.c_str(), id.c_str());
+      if (seg_inserted) {
+        segments_.erase(seg_ins);
+      }
+      rollback_step2();
+      rollback_step1();
+      return IndexError_Runtime;
+    }
+    max_segment_size_ = std::max<uint64_t>(max_segment_size_, padded_size);
+    // C1: extend_file() already extended the page table in-place; no pool
+    // rotation or flush_all needed.
     return 0;
   }
 
   //! Test if a segment exists
   bool has_segment(const std::string &id) const {
-    std::lock_guard<std::mutex> latch(mapping_mutex_);
+    std::shared_lock<std::shared_mutex> latch(
+        mapping_shards_[mapping_shard_id()].mtx);
     return (segments_.find(id) != segments_.end());
   }
 
-  //! Get a segment from storage
-  IndexMapping::SegmentInfo *get_segment_info(const std::string &id) {
-    std::lock_guard<std::mutex> latch(mapping_mutex_);
-    auto iter = segments_.find(id);
-    if (iter == segments_.end()) {
-      return nullptr;
-    }
-    return &iter->second;
+ private:
+  std::atomic<bool> index_dirty_{false};
+  std::atomic<uint64_t> pending_check_point_{0};
+  // Set when an append_segment() rollback fails to restore on-disk state.
+  // Once set, all writers (write/append_segment/flush_index_locked) refuse
+  // to proceed.  Only ever raised; cleared only by close_index().
+  std::atomic<bool> corrupted_{false};
+
+  // Sharded reader-writer lock: each reader hashes to its own shard to
+  // avoid cache-line ping-pong on the reader counter; writers lock all
+  // shards.
+  static constexpr size_t kMappingMutexShards = 32;
+  struct alignas(64) MutexShard {
+    std::shared_mutex mtx;
+  };
+  mutable MutexShard mapping_shards_[kMappingMutexShards]{};
+
+  // Per-(thread, instance) shard selection.  Combining thread::id with
+  // `this` ensures two BufferStorage instances on the same thread map to
+  // different shards (a thread_local-only id collapses them onto one
+  // shard).  boost-style hash_combine disperses skewed thread::id
+  // distributions across the 32 shards.
+  size_t mapping_shard_id() const {
+    size_t seed = std::hash<std::thread::id>()(std::this_thread::get_id());
+    size_t inst = std::hash<const void *>()(static_cast<const void *>(this));
+    // boost::hash_combine(seed, inst)
+    seed ^= inst + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2);
+    return seed % kMappingMutexShards;
   }
 
- private:
-  bool index_dirty_{false};
-  mutable std::mutex mapping_mutex_{};
+  // RAII guard that locks ALL shards exclusively (for writers).
+  struct AllShardsExclusiveLatch {
+    MutexShard *shards_;
+    AllShardsExclusiveLatch(MutexShard *shards) : shards_(shards) {
+      for (size_t i = 0; i < kMappingMutexShards; ++i) shards_[i].mtx.lock();
+    }
+    ~AllShardsExclusiveLatch() {
+      for (size_t i = 0; i < kMappingMutexShards; ++i) shards_[i].mtx.unlock();
+    }
+    AllShardsExclusiveLatch(const AllShardsExclusiveLatch &) = delete;
+    AllShardsExclusiveLatch &operator=(const AllShardsExclusiveLatch &) =
+        delete;
+  };
 
-  std::vector<char *> tmp_buffers_{};
+  // Arena slab for cross-page temp buffers handed out by
+  // WrappedSegment::read(const void**).  The legacy contract requires
+  // every returned pointer to stay valid until close_index(), so slots
+  // are never freed individually -- they are carved out of large
+  // 4K-aligned arenas which are released in bulk.
+  //
+  // Why an arena instead of one posix_memalign(4K, 4K) per read:
+  // Android Bionic scudo's small-class chunk pool is prone to large-
+  // alignment starvation under fragmentation (we observed sporadic
+  // posix_memalign(4096, 4096) returning ENOMEM even with plenty of
+  // free memory).  A single large request (>= kArenaSize) is served
+  // from scudo's secondary allocator (mmap-backed), which is reliable
+  // up to the true OOM boundary.
+  struct ArenaBlock {
+    char *base{nullptr};
+    size_t size{0};  // Total bytes in this arena (4K-aligned).
+    size_t used{0};  // Bytes already handed out (4K-aligned).
+  };
+  // Caller MUST hold tmp_buffers_mutex_.  alloc_size MUST be a
+  // multiple of 4096.  Returns nullptr only if scudo cannot satisfy a
+  // fresh arena allocation, i.e. effectively true OOM.
+  char *tmp_arena_alloc_locked(size_t alloc_size) {
+    static constexpr size_t kAlign = 4096UL;
+    static constexpr size_t kArenaSize = 1UL << 20;  // 1 MiB
+    if (!tmp_buffers_.empty()) {
+      ArenaBlock &back = tmp_buffers_.back();
+      if (back.base && back.size - back.used >= alloc_size) {
+        char *out = back.base + back.used;
+        back.used += alloc_size;
+        return out;
+      }
+    }
+    size_t new_size = alloc_size > kArenaSize ? alloc_size : kArenaSize;
+    char *p = static_cast<char *>(ailego_aligned_malloc(new_size, kAlign));
+    if (!p) {
+      return nullptr;
+    }
+    tmp_buffers_.push_back(ArenaBlock{p, new_size, alloc_size});
+    return p;
+  }
+  std::vector<ArenaBlock> tmp_buffers_{};
   mutable std::mutex tmp_buffers_mutex_{};
 
   // buffer manager
   std::string file_name_;
-  IndexFormat::MetaHeader header_{};
+  // Per-chain owning copies of MetaHeader.  segments_[name].segment_header
+  // points into one of these; using a single shared header_ would let the
+  // next chain's ParseHeader overwrite earlier-chain content_offset.
+  std::vector<std::unique_ptr<IndexFormat::MetaHeader>> chain_headers_{};
   IndexFormat::MetaFooter footer_{};
   std::unordered_map<std::string, IndexMapping::SegmentInfo> segments_{};
   std::unordered_map<std::string, size_t> id_hash_{};
@@ -573,6 +1491,26 @@ class BufferStorage : public IndexStorage {
   ailego::VecBufferPool::Pointer buffer_pool_{nullptr};
   ailego::VecBufferPoolHandle::Pointer buffer_pool_handle_{nullptr};
   uint64_t current_header_start_offset_{0u};
+
+  // Capacity (in bytes) of the segment metadata section written by
+  // init_index().
+  uint32_t segment_meta_capacity_{4096u};
+
+  // Per-header-chain file offsets used by flush_index() and append_segment().
+  struct MetaChain {
+    uint64_t header_start_offset;
+    uint64_t footer_file_offset;
+    uint64_t segment_meta_file_offset;
+    uint32_t segment_meta_size;
+    // Lowest segment-ID-string offset within segment_meta; equals
+    // segment_meta_size when empty, decreases by strlen(id)+1 per append.
+    // Used to detect when a chain split is needed.
+    uint32_t segment_ids_offset;
+    // In-memory copy of this chain's MetaFooter, kept in sync with disk by
+    // flush_index() and append_segment() to avoid a pread per chain.
+    IndexFormat::MetaFooter footer;
+  };
+  std::vector<MetaChain> meta_chains_{};
 };
 
 INDEX_FACTORY_REGISTER_STORAGE(BufferStorage);
diff --git a/src/db/index/segment/segment.cc b/src/db/index/segment/segment.cc
index 96ec3dc37..6dd765262 100644
--- a/src/db/index/segment/segment.cc
+++ b/src/db/index/segment/segment.cc
@@ -526,10 +526,20 @@ Status SegmentImpl::close() {
     }
   }
   vector_indexers_.clear();
+  for (const auto &[name, indexers] : quant_vector_indexers_) {
+    for (auto indexer : indexers) {
+      indexer->Close();
+    }
+  }
+  quant_vector_indexers_.clear();
   for (auto [name, indexer] : memory_vector_indexers_) {
     indexer->Close();
   }
   memory_vector_indexers_.clear();
+  for (auto [name, indexer] : quant_memory_vector_indexers_) {
+    indexer->Close();
+  }
+  quant_memory_vector_indexers_.clear();
 
   return Status::OK();
 }
diff --git a/src/db/index/storage/lazy_record_batch_reader.h b/src/db/index/storage/lazy_record_batch_reader.h
index 451bba8e0..e1286e305 100644
--- a/src/db/index/storage/lazy_record_batch_reader.h
+++ b/src/db/index/storage/lazy_record_batch_reader.h
@@ -128,7 +128,8 @@ class ParquetRecordBatchReader : public arrow::RecordBatchReader {
     std::vector<std::shared_ptr<arrow::Array>> chunks(col_indices_.size());
     if (with_cache_) {
       for (size_t col_idx = 0; col_idx < col_indices_.size(); ++col_idx) {
-        auto buffer_id = ailego::ParquetBufferID(file_path_, col_idx, rg_id);
+        auto buffer_id =
+            ailego::ParquetBufferID(file_path_, col_indices_[col_idx], rg_id);
         auto buffer_handle =
             ailego::ParquetBufferPool::get_instance().acquire_buffer(buffer_id);
         std::shared_ptr<arrow::ChunkedArray> col_chunked_array =
diff --git a/src/db/index/storage/store_helper.h b/src/db/index/storage/store_helper.h
index f930e42ec..abb4599e5 100644
--- a/src/db/index/storage/store_helper.h
+++ b/src/db/index/storage/store_helper.h
@@ -267,12 +267,7 @@ inline arrow::Status ConvertScalarVectorToArrayByType(
         return arrow::Status::Invalid(
             "Cannot convert empty vector to list array");
       }
-
-      auto list_type = std::dynamic_pointer_cast<arrow::ListType>(type);
-      if (!list_type) {
-        return arrow::Status::TypeError("Expected ListType for LIST scalar");
-      }
-
+      auto list_type = std::static_pointer_cast<arrow::ListType>(type);
       std::unique_ptr<arrow::ArrayBuilder> value_builder;
       ARROW_RETURN_NOT_OK(arrow::MakeBuilder(arrow::default_memory_pool(),
                                              list_type->value_type(),
@@ -287,10 +282,9 @@ inline arrow::Status ConvertScalarVectorToArrayByType(
           continue;
         }
 
-        auto list_scalar = std::dynamic_pointer_cast<arrow::ListScalar>(scalar);
-        if (!list_scalar) {
-          return arrow::Status::TypeError("Expected ListScalar for LIST type");
-        }
+        // Same rationale: scalar->type->id() == LIST implies the
+        // scalar IS a ListScalar; avoid RTTI-dependent cast.
+        auto list_scalar = std::static_pointer_cast<arrow::ListScalar>(scalar);
 
         ARROW_RETURN_NOT_OK(builder.Append());
         auto value_builder_ptr = builder.value_builder();
@@ -371,12 +365,10 @@ inline arrow::Status AppendFieldValueToBuilder(
     }
     case arrow::Type::LIST: {
       auto list_builder = dynamic_cast<arrow::ListBuilder *>(builder);
-      auto list_type =
-          std::dynamic_pointer_cast<arrow::ListType>(field->type());
-
-      if (!list_type) {
-        return arrow::Status::TypeError("Field type is not ListType");
-      }
+      // Use static_pointer_cast: the switch guarantees type == LIST;
+      // dynamic_pointer_cast fails on Android due to RTTI divergence
+      // when Arrow is linked as a static archive.
+      auto list_type = std::static_pointer_cast<arrow::ListType>(field->type());
 
       auto value_type = list_type->value_type()->id();
 
@@ -699,8 +691,9 @@ inline arrow::Status BuildArrayFromIndicesWithType(
       return BuildArrayFromIndices<arrow::BinaryArray, arrow::BinaryBuilder>(
           chunked_array, indices_in_table, out_array);
     case arrow::Type::LIST: {
-      auto list_type =
-          std::dynamic_pointer_cast<arrow::ListType>(col_data_type);
+      // static_pointer_cast: switch guarantees type == LIST; avoids
+      // Android RTTI divergence with Arrow static archive.
+      auto list_type = std::static_pointer_cast<arrow::ListType>(col_data_type);
       return BuildListArrayFromIndices(chunked_array, indices_in_table,
                                        list_type, out_array);
     }
diff --git a/src/include/zvec/ailego/buffer/vector_page_table.h b/src/include/zvec/ailego/buffer/vector_page_table.h
index c6a08c9da..8bcc13e99 100644
--- a/src/include/zvec/ailego/buffer/vector_page_table.h
+++ b/src/include/zvec/ailego/buffer/vector_page_table.h
@@ -22,6 +22,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <functional>
 #include <iostream>
 #include <limits>
 #include <map>
@@ -48,16 +49,28 @@ class VectorPageTable {
   struct Entry {
     std::atomic<int> ref_count;
     std::atomic<bool> in_evict_queue;
+    std::atomic<bool> is_dirty;
     char *buffer;
+    size_t file_offset;
   };
 
  public:
-  VectorPageTable() : entry_num_(0), entries_(nullptr) {
+  // Callback invoked by evict_block() to persist a dirty block before its
+  // memory is released. Signature: (block_id, buffer, size, file_offset).
+  using FlushCallback = std::function<int(block_id_t, char *, size_t, size_t)>;
+
+  VectorPageTable() {
     BlockEvictionQueue::get_instance().set_valid(this);
   }
   ~VectorPageTable() {
     BlockEvictionQueue::get_instance().set_invalid(this);
-    delete[] entries_;
+    // Destructor runs without concurrent readers/writers (callers guarantee
+    // no live handles by the time the page table is destroyed), so a relaxed
+    // load is sufficient here.
+    size_t cnt = segment_count_.load(std::memory_order_relaxed);
+    for (size_t i = 0; i < cnt; ++i) {
+      delete[] segments_[i];
+    }
   }
 
   VectorPageTable(const VectorPageTable &) = delete;
@@ -65,7 +78,17 @@ class VectorPageTable {
   VectorPageTable(VectorPageTable &&) = delete;
   VectorPageTable &operator=(VectorPageTable &&) = delete;
 
-  void init(size_t entry_num);
+  //! Initialize the page table to cover `entry_num` entries.
+  //! Returns false (without modifying state) if `entry_num` exceeds the
+  //! statically allocated segment table capacity (kMaxEntries).
+  bool init(size_t entry_num);
+
+  //! Extend the page table to cover at least `new_entry_num` entries.
+  //! Existing entries stay at their original addresses (no invalidation).
+  //! Safe to call while readers operate on existing pages.
+  //! Returns false (without modifying state) if `new_entry_num` exceeds
+  //! the statically allocated segment table capacity (kMaxEntries).
+  bool extend(size_t new_entry_num);
 
   char *acquire_block(block_id_t block_id);
 
@@ -73,25 +96,101 @@ class VectorPageTable {
 
   void evict_block(block_id_t block_id);
 
-  char *set_block_acquired(block_id_t block_id, char *buffer);
+  char *set_block_acquired(block_id_t block_id, char *buffer,
+                           size_t file_offset);
+
+  void set_flush_callback(FlushCallback cb) {
+    flush_callback_ = std::move(cb);
+  }
+
+  //! Mark a loaded block as dirty so that it is persisted on eviction.
+  void mark_dirty(block_id_t block_id) {
+    assert(block_id < entry_num_.load(std::memory_order_acquire));
+    entry_at(block_id).is_dirty.store(true, std::memory_order_relaxed);
+  }
+
+  bool is_block_dirty(block_id_t block_id) const {
+    assert(block_id < entry_num_.load(std::memory_order_acquire));
+    return entry_at(block_id).is_dirty.load(std::memory_order_relaxed);
+  }
+
+  //! Flush a single dirty block without evicting it. Caller guarantees the
+  //! block is currently loaded (buffer != nullptr).
+  int flush_block(block_id_t block_id) {
+    assert(block_id < entry_num_.load(std::memory_order_acquire));
+    Entry &e = entry_at(block_id);
+    char *buffer = e.buffer;
+    if (!buffer || !flush_callback_) {
+      return 0;
+    }
+    if (!e.is_dirty.load(std::memory_order_relaxed)) {
+      return 0;
+    }
+    int rc = flush_callback_(block_id, buffer, kVectorPageSize, e.file_offset);
+    if (rc == 0) {
+      e.is_dirty.store(false, std::memory_order_relaxed);
+    }
+    return rc;
+  }
 
+  //! Returns the current number of entries.  Uses acquire ordering so that
+  //! callers iterating over [0, entry_num()) are guaranteed to see all
+  //! segments_[s] writes performed by a concurrent extend()/init().
   size_t entry_num() const {
-    return entry_num_;
+    return entry_num_.load(std::memory_order_acquire);
   }
 
   bool is_released(block_id_t block_id) const {
-    assert(block_id < entry_num_);
-    return entries_[block_id].ref_count.load(std::memory_order_relaxed) <= 0;
+    assert(block_id < entry_num_.load(std::memory_order_acquire));
+    return entry_at(block_id).ref_count.load(std::memory_order_relaxed) <= 0;
   }
 
   inline bool is_dead_block(BlockEvictionQueue::BlockType block) const {
-    Entry &entry = entries_[block.vector_block.first];
-    return !entry.in_evict_queue.load(std::memory_order_relaxed);
+    const Entry &e = entry_at(block.vector_block.first);
+    return !e.in_evict_queue.load(std::memory_order_relaxed);
   }
 
  private:
-  size_t entry_num_{0};
-  Entry *entries_{nullptr};
+  // Segmented page table: entries are split across fixed-size segments so
+  // that extend() can grow the table without moving existing entries.
+  static constexpr size_t kSegmentShift = 16;  // 65536 entries per segment
+  static constexpr size_t kSegmentSize = size_t{1} << kSegmentShift;
+  static constexpr size_t kSegmentMask = kSegmentSize - 1;
+
+ public:
+  static constexpr size_t kMaxSegments =
+      2048;  // up to 128M entries (512GB @ 4K)
+  // Maximum number of entries the segment table can ever hold.  Callers
+  // (e.g. VecBufferPool::extend_file) can use this to pre-validate a target
+  // file size before mutating any on-disk state.
+  static constexpr size_t kMaxEntries = kMaxSegments * kSegmentSize;
+
+ private:
+  // entry_num_ and segment_count_ are mutated by writers in init()/extend()
+  // and observed by readers in entry_num() and the hot-path methods.  They
+  // are atomic to establish a release/acquire synchronization edge with the
+  // (non-atomic) writes to segments_[s] performed prior to the store: any
+  // reader that observes the new entry_num_ is guaranteed to see the
+  // fully-initialized Entry slots in the corresponding segment.
+  std::atomic<size_t> entry_num_{0};
+  std::atomic<size_t> segment_count_{0};
+  Entry *segments_[kMaxSegments]{};
+
+  // Pair with the release-store on segment_count_ in init()/extend() so
+  // that any reader observing the published segment table also sees the
+  // fully-initialized segments_[s] pointer and Entry slots. Without this
+  // acquire load, segments_[s] can be re-read as nullptr or a torn
+  // pointer on weak memory models (and even reordered on x86 under -O2).
+  Entry &entry_at(size_t idx) {
+    (void)segment_count_.load(std::memory_order_acquire);
+    return segments_[idx >> kSegmentShift][idx & kSegmentMask];
+  }
+  const Entry &entry_at(size_t idx) const {
+    (void)segment_count_.load(std::memory_order_acquire);
+    return segments_[idx >> kSegmentShift][idx & kSegmentMask];
+  }
+
+  FlushCallback flush_callback_{};
 };
 
 class VecBufferPoolHandle;
@@ -102,8 +201,11 @@ class VecBufferPool {
 
   static constexpr size_t kMutexBucketCount = 64UL * 1024UL;
 
-  VecBufferPool(const std::string &filename);
+  VecBufferPool(const std::string &filename, bool writable = false);
   ~VecBufferPool() {
+    // Flush any remaining dirty blocks before tearing down memory/fd so that
+    // writes are not silently lost. Safe to call even in read-only mode.
+    (void)this->flush_all();
     for (size_t i = 0; i < page_table_.entry_num(); ++i) {
       assert(page_table_.is_released(i));
       page_table_.evict_block(i);
@@ -123,6 +225,29 @@ class VecBufferPool {
 
   int get_meta(size_t offset, size_t length, char *buffer);
 
+  //! Write a contiguous range via the page cache; marks touched pages dirty.
+  //! Returns 0 on success, -1 on failure (e.g. read-only pool or I/O error).
+  int write_range(size_t file_offset, size_t length, const char *src);
+
+  //! Write raw bytes directly via pwrite, bypassing the page cache. Used for
+  //! metadata regions (header/footer/segments_meta) which are only read via
+  //! get_meta() and never cached.
+  int write_meta(size_t offset, size_t length, const char *buffer);
+
+  //! Iterate all entries and persist any dirty blocks to disk. Safe to call
+  //! repeatedly; no-op in read-only mode.
+  int flush_all();
+
+  //! Extend the backing file to `new_size` bytes via ftruncate (no-op if
+  //! already >= new_size), refresh the cached file_size_, and extend the
+  //! page_table to cover the new range.  Returns true on success, false on
+  //! a read-only pool or I/O failure.
+  bool extend_file(size_t new_size);
+
+  bool writable() const {
+    return writable_;
+  }
+
   size_t file_size() const {
     return file_size_;
   }
@@ -131,6 +256,7 @@ class VecBufferPool {
   int fd_;
   size_t file_size_;
   std::string file_name_;
+  bool writable_{false};
 
  public:
   VectorPageTable page_table_;
@@ -154,6 +280,14 @@ class VecBufferPoolHandle {
 
   int get_meta(size_t offset, size_t length, char *buffer);
 
+  int write_range(size_t file_offset, size_t len, const char *src);
+
+  int write_meta(size_t offset, size_t length, const char *buffer);
+
+  int flush_all();
+
+  bool writable() const;
+
   void release_one(block_id_t block_id);
 
   void acquire_one(block_id_t block_id);
diff --git a/src/include/zvec/core/framework/index_storage.h b/src/include/zvec/core/framework/index_storage.h
index 530073aad..3da2e6669 100644
--- a/src/include/zvec/core/framework/index_storage.h
+++ b/src/include/zvec/core/framework/index_storage.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <cstring>
 #include <zvec/ailego/buffer/vector_page_table.h>
 #include <zvec/ailego/container/params.h>
 #include <zvec/core/framework/index_error.h>
@@ -47,23 +48,35 @@ class IndexStorage : public IndexModule {
     }
     MemoryBlock(void *data) : type_(MemoryBlockType::MBT_MMAP), data_(data) {}
 
-    static MemoryBlock MakeOwned(void *owned) {
+    //! Build an HEAP_SCRATCH MemoryBlock that owns `owned` (allocated via
+    //! ailego_malloc / ailego_aligned_malloc).  `size` is the byte length of
+    //! the buffer and is required so that copy construction / copy
+    //! assignment can deep-copy the buffer instead of aliasing it (a shallow
+    //! copy would result in use-after-free once the original block is
+    //! destructed and frees the buffer).
+    static MemoryBlock MakeOwned(void *owned, size_t size) {
       MemoryBlock mb;
       mb.type_ = MemoryBlockType::MBT_HEAP_SCRATCH;
       mb.data_ = owned;
+      mb.scratch_size_ = size;
       return mb;
     }
 
     MemoryBlock(const MemoryBlock &rhs) {
       switch (rhs.type_) {
         case MemoryBlockType::MBT_MMAP:
-        case MemoryBlockType::MBT_HEAP_SCRATCH:
           this->reset(rhs.data_);
           break;
         case MemoryBlockType::MBT_BUFFERPOOL:
           this->reset(rhs.buffer_pool_handle_, rhs.buffer_block_id_, rhs.data_);
           buffer_pool_handle_->acquire_one(buffer_block_id_);
           break;
+        case MemoryBlockType::MBT_HEAP_SCRATCH:
+          // Deep copy: each owner must hold its own buffer, otherwise the
+          // first destructor frees the buffer and leaves the surviving
+          // copies dangling.
+          deep_copy_from(rhs);
+          break;
         default:
           break;
       }
@@ -83,7 +96,9 @@ class IndexStorage : public IndexModule {
         case MemoryBlockType::MBT_HEAP_SCRATCH:
           type_ = MemoryBlockType::MBT_HEAP_SCRATCH;
           data_ = rhs.data_;
+          scratch_size_ = rhs.scratch_size_;
           rhs.data_ = nullptr;
+          rhs.scratch_size_ = 0;
           rhs.type_ = MemoryBlockType::MBT_UNKNOWN;
           break;
         default:
@@ -103,7 +118,8 @@ class IndexStorage : public IndexModule {
             buffer_pool_handle_->acquire_one(buffer_block_id_);
             break;
           case MemoryBlockType::MBT_HEAP_SCRATCH:
-            this->reset(rhs.data_);
+            release_current();
+            deep_copy_from(rhs);
             break;
           default:
             break;
@@ -125,10 +141,12 @@ class IndexStorage : public IndexModule {
             rhs.type_ = MemoryBlockType::MBT_UNKNOWN;
             break;
           case MemoryBlockType::MBT_HEAP_SCRATCH:
-            release_owned();
+            release_current();
             type_ = MemoryBlockType::MBT_HEAP_SCRATCH;
             data_ = rhs.data_;
+            scratch_size_ = rhs.scratch_size_;
             rhs.data_ = nullptr;
+            rhs.scratch_size_ = 0;
             rhs.type_ = MemoryBlockType::MBT_UNKNOWN;
             break;
           default:
@@ -154,6 +172,7 @@ class IndexStorage : public IndexModule {
           break;
       }
       data_ = nullptr;
+      scratch_size_ = 0;
     }
 
     const void *data() const {
@@ -188,6 +207,10 @@ class IndexStorage : public IndexModule {
     void *data_{nullptr};
     mutable ailego::VecBufferPoolHandle *buffer_pool_handle_{nullptr};
     size_t buffer_block_id_{0};
+    //! Byte size of the heap-scratch buffer pointed to by `data_`; only used
+    //! when type_ == MBT_HEAP_SCRATCH.  Required for safe deep-copy on
+    //! copy-construction / copy-assignment of HEAP_SCRATCH blocks.
+    size_t scratch_size_{0};
 
    private:
     void release_owned() {
@@ -195,6 +218,44 @@ class IndexStorage : public IndexModule {
         ailego_free(data_);
         data_ = nullptr;
       }
+      scratch_size_ = 0;
+    }
+
+    //! Drop whatever the current MemoryBlock holds, regardless of type, so
+    //! that the slot is ready to receive new ownership.  Mirrors what the
+    //! destructor would do (minus zeroing data_) but leaves the type alone
+    //! for the caller to overwrite immediately afterwards.
+    void release_current() {
+      switch (type_) {
+        case MemoryBlockType::MBT_BUFFERPOOL:
+          if (buffer_pool_handle_) {
+            buffer_pool_handle_->release_one(buffer_block_id_);
+            buffer_pool_handle_ = nullptr;
+          }
+          break;
+        case MemoryBlockType::MBT_HEAP_SCRATCH:
+          release_owned();
+          break;
+        default:
+          break;
+      }
+      data_ = nullptr;
+      type_ = MemoryBlockType::MBT_UNKNOWN;
+    }
+
+    //! Allocate a fresh buffer of the same size as `rhs.scratch_size_`,
+    //! memcpy `rhs.data_` into it, and become the new owner.  Used by the
+    //! HEAP_SCRATCH copy ctor / copy assignment so the original and the
+    //! copy each free their own buffer independently.
+    void deep_copy_from(const MemoryBlock &rhs) {
+      type_ = MemoryBlockType::MBT_HEAP_SCRATCH;
+      scratch_size_ = rhs.scratch_size_;
+      if (scratch_size_ > 0 && rhs.data_) {
+        data_ = ailego_malloc(scratch_size_);
+        std::memcpy(data_, rhs.data_, scratch_size_);
+      } else {
+        data_ = nullptr;
+      }
     }
   };
 
diff --git a/tests/core/algorithm/flat/flat_streamer_buffer_test.cc b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc
index 441853c86..e3fce1f24 100644
--- a/tests/core/algorithm/flat/flat_streamer_buffer_test.cc
+++ b/tests/core/algorithm/flat/flat_streamer_buffer_test.cc
@@ -168,6 +168,251 @@ TEST_F(FlatStreamerTest, TestLinearSearch) {
   read_streamer.reset();
 }
 
+TEST_F(FlatStreamerTest, TestLinearSearchBuffer) {
+  MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL);
+  IndexStreamer::Pointer write_streamer =
+      IndexFactory::CreateStreamer("FlatStreamer");
+  ASSERT_TRUE(write_streamer != nullptr);
+
+  Params params;
+  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
+  auto storage = IndexFactory::CreateStorage("BufferStorage");
+  ASSERT_NE(nullptr, storage);
+  Params stg_params;
+  ASSERT_EQ(0, storage->init(stg_params));
+  ASSERT_EQ(0, storage->open(dir_ + "Test/LinearSearchBuffer", true));
+  ASSERT_EQ(0, write_streamer->open(storage));
+
+  auto ctx = write_streamer->create_context();
+  ASSERT_TRUE(!!ctx);
+
+  size_t cnt = 10000UL;
+  IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
+  for (size_t i = 0; i < cnt; i++) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    write_streamer->add_impl(i, vec.data(), qmeta, ctx);
+  }
+  write_streamer->flush(0UL);
+  write_streamer->close();
+  write_streamer.reset();
+  storage->close();
+
+  IndexStreamer::Pointer read_streamer =
+      IndexFactory::CreateStreamer("FlatStreamer");
+  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
+  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
+  ASSERT_NE(nullptr, read_storage);
+  ASSERT_EQ(0, read_storage->init(stg_params));
+  ASSERT_EQ(0, read_storage->open(dir_ + "Test/LinearSearchBuffer", false));
+  ASSERT_EQ(0, read_streamer->open(read_storage));
+  size_t topk = 3;
+  auto provider = read_streamer->create_provider();
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
+    for (size_t j = 0; j < dim; ++j) {
+      ASSERT_FLOAT_EQ(data[j], i);
+    }
+    ASSERT_EQ(i, result1[0].key());
+
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i + 0.1f;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+
+  ctx->set_topk(100U);
+  NumericalVector<float> vec(dim);
+  for (size_t j = 0; j < dim; ++j) {
+    vec[j] = 10.1f;
+  }
+  ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx));
+  auto &result = ctx->result();
+  ASSERT_EQ(100U, result.size());
+  ASSERT_EQ(10, result[0].key());
+  ASSERT_EQ(11, result[1].key());
+  ASSERT_EQ(5, result[10].key());
+  ASSERT_EQ(0, result[20].key());
+  ASSERT_EQ(30, result[30].key());
+  ASSERT_EQ(35, result[35].key());
+  ASSERT_EQ(99, result[99].key());
+
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
+    for (size_t j = 0; j < dim; ++j) {
+      ASSERT_FLOAT_EQ(data[j], i);
+    }
+    ASSERT_EQ(i, result1[0].key());
+
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i + 0.1f;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
+
+  read_streamer->close();
+  read_streamer.reset();
+}
+
+TEST_F(FlatStreamerTest, TestLinearSearchBufferMMap) {
+  MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL);
+  IndexStreamer::Pointer write_streamer =
+      IndexFactory::CreateStreamer("FlatStreamer");
+  ASSERT_TRUE(write_streamer != nullptr);
+
+  Params params;
+  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
+  auto storage = IndexFactory::CreateStorage("BufferStorage");
+  ASSERT_NE(nullptr, storage);
+  Params stg_params;
+  ASSERT_EQ(0, storage->init(stg_params));
+  ASSERT_EQ(0, storage->open(dir_ + "Test/LinearSearchBuffer", true));
+  ASSERT_EQ(0, write_streamer->open(storage));
+
+  auto ctx = write_streamer->create_context();
+  ASSERT_TRUE(!!ctx);
+
+  size_t cnt = 10000UL;
+  IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
+  for (size_t i = 0; i < cnt; i++) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    write_streamer->add_impl(i, vec.data(), qmeta, ctx);
+  }
+  write_streamer->flush(0UL);
+  write_streamer->close();
+  write_streamer.reset();
+  storage->close();
+
+  IndexStreamer::Pointer read_streamer =
+      IndexFactory::CreateStreamer("FlatStreamer");
+  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
+  auto read_storage = IndexFactory::CreateStorage("MMapFileStorage");
+  ASSERT_NE(nullptr, read_storage);
+  ASSERT_EQ(0, read_storage->init(stg_params));
+  ASSERT_EQ(0, read_storage->open(dir_ + "Test/LinearSearchBuffer", false));
+  ASSERT_EQ(0, read_streamer->open(read_storage));
+  size_t topk = 3;
+  auto provider = read_streamer->create_provider();
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
+    for (size_t j = 0; j < dim; ++j) {
+      ASSERT_FLOAT_EQ(data[j], i);
+    }
+    ASSERT_EQ(i, result1[0].key());
+
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i + 0.1f;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+
+  ctx->set_topk(100U);
+  NumericalVector<float> vec(dim);
+  for (size_t j = 0; j < dim; ++j) {
+    vec[j] = 10.1f;
+  }
+  ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx));
+  auto &result = ctx->result();
+  ASSERT_EQ(100U, result.size());
+  ASSERT_EQ(10, result[0].key());
+  ASSERT_EQ(11, result[1].key());
+  ASSERT_EQ(5, result[10].key());
+  ASSERT_EQ(0, result[20].key());
+  ASSERT_EQ(30, result[30].key());
+  ASSERT_EQ(35, result[35].key());
+  ASSERT_EQ(99, result[99].key());
+
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
+    for (size_t j = 0; j < dim; ++j) {
+      ASSERT_FLOAT_EQ(data[j], i);
+    }
+    ASSERT_EQ(i, result1[0].key());
+
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i + 0.1f;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
+
+  read_streamer->close();
+  read_streamer.reset();
+}
+
+
 TEST_F(FlatStreamerTest, TestLinearSearchWithLRU) {
   MemoryLimitPool::get_instance().init(100 * 1024UL * 1024UL);
 #ifdef __ANDROID__
@@ -351,7 +596,6 @@ TEST_F(FlatStreamerTest, TestLinearSearchMMap) {
     ASSERT_EQ(topk, result1.size());
     IndexStorage::MemoryBlock block;
     ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
-    const float *data = (float *)block.data();
     for (size_t j = 0; j < dim; ++j) {
       const float *data = (float *)provider->get_vector(result1[0].key());
       EXPECT_FLOAT_EQ(data[j], i);
diff --git a/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc b/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc
index cf3093e22..cd21ff912 100644
--- a/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc
+++ b/tests/core/algorithm/hnsw/hnsw_streamer_buffer_test.cc
@@ -171,6 +171,254 @@ TEST_F(HnswStreamerTest, TestHnswSearch) {
   cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
 }
 
+TEST_F(HnswStreamerTest, TestHnswSearchBuffer) {
+  MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL);
+  IndexStreamer::Pointer write_streamer =
+      IndexFactory::CreateStreamer("HnswStreamer");
+  ASSERT_TRUE(write_streamer != nullptr);
+
+  Params params;
+  params.set(PARAM_HNSW_STREAMER_GET_VECTOR_ENABLE, true);
+
+  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
+  auto storage = IndexFactory::CreateStorage("BufferStorage");
+  ASSERT_NE(nullptr, storage);
+  Params stg_params;
+  ASSERT_EQ(0, storage->init(stg_params));
+  ASSERT_EQ(0, storage->open(dir_ + "Test/TestHnswSearchBuffer", true));
+  ASSERT_EQ(0, write_streamer->open(storage));
+
+  auto ctx = write_streamer->create_context();
+  ASSERT_TRUE(!!ctx);
+
+  size_t cnt = 10000UL;
+  IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
+  for (size_t i = 0; i < cnt; i++) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    write_streamer->add_impl(i, vec.data(), qmeta, ctx);
+  }
+  write_streamer->flush(0UL);
+  write_streamer->close();
+  write_streamer.reset();
+  storage->close();
+
+  IndexStreamer::Pointer read_streamer =
+      IndexFactory::CreateStreamer("HnswStreamer");
+  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
+  auto read_storage = IndexFactory::CreateStorage("BufferStorage");
+  ASSERT_NE(nullptr, read_storage);
+  ASSERT_EQ(0, read_storage->init(stg_params));
+  ASSERT_EQ(0, read_storage->open(dir_ + "Test/TestHnswSearchBuffer", false));
+  ASSERT_EQ(0, read_streamer->open(read_storage));
+  size_t topk = 3;
+  auto provider = read_streamer->create_provider();
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
+    for (size_t j = 0; j < dim; ++j) {
+      ASSERT_EQ(data[j], i);
+    }
+    ASSERT_EQ(i, result1[0].key());
+
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i + 0.1f;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+
+  ctx->set_topk(100U);
+  NumericalVector<float> vec(dim);
+  for (size_t j = 0; j < dim; ++j) {
+    vec[j] = 10.1f;
+  }
+  ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx));
+  auto &result = ctx->result();
+  ASSERT_EQ(100U, result.size());
+  ASSERT_EQ(10, result[0].key());
+  ASSERT_EQ(11, result[1].key());
+  ASSERT_EQ(5, result[10].key());
+  ASSERT_EQ(0, result[20].key());
+  ASSERT_EQ(30, result[30].key());
+  ASSERT_EQ(35, result[35].key());
+  ASSERT_EQ(99, result[99].key());
+
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
+    for (size_t j = 0; j < dim; ++j) {
+      ASSERT_EQ(data[j], i);
+    }
+    ASSERT_EQ(i, result1[0].key());
+
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i + 0.1f;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+
+  read_streamer->close();
+  read_streamer.reset();
+  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
+}
+
+TEST_F(HnswStreamerTest, TestHnswSearchBufferMMap) {
+  MemoryLimitPool::get_instance().init(2 * 1024UL * 1024UL * 1024UL);
+  IndexStreamer::Pointer write_streamer =
+      IndexFactory::CreateStreamer("HnswStreamer");
+  ASSERT_TRUE(write_streamer != nullptr);
+
+  Params params;
+  params.set(PARAM_HNSW_STREAMER_GET_VECTOR_ENABLE, true);
+
+  ASSERT_EQ(0, write_streamer->init(*index_meta_ptr_, params));
+  auto storage = IndexFactory::CreateStorage("BufferStorage");
+  ASSERT_NE(nullptr, storage);
+  Params stg_params;
+  ASSERT_EQ(0, storage->init(stg_params));
+  ASSERT_EQ(0, storage->open(dir_ + "Test/TestHnswSearchBufferMMap", true));
+  ASSERT_EQ(0, write_streamer->open(storage));
+
+  auto ctx = write_streamer->create_context();
+  ASSERT_TRUE(!!ctx);
+
+  size_t cnt = 10000UL;
+  IndexQueryMeta qmeta(IndexMeta::DT_FP32, dim);
+  for (size_t i = 0; i < cnt; i++) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    write_streamer->add_impl(i, vec.data(), qmeta, ctx);
+  }
+  write_streamer->flush(0UL);
+  write_streamer->close();
+  write_streamer.reset();
+  storage->close();
+
+  IndexStreamer::Pointer read_streamer =
+      IndexFactory::CreateStreamer("HnswStreamer");
+  ASSERT_EQ(0, read_streamer->init(*index_meta_ptr_, params));
+  auto read_storage = IndexFactory::CreateStorage("MMapFileStorage");
+  ASSERT_NE(nullptr, read_storage);
+  ASSERT_EQ(0, read_storage->init(stg_params));
+  ASSERT_EQ(0, read_storage->open(dir_ + "Test/TestHnswSearchBufferMMap", false));
+  ASSERT_EQ(0, read_streamer->open(read_storage));
+  size_t topk = 3;
+  auto provider = read_streamer->create_provider();
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
+    for (size_t j = 0; j < dim; ++j) {
+      ASSERT_EQ(data[j], i);
+    }
+    ASSERT_EQ(i, result1[0].key());
+
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i + 0.1f;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+
+  ctx->set_topk(100U);
+  NumericalVector<float> vec(dim);
+  for (size_t j = 0; j < dim; ++j) {
+    vec[j] = 10.1f;
+  }
+  ASSERT_EQ(0, read_streamer->search_bf_impl(vec.data(), qmeta, ctx));
+  auto &result = ctx->result();
+  ASSERT_EQ(100U, result.size());
+  ASSERT_EQ(10, result[0].key());
+  ASSERT_EQ(11, result[1].key());
+  ASSERT_EQ(5, result[10].key());
+  ASSERT_EQ(0, result[20].key());
+  ASSERT_EQ(30, result[30].key());
+  ASSERT_EQ(35, result[35].key());
+  ASSERT_EQ(99, result[99].key());
+
+  ElapsedTime elapsed_time;
+  for (size_t i = 0; i < cnt; i += 1) {
+    NumericalVector<float> vec(dim);
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result1 = ctx->result();
+    ASSERT_EQ(topk, result1.size());
+    IndexStorage::MemoryBlock block;
+    ASSERT_EQ(0, provider->get_vector(result1[0].key(), block));
+    const float *data = (float *)block.data();
+    for (size_t j = 0; j < dim; ++j) {
+      ASSERT_EQ(data[j], i);
+    }
+    ASSERT_EQ(i, result1[0].key());
+
+    for (size_t j = 0; j < dim; ++j) {
+      vec[j] = i + 0.1f;
+    }
+    ctx->set_topk(topk);
+    ASSERT_EQ(0, read_streamer->search_impl(vec.data(), qmeta, ctx));
+    auto &result2 = ctx->result();
+    ASSERT_EQ(topk, result2.size());
+    ASSERT_EQ(i, result2[0].key());
+    ASSERT_EQ(i == cnt - 1 ? i - 1 : i + 1, result2[1].key());
+    ASSERT_EQ(i == 0 ? 2 : (i == cnt - 1 ? i - 2 : i - 1), result2[2].key());
+  }
+
+  read_streamer->close();
+  read_streamer.reset();
+  cout << "Elapsed time: " << elapsed_time.milli_seconds() << " ms" << endl;
+}
+
 TEST_F(HnswStreamerTest, TestHnswSearchMMap) {
   IndexStreamer::Pointer write_streamer =
       IndexFactory::CreateStreamer("HnswStreamer");
diff --git a/tests/core/utility/buffer_storage_write_test.cc b/tests/core/utility/buffer_storage_write_test.cc
new file mode 100644
index 000000000..a97a32c17
--- /dev/null
+++ b/tests/core/utility/buffer_storage_write_test.cc
@@ -0,0 +1,1181 @@
+// Copyright 2025-present the zvec project
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <atomic>
+#include <cstring>
+#include <numeric>
+#include <string>
+#include <thread>
+#include <vector>
+#include <gtest/gtest.h>
+#include <zvec/ailego/buffer/block_eviction_queue.h>
+#include <zvec/ailego/buffer/buffer_manager.h>
+#include <zvec/ailego/io/file.h>
+#include <zvec/core/framework/index_factory.h>
+#include <zvec/core/framework/index_helper.h>
+
+using namespace zvec;
+using namespace zvec::core;
+
+class BufferStorageWriteTest : public ::testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    // Initialize the memory limit pool with 64MB - enough for all tests.
+    ailego::MemoryLimitPool::get_instance().init(64 * 1024UL * 1024UL);
+  }
+
+  void SetUp() override {
+    file_path_ = "buffer_storage_write_test_dir/test_" +
+                 std::to_string(reinterpret_cast<uintptr_t>(this));
+    ailego::File::Delete(file_path_);
+    ailego::File::MakePath("buffer_storage_write_test_dir");
+  }
+
+  void TearDown() override { ailego::File::Delete(file_path_); }
+
+  // Open BufferStorage in writable mode (create_if_missing=true)
+  IndexStorage::Pointer OpenWritable() {
+    auto storage = IndexFactory::CreateStorage("BufferStorage");
+    if (!storage) return nullptr;
+    ailego::Params params;
+    storage->init(params);
+    if (storage->open(file_path_, true) != 0) return nullptr;
+    return storage;
+  }
+
+  // Open BufferStorage in read-only mode
+  IndexStorage::Pointer OpenReadOnly() {
+    auto storage = IndexFactory::CreateStorage("BufferStorage");
+    if (!storage) return nullptr;
+    ailego::Params params;
+    storage->init(params);
+    if (storage->open(file_path_, false) != 0) return nullptr;
+    return storage;
+  }
+
+  std::string file_path_;
+};
+
+// ===== Basic Write Tests =====
+
+// Test: Create new index via BufferStorage, append segment, write data, read back
+TEST_F(BufferStorageWriteTest, WriteBasicCreateAndWrite) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg1", 4096));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+
+  std::string data = "Hello BufferStorage Write!";
+  EXPECT_EQ(data.size(), seg->write(0, data.data(), data.size()));
+
+  // Verify data via fetch
+  std::vector<char> buf(data.size());
+  EXPECT_EQ(data.size(), seg->fetch(0, buf.data(), buf.size()));
+  EXPECT_EQ(data, std::string(buf.data(), buf.size()));
+
+  // data_size should reflect the written bytes
+  EXPECT_EQ(data.size(), seg->data_size());
+  EXPECT_EQ(0, storage->close());
+}
+
+// Test: Write at non-zero offset within the segment
+TEST_F(BufferStorageWriteTest, WriteAtNonZeroOffset) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg1", 8192));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+
+  // First write at offset 0
+  std::string first = "AAAA";
+  EXPECT_EQ(first.size(), seg->write(0, first.data(), first.size()));
+
+  // Second write at offset 100
+  std::string second = "BBBB";
+  EXPECT_EQ(second.size(), seg->write(100, second.data(), second.size()));
+
+  // data_size should be max(first.end, second.end) = 104
+  EXPECT_EQ(104u, seg->data_size());
+
+  // Verify both writes
+  std::vector<char> buf1(first.size());
+  EXPECT_EQ(first.size(), seg->fetch(0, buf1.data(), buf1.size()));
+  EXPECT_EQ(first, std::string(buf1.data(), buf1.size()));
+
+  std::vector<char> buf2(second.size());
+  EXPECT_EQ(second.size(), seg->fetch(100, buf2.data(), buf2.size()));
+  EXPECT_EQ(second, std::string(buf2.data(), buf2.size()));
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// Test: Write to multiple independent segments
+TEST_F(BufferStorageWriteTest, WriteMultipleSegments) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg_a", 4096));
+  ASSERT_EQ(0, storage->append("seg_b", 4096));
+  ASSERT_EQ(0, storage->append("seg_c", 4096));
+
+  auto seg_a = storage->get("seg_a");
+  auto seg_b = storage->get("seg_b");
+  auto seg_c = storage->get("seg_c");
+  ASSERT_TRUE(seg_a);
+  ASSERT_TRUE(seg_b);
+  ASSERT_TRUE(seg_c);
+
+  std::string da = "data_for_a";
+  std::string db = "data_for_b_longer";
+  std::string dc = "c";
+
+  EXPECT_EQ(da.size(), seg_a->write(0, da.data(), da.size()));
+  EXPECT_EQ(db.size(), seg_b->write(0, db.data(), db.size()));
+  EXPECT_EQ(dc.size(), seg_c->write(0, dc.data(), dc.size()));
+
+  // Verify independently
+  std::vector<char> buf(db.size());
+  EXPECT_EQ(da.size(), seg_a->fetch(0, buf.data(), da.size()));
+  EXPECT_EQ(da, std::string(buf.data(), da.size()));
+
+  EXPECT_EQ(db.size(), seg_b->fetch(0, buf.data(), db.size()));
+  EXPECT_EQ(db, std::string(buf.data(), db.size()));
+
+  EXPECT_EQ(dc.size(), seg_c->fetch(0, buf.data(), dc.size()));
+  EXPECT_EQ(dc, std::string(buf.data(), dc.size()));
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// Test: Overwrite existing data at the same offset
+TEST_F(BufferStorageWriteTest, WriteOverwrite) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg1", 4096));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+
+  std::string first = "XXXXXXXX";
+  EXPECT_EQ(first.size(), seg->write(0, first.data(), first.size()));
+
+  std::string second = "YYYYYYYY";
+  EXPECT_EQ(second.size(), seg->write(0, second.data(), second.size()));
+
+  // Second write should overwrite
+  std::vector<char> buf(second.size());
+  EXPECT_EQ(second.size(), seg->fetch(0, buf.data(), buf.size()));
+  EXPECT_EQ(second, std::string(buf.data(), buf.size()));
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// ===== Boundary / Error Tests =====
+
+// Test: Write exceeding segment capacity returns 0
+TEST_F(BufferStorageWriteTest, WriteExceedsCapacity) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  // Append a small segment (page-aligned, so at least 4096 bytes capacity)
+  ASSERT_EQ(0, storage->append("seg1", 4096));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+
+  size_t cap = seg->capacity();
+  ASSERT_GT(cap, 0u);
+
+  // Write at an offset that causes overflow: offset + len > capacity
+  std::vector<char> big_data(cap + 1, 'Z');
+  EXPECT_EQ(0u, seg->write(0, big_data.data(), big_data.size()));
+
+  // Write at offset that exceeds capacity
+  std::string small = "small";
+  EXPECT_EQ(0u, seg->write(cap + 1, small.data(), small.size()));
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// Test: Write with zero length (edge case)
+TEST_F(BufferStorageWriteTest, WriteZeroLength) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg1", 4096));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+
+  // Writing zero bytes should succeed (no-op but valid)
+  EXPECT_EQ(0u, seg->write(0, "x", 0));
+  EXPECT_EQ(0u, seg->data_size());
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// ===== Persistence Tests =====
+
+// Test: Write, flush, close, reopen, verify data persisted
+TEST_F(BufferStorageWriteTest, WriteFlushReopenVerify) {
+  std::string data = "Persistent data that survives close/reopen";
+
+  {
+    auto storage = OpenWritable();
+    ASSERT_TRUE(storage);
+    ASSERT_EQ(0, storage->append("persist_seg", 8192));
+    auto seg = storage->get("persist_seg");
+    ASSERT_TRUE(seg);
+    EXPECT_EQ(data.size(), seg->write(0, data.data(), data.size()));
+    EXPECT_EQ(0, storage->flush());
+    EXPECT_EQ(0, storage->close());
+  }
+
+  // Reopen in read-only mode and verify
+  {
+    auto storage = OpenReadOnly();
+    ASSERT_TRUE(storage);
+    auto seg = storage->get("persist_seg");
+    ASSERT_TRUE(seg);
+    EXPECT_EQ(data.size(), seg->data_size());
+
+    std::vector<char> buf(data.size());
+    EXPECT_EQ(data.size(), seg->fetch(0, buf.data(), buf.size()));
+    EXPECT_EQ(data, std::string(buf.data(), buf.size()));
+    EXPECT_EQ(0, storage->close());
+  }
+}
+
+// Test: Multiple write-flush cycles persist all data
+TEST_F(BufferStorageWriteTest, WriteMultipleFlushCycles) {
+  std::string data1 = "first_write";
+  std::string data2 = "second_write_longer";
+
+  {
+    auto storage = OpenWritable();
+    ASSERT_TRUE(storage);
+    ASSERT_EQ(0, storage->append("seg1", 4096));
+    auto seg = storage->get("seg1");
+    ASSERT_TRUE(seg);
+
+    // First write + flush
+    EXPECT_EQ(data1.size(), seg->write(0, data1.data(), data1.size()));
+    EXPECT_EQ(0, storage->flush());
+
+    // Second write at a different offset + flush
+    EXPECT_EQ(data2.size(),
+              seg->write(200, data2.data(), data2.size()));
+    EXPECT_EQ(0, storage->flush());
+    EXPECT_EQ(0, storage->close());
+  }
+
+  // Verify persistence
+  {
+    auto storage = OpenReadOnly();
+    ASSERT_TRUE(storage);
+    auto seg = storage->get("seg1");
+    ASSERT_TRUE(seg);
+
+    std::vector<char> buf1(data1.size());
+    EXPECT_EQ(data1.size(), seg->fetch(0, buf1.data(), buf1.size()));
+    EXPECT_EQ(data1, std::string(buf1.data(), buf1.size()));
+
+    std::vector<char> buf2(data2.size());
+    EXPECT_EQ(data2.size(), seg->fetch(200, buf2.data(), buf2.size()));
+    EXPECT_EQ(data2, std::string(buf2.data(), buf2.size()));
+
+    EXPECT_EQ(0, storage->close());
+  }
+}
+
+// Test: Close without explicit flush still persists (close_index does flush)
+TEST_F(BufferStorageWriteTest, WriteCloseWithoutExplicitFlush) {
+  std::string data = "should_persist_on_close";
+
+  {
+    auto storage = OpenWritable();
+    ASSERT_TRUE(storage);
+    ASSERT_EQ(0, storage->append("seg1", 4096));
+    auto seg = storage->get("seg1");
+    ASSERT_TRUE(seg);
+    EXPECT_EQ(data.size(), seg->write(0, data.data(), data.size()));
+    // No explicit flush - close should handle it
+    EXPECT_EQ(0, storage->close());
+  }
+
+  {
+    auto storage = OpenReadOnly();
+    ASSERT_TRUE(storage);
+    auto seg = storage->get("seg1");
+    ASSERT_TRUE(seg);
+    std::vector<char> buf(data.size());
+    EXPECT_EQ(data.size(), seg->fetch(0, buf.data(), buf.size()));
+    EXPECT_EQ(data, std::string(buf.data(), buf.size()));
+    EXPECT_EQ(0, storage->close());
+  }
+}
+
+// ===== Read-Only Behavior =====
+
+// Test: Write to read-only storage is a silent no-op (returns len)
+TEST_F(BufferStorageWriteTest, WriteReadOnlyNoOp) {
+  // First create an index file with a segment
+  {
+    auto storage = OpenWritable();
+    ASSERT_TRUE(storage);
+    ASSERT_EQ(0, storage->append("seg1", 4096));
+    auto seg = storage->get("seg1");
+    ASSERT_TRUE(seg);
+    std::string init_data = "initial";
+    seg->write(0, init_data.data(), init_data.size());
+    EXPECT_EQ(0, storage->flush());
+    EXPECT_EQ(0, storage->close());
+  }
+
+  // Open read-only and attempt write
+  {
+    auto storage = OpenReadOnly();
+    ASSERT_TRUE(storage);
+    auto seg = storage->get("seg1");
+    ASSERT_TRUE(seg);
+
+    std::string new_data = "overwrite_attempt";
+    // Should return len (silent no-op)
+    EXPECT_EQ(new_data.size(),
+              seg->write(0, new_data.data(), new_data.size()));
+
+    // Data should remain unchanged (still "initial")
+    std::vector<char> buf(7);
+    EXPECT_EQ(7u, seg->fetch(0, buf.data(), 7));
+    EXPECT_EQ("initial", std::string(buf.data(), 7));
+
+    EXPECT_EQ(0, storage->close());
+  }
+}
+
+// ===== Resize Tests =====
+
+// Test: Resize increases data_size without writing
+TEST_F(BufferStorageWriteTest, ResizeGrow) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg1", 4096));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+
+  EXPECT_EQ(0u, seg->data_size());
+  size_t new_size = seg->resize(512);
+  EXPECT_EQ(512u, new_size);
+  EXPECT_EQ(512u, seg->data_size());
+  EXPECT_EQ(seg->capacity() - 512, seg->padding_size());
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// Test: Resize shrinks data_size
+TEST_F(BufferStorageWriteTest, ResizeShrink) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg1", 4096));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+
+  // Write to grow data_size to 100
+  std::vector<char> buf(100, 'X');
+  seg->write(0, buf.data(), buf.size());
+  EXPECT_EQ(100u, seg->data_size());
+
+  // Resize to smaller
+  size_t new_size = seg->resize(50);
+  EXPECT_EQ(50u, new_size);
+  EXPECT_EQ(50u, seg->data_size());
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// Test: Resize beyond capacity is clamped
+TEST_F(BufferStorageWriteTest, ResizeBeyondCapacityClamped) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg1", 4096));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+
+  size_t cap = seg->capacity();
+  size_t result = seg->resize(cap + 1000);
+  EXPECT_EQ(cap, result);
+  EXPECT_EQ(cap, seg->data_size());
+  EXPECT_EQ(0u, seg->padding_size());
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// ===== CRC Tests =====
+
+// Test: update_data_crc reflects in data_crc() getter
+TEST_F(BufferStorageWriteTest, UpdateDataCrc) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg1", 4096));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+
+  uint32_t new_crc = 0xDEADBEEF;
+  seg->update_data_crc(new_crc);
+  EXPECT_EQ(new_crc, seg->data_crc());
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// Test: CRC persists after flush and reopen
+TEST_F(BufferStorageWriteTest, UpdateDataCrcPersistence) {
+  uint32_t crc_val = 0x12345678;
+  {
+    auto storage = OpenWritable();
+    ASSERT_TRUE(storage);
+    ASSERT_EQ(0, storage->append("seg1", 4096));
+    auto seg = storage->get("seg1");
+    ASSERT_TRUE(seg);
+    std::string data = "crc_test_data";
+    seg->write(0, data.data(), data.size());
+    seg->update_data_crc(crc_val);
+    EXPECT_EQ(0, storage->flush());
+    EXPECT_EQ(0, storage->close());
+  }
+
+  {
+    auto storage = OpenReadOnly();
+    ASSERT_TRUE(storage);
+    auto seg = storage->get("seg1");
+    ASSERT_TRUE(seg);
+    EXPECT_EQ(crc_val, seg->data_crc());
+    EXPECT_EQ(0, storage->close());
+  }
+}
+
+// ===== Concurrency Tests =====
+
+// Test: Multiple threads writing to different segments concurrently
+TEST_F(BufferStorageWriteTest, ConcurrentWriteDifferentSegments) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  const int kNumSegments = 8;
+  for (int i = 0; i < kNumSegments; ++i) {
+    ASSERT_EQ(0, storage->append("seg_" + std::to_string(i), 16384));
+  }
+
+  std::vector<std::thread> threads;
+  std::atomic<int> errors{0};
+
+  for (int i = 0; i < kNumSegments; ++i) {
+    threads.emplace_back([&, i]() {
+      auto seg = storage->get("seg_" + std::to_string(i));
+      if (!seg) {
+        errors.fetch_add(1);
+        return;
+      }
+      // Each thread writes its own pattern to its own segment
+      std::vector<char> data(1024, static_cast<char>('A' + i));
+      for (int j = 0; j < 10; ++j) {
+        size_t offset = j * 1024;
+        if (seg->write(offset, data.data(), data.size()) != data.size()) {
+          errors.fetch_add(1);
+        }
+      }
+    });
+  }
+
+  for (auto &t : threads) t.join();
+  EXPECT_EQ(0, errors.load());
+
+  // Verify each segment's data
+  for (int i = 0; i < kNumSegments; ++i) {
+    auto seg = storage->get("seg_" + std::to_string(i));
+    ASSERT_TRUE(seg);
+    // Last write was at offset 9*1024, so data_size >= 10*1024
+    EXPECT_GE(seg->data_size(), 10u * 1024u);
+
+    std::vector<char> buf(1024);
+    seg->fetch(0, buf.data(), 1024);
+    EXPECT_EQ(buf[0], static_cast<char>('A' + i));
+  }
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// Test: Multiple threads writing to the same segment at different offsets
+TEST_F(BufferStorageWriteTest, ConcurrentWriteSameSegment) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  // Need large enough segment for all threads
+  ASSERT_EQ(0, storage->append("shared_seg", 65536));
+  auto seg = storage->get("shared_seg");
+  ASSERT_TRUE(seg);
+
+  const int kNumThreads = 8;
+  const size_t kChunkSize = 256;
+  std::atomic<int> errors{0};
+  std::vector<std::thread> threads;
+
+  for (int i = 0; i < kNumThreads; ++i) {
+    threads.emplace_back([&, i]() {
+      // Each thread writes to its own non-overlapping region
+      size_t offset = i * kChunkSize * 10;
+      std::vector<char> data(kChunkSize, static_cast<char>('A' + i));
+      for (int j = 0; j < 10; ++j) {
+        if (seg->write(offset + j * kChunkSize, data.data(), data.size()) !=
+            data.size()) {
+          errors.fetch_add(1);
+        }
+      }
+    });
+  }
+
+  for (auto &t : threads) t.join();
+  EXPECT_EQ(0, errors.load());
+
+  // Verify each thread's region
+  for (int i = 0; i < kNumThreads; ++i) {
+    size_t offset = i * kChunkSize * 10;
+    std::vector<char> buf(kChunkSize);
+    seg->fetch(offset, buf.data(), kChunkSize);
+    for (size_t b = 0; b < kChunkSize; ++b) {
+      EXPECT_EQ(buf[b], static_cast<char>('A' + i))
+          << "Mismatch at thread " << i << " byte " << b;
+    }
+  }
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// Test: Concurrent writers + flush (simulates real workload)
+TEST_F(BufferStorageWriteTest, ConcurrentWriteWithFlush) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg1", 65536));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+
+  std::atomic<bool> stop{false};
+  std::atomic<int> write_errors{0};
+
+  // Writer threads
+  std::vector<std::thread> writers;
+  for (int i = 0; i < 4; ++i) {
+    writers.emplace_back([&, i]() {
+      std::vector<char> data(128, static_cast<char>('0' + i));
+      int iter = 0;
+      while (!stop.load(std::memory_order_relaxed) && iter < 100) {
+        size_t offset = (i * 128 + (iter % 10) * 128) % 4096;
+        if (seg->write(offset, data.data(), data.size()) != data.size()) {
+          write_errors.fetch_add(1);
+        }
+        ++iter;
+      }
+    });
+  }
+
+  // Flush thread
+  std::thread flusher([&]() {
+    for (int i = 0; i < 5; ++i) {
+      std::this_thread::sleep_for(std::chrono::milliseconds(1));
+      storage->flush();
+    }
+    stop.store(true);
+  });
+
+  for (auto &w : writers) w.join();
+  flusher.join();
+
+  EXPECT_EQ(0, write_errors.load());
+  EXPECT_EQ(0, storage->close());
+}
+
+// ===== Append + Write Integration =====
+
+// Test: Append multiple segments then write to each
+TEST_F(BufferStorageWriteTest, AppendThenWriteSequence) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  for (int i = 0; i < 5; ++i) {
+    std::string seg_name = "seg_" + std::to_string(i);
+    ASSERT_EQ(0, storage->append(seg_name, 4096));
+    auto seg = storage->get(seg_name);
+    ASSERT_TRUE(seg);
+
+    std::string data = "content_of_segment_" + std::to_string(i);
+    EXPECT_EQ(data.size(), seg->write(0, data.data(), data.size()));
+  }
+
+  // Verify all segments have correct data
+  for (int i = 0; i < 5; ++i) {
+    std::string seg_name = "seg_" + std::to_string(i);
+    auto seg = storage->get(seg_name);
+    ASSERT_TRUE(seg);
+    std::string expected = "content_of_segment_" + std::to_string(i);
+    std::vector<char> buf(expected.size());
+    EXPECT_EQ(expected.size(), seg->fetch(0, buf.data(), buf.size()));
+    EXPECT_EQ(expected, std::string(buf.data(), buf.size()));
+  }
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// Test: Write to a segment, append another, write to both, verify all
+TEST_F(BufferStorageWriteTest, InterleavedAppendAndWrite) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  // Append and write first segment
+  ASSERT_EQ(0, storage->append("seg1", 4096));
+  auto seg1 = storage->get("seg1");
+  ASSERT_TRUE(seg1);
+  std::string d1 = "first_data";
+  EXPECT_EQ(d1.size(), seg1->write(0, d1.data(), d1.size()));
+
+  // Append second segment (triggers flush_index internally)
+  ASSERT_EQ(0, storage->append("seg2", 4096));
+  auto seg2 = storage->get("seg2");
+  ASSERT_TRUE(seg2);
+  std::string d2 = "second_data";
+  EXPECT_EQ(d2.size(), seg2->write(0, d2.data(), d2.size()));
+
+  // Re-get seg1 (pointer stability) and write more
+  auto seg1_again = storage->get("seg1");
+  ASSERT_TRUE(seg1_again);
+  std::string d1_extra = "extra";
+  EXPECT_EQ(d1_extra.size(),
+            seg1_again->write(d1.size(), d1_extra.data(), d1_extra.size()));
+
+  // Verify all data
+  std::vector<char> buf(d1.size() + d1_extra.size());
+  EXPECT_EQ(buf.size(), seg1_again->fetch(0, buf.data(), buf.size()));
+  EXPECT_EQ(d1 + d1_extra, std::string(buf.data(), buf.size()));
+
+  std::vector<char> buf2(d2.size());
+  EXPECT_EQ(d2.size(), seg2->fetch(0, buf2.data(), buf2.size()));
+  EXPECT_EQ(d2, std::string(buf2.data(), buf2.size()));
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// ===== Large Write Tests =====
+
+// Test: Fill entire segment capacity with data
+TEST_F(BufferStorageWriteTest, WriteLargeBuffer) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  // Request 16KB segment (will be page-aligned)
+  ASSERT_EQ(0, storage->append("big_seg", 16384));
+  auto seg = storage->get("big_seg");
+  ASSERT_TRUE(seg);
+
+  size_t cap = seg->capacity();
+  ASSERT_GE(cap, 16384u);
+
+  // Fill with a pattern
+  std::vector<char> data(cap);
+  std::iota(data.begin(), data.end(), static_cast<char>(0));
+  EXPECT_EQ(cap, seg->write(0, data.data(), data.size()));
+  EXPECT_EQ(cap, seg->data_size());
+  EXPECT_EQ(0u, seg->padding_size());
+
+  // Verify a portion
+  std::vector<char> verify(1024);
+  EXPECT_EQ(1024u, seg->fetch(0, verify.data(), 1024));
+  EXPECT_EQ(0, std::memcmp(data.data(), verify.data(), 1024));
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// Test: Large write persistence across close/reopen
+TEST_F(BufferStorageWriteTest, WriteLargeBufferPersistence) {
+  const size_t kSize = 8192;
+  std::vector<char> data(kSize);
+  for (size_t i = 0; i < kSize; ++i) {
+    data[i] = static_cast<char>(i % 256);
+  }
+
+  {
+    auto storage = OpenWritable();
+    ASSERT_TRUE(storage);
+    ASSERT_EQ(0, storage->append("large_seg", kSize));
+    auto seg = storage->get("large_seg");
+    ASSERT_TRUE(seg);
+    EXPECT_EQ(kSize, seg->write(0, data.data(), data.size()));
+    EXPECT_EQ(0, storage->close());
+  }
+
+  {
+    auto storage = OpenReadOnly();
+    ASSERT_TRUE(storage);
+    auto seg = storage->get("large_seg");
+    ASSERT_TRUE(seg);
+    EXPECT_EQ(kSize, seg->data_size());
+
+    std::vector<char> buf(kSize);
+    EXPECT_EQ(kSize, seg->fetch(0, buf.data(), kSize));
+    EXPECT_EQ(0, std::memcmp(data.data(), buf.data(), kSize));
+    EXPECT_EQ(0, storage->close());
+  }
+}
+
+// ===== Refresh / Checkpoint Tests =====
+
+// Test: refresh() updates checkpoint and marks dirty
+TEST_F(BufferStorageWriteTest, RefreshCheckpoint) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+  ASSERT_EQ(0, storage->append("seg1", 4096));
+
+  storage->refresh(42);
+  EXPECT_EQ(0, storage->flush());
+
+  // After flush the check_point should be >= 42
+  EXPECT_GE(storage->check_point(), 42u);
+
+  // Increasing checkpoint
+  storage->refresh(100);
+  EXPECT_EQ(0, storage->flush());
+  EXPECT_GE(storage->check_point(), 100u);
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// ===== Duplicate / Error Handling =====
+
+// Test: Appending a duplicate segment ID returns error
+TEST_F(BufferStorageWriteTest, AppendDuplicateSegment) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("dup_seg", 4096));
+  // Second append with same ID should fail
+  EXPECT_NE(0, storage->append("dup_seg", 4096));
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// Test: Appending a zero-size segment returns error
+TEST_F(BufferStorageWriteTest, AppendZeroSize) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  EXPECT_NE(0, storage->append("zero_seg", 0));
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// ===== Code Review Issue Tests =====
+// The following tests target specific bugs/races found during code review.
+
+// PR#414 Issue: data_size concurrent race on same segment.
+// Multiple threads calling write() with different offsets should not corrupt
+// the (data_size, padding_size) pair. Their sum must equal capacity when
+// observed after all writers quiesce (individual unsynchronized reads during
+// concurrent writes may appear torn, which is expected).
+TEST_F(BufferStorageWriteTest, CR_DataSizePaddingSizeInvariant) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg1", 8192));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+  const size_t cap = seg->capacity();
+
+  const int kNumThreads = 8;
+  const int kIters = 200;
+  std::atomic<int> write_failures{0};
+  std::vector<std::thread> threads;
+
+  for (int i = 0; i < kNumThreads; ++i) {
+    threads.emplace_back([&, i]() {
+      char buf[64];
+      std::memset(buf, 'A' + i, sizeof(buf));
+      for (int j = 0; j < kIters; ++j) {
+        // Write at various offsets within capacity to exercise data_size growth
+        size_t offset = ((i * 64) + j * 7) % (cap - 64);
+        if (seg->write(offset, buf, sizeof(buf)) != sizeof(buf)) {
+          write_failures.fetch_add(1);
+        }
+      }
+    });
+  }
+
+  for (auto &t : threads) t.join();
+  EXPECT_EQ(0, write_failures.load());
+  // After all writers stop, the invariant MUST hold
+  EXPECT_EQ(cap, seg->data_size() + seg->padding_size());
+  EXPECT_GT(seg->data_size(), 0u);
+  EXPECT_EQ(0, storage->close());
+}
+
+// PR#414 Issue: Concurrent write() + resize() on same segment.
+// meta_mtx_ must serialize so that (data_size, padding_size) stays consistent.
+// The invariant is verified after all threads stop (reads without meta_mtx_
+// during concurrent mutation may observe a torn pair, which is expected).
+TEST_F(BufferStorageWriteTest, CR_ConcurrentWriteAndResize) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg1", 8192));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+  const size_t cap = seg->capacity();
+
+  std::atomic<bool> stop{false};
+  std::atomic<int> write_failures{0};
+
+  // Writer thread: grows data_size by writing at increasing offsets
+  std::thread writer([&]() {
+    char buf[128];
+    std::memset(buf, 'W', sizeof(buf));
+    for (int j = 0; j < 300 && !stop.load(std::memory_order_relaxed); ++j) {
+      size_t offset = j % (cap - 128);
+      if (seg->write(offset, buf, sizeof(buf)) != sizeof(buf)) {
+        write_failures.fetch_add(1);
+      }
+    }
+  });
+
+  // Resizer thread: constantly resizes
+  std::thread resizer([&]() {
+    for (int j = 0; j < 300 && !stop.load(std::memory_order_relaxed); ++j) {
+      size_t new_size = (j * 37) % cap;
+      seg->resize(new_size);
+    }
+    stop.store(true);
+  });
+
+  writer.join();
+  resizer.join();
+
+  EXPECT_EQ(0, write_failures.load());
+  // After quiescence, invariant must hold
+  EXPECT_EQ(cap, seg->data_size() + seg->padding_size());
+  EXPECT_EQ(0, storage->close());
+}
+
+// Chain-split bug: Many appends exhaust segment_meta capacity, triggering
+// chain split. After reopen, ALL segments must be findable.
+// (Tests fix for reserve()-induced dangling pointer in append_segment.)
+TEST_F(BufferStorageWriteTest, CR_ChainSplitAllSegmentsAccessible) {
+  const int kNumSegments = 50;  // Enough to trigger chain split with default 4096 meta capacity
+
+  {
+    auto storage = OpenWritable();
+    ASSERT_TRUE(storage);
+
+    for (int i = 0; i < kNumSegments; ++i) {
+      std::string name = "chain_seg_" + std::to_string(i);
+      ASSERT_EQ(0, storage->append(name, 4096))
+          << "Failed to append segment " << i;
+      auto seg = storage->get(name);
+      ASSERT_TRUE(seg) << "Failed to get segment " << name << " right after append";
+      // Write a marker so we can verify on reopen
+      std::string marker = "marker_" + std::to_string(i);
+      EXPECT_EQ(marker.size(), seg->write(0, marker.data(), marker.size()));
+    }
+    EXPECT_EQ(0, storage->flush());
+    EXPECT_EQ(0, storage->close());
+  }
+
+  // Reopen and verify ALL segments are present and readable
+  {
+    auto storage = OpenReadOnly();
+    ASSERT_TRUE(storage);
+    for (int i = 0; i < kNumSegments; ++i) {
+      std::string name = "chain_seg_" + std::to_string(i);
+      auto seg = storage->get(name);
+      ASSERT_TRUE(seg) << "Segment " << name << " missing after reopen (chain-split bug?)";
+      std::string expected = "marker_" + std::to_string(i);
+      std::vector<char> buf(expected.size());
+      EXPECT_EQ(expected.size(), seg->fetch(0, buf.data(), buf.size()));
+      EXPECT_EQ(expected, std::string(buf.data(), buf.size()))
+          << "Data mismatch for " << name;
+    }
+    EXPECT_EQ(0, storage->close());
+  }
+}
+
+// mapping_shard_id bug: Multiple BufferStorage instances opened on the
+// same thread must work correctly (the old thread_local shard_id would
+// map them to the same shard, causing potential conflicts).
+TEST_F(BufferStorageWriteTest, CR_MultipleInstancesSameThread) {
+  std::string path2 = file_path_ + "_second";
+  ailego::File::Delete(path2);
+
+  auto storage1 = OpenWritable();
+  ASSERT_TRUE(storage1);
+
+  // Open a second independent BufferStorage instance
+  auto storage2 = IndexFactory::CreateStorage("BufferStorage");
+  ASSERT_TRUE(storage2);
+  ailego::Params params;
+  storage2->init(params);
+  ASSERT_EQ(0, storage2->open(path2, true));
+
+  // Append and write to both concurrently from the SAME thread
+  ASSERT_EQ(0, storage1->append("seg_a", 4096));
+  ASSERT_EQ(0, storage2->append("seg_b", 4096));
+
+  auto seg_a = storage1->get("seg_a");
+  auto seg_b = storage2->get("seg_b");
+  ASSERT_TRUE(seg_a);
+  ASSERT_TRUE(seg_b);
+
+  std::string da = "instance_one_data";
+  std::string db = "instance_two_data";
+  EXPECT_EQ(da.size(), seg_a->write(0, da.data(), da.size()));
+  EXPECT_EQ(db.size(), seg_b->write(0, db.data(), db.size()));
+
+  // Verify data isolation
+  std::vector<char> buf1(da.size());
+  EXPECT_EQ(da.size(), seg_a->fetch(0, buf1.data(), buf1.size()));
+  EXPECT_EQ(da, std::string(buf1.data(), buf1.size()));
+
+  std::vector<char> buf2(db.size());
+  EXPECT_EQ(db.size(), seg_b->fetch(0, buf2.data(), buf2.size()));
+  EXPECT_EQ(db, std::string(buf2.data(), buf2.size()));
+
+  EXPECT_EQ(0, storage1->close());
+  EXPECT_EQ(0, storage2->close());
+  ailego::File::Delete(path2);
+}
+
+// Cross-page read/write: Write data spanning page boundaries (4KB pages),
+// then read back via both fetch() and read(MemoryBlock&) to verify the
+// cross-page buffer allocation path. (Tests fix for UAF in cross-page read.)
+TEST_F(BufferStorageWriteTest, CR_CrossPageWriteAndRead) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  // Segment large enough to span multiple pages
+  ASSERT_EQ(0, storage->append("cross_page_seg", 16384));
+  auto seg = storage->get("cross_page_seg");
+  ASSERT_TRUE(seg);
+
+  // Write 5000 bytes starting at offset 2000, which crosses the first
+  // page boundary at 4096 (relative to segment data start in the file).
+  const size_t kWriteOffset = 2000;
+  const size_t kWriteLen = 5000;
+  std::vector<char> write_data(kWriteLen);
+  for (size_t i = 0; i < kWriteLen; ++i) {
+    write_data[i] = static_cast<char>((i * 7 + 13) % 256);
+  }
+  EXPECT_EQ(kWriteLen, seg->write(kWriteOffset, write_data.data(), kWriteLen));
+
+  // Read back via fetch (uses read_range internally for cross-page)
+  std::vector<char> fetch_buf(kWriteLen);
+  EXPECT_EQ(kWriteLen, seg->fetch(kWriteOffset, fetch_buf.data(), kWriteLen));
+  EXPECT_EQ(write_data, fetch_buf);
+
+  // Read back via read(MemoryBlock&) - exercises the cross-page alloc path.
+  // Scope the MemoryBlock so it is destroyed BEFORE storage->close():
+  // when the read happens to land on a single page (e.g. macOS arm64 with
+  // 16KB pages, where [2000, 7000) fits in one page) the returned block
+  // is MBT_BUFFERPOOL holding a raw pointer to buffer_pool_handle_.  Once
+  // close_index() resets buffer_pool_handle_/buffer_pool_, that raw
+  // pointer dangles and ~MemoryBlock()'s release_one() segfaults.
+  {
+    IndexStorage::MemoryBlock mb;
+    EXPECT_EQ(kWriteLen, seg->read(kWriteOffset, mb, kWriteLen));
+    EXPECT_EQ(0, std::memcmp(write_data.data(), mb.data(), kWriteLen));
+  }
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// Dirty flag race: write() after flush_index() must re-set the dirty flag.
+// If the write lands between CAS(dirty, false) and the end of flush,
+// the next flush must still persist it. Verified by close→reopen→read.
+TEST_F(BufferStorageWriteTest, CR_DirtyFlagNotLostAfterFlush) {
+  std::string early_data = "early";
+  std::string late_data = "late_write_after_flush";
+
+  {
+    auto storage = OpenWritable();
+    ASSERT_TRUE(storage);
+    ASSERT_EQ(0, storage->append("seg1", 4096));
+    auto seg = storage->get("seg1");
+    ASSERT_TRUE(seg);
+
+    // Write and flush
+    EXPECT_EQ(early_data.size(),
+              seg->write(0, early_data.data(), early_data.size()));
+    EXPECT_EQ(0, storage->flush());
+
+    // Write again AFTER flush - dirty flag must be re-set
+    EXPECT_EQ(late_data.size(),
+              seg->write(100, late_data.data(), late_data.size()));
+    // Close without explicit flush (close_index will flush)
+    EXPECT_EQ(0, storage->close());
+  }
+
+  // Reopen and verify the late write persisted
+  {
+    auto storage = OpenReadOnly();
+    ASSERT_TRUE(storage);
+    auto seg = storage->get("seg1");
+    ASSERT_TRUE(seg);
+
+    std::vector<char> buf(late_data.size());
+    EXPECT_EQ(late_data.size(), seg->fetch(100, buf.data(), buf.size()));
+    EXPECT_EQ(late_data, std::string(buf.data(), buf.size()));
+    EXPECT_EQ(0, storage->close());
+  }
+}
+
+// Stress test: Concurrent flush + write interleaving to expose dirty flag races.
+// All writes that return successfully MUST be visible after final close+reopen.
+TEST_F(BufferStorageWriteTest, CR_ConcurrentFlushWriteDirtyFlagStress) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg1", 65536));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+
+  // Track the highest offset+len successfully written
+  std::atomic<size_t> max_committed_end{0};
+  std::atomic<bool> stop{false};
+
+  // Writer: writes sequentially increasing offsets
+  std::thread writer([&]() {
+    char pattern[64];
+    std::memset(pattern, 'P', sizeof(pattern));
+    for (int i = 0; i < 500 && !stop.load(std::memory_order_relaxed); ++i) {
+      size_t offset = i * 64;
+      if (offset + 64 > 65536) break;
+      if (seg->write(offset, pattern, 64) == 64) {
+        // Update max committed end
+        size_t end = offset + 64;
+        size_t cur = max_committed_end.load(std::memory_order_relaxed);
+        while (end > cur) {
+          if (max_committed_end.compare_exchange_weak(
+                  cur, end, std::memory_order_relaxed)) {
+            break;
+          }
+        }
+      }
+    }
+  });
+
+  // Flusher: repeatedly flushes to trigger the CAS(dirty, false) path
+  std::thread flusher([&]() {
+    for (int i = 0; i < 50; ++i) {
+      storage->flush();
+      std::this_thread::sleep_for(std::chrono::microseconds(100));
+    }
+    stop.store(true);
+  });
+
+  writer.join();
+  flusher.join();
+
+  size_t final_data_size = seg->data_size();
+  EXPECT_GE(final_data_size, max_committed_end.load());
+  EXPECT_EQ(0, storage->close());
+}
+
+// Pointer stability after append: WrappedSegment obtained BEFORE a new
+// append must still work correctly AFTER the append (unordered_map address
+// stability guarantee). This tests the fix for reserve()-based invalidation.
+TEST_F(BufferStorageWriteTest, CR_PointerStabilityAcrossAppend) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg_first", 4096));
+  auto seg_first = storage->get("seg_first");
+  ASSERT_TRUE(seg_first);
+
+  // Write initial data
+  std::string initial = "before_append";
+  EXPECT_EQ(initial.size(), seg_first->write(0, initial.data(), initial.size()));
+
+  // Append many more segments (may trigger internal rehash/resize)
+  for (int i = 0; i < 20; ++i) {
+    ASSERT_EQ(0, storage->append("new_seg_" + std::to_string(i), 4096));
+  }
+
+  // The original segment handle must still be valid and writable
+  std::string after = "_after_appends";
+  EXPECT_EQ(after.size(),
+            seg_first->write(initial.size(), after.data(), after.size()));
+
+  // Verify full data
+  std::string expected = initial + after;
+  std::vector<char> buf(expected.size());
+  EXPECT_EQ(expected.size(), seg_first->fetch(0, buf.data(), buf.size()));
+  EXPECT_EQ(expected, std::string(buf.data(), buf.size()));
+
+  EXPECT_EQ(0, storage->close());
+}
+
+// update_data_crc concurrent with write: CRC update must be serialized
+// with data_size changes via meta_mtx_. Invariant verified post-quiescence.
+TEST_F(BufferStorageWriteTest, CR_ConcurrentWriteAndCrcUpdate) {
+  auto storage = OpenWritable();
+  ASSERT_TRUE(storage);
+
+  ASSERT_EQ(0, storage->append("seg1", 8192));
+  auto seg = storage->get("seg1");
+  ASSERT_TRUE(seg);
+  const size_t cap = seg->capacity();
+
+  std::atomic<bool> stop{false};
+  std::atomic<int> write_failures{0};
+
+  // Writer thread
+  std::thread writer([&]() {
+    char buf[128];
+    std::memset(buf, 'X', sizeof(buf));
+    for (int i = 0; i < 500 && !stop.load(std::memory_order_relaxed); ++i) {
+      size_t offset = (i * 128) % (cap - 128);
+      if (seg->write(offset, buf, sizeof(buf)) != sizeof(buf)) {
+        write_failures.fetch_add(1);
+      }
+    }
+  });
+
+  // CRC updater thread: concurrently updates CRC
+  std::thread crc_updater([&]() {
+    for (int i = 0; i < 500 && !stop.load(std::memory_order_relaxed); ++i) {
+      seg->update_data_crc(static_cast<uint32_t>(i));
+    }
+    stop.store(true);
+  });
+
+  writer.join();
+  crc_updater.join();
+
+  EXPECT_EQ(0, write_failures.load());
+  // After all threads stop, invariant must hold
+  EXPECT_EQ(cap, seg->data_size() + seg->padding_size());
+  // CRC should have been updated (last writer wins)
+  // Just verify it doesn't crash and the value is readable
+  (void)seg->data_crc();
+  EXPECT_EQ(0, storage->close());
+}
diff --git a/tests/db/collection_test.cc b/tests/db/collection_test.cc
index 2fcf3de18..1ffdca863 100644
--- a/tests/db/collection_test.cc
+++ b/tests/db/collection_test.cc
@@ -47,6 +47,8 @@ std::string col_path = "test_collection";
 class CollectionTest : public ::testing::Test {
  protected:
   void SetUp() override {
+    zvec::ailego::MemoryLimitPool::get_instance().init(2 * 1024ll * 1024ll *
+                                                       1024ll);
     FileHelper::RemoveDirectory(col_path);
   }
 
@@ -57,128 +59,132 @@ class CollectionTest : public ::testing::Test {
 };
 
 TEST_F(CollectionTest, Feature_CreateAndOpen_General) {
-  CollectionOptions options;
-  options.read_only_ = false;
-  options.enable_mmap_ = true;
+  auto func = [&](bool enable_mmap) {
+    CollectionOptions options;
+    options.read_only_ = false;
+    options.enable_mmap_ = enable_mmap;
 
-  std::string path = "./demo";
+    std::string path = "./demo";
 
-  ailego::FileHelper::RemoveDirectory(path.c_str());
+    ailego::FileHelper::RemoveDirectory(path.c_str());
 
-  auto schema = TestHelper::CreateNormalSchema();
-  auto result = Collection::CreateAndOpen(path, *schema, options);
-  if (!result.has_value()) {
-    std::cout << result.error().message() << std::endl;
-  }
-  ASSERT_TRUE(result.has_value());
-  ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str()));
-
-  auto col = result.value();
-  ASSERT_EQ(col->Path(), path);
-  ASSERT_EQ(col->Schema(), *schema);
-  ASSERT_EQ(col->Options(), options);
-  auto stats = col->Stats().value();
-  ASSERT_TRUE(stats.doc_count == 0);
-  ASSERT_EQ(stats.index_completeness["dense_fp32"], 1);
-  ASSERT_EQ(stats.index_completeness["dense_fp16"], 1);
-  // ASSERT_EQ(stats.index_completeness["dense_fp64"], 1);
-  ASSERT_EQ(stats.index_completeness["sparse_fp32"], 1);
-  ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1);
-
-  ASSERT_EQ(col->Destroy(), Status::OK());
-
-  // after destroyed, every interface should return error
-  std::vector<Doc> empty_docs;
-  ASSERT_FALSE(col->Insert(empty_docs).has_value());
-  ASSERT_FALSE(col->Update(empty_docs).has_value());
-  ASSERT_FALSE(col->Delete({}).has_value());
-  ASSERT_FALSE(col->DeleteByFilter("").ok());
-  ASSERT_FALSE(col->Fetch({}).has_value());
-  ASSERT_FALSE(col->Query(SearchQuery{}).has_value());
-  ASSERT_FALSE(col->Query(MultiQuery{}).has_value());
-  ASSERT_FALSE(col->GroupByQuery({}).has_value());
-  ASSERT_FALSE(col->CreateIndex("", nullptr).ok());
-  ASSERT_FALSE(col->DropIndex("").ok());
-  ASSERT_FALSE(col->AddColumn(nullptr, "").ok());
-  ASSERT_FALSE(col->AlterColumn("", "", nullptr).ok());
-  ASSERT_FALSE(col->DropColumn("").ok());
-  ASSERT_FALSE(col->CreateIndex("", nullptr).ok());
-  ASSERT_FALSE(col->Optimize().ok());
-  ASSERT_FALSE(col->Flush().ok());
-  ASSERT_FALSE(col->Destroy().ok());
-  ASSERT_FALSE(col->Options().has_value());
-  ASSERT_FALSE(col->Path().has_value());
-  ASSERT_FALSE(col->Stats().has_value());
-  ASSERT_FALSE(col->Schema().has_value());
-
-  ASSERT_FALSE(ailego::FileHelper::IsExist(path.c_str()));
-
-  // recreate
-  result = Collection::CreateAndOpen(path, *schema, options);
-  ASSERT_TRUE(result.has_value());
-  ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str()));
+    auto schema = TestHelper::CreateNormalSchema();
+    auto result = Collection::CreateAndOpen(path, *schema, options);
+    if (!result.has_value()) {
+      std::cout << result.error().message() << std::endl;
+    }
+    ASSERT_TRUE(result.has_value());
+    ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str()));
+
+    auto col = result.value();
+    ASSERT_EQ(col->Path(), path);
+    ASSERT_EQ(col->Schema(), *schema);
+    ASSERT_EQ(col->Options(), options);
+    auto stats = col->Stats().value();
+    ASSERT_TRUE(stats.doc_count == 0);
+    ASSERT_EQ(stats.index_completeness["dense_fp32"], 1);
+    ASSERT_EQ(stats.index_completeness["dense_fp16"], 1);
+    // ASSERT_EQ(stats.index_completeness["dense_fp64"], 1);
+    ASSERT_EQ(stats.index_completeness["sparse_fp32"], 1);
+    ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1);
 
-  col = std::move(result.value());
-  col.reset();
-  col = nullptr;
+    ASSERT_EQ(col->Destroy(), Status::OK());
+
+    // after destroyed, every interface should return error
+    std::vector<Doc> empty_docs;
+    ASSERT_FALSE(col->Insert(empty_docs).has_value());
+    ASSERT_FALSE(col->Update(empty_docs).has_value());
+    ASSERT_FALSE(col->Delete({}).has_value());
+    ASSERT_FALSE(col->DeleteByFilter("").ok());
+    ASSERT_FALSE(col->Fetch({}).has_value());
+    ASSERT_FALSE(col->Query(SearchQuery{}).has_value());
+    ASSERT_FALSE(col->Query(MultiQuery{}).has_value());
+    ASSERT_FALSE(col->GroupByQuery({}).has_value());
+    ASSERT_FALSE(col->CreateIndex("", nullptr).ok());
+    ASSERT_FALSE(col->DropIndex("").ok());
+    ASSERT_FALSE(col->AddColumn(nullptr, "").ok());
+    ASSERT_FALSE(col->AlterColumn("", "", nullptr).ok());
+    ASSERT_FALSE(col->DropColumn("").ok());
+    ASSERT_FALSE(col->CreateIndex("", nullptr).ok());
+    ASSERT_FALSE(col->Optimize().ok());
+    ASSERT_FALSE(col->Flush().ok());
+    ASSERT_FALSE(col->Destroy().ok());
+    ASSERT_FALSE(col->Options().has_value());
+    ASSERT_FALSE(col->Path().has_value());
+    ASSERT_FALSE(col->Stats().has_value());
+    ASSERT_FALSE(col->Schema().has_value());
+
+    ASSERT_FALSE(ailego::FileHelper::IsExist(path.c_str()));
+
+    // recreate
+    result = Collection::CreateAndOpen(path, *schema, options);
+    ASSERT_TRUE(result.has_value());
+    ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str()));
 
-  ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str()));
+    col = std::move(result.value());
+    col.reset();
+    col = nullptr;
 
-  // reopen
-  result = Collection::Open(path, options);
-  ASSERT_TRUE(result.has_value());
-  col = std::move(result.value());
-  col.reset();
+    ASSERT_TRUE(ailego::FileHelper::IsExist(path.c_str()));
 
-  // reopen with read-only
-  options.read_only_ = true;
-  result = Collection::Open(path, options);
-  if (!result.has_value()) {
-    std::cout << result.error().message() << std::endl;
-  }
-  ASSERT_TRUE(result.has_value());
-  col = result.value();
+    // reopen
+    result = Collection::Open(path, options);
+    ASSERT_TRUE(result.has_value());
+    col = std::move(result.value());
+    col.reset();
 
-  ASSERT_EQ(col->Path(), path);
-  ASSERT_EQ(col->Schema(), *schema);
-  ASSERT_EQ(col->Options(), options);
-  stats = col->Stats().value();
-  ASSERT_TRUE(stats.doc_count == 0);
-  ASSERT_EQ(stats.index_completeness["dense_fp32"], 1);
-  ASSERT_EQ(stats.index_completeness["dense_fp16"], 1);
-  // ASSERT_EQ(stats.index_completeness["dense_fp64"], 1);
-  ASSERT_EQ(stats.index_completeness["sparse_fp32"], 1);
-  ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1);
-
-  // when open with read-only, write operation should fail
-  ASSERT_FALSE(col->Flush().ok());
-  ASSERT_FALSE(col->Destroy().ok());
-  ASSERT_FALSE(col->Insert(empty_docs).has_value());
-  ASSERT_FALSE(col->Update(empty_docs).has_value());
-  ASSERT_FALSE(col->Delete({}).has_value());
-  ASSERT_FALSE(col->DeleteByFilter("").ok());
-  ASSERT_FALSE(col->CreateIndex("", nullptr).ok());
-  ASSERT_FALSE(col->DropIndex("").ok());
-  ASSERT_FALSE(col->AddColumn(nullptr, "").ok());
-  ASSERT_FALSE(col->AlterColumn("", "", nullptr).ok());
-  ASSERT_FALSE(col->DropColumn("").ok());
-  ASSERT_FALSE(col->CreateIndex("", nullptr).ok());
-  ASSERT_FALSE(col->Optimize().ok());
-
-  // two threads open with read_only
-  result = Collection::Open(path, options);
-  if (!result.has_value()) {
-    std::cout << result.error().message() << std::endl;
-  }
-  ASSERT_TRUE(result.has_value());
-  col = result.value();
+    // reopen with read-only
+    options.read_only_ = true;
+    result = Collection::Open(path, options);
+    if (!result.has_value()) {
+      std::cout << result.error().message() << std::endl;
+    }
+    ASSERT_TRUE(result.has_value());
+    col = result.value();
 
-  auto result1 = Collection::Open(path, options);
-  if (!result1.has_value()) {
-    std::cout << result1.error().message() << std::endl;
-  }
-  ASSERT_TRUE(result1.has_value());
-  auto col1 = result1.value();
+    ASSERT_EQ(col->Path(), path);
+    ASSERT_EQ(col->Schema(), *schema);
+    ASSERT_EQ(col->Options(), options);
+    stats = col->Stats().value();
+    ASSERT_TRUE(stats.doc_count == 0);
+    ASSERT_EQ(stats.index_completeness["dense_fp32"], 1);
+    ASSERT_EQ(stats.index_completeness["dense_fp16"], 1);
+    // ASSERT_EQ(stats.index_completeness["dense_fp64"], 1);
+    ASSERT_EQ(stats.index_completeness["sparse_fp32"], 1);
+    ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1);
+
+    // when open with read-only, write operation should fail
+    ASSERT_FALSE(col->Flush().ok());
+    ASSERT_FALSE(col->Destroy().ok());
+    ASSERT_FALSE(col->Insert(empty_docs).has_value());
+    ASSERT_FALSE(col->Update(empty_docs).has_value());
+    ASSERT_FALSE(col->Delete({}).has_value());
+    ASSERT_FALSE(col->DeleteByFilter("").ok());
+    ASSERT_FALSE(col->CreateIndex("", nullptr).ok());
+    ASSERT_FALSE(col->DropIndex("").ok());
+    ASSERT_FALSE(col->AddColumn(nullptr, "").ok());
+    ASSERT_FALSE(col->AlterColumn("", "", nullptr).ok());
+    ASSERT_FALSE(col->DropColumn("").ok());
+    ASSERT_FALSE(col->CreateIndex("", nullptr).ok());
+    ASSERT_FALSE(col->Optimize().ok());
+
+    // two threads open with read_only
+    result = Collection::Open(path, options);
+    if (!result.has_value()) {
+      std::cout << result.error().message() << std::endl;
+    }
+    ASSERT_TRUE(result.has_value());
+    col = result.value();
+
+    auto result1 = Collection::Open(path, options);
+    if (!result1.has_value()) {
+      std::cout << result1.error().message() << std::endl;
+    }
+    ASSERT_TRUE(result1.has_value());
+    auto col1 = result1.value();
+  };
+  func(true);
+  func(false);
 }
 
 TEST_F(CollectionTest, Feature_CreateAndOpen_Empty) {
@@ -391,13 +397,13 @@ TEST_F(CollectionTest, Feature_Write_Batch_Validate) {
 }
 
 TEST_F(CollectionTest, Feature_Insert_General) {
-  auto func = [&](bool schema_nullable, bool doc_nullable,
+  auto func = [&](bool enable_mmap, bool schema_nullable, bool doc_nullable,
                   int doc_count = 1000) {
     FileHelper::RemoveDirectory(col_path);
 
     // create with normal schema
     auto schema = TestHelper::CreateNormalSchema(schema_nullable);
-    auto options = CollectionOptions{false, true, 100 * 1024 * 1024};
+    auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024};
     auto collection = TestHelper::CreateCollectionWithDoc(
         col_path, *schema, options, 0, doc_count, doc_nullable);
 
@@ -478,14 +484,16 @@ TEST_F(CollectionTest, Feature_Insert_General) {
     ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1);
   };
 
-  func(false, false);
-  func(true, true);
-  func(true, false);
-  func(false, true);
+  for (bool enable_mmap : {true, false}) {
+    func(enable_mmap, false, false);
+    func(enable_mmap, true, true);
+    func(enable_mmap, true, false);
+    func(enable_mmap, false, true);
 
-  func(false, false, 0);
-  func(false, false, 1);
-  func(false, false, 2);
+    func(enable_mmap, false, false, 0);
+    func(enable_mmap, false, false, 1);
+    func(enable_mmap, false, false, 2);
+  }
 }
 
 TEST_F(CollectionTest, Feature_Insert_ScalarIndex) {
@@ -809,13 +817,13 @@ TEST_F(CollectionTest, Feature_Insert_Duplicate) {
 }
 
 TEST_F(CollectionTest, Feature_Upsert_General) {
-  auto func = [&](bool schema_nullable, bool doc_nullable,
+  auto func = [&](bool enable_mmap, bool schema_nullable, bool doc_nullable,
                   int doc_count = 1000) {
     FileHelper::RemoveDirectory(col_path);
 
     // create with normal schema
     auto schema = TestHelper::CreateNormalSchema(schema_nullable);
-    auto options = CollectionOptions{false, true, 100 * 1024 * 1024};
+    auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024};
     auto collection = TestHelper::CreateCollectionWithDoc(
         col_path, *schema, options, 0, doc_count, doc_nullable, true);
 
@@ -896,14 +904,16 @@ TEST_F(CollectionTest, Feature_Upsert_General) {
     ASSERT_EQ(stats.index_completeness["sparse_fp16"], 1);
   };
 
-  func(false, false);
-  func(true, true);
-  func(true, false);
-  func(false, true);
+  for (bool enable_mmap : {true, false}) {
+    func(enable_mmap, false, false);
+    func(enable_mmap, true, true);
+    func(enable_mmap, true, false);
+    func(enable_mmap, false, true);
 
-  func(false, false, 0);
-  func(false, false, 1);
-  func(false, false, 2);
+    func(enable_mmap, false, false, 0);
+    func(enable_mmap, false, false, 1);
+    func(enable_mmap, false, false, 2);
+  }
 }
 
 TEST_F(CollectionTest, Feature_Upsert_Incremental) {
@@ -1096,9 +1106,9 @@ TEST_F(CollectionTest, Feature_Upsert_Nullable) {
 
 
 TEST_F(CollectionTest, Feature_Update_General) {
-  auto func = [&](int doc_count) {
+  auto func = [&](bool enable_mmap, int doc_count) {
     auto schema = TestHelper::CreateNormalSchema();
-    auto options = CollectionOptions{false, true, 100 * 1024 * 1024};
+    auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024};
     FileHelper::RemoveDirectory(col_path);
 
     // insert first
@@ -1180,10 +1190,12 @@ TEST_F(CollectionTest, Feature_Update_General) {
     check_doc(doc_count);
   };
 
-  func(99);
-  func(100);
-  func(101);
-  func(1000);
+  for (bool enable_mmap : {true, false}) {
+    func(enable_mmap, 99);
+    func(enable_mmap, 100);
+    func(enable_mmap, 101);
+    func(enable_mmap, 1000);
+  }
 }
 
 TEST_F(CollectionTest, Feature_Update_Incremental) {
@@ -1437,9 +1449,9 @@ TEST_F(CollectionTest, Feature_Update_Empty) {
 }
 
 TEST_F(CollectionTest, Feature_Delete_General) {
-  auto func = [&](int doc_count) {
+  auto func = [&](bool enable_mmap, int doc_count) {
     auto schema = TestHelper::CreateNormalSchema();
-    auto options = CollectionOptions{false, true, 100 * 1024 * 1024};
+    auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024};
     FileHelper::RemoveDirectory(col_path);
 
     // insert first
@@ -1515,10 +1527,12 @@ TEST_F(CollectionTest, Feature_Delete_General) {
     check_doc(doc_count);
   };
 
-  func(99);
-  func(100);
-  func(101);
-  func(1000);
+  for (bool enable_mmap : {true, false}) {
+    func(enable_mmap, 99);
+    func(enable_mmap, 100);
+    func(enable_mmap, 101);
+    func(enable_mmap, 1000);
+  }
 }
 
 TEST_F(CollectionTest, Feature_Delete_Repeated) {
@@ -1578,9 +1592,9 @@ TEST_F(CollectionTest, Feature_Delete_Repeated) {
 }
 
 TEST_F(CollectionTest, Feature_DeleteByFilter_General) {
-  auto func = [&](int doc_count) {
+  auto func = [&](bool enable_mmap, int doc_count) {
     auto schema = TestHelper::CreateNormalSchema();
-    auto options = CollectionOptions{false, true, 100 * 1024 * 1024};
+    auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024};
     FileHelper::RemoveDirectory(col_path);
 
     // insert first
@@ -1659,10 +1673,12 @@ TEST_F(CollectionTest, Feature_DeleteByFilter_General) {
     check_doc(doc_count);
   };
 
-  func(99);
-  func(100);
-  func(101);
-  func(1000);
+  for (bool enable_mmap : {true, false}) {
+    func(enable_mmap, 99);
+    func(enable_mmap, 100);
+    func(enable_mmap, 101);
+    func(enable_mmap, 1000);
+  }
 }
 
 TEST_F(CollectionTest, Feature_DeleteByFilter_ScalarIndex) {
@@ -1755,122 +1771,132 @@ TEST_F(CollectionTest, Feature_DeleteByFilter_ScalarIndex) {
 }
 
 TEST_F(CollectionTest, Feature_MixedWrite_General) {
-  // case1: insert -> upsert -> update -> delete
-  auto schema = TestHelper::CreateNormalSchema();
-  auto options = CollectionOptions{false, true, 100 * 1024 * 1024};
-  FileHelper::RemoveDirectory(col_path);
+  auto func = [&](bool enable_mmap) {
+    // case1: insert -> upsert -> update -> delete
+    auto schema = TestHelper::CreateNormalSchema();
+    auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024};
+    FileHelper::RemoveDirectory(col_path);
 
-  // insert first
-  auto collection =
-      TestHelper::CreateCollectionWithDoc(col_path, *schema, options, 0, 0);
+    // insert first
+    auto collection =
+        TestHelper::CreateCollectionWithDoc(col_path, *schema, options, 0, 0);
 
-  for (int i = 0; i < 100; i++) {
-    // std::cout << "insert: " << i << std::endl;
-
-    // insert
-    auto new_doc = TestHelper::CreateDoc(i, *schema);
-    std::vector<Doc> new_docs = {new_doc};
-    auto res = collection->Insert(new_docs);
-    ASSERT_TRUE(res.has_value());
-    ASSERT_TRUE(res.value()[0].ok());
-
-    // fetch
-    auto docs = collection->Fetch({TestHelper::MakePK(i)});
-    ASSERT_TRUE(docs.has_value());
-    ASSERT_EQ(docs.value().size(), 1);
-    ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1);
-    ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]);
+    for (int i = 0; i < 100; i++) {
+      // std::cout << "insert: " << i << std::endl;
 
-    auto stats = collection->Stats().value();
-    ASSERT_EQ(stats.doc_count, i + 1);
-
-    // upsert
-    new_doc = TestHelper::CreateDoc(i + 1, *schema, TestHelper::MakePK(i));
-    new_docs = {new_doc};
-    res = collection->Upsert(new_docs);
-    ASSERT_TRUE(res.has_value());
-    ASSERT_TRUE(res.value()[0].ok());
-
-    // fetch
-    docs = collection->Fetch({TestHelper::MakePK(i)}).value();
-    ASSERT_TRUE(docs.has_value());
-    ASSERT_EQ(docs.value().size(), 1);
-    ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1);
-    ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]);
+      // insert
+      auto new_doc = TestHelper::CreateDoc(i, *schema);
+      std::vector<Doc> new_docs = {new_doc};
+      auto res = collection->Insert(new_docs);
+      ASSERT_TRUE(res.has_value());
+      ASSERT_TRUE(res.value()[0].ok());
 
-    stats = collection->Stats().value();
-    ASSERT_EQ(stats.doc_count, i + 1);
-
-    // update
-    new_doc = TestHelper::CreateDoc(i + 2, *schema, TestHelper::MakePK(i));
-    new_docs = {new_doc};
-    res = collection->Update(new_docs);
-    ASSERT_TRUE(res.has_value());
-    ASSERT_TRUE(res.value()[0].ok());
-
-    // fetch
-    docs = collection->Fetch({TestHelper::MakePK(i)}).value();
-    ASSERT_TRUE(docs.has_value());
-    ASSERT_EQ(docs.value().size(), 1);
-    ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1);
-    ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]);
+      // fetch
+      auto docs = collection->Fetch({TestHelper::MakePK(i)});
+      ASSERT_TRUE(docs.has_value());
+      ASSERT_EQ(docs.value().size(), 1);
+      ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1);
+      ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]);
 
-    stats = collection->Stats().value();
-    ASSERT_EQ(stats.doc_count, i + 1);
+      auto stats = collection->Stats().value();
+      ASSERT_EQ(stats.doc_count, i + 1);
 
-    // delete
-    res = collection->Delete({TestHelper::MakePK(i)});
-    ASSERT_TRUE(res.has_value());
-    ASSERT_TRUE(res.value()[0].ok());
+      // upsert
+      new_doc = TestHelper::CreateDoc(i + 1, *schema, TestHelper::MakePK(i));
+      new_docs = {new_doc};
+      res = collection->Upsert(new_docs);
+      ASSERT_TRUE(res.has_value());
+      ASSERT_TRUE(res.value()[0].ok());
 
-    stats = collection->Stats().value();
-    ASSERT_EQ(stats.doc_count, i);
-
-    // insert again
-    new_doc = TestHelper::CreateDoc(i, *schema);
-    new_docs = {new_doc};
-    res = collection->Insert(new_docs);
-    ASSERT_TRUE(res.has_value());
-    ASSERT_TRUE(res.value()[0].ok());
-
-    // fetch
-    docs = collection->Fetch({TestHelper::MakePK(i)});
-    ASSERT_TRUE(docs.has_value());
-    ASSERT_EQ(docs.value().size(), 1);
-    ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1);
-    ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]);
+      // fetch
+      docs = collection->Fetch({TestHelper::MakePK(i)}).value();
+      ASSERT_TRUE(docs.has_value());
+      ASSERT_EQ(docs.value().size(), 1);
+      ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1);
+      ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]);
 
-    stats = collection->Stats().value();
-    ASSERT_EQ(stats.doc_count, i + 1);
-  }
+      stats = collection->Stats().value();
+      ASSERT_EQ(stats.doc_count, i + 1);
+
+      // update
+      new_doc = TestHelper::CreateDoc(i + 2, *schema, TestHelper::MakePK(i));
+      new_docs = {new_doc};
+      res = collection->Update(new_docs);
+      ASSERT_TRUE(res.has_value());
+      ASSERT_TRUE(res.value()[0].ok());
+
+      // fetch
+      docs = collection->Fetch({TestHelper::MakePK(i)}).value();
+      ASSERT_TRUE(docs.has_value());
+      ASSERT_EQ(docs.value().size(), 1);
+      ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1);
+      ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]);
+
+      stats = collection->Stats().value();
+      ASSERT_EQ(stats.doc_count, i + 1);
+
+      // delete
+      res = collection->Delete({TestHelper::MakePK(i)});
+      ASSERT_TRUE(res.has_value());
+      ASSERT_TRUE(res.value()[0].ok());
+
+      stats = collection->Stats().value();
+      ASSERT_EQ(stats.doc_count, i);
+
+      // insert again
+      new_doc = TestHelper::CreateDoc(i, *schema);
+      new_docs = {new_doc};
+      res = collection->Insert(new_docs);
+      ASSERT_TRUE(res.has_value());
+      ASSERT_TRUE(res.value()[0].ok());
+
+      // fetch
+      docs = collection->Fetch({TestHelper::MakePK(i)});
+      ASSERT_TRUE(docs.has_value());
+      ASSERT_EQ(docs.value().size(), 1);
+      ASSERT_EQ(docs.value().count(TestHelper::MakePK(i)), 1);
+      ASSERT_EQ(new_doc, *docs.value()[TestHelper::MakePK(i)]);
+
+      stats = collection->Stats().value();
+      ASSERT_EQ(stats.doc_count, i + 1);
+    }
+  };
+  func(true);
+  func(false);
 }
 
 TEST_F(CollectionTest, Feature_CreateIndex_General) {
-  // create empty collection
-  auto schema = TestHelper::CreateNormalSchema();
-  auto options = CollectionOptions{false, true, 64 * 1024 * 1024};
-  auto collection = TestHelper::CreateCollectionWithDoc(col_path, *schema,
-                                                        options, 0, 0, false);
+  auto func = [&](bool enable_mmap) {
+    FileHelper::RemoveDirectory(col_path);
+    // create empty collection
+    auto schema = TestHelper::CreateNormalSchema();
+    auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1024};
+    auto collection = TestHelper::CreateCollectionWithDoc(col_path, *schema,
+                                                          options, 0, 0, false);
 
-  ASSERT_TRUE(collection->Flush().ok());
-  auto stats = collection->Stats().value();
-  ASSERT_EQ(stats.doc_count, 0);
+    ASSERT_TRUE(collection->Flush().ok());
+    auto stats = collection->Stats().value();
+    ASSERT_EQ(stats.doc_count, 0);
 
-  auto index_params = std::make_shared<HnswIndexParams>(MetricType::IP);
-  auto s = collection->CreateIndex("dense_fp32", index_params);
-  if (!s.ok()) {
-    std::cout << "status: " << s.message() << std::endl;
-    ASSERT_TRUE(false);
-  }
-  auto new_index_params = std::make_shared<HnswIndexParams>(MetricType::COSINE);
-  s = collection->CreateIndex("dense_fp32", index_params);
-  if (!s.ok()) {
-    std::cout << "status: " << s.message() << std::endl;
-    ASSERT_TRUE(false);
-  }
+    auto index_params = std::make_shared<HnswIndexParams>(MetricType::IP);
+    auto s = collection->CreateIndex("dense_fp32", index_params);
+    if (!s.ok()) {
+      std::cout << "status: " << s.message() << std::endl;
+      ASSERT_TRUE(false);
+    }
+    auto new_index_params =
+        std::make_shared<HnswIndexParams>(MetricType::COSINE);
+    s = collection->CreateIndex("dense_fp32", index_params);
+    if (!s.ok()) {
+      std::cout << "status: " << s.message() << std::endl;
+      ASSERT_TRUE(false);
+    }
 
-  s = collection->CreateIndex("dense_fp32_invalid", index_params);
-  ASSERT_FALSE(s.ok());
+    s = collection->CreateIndex("dense_fp32_invalid", index_params);
+    ASSERT_FALSE(s.ok());
+  };
+  func(true);
+  func(false);
 }
 
 TEST_F(CollectionTest, Feature_CreateIndex_Vector) {
@@ -2229,72 +2255,77 @@ TEST_F(CollectionTest, Feature_CreateIndex_Scalar) {
 }
 
 TEST_F(CollectionTest, Feature_DropIndex_General) {
-  // create empty collection
-  auto schema = TestHelper::CreateSchemaWithVectorIndex();
-  auto options = CollectionOptions{false, true, 64 * 1024 * 1204};
-  auto collection = TestHelper::CreateCollectionWithDoc(col_path, *schema,
-                                                        options, 0, 0, false);
+  auto func = [&](bool enable_mmap) {
+    FileHelper::RemoveDirectory(col_path);
+    // create empty collection
+    auto schema = TestHelper::CreateSchemaWithVectorIndex();
+    auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1204};
+    auto collection = TestHelper::CreateCollectionWithDoc(col_path, *schema,
+                                                          options, 0, 0, false);
 
-  ASSERT_TRUE(collection->Flush().ok());
-  auto stats = collection->Stats().value();
-  ASSERT_EQ(stats.doc_count, 0);
-  ASSERT_EQ(stats.index_completeness["dense_fp32"], 1);
+    ASSERT_TRUE(collection->Flush().ok());
+    auto stats = collection->Stats().value();
+    ASSERT_EQ(stats.doc_count, 0);
+    ASSERT_EQ(stats.index_completeness["dense_fp32"], 1);
 
-  ASSERT_EQ(collection->Schema(), *schema);
+    ASSERT_EQ(collection->Schema(), *schema);
 
 
-  auto s = collection->DropIndex("dense_fp32_invalid");
-  ASSERT_FALSE(s.ok());
+    auto s = collection->DropIndex("dense_fp32_invalid");
+    ASSERT_FALSE(s.ok());
 
-  s = collection->DropIndex("dense_fp32");
-  if (!s.ok()) {
-    std::cout << "drop index err: " << s.message() << std::endl;
-  }
-  ASSERT_TRUE(s.ok());
+    s = collection->DropIndex("dense_fp32");
+    if (!s.ok()) {
+      std::cout << "drop index err: " << s.message() << std::endl;
+    }
+    ASSERT_TRUE(s.ok());
 
-  s = collection->DropIndex("dense_fp32");
-  ASSERT_TRUE(s.ok());
+    s = collection->DropIndex("dense_fp32");
+    ASSERT_TRUE(s.ok());
 
-  auto new_schema = std::make_shared<CollectionSchema>(*schema);
-  s = new_schema->drop_index("dense_fp32");
-  ASSERT_TRUE(s.ok());
-  ASSERT_EQ(*new_schema, collection->Schema());
+    auto new_schema = std::make_shared<CollectionSchema>(*schema);
+    s = new_schema->drop_index("dense_fp32");
+    ASSERT_TRUE(s.ok());
+    ASSERT_EQ(*new_schema, collection->Schema());
 
-  stats = collection->Stats().value();
-  ASSERT_EQ(stats.doc_count, 0);
-  ASSERT_EQ(stats.index_completeness["dense_fp32"], 1);
+    stats = collection->Stats().value();
+    ASSERT_EQ(stats.doc_count, 0);
+    ASSERT_EQ(stats.index_completeness["dense_fp32"], 1);
 
-  ASSERT_EQ(*collection->Schema()
-                 .value()
-                 .get_vector_field("dense_fp32")
-                 ->index_params(),
-            DefaultVectorIndexParams);
+    ASSERT_EQ(*collection->Schema()
+                   .value()
+                   .get_vector_field("dense_fp32")
+                   ->index_params(),
+              DefaultVectorIndexParams);
 
-  s = collection->DropIndex("dense_fp32");
-  if (!s.ok()) {
-    std::cout << "drop index err: " << s.message() << std::endl;
-  }
-  ASSERT_TRUE(s.ok());
+    s = collection->DropIndex("dense_fp32");
+    if (!s.ok()) {
+      std::cout << "drop index err: " << s.message() << std::endl;
+    }
+    ASSERT_TRUE(s.ok());
 
-  auto schema1 = collection->Schema().value();
+    auto schema1 = collection->Schema().value();
 
-  collection.reset();
+    collection.reset();
 
-  auto result = Collection::Open(col_path, options);
-  ASSERT_TRUE(result.has_value());
+    auto result = Collection::Open(col_path, options);
+    ASSERT_TRUE(result.has_value());
 
-  collection = std::move(result.value());
-  auto schema2 = collection->Schema().value();
+    collection = std::move(result.value());
+    auto schema2 = collection->Schema().value();
 
-  if (schema1 != schema2) {
-    std::cout << "schema1: " << schema1.to_string_formatted() << std::endl;
-    std::cout << "schema2: " << schema2.to_string_formatted() << std::endl;
-  }
-  ASSERT_EQ(schema1, schema2);
+    if (schema1 != schema2) {
+      std::cout << "schema1: " << schema1.to_string_formatted() << std::endl;
+      std::cout << "schema2: " << schema2.to_string_formatted() << std::endl;
+    }
+    ASSERT_EQ(schema1, schema2);
 
-  stats = collection->Stats().value();
-  ASSERT_EQ(stats.doc_count, 0);
-  ASSERT_EQ(stats.index_completeness["dense_fp32"], 1);
+    stats = collection->Stats().value();
+    ASSERT_EQ(stats.doc_count, 0);
+    ASSERT_EQ(stats.index_completeness["dense_fp32"], 1);
+  };
+  func(true);
+  func(false);
 }
 
 TEST_F(CollectionTest, Feature_DropIndex_Vector) {
@@ -2526,14 +2557,14 @@ TEST_F(CollectionTest, Feature_DropIndex_AfterCreate) {
 }
 
 TEST_F(CollectionTest, Feature_Optimize_General) {
-  auto func = [](int concurrency) {
+  auto func = [](bool enable_mmap, int concurrency) {
     FileHelper::RemoveDirectory(col_path);
 
     int doc_count = 1000;
 
     // create empty collection
     auto schema = TestHelper::CreateSchemaWithVectorIndex();
-    auto options = CollectionOptions{false, true, 64 * 1024 * 1024};
+    auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1024};
     auto collection = TestHelper::CreateCollectionWithDoc(
         col_path, *schema, options, 0, doc_count, false);
 
@@ -2585,12 +2616,15 @@ TEST_F(CollectionTest, Feature_Optimize_General) {
     std::cout << "check success 3" << std::endl;
   };
 
-  func(0);
-  func(4);
+  for (bool enable_mmap : {true, false}) {
+    func(enable_mmap, 0);
+    func(enable_mmap, 4);
+  }
 }
 
 TEST_F(CollectionTest, Feature_Optimize_Repeated) {
-  auto run_repeated_optimize_test = [&](IndexParams::Ptr index_params) {
+  auto run_repeated_optimize_test = [&](bool enable_mmap,
+                                        IndexParams::Ptr index_params) {
     ASSERT_NE(index_params, nullptr);
     SCOPED_TRACE(testing::Message()
                  << "index_params=" << index_params->to_string());
@@ -2599,7 +2633,7 @@ TEST_F(CollectionTest, Feature_Optimize_Repeated) {
     int doc_count = 1000;
     auto schema =
         TestHelper::CreateSchemaWithVectorIndex(false, "demo", index_params);
-    auto options = CollectionOptions{false, true, 64 * 1024 * 1024};
+    auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1024};
     auto collection = TestHelper::CreateCollectionWithDoc(
         col_path, *schema, options, 0, doc_count, false);
 
@@ -2740,22 +2774,31 @@ TEST_F(CollectionTest, Feature_Optimize_Repeated) {
   };
 
 
-  run_repeated_optimize_test(std::make_shared<FlatIndexParams>(
-      MetricType::IP, QuantizeType::UNDEFINED));
-  run_repeated_optimize_test(
-      std::make_shared<FlatIndexParams>(MetricType::IP, QuantizeType::FP16));
-  run_repeated_optimize_test(std::make_shared<HnswIndexParams>(
-      MetricType::IP, 16, 200, QuantizeType::UNDEFINED));
-  run_repeated_optimize_test(std::make_shared<HnswIndexParams>(
-      MetricType::IP, 16, 200, QuantizeType::FP16));
-  run_repeated_optimize_test(std::make_shared<IVFIndexParams>(
-      MetricType::IP, 10, 4, false, QuantizeType::UNDEFINED));
-  run_repeated_optimize_test(std::make_shared<IVFIndexParams>(
-      MetricType::IP, 10, 4, false, QuantizeType::FP16));
+  for (bool enable_mmap : {true, false}) {
+    run_repeated_optimize_test(enable_mmap,
+                               std::make_shared<FlatIndexParams>(
+                                   MetricType::IP, QuantizeType::UNDEFINED));
+    run_repeated_optimize_test(
+        enable_mmap,
+        std::make_shared<FlatIndexParams>(MetricType::IP, QuantizeType::FP16));
+    run_repeated_optimize_test(
+        enable_mmap, std::make_shared<HnswIndexParams>(
+                         MetricType::IP, 16, 200, QuantizeType::UNDEFINED));
+    run_repeated_optimize_test(
+        enable_mmap, std::make_shared<HnswIndexParams>(MetricType::IP, 16, 200,
+                                                       QuantizeType::FP16));
+    run_repeated_optimize_test(enable_mmap, std::make_shared<IVFIndexParams>(
+                                                MetricType::IP, 10, 4, false,
+                                                QuantizeType::UNDEFINED));
+    run_repeated_optimize_test(
+        enable_mmap, std::make_shared<IVFIndexParams>(
+                         MetricType::IP, 10, 4, false, QuantizeType::FP16));
 #if RABITQ_SUPPORTED
-  run_repeated_optimize_test(std::make_shared<HnswRabitqIndexParams>(
-      MetricType::IP, 7, 256, 16, 200, 0));
+    run_repeated_optimize_test(
+        enable_mmap, std::make_shared<HnswRabitqIndexParams>(MetricType::IP, 7,
+                                                             256, 16, 200, 0));
 #endif
+  }
 }
 
 TEST_F(CollectionTest, Feature_Optimize_MetricType) {
@@ -3428,13 +3471,13 @@ TEST_F(CollectionTest, Feature_Query_Validate) {
 }
 
 TEST_F(CollectionTest, Feature_Query_General) {
-  auto func = [&](std::string field_name) {
+  auto func = [&](bool enable_mmap, std::string field_name) {
     FileHelper::RemoveDirectory(col_path);
 
     int doc_count = 1000;
     // create with normal schema
     auto schema = TestHelper::CreateNormalSchema();
-    auto options = CollectionOptions{false, true, 100 * 1024 * 1024};
+    auto options = CollectionOptions{false, enable_mmap, 100 * 1024 * 1024};
     auto collection = TestHelper::CreateCollectionWithDoc(
         col_path, *schema, options, 0, doc_count);
 
@@ -3496,8 +3539,10 @@ TEST_F(CollectionTest, Feature_Query_General) {
     }
   };
 
-  func("dense_fp32");
-  func("sparse_fp32");
+  for (bool enable_mmap : {true, false}) {
+    func(enable_mmap, "dense_fp32");
+    func(enable_mmap, "sparse_fp32");
+  }
 }
 
 TEST_F(CollectionTest, Feature_Query_Empty) {
@@ -4114,69 +4159,96 @@ TEST_F(CollectionTest, Feature_MultiQuery_CallbackReranker) {
 TEST_F(CollectionTest, Feature_GroupByQuery) {}
 
 TEST_F(CollectionTest, Feature_AddColumn_General) {
-  // create collection
-  int doc_count = 1000;
-  auto schema = TestHelper::CreateNormalSchema();
-  auto options = CollectionOptions{false, true, 64 * 1024 * 1024};
-  auto collection = TestHelper::CreateCollectionWithDoc(
-      col_path, *schema, options, 0, doc_count, false);
+  auto func = [&](bool enable_mmap) {
+    FileHelper::RemoveDirectory(col_path);
+    // create collection
+    int doc_count = 1000;
+    auto schema = TestHelper::CreateNormalSchema();
+    auto options = CollectionOptions{false, enable_mmap, 64 * 1024 * 1024};
+    auto collection = TestHelper::CreateCollectionWithDoc(
+        col_path, *schema, options, 0, doc_count, false);
 
-  ASSERT_TRUE(collection->Flush().ok());
-  auto stats = collection->Stats().value();
-  ASSERT_EQ(stats.doc_count, doc_count);
-  auto field_schema =
-      std::make_shared<FieldSchema>("add_int32", DataType::INT32, false);
-  auto s = collection->AddColumn(field_schema, "int32", AddColumnOptions());
-  if (!s.ok()) {
-    std::cout << "status: " << s.message() << std::endl;
-    ASSERT_TRUE(false);
-  }
-  auto new_schema = collection->Schema().value();
-  ASSERT_TRUE(new_schema.has_field("add_int32"));
+    ASSERT_TRUE(collection->Flush().ok());
+    auto stats = collection->Stats().value();
+    ASSERT_EQ(stats.doc_count, doc_count);
+    auto field_schema =
+        std::make_shared<FieldSchema>("add_int32", DataType::INT32, false);
+    auto s = collection->AddColumn(field_schema, "int32", AddColumnOptions());
+    if (!s.ok()) {
+      std::cout << "status: " << s.message() << std::endl;
+      ASSERT_TRUE(false);
+    }
+    auto new_schema = collection->Schema().value();
+    ASSERT_TRUE(new_schema.has_field("add_int32"));
 
-  stats = collection->Stats().value();
-  ASSERT_EQ(stats.doc_count, doc_count);
+    stats = collection->Stats().value();
+    ASSERT_EQ(stats.doc_count, doc_count);
 
-  auto check_doc = [&](int doc_count) {
-    for (int i = 0; i < doc_count; i++) {
-      auto expect_doc = TestHelper::CreateDoc(i, new_schema);
-      auto result = collection->Fetch({expect_doc.pk()});
-      ASSERT_TRUE(result.has_value());
-      ASSERT_EQ(result.value().size(), 1);
-      ASSERT_EQ(result.value().count(expect_doc.pk()), 1);
-      auto doc = result.value()[expect_doc.pk()];
-      ASSERT_NE(doc, nullptr);
-      if (*doc != expect_doc) {
-        std::cout << "       doc:" << doc->to_detail_string() << std::endl;
-        std::cout << "expect_doc:" << expect_doc.to_detail_string()
-                  << std::endl;
+    auto check_doc = [&](int doc_count) {
+      for (int i = 0; i < doc_count; i++) {
+        auto expect_doc = TestHelper::CreateDoc(i, new_schema);
+        auto result = collection->Fetch({expect_doc.pk()});
+        ASSERT_TRUE(result.has_value());
+        ASSERT_EQ(result.value().size(), 1);
+        ASSERT_EQ(result.value().count(expect_doc.pk()), 1);
+        auto doc = result.value()[expect_doc.pk()];
+        ASSERT_NE(doc, nullptr);
+        if (*doc != expect_doc) {
+          std::cout << "       doc:" << doc->to_detail_string() << std::endl;
+          std::cout << "expect_doc:" << expect_doc.to_detail_string()
+                    << std::endl;
+        }
+        ASSERT_EQ(*doc, expect_doc);
       }
-      ASSERT_EQ(*doc, expect_doc);
-    }
-  };
+    };
 
-  check_doc(doc_count);
+    check_doc(doc_count);
 
-  // validate query result
-  for (int i = 1; i < 2; i++) {
-    SearchQuery query;
-    query.topk_ = 10;
-    query.include_vector_ = true;
+    // validate query result
+    for (int i = 1; i < 2; i++) {
+      SearchQuery query;
+      query.topk_ = 10;
+      query.include_vector_ = true;
 
-    auto result = collection->Query(query);
-    if (!result.has_value()) {
-      std::cout << "err: " << result.error().message() << std::endl;
+      auto result = collection->Query(query);
+      if (!result.has_value()) {
+        std::cout << "err: " << result.error().message() << std::endl;
+      }
+      ASSERT_TRUE(result.has_value());
+      ASSERT_EQ(result.value().size(), std::min(query.topk_, doc_count));
+
+      auto fields_name = new_schema.all_field_names();
+      for (int j = 0; j < std::min(query.topk_, doc_count); j++) {
+        auto result_doc = result.value()[j];
+        auto doc_fields_names = result_doc->field_names();
+        ASSERT_TRUE(vectors_equal_when_sorted(fields_name, doc_fields_names));
+      }
     }
-    ASSERT_TRUE(result.has_value());
-    ASSERT_EQ(result.value().size(), std::min(query.topk_, doc_count));
+    check_doc(doc_count);
 
-    auto fields_name = new_schema.all_field_names();
-    for (int j = 0; j < std::min(query.topk_, doc_count); j++) {
-      auto result_doc = result.value()[j];
-      auto doc_fields_names = result_doc->field_names();
-      ASSERT_TRUE(vectors_equal_when_sorted(fields_name, doc_fields_names));
+    // validate query result
+    for (int i = 1; i < 2; i++) {
+      SearchQuery query;
+      query.topk_ = 10;
+      query.include_vector_ = true;
+
+      auto result = collection->Query(query);
+      if (!result.has_value()) {
+        std::cout << "err: " << result.error().message() << std::endl;
+      }
+      ASSERT_TRUE(result.has_value());
+      ASSERT_EQ(result.value().size(), std::min(query.topk_, doc_count));
+
+      auto fields_name = new_schema.all_field_names();
+      for (int j = 0; j < std::min(query.topk_, doc_count); j++) {
+        auto result_doc = result.value()[j];
+        auto doc_fields_names = result_doc->field_names();
+        ASSERT_TRUE(vectors_equal_when_sorted(fields_name, doc_fields_names));
+      }
     }
-  }
+  };
+  func(true);
+  func(false);
 }
 
 TEST_F(CollectionTest, Feature_AddColumn_CornerCase) {